From dc2d86a874b610bd3afdeb01961a44ecbcb91a30 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Tue, 13 Dec 2022 12:43:21 -0800
Subject: [PATCH 001/457] ANDROID: GKI: Enable CONFIG_CFI_CLANG

Enable KCFI. Depends Clang >= 16.

Bug: 239452773
Bug: 261067673
Change-Id: Ic27564a6a800120b53a932fb274daa47497e42ad
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/arm64/configs/gki_defconfig | 1 +
 arch/x86/configs/gki_defconfig   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index b20f36830abd..c4919417aba5 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -78,6 +78,7 @@ CONFIG_KVM=y
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_SHADOW_CALL_STACK=y
+CONFIG_CFI_CLANG=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index 7104cb48606c..32009d7e745c 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -68,6 +68,7 @@ CONFIG_KVM_INTEL=y
 CONFIG_KVM_AMD=y
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
+CONFIG_CFI_CLANG=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODVERSIONS=y

From 0f6815d63539cc738a2673834a7b3127228937a7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 4 Oct 2021 18:48:44 +0100
Subject: [PATCH 002/457] FROMLIST: firmware/smccc: Call arch-specific hook on
 discovering KVM services

arm64 will soon require its own callback to initialise services
that are only availably on this architecture. Introduce a hook
that can be overloaded by the architecture.

Link: https://lore.kernel.org/r/20211004174849.2831548-12-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Bug: 233587962
Change-Id: I743e1786df1477b3c9fab0fe2e5ea52a7dcdf01f
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm/include/asm/hypervisor.h   | 1 +
 arch/arm64/include/asm/hypervisor.h | 1 +
 drivers/firmware/smccc/kvm_guest.c  | 4 ++++
 3 files changed, 6 insertions(+)

diff --git a/arch/arm/include/asm/hypervisor.h b/arch/arm/include/asm/hypervisor.h
index bd61502b9715..8133c8c81a35 100644
--- a/arch/arm/include/asm/hypervisor.h
+++ b/arch/arm/include/asm/hypervisor.h
@@ -6,5 +6,6 @@
 
 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_init_hyp_services(void);
 
 #endif
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index 0ae427f352c8..8e77f411903f 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -6,5 +6,6 @@
 
 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
+void kvm_arm_init_hyp_services(void);
 
 #endif
diff --git a/drivers/firmware/smccc/kvm_guest.c b/drivers/firmware/smccc/kvm_guest.c
index 89a68e7eeaa6..e2c9528b7b85 100644
--- a/drivers/firmware/smccc/kvm_guest.c
+++ b/drivers/firmware/smccc/kvm_guest.c
@@ -10,6 +10,8 @@
 
 #include <asm/hypervisor.h>
 
+void __weak kvm_arm_init_hyp_services(void) {}
+
 static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { };
 
 void __init kvm_init_hyp_services(void)
@@ -39,6 +41,8 @@ void __init kvm_init_hyp_services(void)
 
 	pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n",
 		 res.a3, res.a2, res.a1, res.a0);
+
+	kvm_arm_init_hyp_services();
 }
 
 bool kvm_arm_hyp_service_available(u32 func_id)

From ead4a2adadb9e88f75b3e7ce8c528b4b63749640 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 15 Dec 2022 13:25:05 +0000
Subject: [PATCH 003/457] ANDROID: drivers: hv: Include memory encryption
 header

In order to prepare the ground for providing set_mem_{en,de}crypted()
for arm64, make sure to include the right header to avoid allmodconfig
build failures.

Bug: 233587962
Change-Id: I6fb5a9e469a6fe8496dff81a8cda50b6c6c1ed7f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 drivers/hv/channel.c    | 1 +
 drivers/hv/connection.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 56f7e06c673e..0ffe09546a55 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -19,6 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/set_memory.h>
 #include <asm/page.h>
+#include <asm/mem_encrypt.h>
 #include <asm/mshyperv.h>
 
 #include "hyperv_vmbus.h"
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 9dc27e5d367a..074bffd90d43 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/io.h>
 #include <linux/set_memory.h>
+#include <asm/mem_encrypt.h>
 #include <asm/mshyperv.h>
 
 #include "hyperv_vmbus.h"

From 2a6d4f48c81dd9a21d340c658160330d5304b9c6 Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Thu, 5 Aug 2021 14:23:07 +0100
Subject: [PATCH 004/457] ANDROID: BACKPORT: arm64: mm: Implement memory
 encryption API using KVM sharing hypercalls

When running as a protected guest, the KVM host does not have access to
any pages mapped into the guest. Consequently, KVM exposes hypercalls to
the guest so that pages can be shared back with the host for the purposes
of shared memory communication such as virtio.

Detect the presence of these hypercalls when running as a guest and use
them to implement the memory encryption interfaces gated by
CONFIG_ARCH_HAS_MEM_ENCRYPT which are called from the DMA layer to share
SWIOTLB bounce buffers for virtio.

Although no encryption is actually performed, "sharing" a page is akin
to decryption, whereas "unsharing" a page maps to encryption, albeit
without destruction of the underlying page contents.

Signed-off-by: Will Deacon <will@kernel.org>
[willdeacon@: Use asm/mem_encrypt.h instead of asm/set_memory.h;
 Implement mem_encrypt_active(); Add hypercall IDs;
 Drop unneeded GIC change]
[qperret@: Export set_memory_{en,de}crypted() to fix allmodconfig
 modpost failures]
Bug: 233587962
Change-Id: I5955ff0dca65561183f9a60e94be87f28fbf14ec
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/Kconfig                   |   1 +
 arch/arm64/include/asm/hypervisor.h  |   1 +
 arch/arm64/include/asm/mem_encrypt.h |   9 ++
 arch/arm64/kernel/setup.c            |   6 ++
 arch/arm64/mm/Makefile               |   2 +-
 arch/arm64/mm/mem_encrypt.c          | 134 +++++++++++++++++++++++++++
 6 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/mem_encrypt.h
 create mode 100644 arch/arm64/mm/mem_encrypt.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 505c8a1ccbe0..31cee7a9a8c9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -31,6 +31,7 @@ config ARM64
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
+	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index 8e77f411903f..ef8e55e6f2f2 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -7,5 +7,6 @@
 void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
 void kvm_arm_init_hyp_services(void);
+void kvm_init_memshare_services(void);
 
 #endif
diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..300c8b8cbebe
--- /dev/null
+++ b/arch/arm64/include/asm/mem_encrypt.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_MEM_ENCRYPT_H
+#define __ASM_MEM_ENCRYPT_H
+
+bool mem_encrypt_active(void);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
+#endif	/* __ASM_MEM_ENCRYPT_H */
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index fea3223704b6..0aa634a1bf66 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -40,6 +40,7 @@
 #include <asm/elf.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
+#include <asm/hypervisor.h>
 #include <asm/kasan.h>
 #include <asm/numa.h>
 #include <asm/sections.h>
@@ -438,3 +439,8 @@ static int __init register_arm64_panic_block(void)
 	return 0;
 }
 device_initcall(register_arm64_panic_block);
+
+void kvm_arm_init_hyp_services(void)
+{
+	kvm_init_memshare_services();
+}
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index ff1e800ba7a1..f48be6c63938 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   cache.o copypage.o flush.o \
-				   ioremap.o mmap.o pgd.o mmu.o \
+				   ioremap.o mem_encrypt.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
diff --git a/arch/arm64/mm/mem_encrypt.c b/arch/arm64/mm/mem_encrypt.c
new file mode 100644
index 000000000000..fd59ab7d6880
--- /dev/null
+++ b/arch/arm64/mm/mem_encrypt.c
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Implementation of the memory encryption/decryption API.
+ *
+ * Amusingly, no crypto is actually performed. Rather, we call into the
+ * hypervisor component of KVM to expose pages selectively to the host
+ * for virtio "DMA" operations. In other words, "encrypted" pages are
+ * not accessible to the host, whereas "decrypted" pages are.
+ *
+ * Author: Will Deacon <will@kernel.org>
+ */
+#include <linux/arm-smccc.h>
+#include <linux/mem_encrypt.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/set_memory.h>
+#include <linux/types.h>
+
+#include <asm/hypervisor.h>
+
+#ifndef ARM_SMCCC_KVM_FUNC_HYP_MEMINFO
+#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO	2
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_HYP_MEMINFO)
+#endif	/* ARM_SMCCC_KVM_FUNC_HYP_MEMINFO */
+
+#ifndef ARM_SMCCC_KVM_FUNC_MEM_SHARE
+#define ARM_SMCCC_KVM_FUNC_MEM_SHARE	3
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_SHARE)
+#endif	/* ARM_SMCCC_KVM_FUNC_MEM_SHARE */
+
+#ifndef ARM_SMCCC_KVM_FUNC_MEM_UNSHARE
+#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE	4
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_UNSHARE)
+#endif	/* ARM_SMCCC_KVM_FUNC_MEM_UNSHARE */
+
+static unsigned long memshare_granule_sz;
+
+bool mem_encrypt_active(void)
+{
+	return memshare_granule_sz;
+}
+EXPORT_SYMBOL(mem_encrypt_active);
+
+void kvm_init_memshare_services(void)
+{
+	int i;
+	struct arm_smccc_res res;
+	const u32 funcs[] = {
+		ARM_SMCCC_KVM_FUNC_HYP_MEMINFO,
+		ARM_SMCCC_KVM_FUNC_MEM_SHARE,
+		ARM_SMCCC_KVM_FUNC_MEM_UNSHARE,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(funcs); ++i) {
+		if (!kvm_arm_hyp_service_available(funcs[i]))
+			return;
+	}
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID,
+			     0, 0, 0, &res);
+	if (res.a0 > PAGE_SIZE) /* Includes error codes */
+		return;
+
+	memshare_granule_sz = res.a0;
+}
+
+static int arm_smccc_share_unshare_page(u32 func_id, phys_addr_t phys)
+{
+	phys_addr_t end = phys + PAGE_SIZE;
+
+	while (phys < end) {
+		struct arm_smccc_res res;
+
+		arm_smccc_1_1_invoke(func_id, phys, 0, 0, &res);
+		if (res.a0 != SMCCC_RET_SUCCESS)
+			return -EPERM;
+
+		phys += memshare_granule_sz;
+	}
+
+	return 0;
+}
+
+static int set_memory_xcrypted(u32 func_id, unsigned long start, int numpages)
+{
+	void *addr = (void *)start, *end = addr + numpages * PAGE_SIZE;
+
+	while (addr < end) {
+		int err;
+
+		err = arm_smccc_share_unshare_page(func_id, virt_to_phys(addr));
+		if (err)
+			return err;
+
+		addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+	if (!memshare_granule_sz || WARN_ON(!PAGE_ALIGNED(addr)))
+		return 0;
+
+	return set_memory_xcrypted(ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID,
+				   addr, numpages);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+	if (!memshare_granule_sz || WARN_ON(!PAGE_ALIGNED(addr)))
+		return 0;
+
+	return set_memory_xcrypted(ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID,
+				   addr, numpages);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);

From 346b74c8fb8f43b4c3ace040409f27174b4f33a0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 30 Jun 2021 13:24:32 +0100
Subject: [PATCH 005/457] ANDROID: mm/vmalloc: Add arch-specific callbacks to
 track io{remap,unmap} physical pages

Add a pair of hooks (ioremap_phys_range_hook/iounmap_phys_range_hook)
that can be implemented by an architecture. Contrary to the existing
arch_sync_kernel_mappings(), this one tracks things at the physical
address level.

This is specially useful in these virtualised environments where
the guest has to tell the host whether (and how) it intends to use
a MMIO device.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Bug: 233587962
Change-Id: I970c2e632cb2b01060d5e66e4194fa9248188f43
Signed-off-by: Will Deacon <willdeacon@google.com>
[ qperret: Fixed conflict in vmalloc.c due to call to
  kmsan_ioremap_page_range ]
Signed-off-by: Quentin Perret <qperret@google.com>
---
 include/linux/io.h |  2 ++
 mm/Kconfig         |  5 +++++
 mm/vmalloc.c       | 12 +++++++++++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/linux/io.h b/include/linux/io.h
index 308f4f0cfb93..f63b0968047e 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -21,6 +21,8 @@ void __ioread32_copy(void *to, const void __iomem *from, size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
 #ifdef CONFIG_MMU
+void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot);
+void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size);
 int ioremap_page_range(unsigned long addr, unsigned long end,
 		       phys_addr_t phys_addr, pgprot_t prot);
 #else
diff --git a/mm/Kconfig b/mm/Kconfig
index 57e1d8c5b505..9e8261dd622a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1073,6 +1073,11 @@ config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
 config IO_MAPPING
 	bool
 
+# Some architectures want callbacks for all IO mappings in order to
+# track the physical addresses that get used as devices.
+config ARCH_HAS_IOREMAP_PHYS_HOOKS
+	bool
+
 config SECRETMEM
 	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ccaa461998f3..bdcbde04caa8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -40,6 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/mm.h>
+#include <linux/io.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
@@ -317,12 +318,17 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
 {
 	int err;
 
-	err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
+	prot = pgprot_nx(prot);
+	err = vmap_range_noflush(addr, end, phys_addr, prot,
 				 ioremap_max_page_shift);
 	flush_cache_vmap(addr, end);
 	if (!err)
 		kmsan_ioremap_page_range(addr, end, phys_addr, prot,
 					 ioremap_max_page_shift);
+
+	if (IS_ENABLED(CONFIG_ARCH_HAS_IOREMAP_PHYS_HOOKS) && !err)
+		ioremap_phys_range_hook(phys_addr, end - addr, prot);
+
 	return err;
 }
 
@@ -2696,6 +2702,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
 
 	kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
 
+	if (IS_ENABLED(CONFIG_ARCH_HAS_IOREMAP_PHYS_HOOKS) &&
+	    area->flags & VM_IOREMAP)
+		iounmap_phys_range_hook(area->phys_addr, get_vm_area_size(area));
+
 	vm_remove_mappings(area, deallocate_pages);
 
 	if (deallocate_pages) {

From 4c87d6516aa1c23363b3cd4431b8d0c10a245464 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 30 Jun 2021 13:31:56 +0100
Subject: [PATCH 006/457] ANDROID: BACKPORT: arm64: Implement ioremap/iounmap
 hooks calling into KVM's MMIO guard

Implement the previously defined ioremap/iounmap hooks for arm64,
calling into KVM's MMIO guard if available.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Bug: 233587962
Change-Id: I86a78f8941fb60078fb873a34c5eb32830a00259
[willdeacon@: Add hypercall IDs and slab_is_available() check]
Signed-off-by: Will Deacon <willdeacon@google.com>
[ qperret@: Fixed context conflict ]
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/mm/ioremap.c | 136 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index c5af103d4ad4..89fbdff3afd0 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -1,7 +1,143 @@
 // SPDX-License-Identifier: GPL-2.0-only
 
+#define pr_fmt(fmt)	"ioremap: " fmt
+
 #include <linux/mm.h>
 #include <linux/io.h>
+#include <linux/arm-smccc.h>
+#include <linux/slab.h>
+
+#include <asm/fixmap.h>
+#include <asm/tlbflush.h>
+#include <asm/hypervisor.h>
+
+#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP	7
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP)
+#endif	/* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP */
+
+#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP	8
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP)
+#endif	/* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP */
+
+struct ioremap_guard_ref {
+	refcount_t	count;
+};
+
+static DEFINE_STATIC_KEY_FALSE(ioremap_guard_key);
+static DEFINE_XARRAY(ioremap_guard_array);
+static DEFINE_MUTEX(ioremap_guard_lock);
+
+void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
+{
+	if (!static_branch_unlikely(&ioremap_guard_key))
+		return;
+
+	if (pfn_valid(__phys_to_pfn(phys_addr)))
+		return;
+
+	mutex_lock(&ioremap_guard_lock);
+
+	while (size) {
+		u64 pfn = phys_addr >> PAGE_SHIFT;
+		struct ioremap_guard_ref *ref;
+		struct arm_smccc_res res;
+
+		ref = xa_load(&ioremap_guard_array, pfn);
+		if (ref) {
+			refcount_inc(&ref->count);
+			goto next;
+		}
+
+		/*
+		 * It is acceptable for the allocation to fail, specially
+		 * if trying to ioremap something very early on, like with
+		 * earlycon, which happens long before kmem_cache_init.
+		 * This page will be permanently accessible, similar to a
+		 * saturated refcount.
+		 */
+		if (slab_is_available())
+			ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (ref) {
+			refcount_set(&ref->count, 1);
+			if (xa_err(xa_store(&ioremap_guard_array, pfn, ref,
+					    GFP_KERNEL))) {
+				kfree(ref);
+				ref = NULL;
+			}
+		}
+
+		arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID,
+				  phys_addr, prot, &res);
+		if (res.a0 != SMCCC_RET_SUCCESS) {
+			pr_warn_ratelimited("Failed to register %llx\n",
+					    phys_addr);
+			xa_erase(&ioremap_guard_array, pfn);
+			kfree(ref);
+			goto out;
+		}
+
+	next:
+		size -= PAGE_SIZE;
+		phys_addr += PAGE_SIZE;
+	}
+out:
+	mutex_unlock(&ioremap_guard_lock);
+}
+
+void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size)
+{
+	if (!static_branch_unlikely(&ioremap_guard_key))
+		return;
+
+	VM_BUG_ON(phys_addr & ~PAGE_MASK || size & ~PAGE_MASK);
+
+	mutex_lock(&ioremap_guard_lock);
+
+	while (size) {
+		u64 pfn = phys_addr >> PAGE_SHIFT;
+		struct ioremap_guard_ref *ref;
+		struct arm_smccc_res res;
+
+		ref = xa_load(&ioremap_guard_array, pfn);
+		if (!ref) {
+			pr_warn_ratelimited("%llx not tracked, left mapped\n",
+					    phys_addr);
+			goto next;
+		}
+
+		if (!refcount_dec_and_test(&ref->count))
+			goto next;
+
+		xa_erase(&ioremap_guard_array, pfn);
+		kfree(ref);
+
+		arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID,
+				  phys_addr, &res);
+		if (res.a0 != SMCCC_RET_SUCCESS) {
+			pr_warn_ratelimited("Failed to unregister %llx\n",
+					    phys_addr);
+			goto out;
+		}
+
+	next:
+		size -= PAGE_SIZE;
+		phys_addr += PAGE_SIZE;
+	}
+out:
+	mutex_unlock(&ioremap_guard_lock);
+}
 
 bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot)
 {

From f7c4ae96ad34c040a1650626aa99ea7bec59ba92 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 29 Sep 2021 15:39:47 +0100
Subject: [PATCH 007/457] ANDROID: BACKPORT: arm64: Enroll into KVM's MMIO
 guard if required

Should a guest desire to enroll into the MMIO guard, allow it to
do so with a command-line option.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Bug: 233587962
Change-Id: Ia9a77f693531740500739693c52b4959abacafd4
[willdeacon@: Add hypercall IDs]
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 .../admin-guide/kernel-parameters.txt         |  3 +
 arch/arm64/Kconfig                            |  1 +
 arch/arm64/include/asm/hypervisor.h           |  1 +
 arch/arm64/kernel/setup.c                     |  2 +
 arch/arm64/mm/ioremap.c                       | 58 +++++++++++++++++++
 5 files changed, 65 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b25d51b36676..72874edc0320 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2184,6 +2184,9 @@
 			1 - Bypass the IOMMU for DMA.
 			unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
 
+	ioremap_guard	[ARM64] enable the KVM MMIO guard functionality
+			if available.
+
 	io7=		[HW] IO7 for Marvel-based Alpha systems
 			See comment before marvel_specify_io7 in
 			arch/alpha/kernel/core_marvel.c.
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 31cee7a9a8c9..9bf4dc17c574 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -28,6 +28,7 @@ config ARM64
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_IOREMAP_PHYS_HOOKS
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index ef8e55e6f2f2..ee45ae14be27 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -8,5 +8,6 @@ void kvm_init_hyp_services(void);
 bool kvm_arm_hyp_service_available(u32 func_id);
 void kvm_arm_init_hyp_services(void);
 void kvm_init_memshare_services(void);
+void kvm_init_ioremap_services(void);
 
 #endif
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 0aa634a1bf66..575eafcfcb66 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -50,6 +50,7 @@
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
 #include <asm/efi.h>
+#include <asm/hypervisor.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/mmu_context.h>
 
@@ -442,5 +443,6 @@ device_initcall(register_arm64_panic_block);
 
 void kvm_arm_init_hyp_services(void)
 {
+	kvm_init_ioremap_services();
 	kvm_init_memshare_services();
 }
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 89fbdff3afd0..8d0f8d7dbe53 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -11,6 +11,26 @@
 #include <asm/tlbflush.h>
 #include <asm/hypervisor.h>
 
+#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO	5
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO)
+#endif	/* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO */
+
+#ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL	6
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL)
+#endif	/* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL */
+
 #ifndef ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP
 #define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP	7
 
@@ -39,6 +59,44 @@ static DEFINE_STATIC_KEY_FALSE(ioremap_guard_key);
 static DEFINE_XARRAY(ioremap_guard_array);
 static DEFINE_MUTEX(ioremap_guard_lock);
 
+static bool ioremap_guard;
+static int __init ioremap_guard_setup(char *str)
+{
+	ioremap_guard = true;
+
+	return 0;
+}
+early_param("ioremap_guard", ioremap_guard_setup);
+
+void kvm_init_ioremap_services(void)
+{
+	struct arm_smccc_res res;
+
+	if (!ioremap_guard)
+		return;
+
+	/* We need all the functions to be implemented */
+	if (!kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO) ||
+	    !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL) ||
+	    !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP) ||
+	    !kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP))
+		return;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID,
+			     0, 0, 0, &res);
+	if (res.a0 != PAGE_SIZE)
+		return;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID,
+			     &res);
+	if (res.a0 == SMCCC_RET_SUCCESS) {
+		static_branch_enable(&ioremap_guard_key);
+		pr_info("Using KVM MMIO guard for ioremap\n");
+	} else {
+		pr_warn("KVM MMIO guard registration failed (%ld)\n", res.a0);
+	}
+}
+
 void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
 {
 	if (!static_branch_unlikely(&ioremap_guard_key))

From c5eb79ce1a4de1f38f13d9919b903775f71905c4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 1 Jul 2021 09:30:45 +0100
Subject: [PATCH 008/457] ANDROID: arm64: Add a helper to retrieve the PTE of a
 fixmap

In order to transfer the early mapping state into KVM's MMIO
guard infrastructure, provide a small helper that will retrieve
the associated PTE.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Bug: 233587962
Change-Id: Iefc1c57d5e9476b718a8a68f60e562a57b09fb6a
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/fixmap.h |  2 ++
 arch/arm64/mm/mmu.c             | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 71ed5fdf718b..02bf6e47029a 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -109,6 +109,8 @@ void __init early_fixmap_init(void);
 
 extern void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot);
 
+extern pte_t *__get_fixmap_pte(enum fixed_addresses idx);
+
 #include <asm-generic/fixmap.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9a7c38965154..ae25524dea6a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1351,6 +1351,21 @@ void __set_fixmap(enum fixed_addresses idx,
 	}
 }
 
+pte_t *__get_fixmap_pte(enum fixed_addresses idx)
+{
+	unsigned long addr = __fix_to_virt(idx);
+	pte_t *ptep;
+
+	BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
+
+	ptep = fixmap_pte(addr);
+
+	if (!pte_valid(*ptep))
+		return NULL;
+
+	return ptep;
+}
+
 void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 {
 	const u64 dt_virt_base = __fix_to_virt(FIX_FDT);

From ef4a035f8aee9ae8c29200560dc74907d33a65af Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 1 Jul 2021 09:34:12 +0100
Subject: [PATCH 009/457] ANDROID: arm64: Register earlycon fixmap with the
 MMIO guard

On initialising the MMIO guard infrastructure, register the
earlycon mapping if present.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Bug: 233587962
Change-Id: I379387253d08e2414fa386a3360a45391da7d90d
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/mm/ioremap.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 8d0f8d7dbe53..e91a79c5e3df 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -68,6 +68,17 @@ static int __init ioremap_guard_setup(char *str)
 }
 early_param("ioremap_guard", ioremap_guard_setup);
 
+static void fixup_fixmap(void)
+{
+	pte_t *ptep = __get_fixmap_pte(FIX_EARLYCON_MEM_BASE);
+
+	if (!ptep)
+		return;
+
+	ioremap_phys_range_hook(__pte_to_phys(*ptep), PAGE_SIZE,
+				__pgprot(pte_val(*ptep) & PTE_ATTRINDX_MASK));
+}
+
 void kvm_init_ioremap_services(void)
 {
 	struct arm_smccc_res res;
@@ -91,6 +102,7 @@ void kvm_init_ioremap_services(void)
 			     &res);
 	if (res.a0 == SMCCC_RET_SUCCESS) {
 		static_branch_enable(&ioremap_guard_key);
+		fixup_fixmap();
 		pr_info("Using KVM MMIO guard for ioremap\n");
 	} else {
 		pr_warn("KVM MMIO guard registration failed (%ld)\n", res.a0);

From 4b40706d71359a6254a3bf217986af262de59396 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:03 +0100
Subject: [PATCH 010/457] FROMLIST: KVM: arm64: Move hyp refcount manipulation
 helpers to common header file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We will soon need to manipulate 'struct hyp_page' refcounts from outside
page_alloc.c, so move the helpers to a common header file to allow them
to be reused easily.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-2-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I7a7b37043f3514fcb2e1815c06366e1399e90a15
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/memory.h | 22 ++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/page_alloc.c     | 19 -------------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 592b7edb3edb..9422900e5c6a 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -38,6 +38,10 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr)
 #define hyp_page_to_virt(page)	__hyp_va(hyp_page_to_phys(page))
 #define hyp_page_to_pool(page)	(((struct hyp_page *)page)->pool)
 
+/*
+ * Refcounting for 'struct hyp_page'.
+ * hyp_pool::lock must be held if atomic access to the refcount is required.
+ */
 static inline int hyp_page_count(void *addr)
 {
 	struct hyp_page *p = hyp_virt_to_page(addr);
@@ -45,4 +49,22 @@ static inline int hyp_page_count(void *addr)
 	return p->refcount;
 }
 
+static inline void hyp_page_ref_inc(struct hyp_page *p)
+{
+	BUG_ON(p->refcount == USHRT_MAX);
+	p->refcount++;
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+	BUG_ON(!p->refcount);
+	p->refcount--;
+	return (p->refcount == 0);
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+	BUG_ON(p->refcount);
+	p->refcount = 1;
+}
 #endif /* __KVM_HYP_MEMORY_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index d40f0b30b534..1ded09fc9b10 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -144,25 +144,6 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
 	return p;
 }
 
-static inline void hyp_page_ref_inc(struct hyp_page *p)
-{
-	BUG_ON(p->refcount == USHRT_MAX);
-	p->refcount++;
-}
-
-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
-{
-	BUG_ON(!p->refcount);
-	p->refcount--;
-	return (p->refcount == 0);
-}
-
-static inline void hyp_set_page_refcounted(struct hyp_page *p)
-{
-	BUG_ON(p->refcount);
-	p->refcount = 1;
-}
-
 static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
 {
 	if (hyp_page_ref_dec_and_test(p))

From 8f505fc251c84b21eefa25077ac6d2639e7cfa8b Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:04 +0100
Subject: [PATCH 011/457] FROMLIST: KVM: arm64: Allow attaching of
 non-coalescable pages to a hyp pool

All the contiguous pages used to initialize a 'struct hyp_pool' are
considered coalescable, which means that the hyp page allocator will
actively try to merge them with their buddies on the hyp_put_page() path.
However, using hyp_put_page() on a page that is not part of the inital
memory range given to a hyp_pool() is currently unsupported.

In order to allow dynamically extending hyp pools at run-time, add a
check to __hyp_attach_page() to allow inserting 'external' pages into
the free-list of order 0. This will be necessary to allow lazy donation
of pages from the host to the hypervisor when allocating guest stage-2
page-table pages at EL2.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-3-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ie417f839ae6d30705847ab3e27e2b3c3ac6ee8dc
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/page_alloc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 1ded09fc9b10..0d15227aced8 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -93,11 +93,15 @@ static inline struct hyp_page *node_to_page(struct list_head *node)
 static void __hyp_attach_page(struct hyp_pool *pool,
 			      struct hyp_page *p)
 {
+	phys_addr_t phys = hyp_page_to_phys(p);
 	unsigned short order = p->order;
 	struct hyp_page *buddy;
 
 	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
 
+	if (phys < pool->range_start || phys >= pool->range_end)
+		goto insert;
+
 	/*
 	 * Only the first struct hyp_page of a high-order page (otherwise known
 	 * as the 'head') should have p->order set. The non-head pages should
@@ -116,6 +120,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 		p = min(p, buddy);
 	}
 
+insert:
 	/* Mark the new head, and insert it */
 	p->order = order;
 	page_add_to_list(p, &pool->free_area[order]);

From c066327789e5ee48fe21929a78874aa901d24dbb Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:05 +0100
Subject: [PATCH 012/457] FROMLIST: KVM: arm64: Back the hypervisor 'struct
 hyp_page' array for all memory

The EL2 'vmemmap' array in nVHE Protected mode is currently very sparse:
only memory pages owned by the hypervisor itself have a matching 'struct
hyp_page'. However, as the size of this struct has been reduced
significantly since its introduction, it appears that we can now afford
to back the vmemmap for all of memory.

Having an easily accessible 'struct hyp_page' for every physical page in
memory provides the hypervisor with a simple mechanism to store metadata
(e.g. a refcount) that wouldn't otherwise fit in the very limited number
of software bits available in the host stage-2 page-table entries. This
will be used in subsequent patches when pinning host memory pages for
use by the hypervisor at EL2.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-4-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ifced0fb12faf2af6b757d68e18ba7cb74fdd0871
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h    | 26 +++++++++++++++++++++++
 arch/arm64/kvm/hyp/include/nvhe/mm.h | 14 +------------
 arch/arm64/kvm/hyp/nvhe/mm.c         | 31 ++++++++++++++++++++++++----
 arch/arm64/kvm/hyp/nvhe/page_alloc.c |  4 +---
 arch/arm64/kvm/hyp/nvhe/setup.c      |  7 +++----
 arch/arm64/kvm/pkvm.c                | 18 ++--------------
 6 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 9f4ad2a8df59..8f7b8a2314bb 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -14,6 +14,32 @@
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
 
+static inline unsigned long
+hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size)
+{
+	unsigned long nr_pages = reg->size >> PAGE_SHIFT;
+	unsigned long start, end;
+
+	start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size;
+	end = start + nr_pages * vmemmap_entry_size;
+	start = ALIGN_DOWN(start, PAGE_SIZE);
+	end = ALIGN(end, PAGE_SIZE);
+
+	return end - start;
+}
+
+static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
+{
+	unsigned long res = 0, i;
+
+	for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
+		res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i],
+						 vmemmap_entry_size);
+	}
+
+	return res >> PAGE_SHIFT;
+}
+
 static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 {
 	unsigned long total = 0, i;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 42d8eb9bfe72..b2ee6d5df55b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -15,7 +15,7 @@ extern hyp_spinlock_t pkvm_pgd_lock;
 
 int hyp_create_idmap(u32 hyp_va_bits);
 int hyp_map_vectors(void);
-int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back);
+int hyp_back_vmemmap(phys_addr_t back);
 int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot);
 int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot);
@@ -24,16 +24,4 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 				  unsigned long *haddr);
 int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
 
-static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size,
-				     unsigned long *start, unsigned long *end)
-{
-	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct hyp_page *p = hyp_phys_to_page(phys);
-
-	*start = (unsigned long)p;
-	*end = *start + nr_pages * sizeof(struct hyp_page);
-	*start = ALIGN_DOWN(*start, PAGE_SIZE);
-	*end = ALIGN(*end, PAGE_SIZE);
-}
-
 #endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 96193cb31a39..d3a3b47181de 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -129,13 +129,36 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 	return ret;
 }
 
-int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back)
+int hyp_back_vmemmap(phys_addr_t back)
 {
-	unsigned long start, end;
+	unsigned long i, start, size, end = 0;
+	int ret;
 
-	hyp_vmemmap_range(phys, size, &start, &end);
+	for (i = 0; i < hyp_memblock_nr; i++) {
+		start = hyp_memory[i].base;
+		start = ALIGN_DOWN((u64)hyp_phys_to_page(start), PAGE_SIZE);
+		/*
+		 * The begining of the hyp_vmemmap region for the current
+		 * memblock may already be backed by the page backing the end
+		 * the previous region, so avoid mapping it twice.
+		 */
+		start = max(start, end);
 
-	return __pkvm_create_mappings(start, end - start, back, PAGE_HYP);
+		end = hyp_memory[i].base + hyp_memory[i].size;
+		end = PAGE_ALIGN((u64)hyp_phys_to_page(end));
+		if (start >= end)
+			continue;
+
+		size = end - start;
+		ret = __pkvm_create_mappings(start, size, back, PAGE_HYP);
+		if (ret)
+			return ret;
+
+		memset(hyp_phys_to_virt(back), 0, size);
+		back += size;
+	}
+
+	return 0;
 }
 
 static void *__hyp_bp_vect_base;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 0d15227aced8..7804da89e55d 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -235,10 +235,8 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
 
 	/* Init the vmemmap portion */
 	p = hyp_phys_to_page(phys);
-	for (i = 0; i < nr_pages; i++) {
-		p[i].order = 0;
+	for (i = 0; i < nr_pages; i++)
 		hyp_set_page_refcounted(&p[i]);
-	}
 
 	/* Attach the unused pages to the buddy tree */
 	for (i = reserved_pages; i < nr_pages; i++)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index e8d4ea2fcfa0..579eb4f73476 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -31,12 +31,11 @@ static struct hyp_pool hpool;
 
 static int divide_memory_pool(void *virt, unsigned long size)
 {
-	unsigned long vstart, vend, nr_pages;
+	unsigned long nr_pages;
 
 	hyp_early_alloc_init(virt, size);
 
-	hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend);
-	nr_pages = (vend - vstart) >> PAGE_SHIFT;
+	nr_pages = hyp_vmemmap_pages(sizeof(struct hyp_page));
 	vmemmap_base = hyp_early_alloc_contig(nr_pages);
 	if (!vmemmap_base)
 		return -ENOMEM;
@@ -78,7 +77,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
 	if (ret)
 		return ret;
 
-	ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base));
+	ret = hyp_back_vmemmap(hyp_virt_to_phys(vmemmap_base));
 	if (ret)
 		return ret;
 
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index ebecb7c045f4..34229425b25d 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -53,7 +53,7 @@ static int __init register_memblock_regions(void)
 
 void __init kvm_hyp_reserve(void)
 {
-	u64 nr_pages, prev, hyp_mem_pages = 0;
+	u64 hyp_mem_pages = 0;
 	int ret;
 
 	if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
@@ -71,21 +71,7 @@ void __init kvm_hyp_reserve(void)
 
 	hyp_mem_pages += hyp_s1_pgtable_pages();
 	hyp_mem_pages += host_s2_pgtable_pages();
-
-	/*
-	 * The hyp_vmemmap needs to be backed by pages, but these pages
-	 * themselves need to be present in the vmemmap, so compute the number
-	 * of pages needed by looking for a fixed point.
-	 */
-	nr_pages = 0;
-	do {
-		prev = nr_pages;
-		nr_pages = hyp_mem_pages + prev;
-		nr_pages = DIV_ROUND_UP(nr_pages * STRUCT_HYP_PAGE_SIZE,
-					PAGE_SIZE);
-		nr_pages += __hyp_pgtable_max_pages(nr_pages);
-	} while (nr_pages != prev);
-	hyp_mem_pages += nr_pages;
+	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
 
 	/*
 	 * Try to allocate a PMD-aligned region to reduce TLB pressure once

From 27bae98c43d973c7642f9d2357dff91217661b28 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:06 +0100
Subject: [PATCH 013/457] FROMLIST: KVM: arm64: Fix-up hyp stage-1 refcounts
 for all pages mapped at EL2

In order to allow unmapping arbitrary memory pages from the hypervisor
stage-1 page-table, fix-up the initial refcount for pages that have been
mapped before the 'vmemmap' array was up and running so that it
accurately accounts for all existing hypervisor mappings.

This is achieved by traversing the entire hypervisor stage-1 page-table
during initialisation of EL2 and updating the corresponding
'struct hyp_page' for each valid mapping.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-5-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I10df07bab97198ac4fd9477091b2e9340cc441b0
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/setup.c | 62 +++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 579eb4f73476..8f2726d7e201 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -185,12 +185,11 @@ static void hpool_put_page(void *addr)
 	hyp_put_page(&hpool, addr);
 }
 
-static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
-					 kvm_pte_t *ptep,
-					 enum kvm_pgtable_walk_flags flag,
-					 void * const arg)
+static int fix_host_ownership_walker(u64 addr, u64 end, u32 level,
+				     kvm_pte_t *ptep,
+				     enum kvm_pgtable_walk_flags flag,
+				     void * const arg)
 {
-	struct kvm_pgtable_mm_ops *mm_ops = arg;
 	enum kvm_pgtable_prot prot;
 	enum pkvm_page_state state;
 	kvm_pte_t pte = *ptep;
@@ -199,15 +198,6 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
 	if (!kvm_pte_valid(pte))
 		return 0;
 
-	/*
-	 * Fix-up the refcount for the page-table pages as the early allocator
-	 * was unable to access the hyp_vmemmap and so the buddy allocator has
-	 * initialised the refcount to '1'.
-	 */
-	mm_ops->get_page(ptep);
-	if (flag != KVM_PGTABLE_WALK_LEAF)
-		return 0;
-
 	if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
 		return -EINVAL;
 
@@ -236,12 +226,30 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
 	return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
 }
 
-static int finalize_host_mappings(void)
+static int fix_hyp_pgtable_refcnt_walker(u64 addr, u64 end, u32 level,
+					 kvm_pte_t *ptep,
+					 enum kvm_pgtable_walk_flags flag,
+					 void * const arg)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = arg;
+	kvm_pte_t pte = *ptep;
+
+	/*
+	 * Fix-up the refcount for the page-table pages as the early allocator
+	 * was unable to access the hyp_vmemmap and so the buddy allocator has
+	 * initialised the refcount to '1'.
+	 */
+	if (kvm_pte_valid(pte))
+		mm_ops->get_page(ptep);
+
+	return 0;
+}
+
+static int fix_host_ownership(void)
 {
 	struct kvm_pgtable_walker walker = {
-		.cb	= finalize_host_mappings_walker,
-		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
-		.arg	= pkvm_pgtable.mm_ops,
+		.cb	= fix_host_ownership_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
 	};
 	int i, ret;
 
@@ -257,6 +265,18 @@ static int finalize_host_mappings(void)
 	return 0;
 }
 
+static int fix_hyp_pgtable_refcnt(void)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= fix_hyp_pgtable_refcnt_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+		.arg	= pkvm_pgtable.mm_ops,
+	};
+
+	return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits),
+				&walker);
+}
+
 void __noreturn __pkvm_init_finalise(void)
 {
 	struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -286,7 +306,11 @@ void __noreturn __pkvm_init_finalise(void)
 	};
 	pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
 
-	ret = finalize_host_mappings();
+	ret = fix_host_ownership();
+	if (ret)
+		goto out;
+
+	ret = fix_hyp_pgtable_refcnt();
 	if (ret)
 		goto out;
 

From 320597dd7b4d01508038174102de6ea0d2076b5d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:07 +0100
Subject: [PATCH 014/457] FROMLIST: KVM: arm64: Unify identifiers used to
 distinguish host and hypervisor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 'pkvm_component_id' enum type provides constants to refer to the
host and the hypervisor, yet this information is duplicated by the
'pkvm_hyp_id' constant.

Remove the definition of 'pkvm_hyp_id' and move the 'pkvm_component_id'
type definition to 'mem_protect.h' so that it can be used outside of
the memory protection code, for example when initialising the owner for
hypervisor-owned pages.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-6-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I78454dc315a0993808a741ebd7fabf9dbcaf09bc
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 6 +++++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 8 --------
 arch/arm64/kvm/hyp/nvhe/setup.c               | 2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 80e99836eac7..f5705a1e972f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -51,7 +51,11 @@ struct host_kvm {
 };
 extern struct host_kvm host_kvm;
 
-extern const u8 pkvm_hyp_id;
+/* This corresponds to page-table locking order */
+enum pkvm_component_id {
+	PKVM_ID_HOST,
+	PKVM_ID_HYP,
+};
 
 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 07f9dc9848ef..b5ab055c4674 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -26,8 +26,6 @@ struct host_kvm host_kvm;
 
 static struct hyp_pool host_s2_pool;
 
-const u8 pkvm_hyp_id = 1;
-
 static void host_lock_component(void)
 {
 	hyp_spin_lock(&host_kvm.lock);
@@ -380,12 +378,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 	BUG_ON(ret && ret != -EAGAIN);
 }
 
-/* This corresponds to locking order */
-enum pkvm_component_id {
-	PKVM_ID_HOST,
-	PKVM_ID_HYP,
-};
-
 struct pkvm_mem_transition {
 	u64				nr_pages;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 8f2726d7e201..0312c9c74a5a 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -212,7 +212,7 @@ static int fix_host_ownership_walker(u64 addr, u64 end, u32 level,
 	state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
 	switch (state) {
 	case PKVM_PAGE_OWNED:
-		return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id);
+		return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
 	case PKVM_PAGE_SHARED_OWNED:
 		prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
 		break;

From 1d9cf02cffe41057d1375a8afee87dc618131e77 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:08 +0100
Subject: [PATCH 015/457] FROMLIST: KVM: arm64: Implement do_donate() helper
 for donating memory

Transferring ownership information of a memory region from one component
to another can be achieved using a "donate" operation, which results
in the previous owner losing access to the underlying pages entirely
and the new owner having exclusive access to the page.

Implement a do_donate() helper, along the same lines as do_{un,}share,
and provide this functionality for the host-{to,from}-hyp cases as this
will later be used to donate/reclaim memory pages to store VM metadata
at EL2.

In a similar manner to the sharing transitions, permission checks are
performed by the hypervisor to ensure that the component initiating the
transition really is the owner of the page and also that the completer
does not currently have a page mapped at the target address.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Co-developed-by: Quentin Perret <qperret@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-7-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I19c349d40f63c1bd5eae0baf9acd21697e1e8762
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   2 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 239 ++++++++++++++++++
 2 files changed, 241 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index f5705a1e972f..c87b19b2d468 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -60,6 +60,8 @@ enum pkvm_component_id {
 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
 int __pkvm_host_unshare_hyp(u64 pfn);
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index b5ab055c4674..6fbd0e9f7d7f 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -391,6 +391,9 @@ struct pkvm_mem_transition {
 				/* Address in the completer's address space */
 				u64	completer_addr;
 			} host;
+			struct {
+				u64	completer_addr;
+			} hyp;
 		};
 	} initiator;
 
@@ -404,6 +407,10 @@ struct pkvm_mem_share {
 	const enum kvm_pgtable_prot		completer_prot;
 };
 
+struct pkvm_mem_donation {
+	const struct pkvm_mem_transition	tx;
+};
+
 struct check_walk_data {
 	enum pkvm_page_state	desired;
 	enum pkvm_page_state	(*get_page_state)(kvm_pte_t pte);
@@ -503,6 +510,46 @@ static int host_initiate_unshare(u64 *completer_addr,
 	return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED);
 }
 
+static int host_initiate_donation(u64 *completer_addr,
+				  const struct pkvm_mem_transition *tx)
+{
+	u8 owner_id = tx->completer.id;
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	*completer_addr = tx->initiator.host.completer_addr;
+	return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id);
+}
+
+static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
+{
+	return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
+		 tx->initiator.id != PKVM_ID_HYP);
+}
+
+static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
+				 enum pkvm_page_state state)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	if (__host_ack_skip_pgtable_check(tx))
+		return 0;
+
+	return __host_check_page_state_range(addr, size, state);
+}
+
+static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	return __host_ack_transition(addr, tx, PKVM_NOPAGE);
+}
+
+static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+	u8 host_id = tx->completer.id;
+
+	return host_stage2_set_owner_locked(addr, size, host_id);
+}
+
 static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
 {
 	if (!kvm_pte_valid(pte))
@@ -523,6 +570,27 @@ static int __hyp_check_page_state_range(u64 addr, u64 size,
 	return check_page_state_range(&pkvm_pgtable, addr, size, &d);
 }
 
+static int hyp_request_donation(u64 *completer_addr,
+				const struct pkvm_mem_transition *tx)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+	u64 addr = tx->initiator.addr;
+
+	*completer_addr = tx->initiator.hyp.completer_addr;
+	return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+}
+
+static int hyp_initiate_donation(u64 *completer_addr,
+				 const struct pkvm_mem_transition *tx)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+	int ret;
+
+	*completer_addr = tx->initiator.hyp.completer_addr;
+	ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size);
+	return (ret != size) ? -EFAULT : 0;
+}
+
 static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx)
 {
 	return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) ||
@@ -554,6 +622,16 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
 					    PKVM_PAGE_SHARED_BORROWED);
 }
 
+static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	if (__hyp_ack_skip_pgtable_check(tx))
+		return 0;
+
+	return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE);
+}
+
 static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
 			      enum kvm_pgtable_prot perms)
 {
@@ -572,6 +650,15 @@ static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
 	return (ret != size) ? -EFAULT : 0;
 }
 
+static int hyp_complete_donation(u64 addr,
+				 const struct pkvm_mem_transition *tx)
+{
+	void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE);
+	enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED);
+
+	return pkvm_create_mappings_locked(start, end, prot);
+}
+
 static int check_share(struct pkvm_mem_share *share)
 {
 	const struct pkvm_mem_transition *tx = &share->tx;
@@ -724,6 +811,94 @@ static int do_unshare(struct pkvm_mem_share *share)
 	return WARN_ON(__do_unshare(share));
 }
 
+static int check_donation(struct pkvm_mem_donation *donation)
+{
+	const struct pkvm_mem_transition *tx = &donation->tx;
+	u64 completer_addr;
+	int ret;
+
+	switch (tx->initiator.id) {
+	case PKVM_ID_HOST:
+		ret = host_request_owned_transition(&completer_addr, tx);
+		break;
+	case PKVM_ID_HYP:
+		ret = hyp_request_donation(&completer_addr, tx);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		return ret;
+
+	switch (tx->completer.id){
+	case PKVM_ID_HOST:
+		ret = host_ack_donation(completer_addr, tx);
+		break;
+	case PKVM_ID_HYP:
+		ret = hyp_ack_donation(completer_addr, tx);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int __do_donate(struct pkvm_mem_donation *donation)
+{
+	const struct pkvm_mem_transition *tx = &donation->tx;
+	u64 completer_addr;
+	int ret;
+
+	switch (tx->initiator.id) {
+	case PKVM_ID_HOST:
+		ret = host_initiate_donation(&completer_addr, tx);
+		break;
+	case PKVM_ID_HYP:
+		ret = hyp_initiate_donation(&completer_addr, tx);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		return ret;
+
+	switch (tx->completer.id){
+	case PKVM_ID_HOST:
+		ret = host_complete_donation(completer_addr, tx);
+		break;
+	case PKVM_ID_HYP:
+		ret = hyp_complete_donation(completer_addr, tx);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/*
+ * do_donate():
+ *
+ * The page owner transfers ownership to another component, losing access
+ * as a consequence.
+ *
+ * Initiator: OWNED	=> NOPAGE
+ * Completer: NOPAGE	=> OWNED
+ */
+static int do_donate(struct pkvm_mem_donation *donation)
+{
+	int ret;
+
+	ret = check_donation(donation);
+	if (ret)
+		return ret;
+
+	return WARN_ON(__do_donate(donation));
+}
+
 int __pkvm_host_share_hyp(u64 pfn)
 {
 	int ret;
@@ -789,3 +964,67 @@ int __pkvm_host_unshare_hyp(u64 pfn)
 
 	return ret;
 }
+
+int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
+{
+	int ret;
+	u64 host_addr = hyp_pfn_to_phys(pfn);
+	u64 hyp_addr = (u64)__hyp_va(host_addr);
+	struct pkvm_mem_donation donation = {
+		.tx	= {
+			.nr_pages	= nr_pages,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= host_addr,
+				.host	= {
+					.completer_addr = hyp_addr,
+				},
+			},
+			.completer	= {
+				.id	= PKVM_ID_HYP,
+			},
+		},
+	};
+
+	host_lock_component();
+	hyp_lock_component();
+
+	ret = do_donate(&donation);
+
+	hyp_unlock_component();
+	host_unlock_component();
+
+	return ret;
+}
+
+int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
+{
+	int ret;
+	u64 host_addr = hyp_pfn_to_phys(pfn);
+	u64 hyp_addr = (u64)__hyp_va(host_addr);
+	struct pkvm_mem_donation donation = {
+		.tx	= {
+			.nr_pages	= nr_pages,
+			.initiator	= {
+				.id	= PKVM_ID_HYP,
+				.addr	= hyp_addr,
+				.hyp	= {
+					.completer_addr = host_addr,
+				},
+			},
+			.completer	= {
+				.id	= PKVM_ID_HOST,
+			},
+		},
+	};
+
+	host_lock_component();
+	hyp_lock_component();
+
+	ret = do_donate(&donation);
+
+	hyp_unlock_component();
+	host_unlock_component();
+
+	return ret;
+}

From b0c1691474219032cc68fd91c6bd4f416867f97b Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:09 +0100
Subject: [PATCH 016/457] FROMLIST: KVM: arm64: Prevent the donation of no-map
 pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Memory regions marked as "no-map" in the host device-tree routinely
include TrustZone carev-outs and DMA pools. Although donating such pages
to the hypervisor may not breach confidentiality, it could be used to
corrupt its state in uncontrollable ways. To prevent this, let's block
host-initiated memory transitions targeting "no-map" pages altogether in
nVHE protected mode as there should be no valid reason to do this in
current operation.

Thankfully, the pKVM EL2 hypervisor has a full copy of the host's list
of memblock regions, so we can easily check for the presence of the
MEMBLOCK_NOMAP flag on a region containing pages being donated from the
host.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-8-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I21198f93a6d9c727b70c504ddd31345329eabb8f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 6fbd0e9f7d7f..d9f4174623e7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -193,7 +193,7 @@ struct kvm_mem_range {
 	u64 end;
 };
 
-static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
+static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
 {
 	int cur, left = 0, right = hyp_memblock_nr;
 	struct memblock_region *reg;
@@ -216,18 +216,28 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range)
 		} else {
 			range->start = reg->base;
 			range->end = end;
-			return true;
+			return reg;
 		}
 	}
 
-	return false;
+	return NULL;
 }
 
 bool addr_is_memory(phys_addr_t phys)
 {
 	struct kvm_mem_range range;
 
-	return find_mem_range(phys, &range);
+	return !!find_mem_range(phys, &range);
+}
+
+static bool addr_is_allowed_memory(phys_addr_t phys)
+{
+	struct memblock_region *reg;
+	struct kvm_mem_range range;
+
+	reg = find_mem_range(phys, &range);
+
+	return reg && !(reg->flags & MEMBLOCK_NOMAP);
 }
 
 static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
@@ -346,7 +356,7 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
 static int host_stage2_idmap(u64 addr)
 {
 	struct kvm_mem_range range;
-	bool is_memory = find_mem_range(addr, &range);
+	bool is_memory = !!find_mem_range(addr, &range);
 	enum kvm_pgtable_prot prot;
 	int ret;
 
@@ -424,7 +434,7 @@ static int __check_page_state_visitor(u64 addr, u64 end, u32 level,
 	struct check_walk_data *d = arg;
 	kvm_pte_t pte = *ptep;
 
-	if (kvm_pte_valid(pte) && !addr_is_memory(kvm_pte_to_phys(pte)))
+	if (kvm_pte_valid(pte) && !addr_is_allowed_memory(kvm_pte_to_phys(pte)))
 		return -EINVAL;
 
 	return d->get_page_state(pte) == d->desired ? 0 : -EPERM;

From b89b2f299b8dd9253d5d8d0a904a548cbd3946cf Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:10 +0100
Subject: [PATCH 017/457] FROMLIST: KVM: arm64: Add helpers to pin memory
 shared with the hypervisor at EL2

Add helpers allowing the hypervisor to check whether a range of pages
are currently shared by the host, and 'pin' them if so by blocking host
unshare operations until the memory has been unpinned.

This will allow the hypervisor to take references on host-provided
data-structures (e.g. 'struct kvm') with the guarantee that these pages
will remain in a stable state until the hypervisor decides to release
them, for example during guest teardown.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-9-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I2918af9b4986b3d5d19ed1c6168b20b59b6a563e
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  3 ++
 arch/arm64/kvm/hyp/include/nvhe/memory.h      |  7 ++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 48 +++++++++++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index c87b19b2d468..998bf165af71 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -69,6 +69,9 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
+int hyp_pin_shared_mem(void *from, void *to);
+void hyp_unpin_shared_mem(void *from, void *to);
+
 static __always_inline void __load_host_stage2(void)
 {
 	if (static_branch_likely(&kvm_protected_mode_initialized))
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 9422900e5c6a..ab205c4d6774 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -55,10 +55,15 @@ static inline void hyp_page_ref_inc(struct hyp_page *p)
 	p->refcount++;
 }
 
-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+static inline void hyp_page_ref_dec(struct hyp_page *p)
 {
 	BUG_ON(!p->refcount);
 	p->refcount--;
+}
+
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+	hyp_page_ref_dec(p);
 	return (p->refcount == 0);
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d9f4174623e7..1f2ffb10ef17 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -625,6 +625,9 @@ static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
 {
 	u64 size = tx->nr_pages * PAGE_SIZE;
 
+	if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr))
+		return -EBUSY;
+
 	if (__hyp_ack_skip_pgtable_check(tx))
 		return 0;
 
@@ -1038,3 +1041,48 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
 
 	return ret;
 }
+
+int hyp_pin_shared_mem(void *from, void *to)
+{
+	u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+	u64 end = PAGE_ALIGN((u64)to);
+	u64 size = end - start;
+	int ret;
+
+	host_lock_component();
+	hyp_lock_component();
+
+	ret = __host_check_page_state_range(__hyp_pa(start), size,
+					    PKVM_PAGE_SHARED_OWNED);
+	if (ret)
+		goto unlock;
+
+	ret = __hyp_check_page_state_range(start, size,
+					   PKVM_PAGE_SHARED_BORROWED);
+	if (ret)
+		goto unlock;
+
+	for (cur = start; cur < end; cur += PAGE_SIZE)
+		hyp_page_ref_inc(hyp_virt_to_page(cur));
+
+unlock:
+	hyp_unlock_component();
+	host_unlock_component();
+
+	return ret;
+}
+
+void hyp_unpin_shared_mem(void *from, void *to)
+{
+	u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
+	u64 end = PAGE_ALIGN((u64)to);
+
+	host_lock_component();
+	hyp_lock_component();
+
+	for (cur = start; cur < end; cur += PAGE_SIZE)
+		hyp_page_ref_dec(hyp_virt_to_page(cur));
+
+	hyp_unlock_component();
+	host_unlock_component();
+}

From a7d9ff386da01ab81e6e064ccdc698dc01cb7106 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:11 +0100
Subject: [PATCH 018/457] FROMLIST: KVM: arm64: Include asm/kvm_mmu.h in
 nvhe/mem_protect.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nvhe/mem_protect.h refers to __load_stage2() in the definition of
__load_host_stage2() but doesn't include the relevant header.

Include asm/kvm_mmu.h in nvhe/mem_protect.h so that users of the latter
don't have to do this themselves.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-10-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I9f1eea7b57d6ca5cc5ac172ce5175272f166164a
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 998bf165af71..3bea816296dc 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -8,6 +8,7 @@
 #define __KVM_NVHE_MEM_PROTECT__
 #include <linux/kvm_host.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/virt.h>
 #include <nvhe/spinlock.h>

From c458b03219f5cc6b93704deef65b2a6225068a2a Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Thu, 20 Oct 2022 14:38:12 +0100
Subject: [PATCH 019/457] FROMLIST: KVM: arm64: Add hyp_spinlock_t static
 initializer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a static initializer macro for 'hyp_spinlock_t' so that it is
straightforward to instantiate global locks at EL2. This will be later
utilised for locking the VM table in the hypervisor.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-11-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I5055011c6eb4376922e5b06871fa4bb3528f4e8f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/spinlock.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
index 4652fd04bdbe..7c7ea8c55405 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h
@@ -28,9 +28,17 @@ typedef union hyp_spinlock {
 	};
 } hyp_spinlock_t;
 
+#define __HYP_SPIN_LOCK_INITIALIZER \
+	{ .__val = 0 }
+
+#define __HYP_SPIN_LOCK_UNLOCKED \
+	((hyp_spinlock_t) __HYP_SPIN_LOCK_INITIALIZER)
+
+#define DEFINE_HYP_SPINLOCK(x)	hyp_spinlock_t x = __HYP_SPIN_LOCK_UNLOCKED
+
 #define hyp_spin_lock_init(l)						\
 do {									\
-	*(l) = (hyp_spinlock_t){ .__val = 0 };				\
+	*(l) = __HYP_SPIN_LOCK_UNLOCKED;				\
 } while (0)
 
 static inline void hyp_spin_lock(hyp_spinlock_t *lock)

From 00578f682ac7edaedfb6526cd1e280347e63e28f Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:13 +0100
Subject: [PATCH 020/457] FROMLIST: KVM: arm64: Rename 'host_kvm' to 'host_mmu'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for introducing VM and vCPU state at EL2, rename the
existing 'struct host_kvm' and its singleton 'host_kvm' instance to
'host_mmu' so as to avoid confusion between the structure tracking the
host stage-2 MMU state and the host instance of a 'struct kvm' for a
protected guest.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-12-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Iaf93fa2f37f1e87e56182b8494be808ce0dfcf95
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  6 +--
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 46 +++++++++----------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 3bea816296dc..0a6d3e7f2a43 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -44,13 +44,13 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
 	return prot & PKVM_PAGE_STATE_PROT_MASK;
 }
 
-struct host_kvm {
+struct host_mmu {
 	struct kvm_arch arch;
 	struct kvm_pgtable pgt;
 	struct kvm_pgtable_mm_ops mm_ops;
 	hyp_spinlock_t lock;
 };
-extern struct host_kvm host_kvm;
+extern struct host_mmu host_mmu;
 
 /* This corresponds to page-table locking order */
 enum pkvm_component_id {
@@ -76,7 +76,7 @@ void hyp_unpin_shared_mem(void *from, void *to);
 static __always_inline void __load_host_stage2(void)
 {
 	if (static_branch_likely(&kvm_protected_mode_initialized))
-		__load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
+		__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
 	else
 		write_sysreg(0, vttbr_el2);
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 1f2ffb10ef17..2c612d11bf3d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -22,18 +22,18 @@
 #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
 
 extern unsigned long hyp_nr_cpus;
-struct host_kvm host_kvm;
+struct host_mmu host_mmu;
 
 static struct hyp_pool host_s2_pool;
 
 static void host_lock_component(void)
 {
-	hyp_spin_lock(&host_kvm.lock);
+	hyp_spin_lock(&host_mmu.lock);
 }
 
 static void host_unlock_component(void)
 {
-	hyp_spin_unlock(&host_kvm.lock);
+	hyp_spin_unlock(&host_mmu.lock);
 }
 
 static void hyp_lock_component(void)
@@ -88,7 +88,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
 	if (ret)
 		return ret;
 
-	host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) {
+	host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
 		.zalloc_pages_exact = host_s2_zalloc_pages_exact,
 		.zalloc_page = host_s2_zalloc_page,
 		.phys_to_virt = hyp_phys_to_virt,
@@ -109,7 +109,7 @@ static void prepare_host_vtcr(void)
 	parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val);
 	phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange);
 
-	host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
+	host_mmu.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val,
 					  id_aa64mmfr1_el1_sys_val, phys_shift);
 }
 
@@ -117,25 +117,25 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
 
 int kvm_host_prepare_stage2(void *pgt_pool_base)
 {
-	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+	struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
 	int ret;
 
 	prepare_host_vtcr();
-	hyp_spin_lock_init(&host_kvm.lock);
-	mmu->arch = &host_kvm.arch;
+	hyp_spin_lock_init(&host_mmu.lock);
+	mmu->arch = &host_mmu.arch;
 
 	ret = prepare_s2_pool(pgt_pool_base);
 	if (ret)
 		return ret;
 
-	ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, mmu,
-					&host_kvm.mm_ops, KVM_HOST_S2_FLAGS,
+	ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu,
+					&host_mmu.mm_ops, KVM_HOST_S2_FLAGS,
 					host_stage2_force_pte_cb);
 	if (ret)
 		return ret;
 
-	mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd);
-	mmu->pgt = &host_kvm.pgt;
+	mmu->pgd_phys = __hyp_pa(host_mmu.pgt.pgd);
+	mmu->pgt = &host_mmu.pgt;
 	atomic64_set(&mmu->vmid.id, 0);
 
 	return 0;
@@ -143,19 +143,19 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
 
 int __pkvm_prot_finalize(void)
 {
-	struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
+	struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
 	struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
 
 	if (params->hcr_el2 & HCR_VM)
 		return -EPERM;
 
 	params->vttbr = kvm_get_vttbr(mmu);
-	params->vtcr = host_kvm.arch.vtcr;
+	params->vtcr = host_mmu.arch.vtcr;
 	params->hcr_el2 |= HCR_VM;
 	kvm_flush_dcache_to_poc(params, sizeof(*params));
 
 	write_sysreg(params->hcr_el2, hcr_el2);
-	__load_stage2(&host_kvm.arch.mmu, &host_kvm.arch);
+	__load_stage2(&host_mmu.arch.mmu, &host_mmu.arch);
 
 	/*
 	 * Make sure to have an ISB before the TLB maintenance below but only
@@ -173,7 +173,7 @@ int __pkvm_prot_finalize(void)
 
 static int host_stage2_unmap_dev_all(void)
 {
-	struct kvm_pgtable *pgt = &host_kvm.pgt;
+	struct kvm_pgtable *pgt = &host_mmu.pgt;
 	struct memblock_region *reg;
 	u64 addr = 0;
 	int i, ret;
@@ -258,7 +258,7 @@ static bool range_is_memory(u64 start, u64 end)
 static inline int __host_stage2_idmap(u64 start, u64 end,
 				      enum kvm_pgtable_prot prot)
 {
-	return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
+	return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
 				      prot, &host_s2_pool);
 }
 
@@ -271,7 +271,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
 #define host_stage2_try(fn, ...)					\
 	({								\
 		int __ret;						\
-		hyp_assert_lock_held(&host_kvm.lock);			\
+		hyp_assert_lock_held(&host_mmu.lock);			\
 		__ret = fn(__VA_ARGS__);				\
 		if (__ret == -ENOMEM) {					\
 			__ret = host_stage2_unmap_dev_all();		\
@@ -294,8 +294,8 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
 	u32 level;
 	int ret;
 
-	hyp_assert_lock_held(&host_kvm.lock);
-	ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level);
+	hyp_assert_lock_held(&host_mmu.lock);
+	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
 	if (ret)
 		return ret;
 
@@ -327,7 +327,7 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
 
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 {
-	return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
+	return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
 			       addr, size, &host_s2_pool, owner_id);
 }
 
@@ -468,8 +468,8 @@ static int __host_check_page_state_range(u64 addr, u64 size,
 		.get_page_state	= host_get_page_state,
 	};
 
-	hyp_assert_lock_held(&host_kvm.lock);
-	return check_page_state_range(&host_kvm.pgt, addr, size, &d);
+	hyp_assert_lock_held(&host_mmu.lock);
+	return check_page_state_range(&host_mmu.pgt, addr, size, &d);
 }
 
 static int __host_set_page_state_range(u64 addr, u64 size,

From ab3b93a33d0edd14cfaf857f3568af0acb02951b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Thu, 20 Oct 2022 14:38:14 +0100
Subject: [PATCH 021/457] FROMLIST: KVM: arm64: Add infrastructure to create
 and track pKVM instances at EL2

Introduce a global table (and lock) to track pKVM instances at EL2, and
provide hypercalls that can be used by the untrusted host to create and
destroy pKVM VMs and their vCPUs. pKVM VM/vCPU state is directly
accessible only by the trusted hypervisor (EL2).

Each pKVM VM is directly associated with an untrusted host KVM instance,
and is referenced by the host using an opaque handle. Future patches
will provide hypercalls to allow the host to initialize/set/get pKVM
VM/vCPU state using the opaque handle.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Co-developed-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-13-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ibe946e86f9a773b6024bf74faeccbb6929548a1f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h              |   3 +
 arch/arm64/include/asm/kvm_host.h             |   8 +
 arch/arm64/include/asm/kvm_pgtable.h          |   8 +
 arch/arm64/include/asm/kvm_pkvm.h             |   8 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   3 +
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h        |  58 +++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  31 ++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         |  14 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c                | 379 ++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c               |   8 +
 arch/arm64/kvm/hyp/pgtable.c                  |   9 +
 arch/arm64/kvm/pkvm.c                         |   1 +
 12 files changed, 530 insertions(+)
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/pkvm.h

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 53035763e48e..de52ba775d48 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -76,6 +76,9 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
+	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 45e2136322ba..d3dd7ab9c79e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -115,6 +115,8 @@ struct kvm_smccc_features {
 	unsigned long vendor_hyp_bmap;
 };
 
+typedef unsigned int pkvm_handle_t;
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -166,6 +168,12 @@ struct kvm_arch {
 
 	/* Hypercall features firmware registers' descriptor */
 	struct kvm_smccc_features smccc_feat;
+
+	/*
+	 * For an untrusted host VM, 'pkvm_handle' is used to lookup
+	 * the associated pKVM instance in the hypervisor.
+	 */
+	pkvm_handle_t pkvm_handle;
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 3252eb50ecfe..15c389db1931 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -296,6 +296,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
  */
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
 
+/**
+ * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
+ * @vtcr:	Content of the VTCR register.
+ *
+ * Return: the size (in bytes) of the stage-2 PGD
+ */
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);
+
 /**
  * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
  * @pgt:	Uninitialised page-table structure to initialise.
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 8f7b8a2314bb..f4e3133d6550 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -9,6 +9,9 @@
 #include <linux/memblock.h>
 #include <asm/kvm_pgtable.h>
 
+/* Maximum number of VMs that can co-exist under pKVM. */
+#define KVM_MAX_PVMS 255
+
 #define HYP_MEMBLOCK_REGIONS 128
 
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
@@ -40,6 +43,11 @@ static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
 	return res >> PAGE_SHIFT;
 }
 
+static inline unsigned long hyp_vm_table_pages(void)
+{
+	return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT;
+}
+
 static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 {
 	unsigned long total = 0, i;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 0a6d3e7f2a43..ce9a796a85ee 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -11,6 +11,7 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/virt.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/spinlock.h>
 
 /*
@@ -68,10 +69,12 @@ bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
 int hyp_pin_shared_mem(void *from, void *to);
 void hyp_unpin_shared_mem(void *from, void *to);
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm);
 
 static __always_inline void __load_host_stage2(void)
 {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
new file mode 100644
index 000000000000..8c653a3b9501
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google LLC
+ * Author: Fuad Tabba <tabba@google.com>
+ */
+
+#ifndef __ARM64_KVM_NVHE_PKVM_H__
+#define __ARM64_KVM_NVHE_PKVM_H__
+
+#include <asm/kvm_pkvm.h>
+
+/*
+ * Holds the relevant data for maintaining the vcpu state completely at hyp.
+ */
+struct pkvm_hyp_vcpu {
+	struct kvm_vcpu vcpu;
+
+	/* Backpointer to the host's (untrusted) vCPU instance. */
+	struct kvm_vcpu *host_vcpu;
+};
+
+/*
+ * Holds the relevant data for running a protected vm.
+ */
+struct pkvm_hyp_vm {
+	struct kvm kvm;
+
+	/* Backpointer to the host's (untrusted) KVM instance. */
+	struct kvm *host_kvm;
+
+	/* The guest's stage-2 page-table managed by the hypervisor. */
+	struct kvm_pgtable pgt;
+
+	/*
+	 * The number of vcpus initialized and ready to run.
+	 * Modifying this is protected by 'vm_table_lock'.
+	 */
+	unsigned int nr_vcpus;
+
+	/* Array of the hyp vCPU structures for this VM. */
+	struct pkvm_hyp_vcpu *vcpus[];
+};
+
+static inline struct pkvm_hyp_vm *
+pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
+}
+
+void pkvm_hyp_vm_table_init(void *tbl);
+
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+		   unsigned long pgd_hva);
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+		     unsigned long vcpu_hva);
+int __pkvm_teardown_vm(pkvm_handle_t handle);
+
+#endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 3cea4b6ac23e..b5f3fcfe9135 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -15,6 +15,7 @@
 
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -191,6 +192,33 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
 	__pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
 }
 
+static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
+	DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2);
+	DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3);
+
+	host_kvm = kern_hyp_va(host_kvm);
+	cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva);
+}
+
+static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 2);
+	DECLARE_REG(unsigned long, vcpu_hva, host_ctxt, 3);
+
+	host_vcpu = kern_hyp_va(host_vcpu);
+	cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva);
+}
+
+static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -220,6 +248,9 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__vgic_v3_save_aprs),
 	HANDLE_FUNC(__vgic_v3_restore_aprs),
 	HANDLE_FUNC(__pkvm_vcpu_init_traps),
+	HANDLE_FUNC(__pkvm_init_vm),
+	HANDLE_FUNC(__pkvm_init_vcpu),
+	HANDLE_FUNC(__pkvm_teardown_vm),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2c612d11bf3d..dbfa7a2e8914 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -141,6 +141,20 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
 	return 0;
 }
 
+int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
+{
+	vm->pgt.pgd = pgd;
+	return 0;
+}
+
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm)
+{
+	unsigned long nr_pages;
+
+	nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(vm->pgt.pgd), nr_pages));
+}
+
 int __pkvm_prot_finalize(void)
 {
 	struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 85d3b7ae720f..dcc7baeb8906 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -7,6 +7,9 @@
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
 #include <nvhe/fixed_config.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 /*
@@ -183,3 +186,379 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
 	pvm_init_traps_aa64mmfr0(vcpu);
 	pvm_init_traps_aa64mmfr1(vcpu);
 }
+
+/*
+ * Start the VM table handle at the offset defined instead of at 0.
+ * Mainly for sanity checking and debugging.
+ */
+#define HANDLE_OFFSET 0x1000
+
+static unsigned int vm_handle_to_idx(pkvm_handle_t handle)
+{
+	return handle - HANDLE_OFFSET;
+}
+
+static pkvm_handle_t idx_to_vm_handle(unsigned int idx)
+{
+	return idx + HANDLE_OFFSET;
+}
+
+/*
+ * Spinlock for protecting state related to the VM table. Protects writes
+ * to 'vm_table' and 'nr_table_entries' as well as reads and writes to
+ * 'last_hyp_vcpu_lookup'.
+ */
+static DEFINE_HYP_SPINLOCK(vm_table_lock);
+
+/*
+ * The table of VM entries for protected VMs in hyp.
+ * Allocated at hyp initialization and setup.
+ */
+static struct pkvm_hyp_vm **vm_table;
+
+void pkvm_hyp_vm_table_init(void *tbl)
+{
+	WARN_ON(vm_table);
+	vm_table = tbl;
+}
+
+/*
+ * Return the hyp vm structure corresponding to the handle.
+ */
+static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
+{
+	unsigned int idx = vm_handle_to_idx(handle);
+
+	if (unlikely(idx >= KVM_MAX_PVMS))
+		return NULL;
+
+	return vm_table[idx];
+}
+
+static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
+{
+	if (host_vcpu)
+		hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
+}
+
+static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
+			     unsigned int nr_vcpus)
+{
+	int i;
+
+	for (i = 0; i < nr_vcpus; i++)
+		unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+}
+
+static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
+			     unsigned int nr_vcpus)
+{
+	hyp_vm->host_kvm = host_kvm;
+	hyp_vm->kvm.created_vcpus = nr_vcpus;
+	hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+}
+
+static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
+			      struct pkvm_hyp_vm *hyp_vm,
+			      struct kvm_vcpu *host_vcpu,
+			      unsigned int vcpu_idx)
+{
+	int ret = 0;
+
+	if (hyp_pin_shared_mem(host_vcpu, host_vcpu + 1))
+		return -EBUSY;
+
+	if (host_vcpu->vcpu_idx != vcpu_idx) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	hyp_vcpu->host_vcpu = host_vcpu;
+
+	hyp_vcpu->vcpu.kvm = &hyp_vm->kvm;
+	hyp_vcpu->vcpu.vcpu_id = READ_ONCE(host_vcpu->vcpu_id);
+	hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
+
+	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
+done:
+	if (ret)
+		unpin_host_vcpu(host_vcpu);
+	return ret;
+}
+
+static int find_free_vm_table_entry(struct kvm *host_kvm)
+{
+	int i;
+
+	for (i = 0; i < KVM_MAX_PVMS; ++i) {
+		if (!vm_table[i])
+			return i;
+	}
+
+	return -ENOMEM;
+}
+
+/*
+ * Allocate a VM table entry and insert a pointer to the new vm.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
+					   struct pkvm_hyp_vm *hyp_vm)
+{
+	struct kvm_s2_mmu *mmu = &hyp_vm->kvm.arch.mmu;
+	int idx;
+
+	hyp_assert_lock_held(&vm_table_lock);
+
+	/*
+	 * Initializing protected state might have failed, yet a malicious
+	 * host could trigger this function. Thus, ensure that 'vm_table'
+	 * exists.
+	 */
+	if (unlikely(!vm_table))
+		return -EINVAL;
+
+	idx = find_free_vm_table_entry(host_kvm);
+	if (idx < 0)
+		return idx;
+
+	hyp_vm->kvm.arch.pkvm_handle = idx_to_vm_handle(idx);
+
+	/* VMID 0 is reserved for the host */
+	atomic64_set(&mmu->vmid.id, idx + 1);
+
+	mmu->arch = &hyp_vm->kvm.arch;
+	mmu->pgt = &hyp_vm->pgt;
+
+	vm_table[idx] = hyp_vm;
+	return hyp_vm->kvm.arch.pkvm_handle;
+}
+
+/*
+ * Deallocate and remove the VM table entry corresponding to the handle.
+ */
+static void remove_vm_table_entry(pkvm_handle_t handle)
+{
+	hyp_assert_lock_held(&vm_table_lock);
+	vm_table[vm_handle_to_idx(handle)] = NULL;
+}
+
+static size_t pkvm_get_hyp_vm_size(unsigned int nr_vcpus)
+{
+	return size_add(sizeof(struct pkvm_hyp_vm),
+		size_mul(sizeof(struct pkvm_hyp_vcpu *), nr_vcpus));
+}
+
+static void *map_donated_memory_noclear(unsigned long host_va, size_t size)
+{
+	void *va = (void *)kern_hyp_va(host_va);
+
+	if (!PAGE_ALIGNED(va))
+		return NULL;
+
+	if (__pkvm_host_donate_hyp(hyp_virt_to_pfn(va),
+				   PAGE_ALIGN(size) >> PAGE_SHIFT))
+		return NULL;
+
+	return va;
+}
+
+static void *map_donated_memory(unsigned long host_va, size_t size)
+{
+	void *va = map_donated_memory_noclear(host_va, size);
+
+	if (va)
+		memset(va, 0, size);
+
+	return va;
+}
+
+static void __unmap_donated_memory(void *va, size_t size)
+{
+	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
+				       PAGE_ALIGN(size) >> PAGE_SHIFT));
+}
+
+static void unmap_donated_memory(void *va, size_t size)
+{
+	if (!va)
+		return;
+
+	memset(va, 0, size);
+	__unmap_donated_memory(va, size);
+}
+
+static void unmap_donated_memory_noclear(void *va, size_t size)
+{
+	if (!va)
+		return;
+
+	__unmap_donated_memory(va, size);
+}
+
+/*
+ * Initialize the hypervisor copy of the protected VM state using the
+ * memory donated by the host.
+ *
+ * Unmaps the donated memory from the host at stage 2.
+ *
+ * kvm: A pointer to the host's struct kvm.
+ * vm_hva: The host va of the area being donated for the VM state.
+ *	   Must be page aligned.
+ * pgd_hva: The host va of the area being donated for the stage-2 PGD for
+ *	    the VM. Must be page aligned. Its size is implied by the VM's
+ *	    VTCR.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
+		   unsigned long pgd_hva)
+{
+	struct pkvm_hyp_vm *hyp_vm = NULL;
+	size_t vm_size, pgd_size;
+	unsigned int nr_vcpus;
+	void *pgd = NULL;
+	int ret;
+
+	ret = hyp_pin_shared_mem(host_kvm, host_kvm + 1);
+	if (ret)
+		return ret;
+
+	nr_vcpus = READ_ONCE(host_kvm->created_vcpus);
+	if (nr_vcpus < 1) {
+		ret = -EINVAL;
+		goto err_unpin_kvm;
+	}
+
+	vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
+	pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
+
+	ret = -ENOMEM;
+
+	hyp_vm = map_donated_memory(vm_hva, vm_size);
+	if (!hyp_vm)
+		goto err_remove_mappings;
+
+	pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
+	if (!pgd)
+		goto err_remove_mappings;
+
+	init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
+
+	hyp_spin_lock(&vm_table_lock);
+	ret = insert_vm_table_entry(host_kvm, hyp_vm);
+	if (ret < 0)
+		goto err_unlock;
+
+	ret = kvm_guest_prepare_stage2(hyp_vm, pgd);
+	if (ret)
+		goto err_remove_vm_table_entry;
+	hyp_spin_unlock(&vm_table_lock);
+
+	return hyp_vm->kvm.arch.pkvm_handle;
+
+err_remove_vm_table_entry:
+	remove_vm_table_entry(hyp_vm->kvm.arch.pkvm_handle);
+err_unlock:
+	hyp_spin_unlock(&vm_table_lock);
+err_remove_mappings:
+	unmap_donated_memory(hyp_vm, vm_size);
+	unmap_donated_memory(pgd, pgd_size);
+err_unpin_kvm:
+	hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
+	return ret;
+}
+
+/*
+ * Initialize the hypervisor copy of the protected vCPU state using the
+ * memory donated by the host.
+ *
+ * handle: The handle for the protected vm.
+ * host_vcpu: A pointer to the corresponding host vcpu.
+ * vcpu_hva: The host va of the area being donated for the vcpu state.
+ *	     Must be page aligned. The size of the area must be equal to
+ *	     the page-aligned size of 'struct pkvm_hyp_vcpu'.
+ * Return 0 on success, negative error code on failure.
+ */
+int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
+		     unsigned long vcpu_hva)
+{
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct pkvm_hyp_vm *hyp_vm;
+	unsigned int idx;
+	int ret;
+
+	hyp_vcpu = map_donated_memory(vcpu_hva, sizeof(*hyp_vcpu));
+	if (!hyp_vcpu)
+		return -ENOMEM;
+
+	hyp_spin_lock(&vm_table_lock);
+
+	hyp_vm = get_vm_by_handle(handle);
+	if (!hyp_vm) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	idx = hyp_vm->nr_vcpus;
+	if (idx >= hyp_vm->kvm.created_vcpus) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	ret = init_pkvm_hyp_vcpu(hyp_vcpu, hyp_vm, host_vcpu, idx);
+	if (ret)
+		goto unlock;
+
+	hyp_vm->vcpus[idx] = hyp_vcpu;
+	hyp_vm->nr_vcpus++;
+unlock:
+	hyp_spin_unlock(&vm_table_lock);
+
+	if (ret)
+		unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
+
+	return ret;
+}
+
+int __pkvm_teardown_vm(pkvm_handle_t handle)
+{
+	struct pkvm_hyp_vm *hyp_vm;
+	size_t vm_size;
+	int err;
+
+	hyp_spin_lock(&vm_table_lock);
+	hyp_vm = get_vm_by_handle(handle);
+	if (!hyp_vm) {
+		err = -ENOENT;
+		goto err_unlock;
+	}
+
+	if (WARN_ON(hyp_page_count(hyp_vm))) {
+		err = -EBUSY;
+		goto err_unlock;
+	}
+
+	/* Ensure the VMID is clean before it can be reallocated */
+	__kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
+	remove_vm_table_entry(handle);
+	hyp_spin_unlock(&vm_table_lock);
+
+	/* Reclaim guest pages (including page-table pages) */
+	reclaim_guest_pages(hyp_vm);
+	unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
+
+	/* Push the metadata pages to the teardown memcache */
+	hyp_unpin_shared_mem(hyp_vm->host_kvm, hyp_vm->host_kvm + 1);
+
+	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
+	unmap_donated_memory(hyp_vm, vm_size);
+	return 0;
+
+err_unlock:
+	hyp_spin_unlock(&vm_table_lock);
+	return err;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 0312c9c74a5a..2be72fbe7279 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -16,6 +16,7 @@
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 unsigned long hyp_nr_cpus;
@@ -24,6 +25,7 @@ unsigned long hyp_nr_cpus;
 			 (unsigned long)__per_cpu_start)
 
 static void *vmemmap_base;
+static void *vm_table_base;
 static void *hyp_pgt_base;
 static void *host_s2_pgt_base;
 static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
@@ -40,6 +42,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
 	if (!vmemmap_base)
 		return -ENOMEM;
 
+	nr_pages = hyp_vm_table_pages();
+	vm_table_base = hyp_early_alloc_contig(nr_pages);
+	if (!vm_table_base)
+		return -ENOMEM;
+
 	nr_pages = hyp_s1_pgtable_pages();
 	hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
 	if (!hyp_pgt_base)
@@ -314,6 +321,7 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	pkvm_hyp_vm_table_init(vm_table_base);
 out:
 	/*
 	 * We tail-called to here from handle___pkvm_init() and will not return,
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index cdf8e76b0be1..a1a27f88a312 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1200,6 +1200,15 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 	return 0;
 }
 
+size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
+{
+	u32 ia_bits = VTCR_EL2_IPA(vtcr);
+	u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+	return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+}
+
 static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 			      enum kvm_pgtable_walk_flags flag,
 			      void * const arg)
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 34229425b25d..71493136e59c 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -71,6 +71,7 @@ void __init kvm_hyp_reserve(void)
 
 	hyp_mem_pages += hyp_s1_pgtable_pages();
 	hyp_mem_pages += host_s2_pgtable_pages();
+	hyp_mem_pages += hyp_vm_table_pages();
 	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
 
 	/*

From 7c386abbe8b717eeeabd1d5a7713338e87dbac9c Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Thu, 20 Oct 2022 14:38:15 +0100
Subject: [PATCH 022/457] FROMLIST: KVM: arm64: Instantiate pKVM hypervisor VM
 and vCPU structures from EL1

With the pKVM hypervisor at EL2 now offering hypercalls to the host for
creating and destroying VM and vCPU structures, plumb these in to the
existing arm64 KVM backend to ensure that the hypervisor data structures
are allocated and initialised on first vCPU run for a pKVM guest.

In the host, 'struct kvm_protected_vm' is introduced to hold the handle
of the pKVM VM instance as well as to track references to the memory
donated to the hypervisor so that it can be freed back to the host
allocator following VM teardown. The stage-2 page-table, hypervisor VM
and vCPU structures are allocated separately so as to avoid the need for
a large physically-contiguous allocation in the host at run-time.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-14-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I2823f936b69f756d867bf584c645b29357c6abe9
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h  |  14 ++-
 arch/arm64/include/asm/kvm_pkvm.h  |   4 +
 arch/arm64/kvm/arm.c               |  14 +++
 arch/arm64/kvm/hyp/hyp-constants.c |   3 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c     |  15 +++-
 arch/arm64/kvm/pkvm.c              | 138 +++++++++++++++++++++++++++++
 6 files changed, 182 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d3dd7ab9c79e..467393e7331f 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -117,6 +117,16 @@ struct kvm_smccc_features {
 
 typedef unsigned int pkvm_handle_t;
 
+struct kvm_protected_vm {
+	pkvm_handle_t handle;
+
+	struct {
+		void *pgd;
+		void *vm;
+		void *vcpus[KVM_MAX_VCPUS];
+	} hyp_donations;
+};
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -170,10 +180,10 @@ struct kvm_arch {
 	struct kvm_smccc_features smccc_feat;
 
 	/*
-	 * For an untrusted host VM, 'pkvm_handle' is used to lookup
+	 * For an untrusted host VM, 'pkvm.handle' is used to lookup
 	 * the associated pKVM instance in the hypervisor.
 	 */
-	pkvm_handle_t pkvm_handle;
+	struct kvm_protected_vm pkvm;
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index f4e3133d6550..01129b0d4c68 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -14,6 +14,10 @@
 
 #define HYP_MEMBLOCK_REGIONS 128
 
+int pkvm_init_host_vm(struct kvm *kvm);
+int pkvm_create_hyp_vm(struct kvm *kvm);
+void pkvm_destroy_hyp_vm(struct kvm *kvm);
+
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 94d33e296e10..30d6fc5d3a93 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -37,6 +37,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_pkvm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/sections.h>
 
@@ -150,6 +151,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (ret)
 		goto out_free_stage2_pgd;
 
+	ret = pkvm_init_host_vm(kvm);
+	if (ret)
+		goto out_free_stage2_pgd;
+
 	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
 		ret = -ENOMEM;
 		goto out_free_stage2_pgd;
@@ -187,6 +192,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 
 	kvm_vgic_destroy(kvm);
 
+	if (is_protected_kvm_enabled())
+		pkvm_destroy_hyp_vm(kvm);
+
 	kvm_destroy_vcpus(kvm);
 
 	kvm_unshare_hyp(kvm, kvm + 1);
@@ -569,6 +577,12 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 	if (ret)
 		return ret;
 
+	if (is_protected_kvm_enabled()) {
+		ret = pkvm_create_hyp_vm(kvm);
+		if (ret)
+			return ret;
+	}
+
 	if (!irqchip_in_kernel(kvm)) {
 		/*
 		 * Tell the rest of the code that there are userspace irqchip
diff --git a/arch/arm64/kvm/hyp/hyp-constants.c b/arch/arm64/kvm/hyp/hyp-constants.c
index b3742a6691e8..b257a3b4bfc5 100644
--- a/arch/arm64/kvm/hyp/hyp-constants.c
+++ b/arch/arm64/kvm/hyp/hyp-constants.c
@@ -2,9 +2,12 @@
 
 #include <linux/kbuild.h>
 #include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
 
 int main(void)
 {
 	DEFINE(STRUCT_HYP_PAGE_SIZE,	sizeof(struct hyp_page));
+	DEFINE(PKVM_HYP_VM_SIZE,	sizeof(struct pkvm_hyp_vm));
+	DEFINE(PKVM_HYP_VCPU_SIZE,	sizeof(struct pkvm_hyp_vcpu));
 	return 0;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index dcc7baeb8906..bf0436f9f934 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -324,7 +324,7 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
 	if (idx < 0)
 		return idx;
 
-	hyp_vm->kvm.arch.pkvm_handle = idx_to_vm_handle(idx);
+	hyp_vm->kvm.arch.pkvm.handle = idx_to_vm_handle(idx);
 
 	/* VMID 0 is reserved for the host */
 	atomic64_set(&mmu->vmid.id, idx + 1);
@@ -333,7 +333,7 @@ static pkvm_handle_t insert_vm_table_entry(struct kvm *host_kvm,
 	mmu->pgt = &hyp_vm->pgt;
 
 	vm_table[idx] = hyp_vm;
-	return hyp_vm->kvm.arch.pkvm_handle;
+	return hyp_vm->kvm.arch.pkvm.handle;
 }
 
 /*
@@ -458,10 +458,10 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
 		goto err_remove_vm_table_entry;
 	hyp_spin_unlock(&vm_table_lock);
 
-	return hyp_vm->kvm.arch.pkvm_handle;
+	return hyp_vm->kvm.arch.pkvm.handle;
 
 err_remove_vm_table_entry:
-	remove_vm_table_entry(hyp_vm->kvm.arch.pkvm_handle);
+	remove_vm_table_entry(hyp_vm->kvm.arch.pkvm.handle);
 err_unlock:
 	hyp_spin_unlock(&vm_table_lock);
 err_remove_mappings:
@@ -527,6 +527,7 @@ unlock:
 int __pkvm_teardown_vm(pkvm_handle_t handle)
 {
 	struct pkvm_hyp_vm *hyp_vm;
+	unsigned int idx;
 	size_t vm_size;
 	int err;
 
@@ -554,6 +555,12 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 	/* Push the metadata pages to the teardown memcache */
 	hyp_unpin_shared_mem(hyp_vm->host_kvm, hyp_vm->host_kvm + 1);
 
+	for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
+		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
+
+		unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
+	}
+
 	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
 	unmap_donated_memory(hyp_vm, vm_size);
 	return 0;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 71493136e59c..8c443b915e43 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -6,6 +6,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/memblock.h>
+#include <linux/mutex.h>
 #include <linux/sort.h>
 
 #include <asm/kvm_pkvm.h>
@@ -94,3 +95,140 @@ void __init kvm_hyp_reserve(void)
 	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
 		 hyp_mem_base);
 }
+
+/*
+ * Allocates and donates memory for hypervisor VM structs at EL2.
+ *
+ * Allocates space for the VM state, which includes the hyp vm as well as
+ * the hyp vcpus.
+ *
+ * Stores an opaque handler in the kvm struct for future reference.
+ *
+ * Return 0 on success, negative error code on failure.
+ */
+static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
+{
+	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
+	struct kvm_vcpu *host_vcpu;
+	pkvm_handle_t handle;
+	void *pgd, *hyp_vm;
+	unsigned long idx;
+	int ret;
+
+	if (host_kvm->created_vcpus < 1)
+		return -EINVAL;
+
+	pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
+
+	/*
+	 * The PGD pages will be reclaimed using a hyp_memcache which implies
+	 * page granularity. So, use alloc_pages_exact() to get individual
+	 * refcounts.
+	 */
+	pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
+	if (!pgd)
+		return -ENOMEM;
+
+	/* Allocate memory to donate to hyp for vm and vcpu pointers. */
+	hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
+					size_mul(sizeof(void *),
+						 host_kvm->created_vcpus)));
+	hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
+	if (!hyp_vm) {
+		ret = -ENOMEM;
+		goto free_pgd;
+	}
+
+	/* Donate the VM memory to hyp and let hyp initialize it. */
+	ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
+	if (ret < 0)
+		goto free_vm;
+
+	handle = ret;
+
+	host_kvm->arch.pkvm.handle = handle;
+	host_kvm->arch.pkvm.hyp_donations.pgd = pgd;
+	host_kvm->arch.pkvm.hyp_donations.vm = hyp_vm;
+
+	/* Donate memory for the vcpus at hyp and initialize it. */
+	hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
+	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
+		void *hyp_vcpu;
+
+		/* Indexing of the vcpus to be sequential starting at 0. */
+		if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
+			ret = -EINVAL;
+			goto destroy_vm;
+		}
+
+		hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
+		if (!hyp_vcpu) {
+			ret = -ENOMEM;
+			goto destroy_vm;
+		}
+
+		host_kvm->arch.pkvm.hyp_donations.vcpus[idx] = hyp_vcpu;
+
+		ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
+					hyp_vcpu);
+		if (ret)
+			goto destroy_vm;
+	}
+
+	return 0;
+
+destroy_vm:
+	pkvm_destroy_hyp_vm(host_kvm);
+	return ret;
+free_vm:
+	free_pages_exact(hyp_vm, hyp_vm_sz);
+free_pgd:
+	free_pages_exact(pgd, pgd_sz);
+	return ret;
+}
+
+int pkvm_create_hyp_vm(struct kvm *host_kvm)
+{
+	int ret = 0;
+
+	mutex_lock(&host_kvm->lock);
+	if (!host_kvm->arch.pkvm.handle)
+		ret = __pkvm_create_hyp_vm(host_kvm);
+	mutex_unlock(&host_kvm->lock);
+
+	return ret;
+}
+
+void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
+{
+	unsigned long idx, nr_vcpus = host_kvm->created_vcpus;
+	size_t pgd_sz, hyp_vm_sz;
+
+	if (host_kvm->arch.pkvm.handle)
+		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
+					  host_kvm->arch.pkvm.handle));
+
+	host_kvm->arch.pkvm.handle = 0;
+
+	for (idx = 0; idx < nr_vcpus; ++idx) {
+		void *hyp_vcpu = host_kvm->arch.pkvm.hyp_donations.vcpus[idx];
+
+		if (!hyp_vcpu)
+			break;
+
+		free_pages_exact(hyp_vcpu, PAGE_ALIGN(PKVM_HYP_VCPU_SIZE));
+	}
+
+	hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
+					size_mul(sizeof(void *), nr_vcpus)));
+	pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
+
+	free_pages_exact(host_kvm->arch.pkvm.hyp_donations.vm, hyp_vm_sz);
+	free_pages_exact(host_kvm->arch.pkvm.hyp_donations.pgd, pgd_sz);
+}
+
+int pkvm_init_host_vm(struct kvm *host_kvm)
+{
+	mutex_init(&host_kvm->lock);
+	return 0;
+}

From 1c3e6fdd03e9b7d646ac735e91c8e91350891787 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:16 +0100
Subject: [PATCH 023/457] FROMLIST: KVM: arm64: Add per-cpu fixmap
 infrastructure at EL2

Mapping pages in a guest page-table from within the pKVM hypervisor at
EL2 may require cache maintenance to ensure that the initialised page
contents is visible even to non-cacheable (e.g. MMU-off) accesses from
the guest.

In preparation for performing this maintenance at EL2, introduce a
per-vCPU fixmap which allows the pKVM hypervisor to map guest pages
temporarily into its stage-1 page-table for the purposes of cache
maintenance and, in future, poisoning on the reclaim path. The use of a
fixmap avoids the need for memory allocation or locking on the map()
path.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Co-developed-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-15-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8054dac551f4fd5c4afb5b497816589f1fae5285
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h          |  14 +++
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   2 +
 arch/arm64/kvm/hyp/include/nvhe/mm.h          |   4 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         |   1 -
 arch/arm64/kvm/hyp/nvhe/mm.c                  | 104 ++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c               |   4 +
 arch/arm64/kvm/hyp/pgtable.c                  |  12 --
 7 files changed, 128 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 15c389db1931..34cb93f3c96d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -42,6 +42,8 @@ typedef u64 kvm_pte_t;
 #define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
 #define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
 
+#define KVM_PHYS_INVALID		(-1ULL)
+
 static inline bool kvm_pte_valid(kvm_pte_t pte)
 {
 	return pte & KVM_PTE_VALID;
@@ -57,6 +59,18 @@ static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
 	return pa;
 }
 
+static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16) {
+		pa &= GENMASK(51, 48);
+		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+	}
+
+	return pte;
+}
+
 static inline u64 kvm_granule_shift(u32 level)
 {
 	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index ce9a796a85ee..ef31a1872c93 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -59,6 +59,8 @@ enum pkvm_component_id {
 	PKVM_ID_HYP,
 };
 
+extern unsigned long hyp_nr_cpus;
+
 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
 int __pkvm_host_unshare_hyp(u64 pfn);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index b2ee6d5df55b..d5ec972b5c1e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -13,6 +13,10 @@
 extern struct kvm_pgtable pkvm_pgtable;
 extern hyp_spinlock_t pkvm_pgd_lock;
 
+int hyp_create_pcpu_fixmap(void);
+void *hyp_fixmap_map(phys_addr_t phys);
+void hyp_fixmap_unmap(void);
+
 int hyp_create_idmap(u32 hyp_va_bits);
 int hyp_map_vectors(void);
 int hyp_back_vmemmap(phys_addr_t back);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index dbfa7a2e8914..f0555f200aaf 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -21,7 +21,6 @@
 
 #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP)
 
-extern unsigned long hyp_nr_cpus;
 struct host_mmu host_mmu;
 
 static struct hyp_pool host_s2_pool;
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index d3a3b47181de..5648ac21e62d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -14,6 +14,7 @@
 #include <nvhe/early_alloc.h>
 #include <nvhe/gfp.h>
 #include <nvhe/memory.h>
+#include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/spinlock.h>
 
@@ -25,6 +26,12 @@ unsigned int hyp_memblock_nr;
 
 static u64 __io_map_base;
 
+struct hyp_fixmap_slot {
+	u64 addr;
+	kvm_pte_t *ptep;
+};
+static DEFINE_PER_CPU(struct hyp_fixmap_slot, fixmap_slots);
+
 static int __pkvm_create_mappings(unsigned long start, unsigned long size,
 				  unsigned long phys, enum kvm_pgtable_prot prot)
 {
@@ -212,6 +219,103 @@ int hyp_map_vectors(void)
 	return 0;
 }
 
+void *hyp_fixmap_map(phys_addr_t phys)
+{
+	struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
+	kvm_pte_t pte, *ptep = slot->ptep;
+
+	pte = *ptep;
+	pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
+	pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID;
+	WRITE_ONCE(*ptep, pte);
+	dsb(ishst);
+
+	return (void *)slot->addr;
+}
+
+static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
+{
+	kvm_pte_t *ptep = slot->ptep;
+	u64 addr = slot->addr;
+
+	WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
+
+	/*
+	 * Irritatingly, the architecture requires that we use inner-shareable
+	 * broadcast TLB invalidation here in case another CPU speculates
+	 * through our fixmap and decides to create an "amalagamation of the
+	 * values held in the TLB" due to the apparent lack of a
+	 * break-before-make sequence.
+	 *
+	 * https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
+	 */
+	dsb(ishst);
+	__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), (KVM_PGTABLE_MAX_LEVELS - 1));
+	dsb(ish);
+	isb();
+}
+
+void hyp_fixmap_unmap(void)
+{
+	fixmap_clear_slot(this_cpu_ptr(&fixmap_slots));
+}
+
+static int __create_fixmap_slot_cb(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+				   enum kvm_pgtable_walk_flags flag,
+				   void * const arg)
+{
+	struct hyp_fixmap_slot *slot = per_cpu_ptr(&fixmap_slots, (u64)arg);
+
+	if (!kvm_pte_valid(*ptep) || level != KVM_PGTABLE_MAX_LEVELS - 1)
+		return -EINVAL;
+
+	slot->addr = addr;
+	slot->ptep = ptep;
+
+	/*
+	 * Clear the PTE, but keep the page-table page refcount elevated to
+	 * prevent it from ever being freed. This lets us manipulate the PTEs
+	 * by hand safely without ever needing to allocate memory.
+	 */
+	fixmap_clear_slot(slot);
+
+	return 0;
+}
+
+static int create_fixmap_slot(u64 addr, u64 cpu)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= __create_fixmap_slot_cb,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg = (void *)cpu,
+	};
+
+	return kvm_pgtable_walk(&pkvm_pgtable, addr, PAGE_SIZE, &walker);
+}
+
+int hyp_create_pcpu_fixmap(void)
+{
+	unsigned long addr, i;
+	int ret;
+
+	for (i = 0; i < hyp_nr_cpus; i++) {
+		ret = pkvm_alloc_private_va_range(PAGE_SIZE, &addr);
+		if (ret)
+			return ret;
+
+		ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, PAGE_SIZE,
+					  __hyp_pa(__hyp_bss_start), PAGE_HYP);
+		if (ret)
+			return ret;
+
+		ret = create_fixmap_slot(addr, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int hyp_create_idmap(u32 hyp_va_bits)
 {
 	unsigned long start, end;
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 2be72fbe7279..0f69c1393416 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -321,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	ret = hyp_create_pcpu_fixmap();
+	if (ret)
+		goto out;
+
 	pkvm_hyp_vm_table_init(vm_table_base);
 out:
 	/*
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index a1a27f88a312..2bcb2d5903ba 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -57,8 +57,6 @@ struct kvm_pgtable_walk_data {
 	u64				end;
 };
 
-#define KVM_PHYS_INVALID (-1ULL)
-
 static bool kvm_phys_is_valid(u64 phys)
 {
 	return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX));
@@ -122,16 +120,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level)
 	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
 }
 
-static kvm_pte_t kvm_phys_to_pte(u64 pa)
-{
-	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
-
-	if (PAGE_SHIFT == 16)
-		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
-
-	return pte;
-}
-
 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
 {
 	return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));

From 1c4368d77b7236cdfc82d65a66cf346840f830ad Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:17 +0100
Subject: [PATCH 024/457] FROMLIST: KVM: arm64: Initialise hypervisor copies of
 host symbols unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The nVHE object at EL2 maintains its own copies of some host variables
so that, when pKVM is enabled, the host cannot directly modify the
hypervisor state. When running in normal nVHE mode, however, these
variables are still mirrored at EL2 but are not initialised.

Initialise the hypervisor symbols from the host copies regardless of
pKVM, ensuring that any reference to this data at EL2 with normal nVHE
will return a sensibly initialised value.

Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-16-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I1e490d403e85ae5dbc984c1f0b8447c93cdf8700
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/arm.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 30d6fc5d3a93..584626e11797 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1884,11 +1884,8 @@ static int do_pkvm_init(u32 hyp_va_bits)
 	return ret;
 }
 
-static int kvm_hyp_init_protection(u32 hyp_va_bits)
+static void kvm_hyp_init_symbols(void)
 {
-	void *addr = phys_to_virt(hyp_mem_base);
-	int ret;
-
 	kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
 	kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
 	kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
@@ -1897,6 +1894,12 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
 	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+}
+
+static int kvm_hyp_init_protection(u32 hyp_va_bits)
+{
+	void *addr = phys_to_virt(hyp_mem_base);
+	int ret;
 
 	ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
 	if (ret)
@@ -2071,6 +2074,8 @@ static int init_hyp_mode(void)
 		cpu_prepare_hyp_mode(cpu);
 	}
 
+	kvm_hyp_init_symbols();
+
 	if (is_protected_kvm_enabled()) {
 		init_cpu_logical_map();
 
@@ -2078,9 +2083,7 @@ static int init_hyp_mode(void)
 			err = -ENODEV;
 			goto out_err;
 		}
-	}
 
-	if (is_protected_kvm_enabled()) {
 		err = kvm_hyp_init_protection(hyp_va_bits);
 		if (err) {
 			kvm_err("Failed to init hyp memory protection\n");

From 3f2e3bfcc75b3bf529a4f6172644af7d39541b4e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:18 +0100
Subject: [PATCH 025/457] FROMLIST: KVM: arm64: Provide I-cache invalidation by
 virtual address at EL2

In preparation for handling cache maintenance of guest pages from within
the pKVM hypervisor at EL2, introduce an EL2 copy of icache_inval_pou()
which will later be plumbed into the stage-2 page-table cache
maintenance callbacks, ensuring that the initial contents of pages
mapped as executable into the guest stage-2 page-table is visible to the
instruction fetcher.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-17-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I0a1fb40d49bbc19eb53d59805acb93ebd7702b8b
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h |  1 +
 arch/arm64/kernel/image-vars.h   |  3 ---
 arch/arm64/kvm/arm.c             |  1 +
 arch/arm64/kvm/hyp/nvhe/cache.S  | 11 +++++++++++
 arch/arm64/kvm/hyp/nvhe/pkvm.c   |  3 +++
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index aa7fa2a08f06..fd99cf09972d 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -123,4 +123,5 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
 extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
 extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
 
+extern unsigned long kvm_nvhe_sym(__icache_flags);
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8151412653de..7f4e43bfaade 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -71,9 +71,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
 /* Vectors installed by hyp-init on reset HVC. */
 KVM_NVHE_ALIAS(__hyp_stub_vectors);
 
-/* Kernel symbol used by icache_is_vpipt(). */
-KVM_NVHE_ALIAS(__icache_flags);
-
 /* VMID bits set by the KVM VMID allocator */
 KVM_NVHE_ALIAS(kvm_arm_vmid_bits);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 584626e11797..d99e93e6ddf7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1894,6 +1894,7 @@ static void kvm_hyp_init_symbols(void)
 	kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+	kvm_nvhe_sym(__icache_flags) = __icache_flags;
 }
 
 static int kvm_hyp_init_protection(u32 hyp_va_bits)
diff --git a/arch/arm64/kvm/hyp/nvhe/cache.S b/arch/arm64/kvm/hyp/nvhe/cache.S
index 0c367eb5f4e2..85936c17ae40 100644
--- a/arch/arm64/kvm/hyp/nvhe/cache.S
+++ b/arch/arm64/kvm/hyp/nvhe/cache.S
@@ -12,3 +12,14 @@ SYM_FUNC_START(__pi_dcache_clean_inval_poc)
 	ret
 SYM_FUNC_END(__pi_dcache_clean_inval_poc)
 SYM_FUNC_ALIAS(dcache_clean_inval_poc, __pi_dcache_clean_inval_poc)
+
+SYM_FUNC_START(__pi_icache_inval_pou)
+alternative_if ARM64_HAS_CACHE_DIC
+	isb
+	ret
+alternative_else_nop_endif
+
+	invalidate_icache_by_line x0, x1, x2, x3
+	ret
+SYM_FUNC_END(__pi_icache_inval_pou)
+SYM_FUNC_ALIAS(icache_inval_pou, __pi_icache_inval_pou)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index bf0436f9f934..604505ed7727 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -12,6 +12,9 @@
 #include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
+/* Used by icache_is_vpipt(). */
+unsigned long __icache_flags;
+
 /*
  * Set trap register values based on features in ID_AA64PFR0.
  */

From f133e01b873cc33d4f4903dcc23471cc6aec2285 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:19 +0100
Subject: [PATCH 026/457] FROMLIST: KVM: arm64: Add generic hyp_memcache
 helpers

The host at EL1 and the pKVM hypervisor at EL2 will soon need to
exchange memory pages dynamically for creating and destroying VM state.

Indeed, the hypervisor will rely on the host to donate memory pages it
can use to create guest stage-2 page-tables and to store VM and vCPU
metadata. In order to ease this process, introduce a
'struct hyp_memcache' which is essentially a linked list of available
pages, indexed by physical addresses so that it can be passed
meaningfully between the different virtual address spaces configured at
EL1 and EL2.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-18-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I69996a4018e61473f25c5079f9ebcd82754914f4
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h             | 57 +++++++++++++++++++
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +
 arch/arm64/kvm/hyp/nvhe/mm.c                  | 33 +++++++++++
 arch/arm64/kvm/mmu.c                          | 26 +++++++++
 4 files changed, 118 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 467393e7331f..835987e0f868 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -73,6 +73,63 @@ u32 __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
 
+struct kvm_hyp_memcache {
+	phys_addr_t head;
+	unsigned long nr_pages;
+};
+
+static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
+				     phys_addr_t *p,
+				     phys_addr_t (*to_pa)(void *virt))
+{
+	*p = mc->head;
+	mc->head = to_pa(p);
+	mc->nr_pages++;
+}
+
+static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
+				     void *(*to_va)(phys_addr_t phys))
+{
+	phys_addr_t *p = to_va(mc->head);
+
+	if (!mc->nr_pages)
+		return NULL;
+
+	mc->head = *p;
+	mc->nr_pages--;
+
+	return p;
+}
+
+static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
+				       unsigned long min_pages,
+				       void *(*alloc_fn)(void *arg),
+				       phys_addr_t (*to_pa)(void *virt),
+				       void *arg)
+{
+	while (mc->nr_pages < min_pages) {
+		phys_addr_t *p = alloc_fn(arg);
+
+		if (!p)
+			return -ENOMEM;
+		push_hyp_memcache(mc, p, to_pa);
+	}
+
+	return 0;
+}
+
+static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
+				       void (*free_fn)(void *virt, void *arg),
+				       void *(*to_va)(phys_addr_t phys),
+				       void *arg)
+{
+	while (mc->nr_pages)
+		free_fn(pop_hyp_memcache(mc, to_va), arg);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc);
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);
+
 struct kvm_vmid {
 	atomic64_t id;
 };
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index ef31a1872c93..420b87e755a4 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -77,6 +77,8 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 int hyp_pin_shared_mem(void *from, void *to);
 void hyp_unpin_shared_mem(void *from, void *to);
 void reclaim_guest_pages(struct pkvm_hyp_vm *vm);
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+		    struct kvm_hyp_memcache *host_mc);
 
 static __always_inline void __load_host_stage2(void)
 {
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 5648ac21e62d..c80b2c007619 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -340,3 +340,36 @@ int hyp_create_idmap(u32 hyp_va_bits)
 
 	return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
 }
+
+static void *admit_host_page(void *arg)
+{
+	struct kvm_hyp_memcache *host_mc = arg;
+
+	if (!host_mc->nr_pages)
+		return NULL;
+
+	/*
+	 * The host still owns the pages in its memcache, so we need to go
+	 * through a full host-to-hyp donation cycle to change it. Fortunately,
+	 * __pkvm_host_donate_hyp() takes care of races for us, so if it
+	 * succeeds we're good to go.
+	 */
+	if (__pkvm_host_donate_hyp(hyp_phys_to_pfn(host_mc->head), 1))
+		return NULL;
+
+	return pop_hyp_memcache(host_mc, hyp_phys_to_virt);
+}
+
+/* Refill our local memcache by poping pages from the one provided by the host. */
+int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
+		    struct kvm_hyp_memcache *host_mc)
+{
+	struct kvm_hyp_memcache tmp = *host_mc;
+	int ret;
+
+	ret =  __topup_hyp_memcache(mc, min_pages, admit_host_page,
+				    hyp_virt_to_phys, &tmp);
+	*host_mc = tmp;
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 60ee3d9f01f8..18061163c607 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -807,6 +807,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	}
 }
 
+static void hyp_mc_free_fn(void *addr, void *unused)
+{
+	free_page((unsigned long)addr);
+}
+
+static void *hyp_mc_alloc_fn(void *unused)
+{
+	return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc)
+{
+	if (is_protected_kvm_enabled())
+		__free_hyp_memcache(mc, hyp_mc_free_fn,
+				    kvm_host_va, NULL);
+}
+
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
+{
+	if (!is_protected_kvm_enabled())
+		return 0;
+
+	return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
+				    kvm_host_pa, NULL);
+}
+
 /**
  * kvm_phys_addr_ioremap - map a device range to guest IPA
  *

From 0f23cd1e9d6b1e34a519bbdc1026e1a453eb55cd Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:20 +0100
Subject: [PATCH 027/457] FROMLIST: KVM: arm64: Consolidate stage-2
 initialisation into a single function

The initialisation of guest stage-2 page-tables is currently split
across two functions: kvm_init_stage2_mmu() and kvm_arm_setup_stage2().
That is presumably for historical reasons as kvm_arm_setup_stage2()
originates from the (now defunct) KVM port for 32-bit Arm.

Simplify this code path by merging both functions into one, taking care
to map the 'struct kvm' into the hypervisor stage-1 early on in order to
simplify the failure path.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-19-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I47433ed294fb5329b17b5e8b290e41b8e6d5b4a9
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_arm.h  |  2 +-
 arch/arm64/include/asm/kvm_host.h |  2 --
 arch/arm64/include/asm/kvm_mmu.h  |  2 +-
 arch/arm64/kvm/arm.c              | 27 +++++++++++++--------------
 arch/arm64/kvm/mmu.c              | 27 ++++++++++++++++++++++++++-
 arch/arm64/kvm/reset.c            | 29 -----------------------------
 6 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 8aa8492dafc0..89e63585dae4 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -135,7 +135,7 @@
  * 40 bits wide (T0SZ = 24).  Systems with a PARange smaller than 40 bits are
  * not known to exist and will break with this configuration.
  *
- * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_init_stage2_mmu.
  *
  * Note that when using 4K pages, we concatenate two first level page tables
  * together. With 16K pages, we concatenate 16 first level page tables.
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 835987e0f868..57218f0c449e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -990,8 +990,6 @@ int kvm_set_ipa_limit(void);
 #define __KVM_HAVE_ARCH_VM_ALLOC
 struct kvm *kvm_arch_alloc_vm(void);
 
-int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
-
 static inline bool kvm_vm_is_protected(struct kvm *kvm)
 {
 	return false;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7784081088e7..e4a7e6369499 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -166,7 +166,7 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
 void free_hyp_pgds(void);
 
 void stage2_unmap_vm(struct kvm *kvm);
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu);
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size, bool writable);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index d99e93e6ddf7..f78eefa02f6b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -139,28 +139,24 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	int ret;
 
-	ret = kvm_arm_setup_stage2(kvm, type);
-	if (ret)
-		return ret;
-
-	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu);
-	if (ret)
-		return ret;
-
 	ret = kvm_share_hyp(kvm, kvm + 1);
 	if (ret)
-		goto out_free_stage2_pgd;
+		return ret;
 
 	ret = pkvm_init_host_vm(kvm);
 	if (ret)
-		goto out_free_stage2_pgd;
+		goto err_unshare_kvm;
 
 	if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
 		ret = -ENOMEM;
-		goto out_free_stage2_pgd;
+		goto err_unshare_kvm;
 	}
 	cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
 
+	ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
+	if (ret)
+		goto err_free_cpumask;
+
 	kvm_vgic_early_init(kvm);
 
 	/* The maximum number of VCPUs is limited by the host's GIC model */
@@ -169,9 +165,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	set_default_spectre(kvm);
 	kvm_arm_init_hypercalls(kvm);
 
-	return ret;
-out_free_stage2_pgd:
-	kvm_free_stage2_pgd(&kvm->arch.mmu);
+	return 0;
+
+err_free_cpumask:
+	free_cpumask_var(kvm->arch.supported_cpus);
+err_unshare_kvm:
+	kvm_unshare_hyp(kvm, kvm + 1);
 	return ret;
 }
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 18061163c607..3e56c6393cae 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -675,15 +675,40 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
  * kvm_init_stage2_mmu - Initialise a S2 MMU structure
  * @kvm:	The pointer to the KVM structure
  * @mmu:	The pointer to the s2 MMU structure
+ * @type:	The machine type of the virtual machine
  *
  * Allocates only the stage-2 HW PGD level table(s).
  * Note we don't need locking here as this is only called when the VM is
  * created, which can only be done once.
  */
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
 {
+	u32 kvm_ipa_limit = get_kvm_ipa_limit();
 	int cpu, err;
 	struct kvm_pgtable *pgt;
+	u64 mmfr0, mmfr1;
+	u32 phys_shift;
+
+	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+		return -EINVAL;
+
+	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+	if (phys_shift) {
+		if (phys_shift > kvm_ipa_limit ||
+		    phys_shift < ARM64_MIN_PARANGE_BITS)
+			return -EINVAL;
+	} else {
+		phys_shift = KVM_PHYS_SHIFT;
+		if (phys_shift > kvm_ipa_limit) {
+			pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
+				     current->comm);
+			return -EINVAL;
+		}
+	}
+
+	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
 
 	if (mmu->pgt != NULL) {
 		kvm_err("kvm_arch already initialized?\n");
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 5ae18472205a..e0267f672b8a 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -395,32 +395,3 @@ int kvm_set_ipa_limit(void)
 
 	return 0;
 }
-
-int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
-{
-	u64 mmfr0, mmfr1;
-	u32 phys_shift;
-
-	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
-		return -EINVAL;
-
-	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
-	if (phys_shift) {
-		if (phys_shift > kvm_ipa_limit ||
-		    phys_shift < ARM64_MIN_PARANGE_BITS)
-			return -EINVAL;
-	} else {
-		phys_shift = KVM_PHYS_SHIFT;
-		if (phys_shift > kvm_ipa_limit) {
-			pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
-				     current->comm);
-			return -EINVAL;
-		}
-	}
-
-	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
-	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
-	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
-
-	return 0;
-}

From fac4b1b131b37beb4db98eb887156c180d85ae6e Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:21 +0100
Subject: [PATCH 028/457] FROMLIST: KVM: arm64: Instantiate guest stage-2
 page-tables at EL2

Extend the initialisation of guest data structures within the pKVM
hypervisor at EL2 so that we instantiate a memory pool and a full
'struct kvm_s2_mmu' structure for each VM, with a stage-2 page-table
entirely independent from the one managed by the host at EL1.

The 'struct kvm_pgtable_mm_ops' used by the page-table code is populated
with a set of callbacks that can manage guest pages in the hypervisor
without any direct intervention from the host, allocating page-table
pages from the provided pool and returning these to the host on VM
teardown. To keep things simple, the stage-2 MMU for the guest is
configured identically to the host stage-2 in the VTCR register and so
the IPA size of the guest must match the PA size of the host.

For now, the new page-table is unused as there is no way for the host
to map anything into it. Yet.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-20-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I2fc61f4f16d16b4786a50f4e2e4c5b06ed357b22
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |   6 ++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c  | 125 ++++++++++++++++++++++++-
 arch/arm64/kvm/mmu.c                   |   4 +-
 3 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 8c653a3b9501..d14dfbcb7da1 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -9,6 +9,9 @@
 
 #include <asm/kvm_pkvm.h>
 
+#include <nvhe/gfp.h>
+#include <nvhe/spinlock.h>
+
 /*
  * Holds the relevant data for maintaining the vcpu state completely at hyp.
  */
@@ -30,6 +33,9 @@ struct pkvm_hyp_vm {
 
 	/* The guest's stage-2 page-table managed by the hypervisor. */
 	struct kvm_pgtable pgt;
+	struct kvm_pgtable_mm_ops mm_ops;
+	struct hyp_pool pool;
+	hyp_spinlock_t lock;
 
 	/*
 	 * The number of vcpus initialized and ready to run.
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index f0555f200aaf..10a4af503d0d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -25,6 +25,21 @@ struct host_mmu host_mmu;
 
 static struct hyp_pool host_s2_pool;
 
+static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
+#define current_vm (*this_cpu_ptr(&__current_vm))
+
+static void guest_lock_component(struct pkvm_hyp_vm *vm)
+{
+	hyp_spin_lock(&vm->lock);
+	current_vm = vm;
+}
+
+static void guest_unlock_component(struct pkvm_hyp_vm *vm)
+{
+	current_vm = NULL;
+	hyp_spin_unlock(&vm->lock);
+}
+
 static void host_lock_component(void)
 {
 	hyp_spin_lock(&host_mmu.lock);
@@ -140,18 +155,124 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
 	return 0;
 }
 
+static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
+				      enum kvm_pgtable_prot prot)
+{
+	return true;
+}
+
+static void *guest_s2_zalloc_pages_exact(size_t size)
+{
+	void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
+
+	WARN_ON(size != (PAGE_SIZE << get_order(size)));
+	hyp_split_page(hyp_virt_to_page(addr));
+
+	return addr;
+}
+
+static void guest_s2_free_pages_exact(void *addr, unsigned long size)
+{
+	u8 order = get_order(size);
+	unsigned int i;
+
+	for (i = 0; i < (1 << order); i++)
+		hyp_put_page(&current_vm->pool, addr + (i * PAGE_SIZE));
+}
+
+static void *guest_s2_zalloc_page(void *mc)
+{
+	struct hyp_page *p;
+	void *addr;
+
+	addr = hyp_alloc_pages(&current_vm->pool, 0);
+	if (addr)
+		return addr;
+
+	addr = pop_hyp_memcache(mc, hyp_phys_to_virt);
+	if (!addr)
+		return addr;
+
+	memset(addr, 0, PAGE_SIZE);
+	p = hyp_virt_to_page(addr);
+	memset(p, 0, sizeof(*p));
+	p->refcount = 1;
+
+	return addr;
+}
+
+static void guest_s2_get_page(void *addr)
+{
+	hyp_get_page(&current_vm->pool, addr);
+}
+
+static void guest_s2_put_page(void *addr)
+{
+	hyp_put_page(&current_vm->pool, addr);
+}
+
+static void clean_dcache_guest_page(void *va, size_t size)
+{
+	__clean_dcache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
+	hyp_fixmap_unmap();
+}
+
+static void invalidate_icache_guest_page(void *va, size_t size)
+{
+	__invalidate_icache_guest_page(hyp_fixmap_map(__hyp_pa(va)), size);
+	hyp_fixmap_unmap();
+}
+
 int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
 {
-	vm->pgt.pgd = pgd;
+	struct kvm_s2_mmu *mmu = &vm->kvm.arch.mmu;
+	unsigned long nr_pages;
+	int ret;
+
+	nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+	ret = hyp_pool_init(&vm->pool, hyp_virt_to_pfn(pgd), nr_pages, 0);
+	if (ret)
+		return ret;
+
+	hyp_spin_lock_init(&vm->lock);
+	vm->mm_ops = (struct kvm_pgtable_mm_ops) {
+		.zalloc_pages_exact	= guest_s2_zalloc_pages_exact,
+		.free_pages_exact	= guest_s2_free_pages_exact,
+		.zalloc_page		= guest_s2_zalloc_page,
+		.phys_to_virt		= hyp_phys_to_virt,
+		.virt_to_phys		= hyp_virt_to_phys,
+		.page_count		= hyp_page_count,
+		.get_page		= guest_s2_get_page,
+		.put_page		= guest_s2_put_page,
+		.dcache_clean_inval_poc	= clean_dcache_guest_page,
+		.icache_inval_pou	= invalidate_icache_guest_page,
+	};
+
+	guest_lock_component(vm);
+	ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
+					guest_stage2_force_pte_cb);
+	guest_unlock_component(vm);
+	if (ret)
+		return ret;
+
+	vm->kvm.arch.mmu.pgd_phys = __hyp_pa(vm->pgt.pgd);
+
 	return 0;
 }
 
 void reclaim_guest_pages(struct pkvm_hyp_vm *vm)
 {
+	void *pgd = vm->pgt.pgd;
 	unsigned long nr_pages;
 
 	nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
-	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(vm->pgt.pgd), nr_pages));
+
+	guest_lock_component(vm);
+	kvm_pgtable_stage2_destroy(&vm->pgt);
+	vm->kvm.arch.mmu.pgd_phys = 0ULL;
+	guest_unlock_component(vm);
+
+	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(pgd), nr_pages));
 }
 
 int __pkvm_prot_finalize(void)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3e56c6393cae..962f4472601b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -693,7 +693,9 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 		return -EINVAL;
 
 	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
-	if (phys_shift) {
+	if (is_protected_kvm_enabled()) {
+		phys_shift = kvm_ipa_limit;
+	} else if (phys_shift) {
 		if (phys_shift > kvm_ipa_limit ||
 		    phys_shift < ARM64_MIN_PARANGE_BITS)
 			return -EINVAL;

From e5be78b7cb0a1d247f41c5fdcf158f880e8aaab9 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:22 +0100
Subject: [PATCH 029/457] FROMLIST: BACKPORT: KVM: arm64: Return guest memory
 from EL2 via dedicated teardown memcache

Rather than relying on the host to free the previously-donated pKVM
hypervisor VM pages explicitly on teardown, introduce a dedicated
teardown memcache which allows the host to reclaim guest memory
resources without having to keep track of all of the allocations made by
the pKVM hypervisor at EL2.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-21-will@kernel.org
[willdeacon@: Fix GCC compat error due to variable declaration in for loop
 initializer prior to C99]
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ib43b68f36fdaf5aac578f177ab8260c72acc6ed5
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h             |  7 +----
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 17 ++++++----
 arch/arm64/kvm/hyp/nvhe/pkvm.c                | 22 +++++++++++--
 arch/arm64/kvm/pkvm.c                         | 31 ++++---------------
 5 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 57218f0c449e..63307e7dc9c5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -176,12 +176,7 @@ typedef unsigned int pkvm_handle_t;
 
 struct kvm_protected_vm {
 	pkvm_handle_t handle;
-
-	struct {
-		void *pgd;
-		void *vm;
-		void *vcpus[KVM_MAX_VCPUS];
-	} hyp_donations;
+	struct kvm_hyp_memcache teardown_mc;
 };
 
 struct kvm_arch {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 420b87e755a4..b7bdbe63deed 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -76,7 +76,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
 int hyp_pin_shared_mem(void *from, void *to);
 void hyp_unpin_shared_mem(void *from, void *to);
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm);
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
 int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
 		    struct kvm_hyp_memcache *host_mc);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 10a4af503d0d..f842d6e0dafd 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -260,19 +260,24 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
 	return 0;
 }
 
-void reclaim_guest_pages(struct pkvm_hyp_vm *vm)
+void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
 {
-	void *pgd = vm->pgt.pgd;
-	unsigned long nr_pages;
-
-	nr_pages = kvm_pgtable_stage2_pgd_size(vm->kvm.arch.vtcr) >> PAGE_SHIFT;
+	void *addr;
 
+	/* Dump all pgtable pages in the hyp_pool */
 	guest_lock_component(vm);
 	kvm_pgtable_stage2_destroy(&vm->pgt);
 	vm->kvm.arch.mmu.pgd_phys = 0ULL;
 	guest_unlock_component(vm);
 
-	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(pgd), nr_pages));
+	/* Drain the hyp_pool into the memcache */
+	addr = hyp_alloc_pages(&vm->pool, 0);
+	while (addr) {
+		memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
+		push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+		WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
+		addr = hyp_alloc_pages(&vm->pool, 0);
+	}
 }
 
 int __pkvm_prot_finalize(void)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 604505ed7727..2ab262b342ee 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -527,8 +527,23 @@ unlock:
 	return ret;
 }
 
+static void
+teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
+{
+	void *start;
+
+	size = PAGE_ALIGN(size);
+	memset(addr, 0, size);
+
+	for (start = addr; start < addr + size; start += PAGE_SIZE)
+		push_hyp_memcache(mc, start, hyp_virt_to_phys);
+
+	unmap_donated_memory_noclear(addr, size);
+}
+
 int __pkvm_teardown_vm(pkvm_handle_t handle)
 {
+	struct kvm_hyp_memcache *mc;
 	struct pkvm_hyp_vm *hyp_vm;
 	unsigned int idx;
 	size_t vm_size;
@@ -552,7 +567,8 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 	hyp_spin_unlock(&vm_table_lock);
 
 	/* Reclaim guest pages (including page-table pages) */
-	reclaim_guest_pages(hyp_vm);
+	mc = &hyp_vm->host_kvm->arch.pkvm.teardown_mc;
+	reclaim_guest_pages(hyp_vm, mc);
 	unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
 
 	/* Push the metadata pages to the teardown memcache */
@@ -561,11 +577,11 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 	for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
 		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
 
-		unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu));
+		teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
 	}
 
 	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
-	unmap_donated_memory(hyp_vm, vm_size);
+	teardown_donated_memory(mc, hyp_vm, vm_size);
 	return 0;
 
 err_unlock:
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 8c443b915e43..cf56958b1492 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -147,8 +147,6 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 	handle = ret;
 
 	host_kvm->arch.pkvm.handle = handle;
-	host_kvm->arch.pkvm.hyp_donations.pgd = pgd;
-	host_kvm->arch.pkvm.hyp_donations.vm = hyp_vm;
 
 	/* Donate memory for the vcpus at hyp and initialize it. */
 	hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
@@ -167,12 +165,12 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 			goto destroy_vm;
 		}
 
-		host_kvm->arch.pkvm.hyp_donations.vcpus[idx] = hyp_vcpu;
-
 		ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
 					hyp_vcpu);
-		if (ret)
+		if (ret) {
+			free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
 			goto destroy_vm;
+		}
 	}
 
 	return 0;
@@ -201,30 +199,13 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
 
 void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 {
-	unsigned long idx, nr_vcpus = host_kvm->created_vcpus;
-	size_t pgd_sz, hyp_vm_sz;
-
-	if (host_kvm->arch.pkvm.handle)
+	if (host_kvm->arch.pkvm.handle) {
 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
 					  host_kvm->arch.pkvm.handle));
-
-	host_kvm->arch.pkvm.handle = 0;
-
-	for (idx = 0; idx < nr_vcpus; ++idx) {
-		void *hyp_vcpu = host_kvm->arch.pkvm.hyp_donations.vcpus[idx];
-
-		if (!hyp_vcpu)
-			break;
-
-		free_pages_exact(hyp_vcpu, PAGE_ALIGN(PKVM_HYP_VCPU_SIZE));
 	}
 
-	hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
-					size_mul(sizeof(void *), nr_vcpus)));
-	pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
-
-	free_pages_exact(host_kvm->arch.pkvm.hyp_donations.vm, hyp_vm_sz);
-	free_pages_exact(host_kvm->arch.pkvm.hyp_donations.pgd, pgd_sz);
+	host_kvm->arch.pkvm.handle = 0;
+	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
 }
 
 int pkvm_init_host_vm(struct kvm *host_kvm)

From e101352da908ba256e710342bc4d5786b9ee973f Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:23 +0100
Subject: [PATCH 030/457] FROMLIST: KVM: arm64: Unmap 'kvm_arm_hyp_percpu_base'
 from the host

When pKVM is enabled, the hypervisor at EL2 does not trust the host at
EL1 and must therefore prevent it from having unrestricted access to
internal hypervisor state.

The 'kvm_arm_hyp_percpu_base' array holds the offsets for hypervisor
per-cpu allocations, so move this this into the nVHE code where it
cannot be modified by the untrusted host at EL1.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-22-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8d67b2905ac97e15f4252d45c36b97e53d3072ce
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h  | 4 ++--
 arch/arm64/kernel/image-vars.h    | 3 ---
 arch/arm64/kvm/arm.c              | 9 ++++-----
 arch/arm64/kvm/hyp/nvhe/hyp-smp.c | 2 ++
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index de52ba775d48..43c3bc0f9544 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -109,7 +109,7 @@ enum __kvm_host_smccc_func {
 #define per_cpu_ptr_nvhe_sym(sym, cpu)						\
 	({									\
 		unsigned long base, off;					\
-		base = kvm_arm_hyp_percpu_base[cpu];				\
+		base = kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];		\
 		off = (unsigned long)&CHOOSE_NVHE_SYM(sym) -			\
 		      (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start);		\
 		base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL;	\
@@ -214,7 +214,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
 #define __kvm_hyp_init		CHOOSE_NVHE_SYM(__kvm_hyp_init)
 #define __kvm_hyp_vector	CHOOSE_HYP_SYM(__kvm_hyp_vector)
 
-extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+extern unsigned long kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[];
 DECLARE_KVM_NVHE_SYM(__per_cpu_start);
 DECLARE_KVM_NVHE_SYM(__per_cpu_end);
 
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 7f4e43bfaade..ae8f37f4aa8c 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -89,9 +89,6 @@ KVM_NVHE_ALIAS(gic_nonsecure_priorities);
 KVM_NVHE_ALIAS(__start___kvm_ex_table);
 KVM_NVHE_ALIAS(__stop___kvm_ex_table);
 
-/* Array containing bases of nVHE per-CPU memory regions. */
-KVM_NVHE_ALIAS(kvm_arm_hyp_percpu_base);
-
 /* PMU available static key */
 #ifdef CONFIG_HW_PERF_EVENTS
 KVM_NVHE_ALIAS(kvm_arm_pmu_available);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index f78eefa02f6b..25467f24803d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -51,7 +51,6 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
 
 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 static bool vgic_present;
@@ -1857,13 +1856,13 @@ static void teardown_hyp_mode(void)
 	free_hyp_pgds();
 	for_each_possible_cpu(cpu) {
 		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
-		free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+		free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());
 	}
 }
 
 static int do_pkvm_init(u32 hyp_va_bits)
 {
-	void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base);
+	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
 	int ret;
 
 	preempt_disable();
@@ -1967,7 +1966,7 @@ static int init_hyp_mode(void)
 
 		page_addr = page_address(page);
 		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
-		kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+		kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
 	}
 
 	/*
@@ -2060,7 +2059,7 @@ static int init_hyp_mode(void)
 	}
 
 	for_each_possible_cpu(cpu) {
-		char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+		char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
 		char *percpu_end = percpu_begin + nvhe_percpu_size();
 
 		/* Map Hyp percpu pages */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index 9f54833af400..04d194583f1e 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -23,6 +23,8 @@ u64 cpu_logical_map(unsigned int cpu)
 	return hyp_cpu_logical_map[cpu];
 }
 
+unsigned long __ro_after_init kvm_arm_hyp_percpu_base[NR_CPUS];
+
 unsigned long __hyp_per_cpu_offset(unsigned int cpu)
 {
 	unsigned long *cpu_base_array;

From fa3678bc2b65c8aa4697d740d298b8c65b130a46 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:24 +0100
Subject: [PATCH 031/457] FROMLIST: KVM: arm64: Maintain a copy of
 'kvm_arm_vmid_bits' at EL2

Sharing 'kvm_arm_vmid_bits' between EL1 and EL2 allows the host to
modify the variable arbitrarily, potentially leading to all sorts of
shenanians as this is used to configure the VTTBR register for the
guest stage-2.

In preparation for unmapping host sections entirely from EL2, maintain
a copy of 'kvm_arm_vmid_bits' in the pKVM hypervisor and initialise it
from the host value while it is still trusted.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-23-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I613e7c0ef747324e73cf4d4f543354a3641c7505
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h | 2 ++
 arch/arm64/kernel/image-vars.h   | 3 ---
 arch/arm64/kvm/arm.c             | 1 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c   | 3 +++
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index fd99cf09972d..6797eafe7890 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -124,4 +124,6 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
 extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
 
 extern unsigned long kvm_nvhe_sym(__icache_flags);
+extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
+
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index ae8f37f4aa8c..31ad75da4d58 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -71,9 +71,6 @@ KVM_NVHE_ALIAS(nvhe_hyp_panic_handler);
 /* Vectors installed by hyp-init on reset HVC. */
 KVM_NVHE_ALIAS(__hyp_stub_vectors);
 
-/* VMID bits set by the KVM VMID allocator */
-KVM_NVHE_ALIAS(kvm_arm_vmid_bits);
-
 /* Static keys which are set if a vGIC trap should be handled in hyp. */
 KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
 KVM_NVHE_ALIAS(vgic_v3_cpuif_trap);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 25467f24803d..1d4b8122d010 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1893,6 +1893,7 @@ static void kvm_hyp_init_symbols(void)
 	kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
 	kvm_nvhe_sym(__icache_flags) = __icache_flags;
+	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
 }
 
 static int kvm_hyp_init_protection(u32 hyp_va_bits)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 2ab262b342ee..4f97080f7532 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -15,6 +15,9 @@
 /* Used by icache_is_vpipt(). */
 unsigned long __icache_flags;
 
+/* Used by kvm_get_vttbr(). */
+unsigned int kvm_arm_vmid_bits;
+
 /*
  * Set trap register values based on features in ID_AA64PFR0.
  */

From 2daecbe40c967e4113e90b2b36e98111f59ebe1b Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:25 +0100
Subject: [PATCH 032/457] FROMLIST: KVM: arm64: Explicitly map
 'kvm_vgic_global_state' at EL2

The pkvm hypervisor at EL2 may need to read the 'kvm_vgic_global_state'
variable from the host, for example when saving and restoring the state
of the virtual GIC.

Explicitly map 'kvm_vgic_global_state' in the stage-1 page-table of the
pKVM hypervisor rather than relying on mapping all of the host '.rodata'
section.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-24-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I13a190d4164a4e0fd68dd5ec88ab5647dd4e73fc
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/setup.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 0f69c1393416..5a371ab236db 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -161,6 +161,11 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
 	if (ret)
 		return ret;
 
+	ret = pkvm_create_mappings(&kvm_vgic_global_state,
+				   &kvm_vgic_global_state + 1, prot);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 

From 75b3fd47d06e8f0ab3cd8b56e9363d9a569e85e0 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 20 Oct 2022 14:38:26 +0100
Subject: [PATCH 033/457] FROMLIST: KVM: arm64: Don't unnecessarily map host
 kernel sections at EL2

We no longer need to map the host's '.rodata' and '.bss' sections in the
stage-1 page-table of the pKVM hypervisor at EL2, so remove those
mappings and avoid creating any future dependencies at EL2 on
host-controlled data structures.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-25-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Iddb819bc1ef4006b6ed7490476f28f0e880e1d8c
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kernel/image-vars.h  |  6 ------
 arch/arm64/kvm/hyp/nvhe/setup.c | 14 +++-----------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 31ad75da4d58..e3f88b5836a2 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -102,12 +102,6 @@ KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy);
 KVM_NVHE_ALIAS_HYP(__memset, __pi_memset);
 #endif
 
-/* Kernel memory sections */
-KVM_NVHE_ALIAS(__start_rodata);
-KVM_NVHE_ALIAS(__end_rodata);
-KVM_NVHE_ALIAS(__bss_start);
-KVM_NVHE_ALIAS(__bss_stop);
-
 /* Hyp memory sections */
 KVM_NVHE_ALIAS(__hyp_idmap_text_start);
 KVM_NVHE_ALIAS(__hyp_idmap_text_end);
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 5a371ab236db..5cdf3fb09bb4 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -144,23 +144,15 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
 	}
 
 	/*
-	 * Map the host's .bss and .rodata sections RO in the hypervisor, but
-	 * transfer the ownership from the host to the hypervisor itself to
-	 * make sure it can't be donated or shared with another entity.
+	 * Map the host sections RO in the hypervisor, but transfer the
+	 * ownership from the host to the hypervisor itself to make sure they
+	 * can't be donated or shared with another entity.
 	 *
 	 * The ownership transition requires matching changes in the host
 	 * stage-2. This will be done later (see finalize_host_mappings()) once
 	 * the hyp_vmemmap is addressable.
 	 */
 	prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED);
-	ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot);
-	if (ret)
-		return ret;
-
-	ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot);
-	if (ret)
-		return ret;
-
 	ret = pkvm_create_mappings(&kvm_vgic_global_state,
 				   &kvm_vgic_global_state + 1, prot);
 	if (ret)

From 97cbc56f84058702a2205754abdbe22d2438fd2f Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 20 Oct 2022 14:38:27 +0100
Subject: [PATCH 034/457] FROMLIST: KVM: arm64: Use the pKVM hyp vCPU structure
 in handle___kvm_vcpu_run()

As a stepping stone towards deprivileging the host's access to the
guest's vCPU structures, introduce some naive flush/sync routines to
copy most of the host vCPU into the hyp vCPU on vCPU run and back
again on return to EL1.

This allows us to run using the pKVM hyp structures when KVM is
initialised in protected mode.

Tested-by: Vincent Donnefort <vdonnefort@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20221020133827.5541-26-will@kernel.org
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Iaf1c07cbf58eaff8a2968e9dc6457d36dcef83cf
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  4 ++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c     | 79 +++++++++++++++++++++++++-
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 28 +++++++++
 3 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index d14dfbcb7da1..82b3d62538a6 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -61,4 +61,8 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
 		     unsigned long vcpu_hva);
 int __pkvm_teardown_vm(pkvm_handle_t handle);
 
+struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
+					 unsigned int vcpu_idx);
+void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index b5f3fcfe9135..728e01d4536b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -22,11 +22,86 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
+static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	hyp_vcpu->vcpu.arch.ctxt	= host_vcpu->arch.ctxt;
+
+	hyp_vcpu->vcpu.arch.sve_state	= kern_hyp_va(host_vcpu->arch.sve_state);
+	hyp_vcpu->vcpu.arch.sve_max_vl	= host_vcpu->arch.sve_max_vl;
+
+	hyp_vcpu->vcpu.arch.hw_mmu	= host_vcpu->arch.hw_mmu;
+
+	hyp_vcpu->vcpu.arch.hcr_el2	= host_vcpu->arch.hcr_el2;
+	hyp_vcpu->vcpu.arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
+	hyp_vcpu->vcpu.arch.cptr_el2	= host_vcpu->arch.cptr_el2;
+
+	hyp_vcpu->vcpu.arch.iflags	= host_vcpu->arch.iflags;
+	hyp_vcpu->vcpu.arch.fp_state	= host_vcpu->arch.fp_state;
+
+	hyp_vcpu->vcpu.arch.debug_ptr	= kern_hyp_va(host_vcpu->arch.debug_ptr);
+	hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
+
+	hyp_vcpu->vcpu.arch.vsesr_el2	= host_vcpu->arch.vsesr_el2;
+
+	hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+}
+
+static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+	struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
+	unsigned int i;
+
+	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;
+
+	host_vcpu->arch.hcr_el2		= hyp_vcpu->vcpu.arch.hcr_el2;
+	host_vcpu->arch.cptr_el2	= hyp_vcpu->vcpu.arch.cptr_el2;
+
+	host_vcpu->arch.fault		= hyp_vcpu->vcpu.arch.fault;
+
+	host_vcpu->arch.iflags		= hyp_vcpu->vcpu.arch.iflags;
+	host_vcpu->arch.fp_state	= hyp_vcpu->vcpu.arch.fp_state;
+
+	host_cpu_if->vgic_hcr		= hyp_cpu_if->vgic_hcr;
+	for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
+		host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
+}
+
 static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 {
-	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
+	int ret;
 
-	cpu_reg(host_ctxt, 1) =  __kvm_vcpu_run(kern_hyp_va(vcpu));
+	host_vcpu = kern_hyp_va(host_vcpu);
+
+	if (unlikely(is_protected_kvm_enabled())) {
+		struct pkvm_hyp_vcpu *hyp_vcpu;
+		struct kvm *host_kvm;
+
+		host_kvm = kern_hyp_va(host_vcpu->kvm);
+		hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
+					      host_vcpu->vcpu_idx);
+		if (!hyp_vcpu) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		flush_hyp_vcpu(hyp_vcpu);
+
+		ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
+
+		sync_hyp_vcpu(hyp_vcpu);
+		pkvm_put_hyp_vcpu(hyp_vcpu);
+	} else {
+		/* The host is fully trusted, run its vCPU directly. */
+		ret = __kvm_vcpu_run(host_vcpu);
+	}
+
+out:
+	cpu_reg(host_ctxt, 1) =  ret;
 }
 
 static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 4f97080f7532..1e9b507c4190 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -241,6 +241,33 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
 	return vm_table[idx];
 }
 
+struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
+					 unsigned int vcpu_idx)
+{
+	struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
+	struct pkvm_hyp_vm *hyp_vm;
+
+	hyp_spin_lock(&vm_table_lock);
+	hyp_vm = get_vm_by_handle(handle);
+	if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
+		goto unlock;
+
+	hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+	hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
+unlock:
+	hyp_spin_unlock(&vm_table_lock);
+	return hyp_vcpu;
+}
+
+void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+
+	hyp_spin_lock(&vm_table_lock);
+	hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
+	hyp_spin_unlock(&vm_table_lock);
+}
+
 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
 {
 	if (host_vcpu)
@@ -286,6 +313,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 	hyp_vcpu->vcpu.vcpu_idx = vcpu_idx;
 
 	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
+	hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
 done:
 	if (ret)
 		unpin_host_vcpu(host_vcpu);

From 42e7f6a05438e52445afa893b5bd9e09759a98bf Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 27 May 2021 16:02:37 +0000
Subject: [PATCH 035/457] ANDROID: KVM: arm64: Repurpose a byte of 'order' for
 flags in 'struct hyp_page'

In preparation for poisoning guest memory pages in the pKVM hypervisor
when being reclaimed by the host, introduce a new 'flags' field in
'struct hyp_page' so that we will be able to track on a per-page basis
whether or not poisoning is required.

Rather than increase the total size of the structure, shrink the 16-bit
'order' field to a single byte and use the recovered space for the new
field.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8eb1f7ed8da0374b878a315eb1a6f867d0f379a9
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/gfp.h    |  6 +++---
 arch/arm64/kvm/hyp/include/nvhe/memory.h |  3 ++-
 arch/arm64/kvm/hyp/nvhe/page_alloc.c     | 14 +++++++-------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 0a048dc06a7d..9330b13075f8 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -7,7 +7,7 @@
 #include <nvhe/memory.h>
 #include <nvhe/spinlock.h>
 
-#define HYP_NO_ORDER	USHRT_MAX
+#define HYP_NO_ORDER	0xff
 
 struct hyp_pool {
 	/*
@@ -19,11 +19,11 @@ struct hyp_pool {
 	struct list_head free_area[MAX_ORDER];
 	phys_addr_t range_start;
 	phys_addr_t range_end;
-	unsigned short max_order;
+	u8 max_order;
 };
 
 /* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order);
 void hyp_split_page(struct hyp_page *page);
 void hyp_get_page(struct hyp_pool *pool, void *addr);
 void hyp_put_page(struct hyp_pool *pool, void *addr);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index ab205c4d6774..51454ed29c8a 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -9,7 +9,8 @@
 
 struct hyp_page {
 	unsigned short refcount;
-	unsigned short order;
+	u8 order;
+	u8 flags;
 };
 
 extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 7804da89e55d..01976a58d850 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
  */
 static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
 					     struct hyp_page *p,
-					     unsigned short order)
+					     u8 order)
 {
 	phys_addr_t addr = hyp_page_to_phys(p);
 
@@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
 /* Find a buddy page currently available for allocation */
 static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
 					   struct hyp_page *p,
-					   unsigned short order)
+					   u8 order)
 {
 	struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
 
@@ -94,8 +94,8 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 			      struct hyp_page *p)
 {
 	phys_addr_t phys = hyp_page_to_phys(p);
-	unsigned short order = p->order;
 	struct hyp_page *buddy;
+	u8 order = p->order;
 
 	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
 
@@ -128,7 +128,7 @@ insert:
 
 static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
 					   struct hyp_page *p,
-					   unsigned short order)
+					   u8 order)
 {
 	struct hyp_page *buddy;
 
@@ -182,7 +182,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr)
 
 void hyp_split_page(struct hyp_page *p)
 {
-	unsigned short order = p->order;
+	u8 order = p->order;
 	unsigned int i;
 
 	p->order = 0;
@@ -194,10 +194,10 @@ void hyp_split_page(struct hyp_page *p)
 	}
 }
 
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
+void *hyp_alloc_pages(struct hyp_pool *pool, u8 order)
 {
-	unsigned short i = order;
 	struct hyp_page *p;
+	u8 i = order;
 
 	hyp_spin_lock(&pool->lock);
 

From 528846f42e1181acbf909c841216bae9a46b52ac Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 11 Jun 2022 11:50:23 +0100
Subject: [PATCH 036/457] ANDROID: KVM: arm64: Add vcpu flag copy primitive

Contrary to vanilla KVM, pKVM not only deals with flags in a vcpu,
but also synchronises them across host and hypervisor views of the same
vcpu.

Most of the time, this is about copying flags from one vcpu structure
to another, so let's offer a primitive that does this.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Icd67b617c1cd69706ccd99739756458864b422bb
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 63307e7dc9c5..c68f8e082867 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -544,9 +544,25 @@ struct kvm_vcpu_arch {
 		*fset &= ~(m);					\
 	} while (0)
 
+#define __vcpu_copy_flag(vt, vs, flagset, f, m)			\
+	do {							\
+		typeof(vs->arch.flagset) tmp, val;		\
+								\
+		__build_check_flag(vs, flagset, f, m);		\
+								\
+		val = READ_ONCE(vs->arch.flagset);		\
+		val &= (m);					\
+		tmp = READ_ONCE(vt->arch.flagset);		\
+		tmp &= ~(m);					\
+		tmp |= val;					\
+		WRITE_ONCE(vt->arch.flagset, tmp);		\
+	} while (0)
+
+
 #define vcpu_get_flag(v, ...)	__vcpu_get_flag((v), __VA_ARGS__)
 #define vcpu_set_flag(v, ...)	__vcpu_set_flag((v), __VA_ARGS__)
 #define vcpu_clear_flag(v, ...)	__vcpu_clear_flag((v), __VA_ARGS__)
+#define vcpu_copy_flag(vt, vs,...) __vcpu_copy_flag((vt), (vs), __VA_ARGS__)
 
 /* SVE exposed to guest */
 #define GUEST_HAS_SVE		__vcpu_single_flag(cflags, BIT(0))

From d2dcc5fffa2664a3110c4fff0c7d557d940fa6c2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 11 Jun 2022 11:50:50 +0100
Subject: [PATCH 037/457] ANDROID: KVM: arm64: Add PC_UPDATE_REQ flags covering
 all PC updates

In order to deal with PC updates (such as INCREMENT_PC and the
collection of flags that come with PENDING_EXCEPTION), add a single
mask that covers them all.

This will be used to manipulate these flags as a single entity.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Id24f79f482911efe3374abbead8a70e46cf12725
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index c68f8e082867..cfc25b45fd33 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -580,6 +580,8 @@ struct kvm_vcpu_arch {
 #define INCREMENT_PC		__vcpu_single_flag(iflags, BIT(1))
 /* Target EL/MODE (not a single flag, but let's abuse the macro) */
 #define EXCEPT_MASK		__vcpu_single_flag(iflags, GENMASK(3, 1))
+/* Cover both PENDING_EXCEPTION and EXCEPT_MASK for global operations */
+#define PC_UPDATE_REQ		__vcpu_single_flag(iflags, GENMASK(3, 0))
 
 /* Helpers to encode exceptions with minimum fuss */
 #define __EXCEPT_MASK_VAL	unpack_vcpu_flag(EXCEPT_MASK)

From b0471af21c4a62f513050c4306f955e738fcfa22 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 6 Apr 2022 16:30:29 +0100
Subject: [PATCH 038/457] ANDROID: KVM: arm64: Provide a hypercall for the host
 to reclaim guest memory

Implement a new hypercall, __pkvm_host_reclaim_page(), so that the host
at EL1 can reclaim pages that were previously donated to EL2. This
allows EL2 to defer clearing of guest memory on teardown and allows
preemption in the host after reclaiming each page.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ifbeafc5ed3e930307f9a9ae04d05ee06cb4451ac
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h              |  1 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/include/nvhe/memory.h      |  7 ++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  8 ++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 99 ++++++++++++++++++-
 5 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 43c3bc0f9544..2c1f6a571082 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -64,6 +64,7 @@ enum __kvm_host_smccc_func {
 	/* Hypercalls available after pKVM finalisation */
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
+	__KVM_HOST_SMCCC_FUNC___pkvm_host_reclaim_page,
 	__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
 	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b7bdbe63deed..cc7aa2c1bba0 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -64,6 +64,7 @@ extern unsigned long hyp_nr_cpus;
 int __pkvm_prot_finalize(void);
 int __pkvm_host_share_hyp(u64 pfn);
 int __pkvm_host_unshare_hyp(u64 pfn);
+int __pkvm_host_reclaim_page(u64 pfn);
 int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
 int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 51454ed29c8a..a8d4a5b919d2 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -7,6 +7,13 @@
 
 #include <linux/types.h>
 
+/*
+ * Accesses to struct hyp_page flags are serialized by the host stage-2
+ * page-table lock.
+ */
+#define HOST_PAGE_NEED_POISONING	BIT(0)
+#define HOST_PAGE_PENDING_RECLAIM	BIT(1)
+
 struct hyp_page {
 	unsigned short refcount;
 	u8 order;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 728e01d4536b..a5bfe6dd948e 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -230,6 +230,13 @@ static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn);
 }
 
+static void handle___pkvm_host_reclaim_page(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, pfn, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_host_reclaim_page(pfn);
+}
+
 static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
@@ -311,6 +318,7 @@ static const hcall_t host_hcall[] = {
 
 	HANDLE_FUNC(__pkvm_host_share_hyp),
 	HANDLE_FUNC(__pkvm_host_unshare_hyp),
+	HANDLE_FUNC(__pkvm_host_reclaim_page),
 	HANDLE_FUNC(__kvm_adjust_pc),
 	HANDLE_FUNC(__kvm_vcpu_run),
 	HANDLE_FUNC(__kvm_flush_vm_context),
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index f842d6e0dafd..b18426727fec 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -260,15 +260,50 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
 	return 0;
 }
 
+static int reclaim_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			  enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+	kvm_pte_t pte = *ptep;
+	struct hyp_page *page;
+
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	page = hyp_phys_to_page(kvm_pte_to_phys(pte));
+	switch (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte))) {
+	case PKVM_PAGE_OWNED:
+		page->flags |= HOST_PAGE_NEED_POISONING;
+		fallthrough;
+	case PKVM_PAGE_SHARED_BORROWED:
+	case PKVM_PAGE_SHARED_OWNED:
+		page->flags |= HOST_PAGE_PENDING_RECLAIM;
+		break;
+	default:
+		return -EPERM;
+	}
+
+	return 0;
+}
+
 void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
 {
+
+	struct kvm_pgtable_walker walker = {
+		.cb     = reclaim_walker,
+		.flags  = KVM_PGTABLE_WALK_LEAF
+	};
 	void *addr;
 
-	/* Dump all pgtable pages in the hyp_pool */
+	host_lock_component();
 	guest_lock_component(vm);
+
+	/* Reclaim all guest pages and dump all pgtable pages in the hyp_pool */
+	BUG_ON(kvm_pgtable_walk(&vm->pgt, 0, BIT(vm->pgt.ia_bits), &walker));
 	kvm_pgtable_stage2_destroy(&vm->pgt);
 	vm->kvm.arch.mmu.pgd_phys = 0ULL;
+
 	guest_unlock_component(vm);
+	host_unlock_component();
 
 	/* Drain the hyp_pool into the memcache */
 	addr = hyp_alloc_pages(&vm->pool, 0);
@@ -1225,3 +1260,65 @@ void hyp_unpin_shared_mem(void *from, void *to)
 	hyp_unlock_component();
 	host_unlock_component();
 }
+
+static int hyp_zero_page(phys_addr_t phys)
+{
+	void *addr;
+
+	addr = hyp_fixmap_map(phys);
+	if (!addr)
+		return -EINVAL;
+
+	memset(addr, 0, PAGE_SIZE);
+
+	/*
+	 * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page()
+	 * here as the latter may elide the CMO under the assumption that FWB
+	 * will be enabled on CPUs that support it. This is incorrect for the
+	 * host stage-2 and would otherwise lead to a malicious host potentially
+	 * being able to read the contents of newly reclaimed guest pages.
+	 */
+	kvm_flush_dcache_to_poc(addr, PAGE_SIZE);
+	hyp_fixmap_unmap();
+	return 0;
+}
+
+int __pkvm_host_reclaim_page(u64 pfn)
+{
+	u64 addr = hyp_pfn_to_phys(pfn);
+	struct hyp_page *page;
+	kvm_pte_t pte;
+	int ret;
+
+	host_lock_component();
+
+	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, NULL);
+	if (ret)
+		goto unlock;
+
+	if (host_get_page_state(pte) == PKVM_PAGE_OWNED)
+		goto unlock;
+
+	page = hyp_phys_to_page(addr);
+	if (!(page->flags & HOST_PAGE_PENDING_RECLAIM)) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
+	if (page->flags & HOST_PAGE_NEED_POISONING) {
+		ret = hyp_zero_page(addr);
+		if (ret)
+			goto unlock;
+		page->flags &= ~HOST_PAGE_NEED_POISONING;
+	}
+
+	ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_HOST);
+	if (ret)
+		goto unlock;
+	page->flags &= ~HOST_PAGE_PENDING_RECLAIM;
+
+unlock:
+	host_unlock_component();
+
+	return ret;
+}

From 03a74c30ef209e2a9a1f57099095a99d1ac3a608 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 6 Apr 2022 16:52:57 +0100
Subject: [PATCH 039/457] ANDROID: KVM: arm64: Extend memory sharing to allow
 host-to-guest transitions

In preparation for handling guest stage-2 mappings at EL2, extend our
memory protection mechanisms to support sharing of pages from the host
to a specific guest.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8e1d7cf4db70ad55a29d935f60e6335fc83490eb
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h             |   8 +-
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   2 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 100 ++++++++++++++++++
 3 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index cfc25b45fd33..1d4f14b4c2b5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -469,8 +469,12 @@ struct kvm_vcpu_arch {
 	/* vcpu power state */
 	struct kvm_mp_state mp_state;
 
-	/* Cache some mmu pages needed inside spinlock regions */
-	struct kvm_mmu_memory_cache mmu_page_cache;
+	union {
+		/* Cache some mmu pages needed inside spinlock regions */
+		struct kvm_mmu_memory_cache mmu_page_cache;
+		/* Pages to be donated to pkvm/EL2 if it runs out */
+		struct kvm_hyp_memcache pkvm_memcache;
+	};
 
 	/* Target CPU and feature flags */
 	int target;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index cc7aa2c1bba0..a91c3ad6db0b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -57,6 +57,7 @@ extern struct host_mmu host_mmu;
 enum pkvm_component_id {
 	PKVM_ID_HOST,
 	PKVM_ID_HYP,
+	PKVM_ID_GUEST,
 };
 
 extern unsigned long hyp_nr_cpus;
@@ -67,6 +68,7 @@ int __pkvm_host_unshare_hyp(u64 pfn);
 int __pkvm_host_reclaim_page(u64 pfn);
 int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
 int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index b18426727fec..5736f700b451 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -578,11 +578,21 @@ struct pkvm_mem_transition {
 			struct {
 				u64	completer_addr;
 			} hyp;
+			struct {
+				struct pkvm_hyp_vcpu *hyp_vcpu;
+			} guest;
 		};
 	} initiator;
 
 	struct {
 		enum pkvm_component_id	id;
+
+		union {
+			struct {
+				struct pkvm_hyp_vcpu *hyp_vcpu;
+				phys_addr_t phys;
+			} guest;
+		};
 	} completer;
 };
 
@@ -846,6 +856,52 @@ static int hyp_complete_donation(u64 addr,
 	return pkvm_create_mappings_locked(start, end, prot);
 }
 
+static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte)
+{
+	if (!kvm_pte_valid(pte))
+		return PKVM_NOPAGE;
+
+	return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+}
+
+static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,
+					  u64 size, enum pkvm_page_state state)
+{
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	struct check_walk_data d = {
+		.desired	= state,
+		.get_page_state	= guest_get_page_state,
+	};
+
+	hyp_assert_lock_held(&vm->lock);
+	return check_page_state_range(&vm->pgt, addr, size, &d);
+}
+
+static int guest_ack_share(u64 addr, const struct pkvm_mem_transition *tx,
+			   enum kvm_pgtable_prot perms)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	if (perms != KVM_PGTABLE_PROT_RWX)
+		return -EPERM;
+
+	return __guest_check_page_state_range(tx->completer.guest.hyp_vcpu,
+					      addr, size, PKVM_NOPAGE);
+}
+
+static int guest_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
+				enum kvm_pgtable_prot perms)
+{
+	struct pkvm_hyp_vcpu *vcpu = tx->completer.guest.hyp_vcpu;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	u64 size = tx->nr_pages * PAGE_SIZE;
+	enum kvm_pgtable_prot prot;
+
+	prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED);
+	return kvm_pgtable_stage2_map(&vm->pgt, addr, size, tx->completer.guest.phys,
+				      prot, &vcpu->vcpu.arch.pkvm_memcache);
+}
+
 static int check_share(struct pkvm_mem_share *share)
 {
 	const struct pkvm_mem_transition *tx = &share->tx;
@@ -867,6 +923,9 @@ static int check_share(struct pkvm_mem_share *share)
 	case PKVM_ID_HYP:
 		ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_ack_share(completer_addr, tx, share->completer_prot);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -895,6 +954,9 @@ static int __do_share(struct pkvm_mem_share *share)
 	case PKVM_ID_HYP:
 		ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_complete_share(completer_addr, tx, share->completer_prot);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1261,6 +1323,44 @@ void hyp_unpin_shared_mem(void *from, void *to)
 	host_unlock_component();
 }
 
+int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
+{
+	int ret;
+	u64 host_addr = hyp_pfn_to_phys(pfn);
+	u64 guest_addr = hyp_pfn_to_phys(gfn);
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= 1,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= host_addr,
+				.host	= {
+					.completer_addr = guest_addr,
+				},
+			},
+			.completer	= {
+				.id	= PKVM_ID_GUEST,
+				.guest	= {
+					.hyp_vcpu = vcpu,
+					.phys = host_addr,
+				},
+			},
+		},
+		.completer_prot	= KVM_PGTABLE_PROT_RWX,
+	};
+
+	host_lock_component();
+	guest_lock_component(vm);
+
+	ret = do_share(&share);
+
+	guest_unlock_component(vm);
+	host_unlock_component();
+
+	return ret;
+}
+
 static int hyp_zero_page(phys_addr_t phys)
 {
 	void *addr;

From cfa98233eb54bac373f53200e212ce2a5cd7f27a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 21 Jun 2021 15:47:20 +0100
Subject: [PATCH 040/457] ANDROID: KVM: arm64: Check for PTE validity when
 checking for executable/cacheable

Don't blindly assume that the PTE is valid when checking whether
it describes an executable or cacheable mapping.

This makes sure that we don't issue CMOs for invalid mappings.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I6cbcdb97033ec7b2ed2c9dce0cfc91491e573908
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2bcb2d5903ba..362a3fdf651f 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -688,12 +688,12 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
 {
 	u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
-	return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+	return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
 }
 
 static bool stage2_pte_executable(kvm_pte_t pte)
 {
-	return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+	return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
 }
 
 static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level,
@@ -738,8 +738,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	/* Perform CMOs before installation of the guest stage-2 PTE */
 	if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
-						granule);
-
+					       granule);
 	if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
 		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
 
@@ -1136,7 +1135,7 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
 	kvm_pte_t pte = *ptep;
 
-	if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
+	if (!stage2_pte_cacheable(pgt, pte))
 		return 0;
 
 	if (mm_ops->dcache_clean_inval_poc)

From f0b24085ffa7e7ddab53019f6f028777c8cf7e7c Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Thu, 7 Apr 2022 16:12:41 +0100
Subject: [PATCH 041/457] ANDROID: KVM: arm64: Do not allow memslot changes
 after first VM run under pKVM

As the guest stage-2 page-tables will soon be managed entirely by EL2
when pKVM is enabled, guest memory will be pinned and the MMU notifiers
in the host will be unable to reconfigure mappings at EL2 other than
destrroying the guest and reclaiming all of the memory.

Forbid memslot move/delete operations for VMs that have run under pKVM,
returning -EPERM to userspace if such an operation is requested.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I73650c1ac79d8c116a3f31d17ef2a4ef1b30a844
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/mmu.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 962f4472601b..dbee0a624cba 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1760,6 +1760,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	hva_t hva, reg_end;
 	int ret = 0;
 
+	/* In protected mode, cannot modify memslots once a VM has run. */
+	if (is_protected_kvm_enabled() &&
+	    (change == KVM_MR_DELETE || change == KVM_MR_MOVE) &&
+	    kvm->arch.pkvm.handle) {
+		return -EPERM;
+	}
+
 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
 			change != KVM_MR_FLAGS_ONLY)
 		return 0;
@@ -1836,6 +1843,10 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
 	phys_addr_t size = slot->npages << PAGE_SHIFT;
 
+	/* Stage-2 is managed by hyp in protected mode. */
+	if (is_protected_kvm_enabled())
+		return;
+
 	write_lock(&kvm->mmu_lock);
 	unmap_stage2_range(&kvm->arch.mmu, gpa, size);
 	write_unlock(&kvm->mmu_lock);

From 9954ff8dcc938c6c61fdf088dcf674692a848ec5 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 6 May 2022 08:59:32 +0000
Subject: [PATCH 042/457] ANDROID: KVM: arm64: Disallow dirty logging and RO
 memslots with pKVM

The current implementation of pKVM doesn't support dirty logging or
read-only memslots. Although support for these features is desirable,
this will require future work, so let's cleanly report the limitations
to userspace by failing the ioctls until then.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ifc434c234ac58b46a244fdd44114bc9a51f53e19
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/mmu.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index dbee0a624cba..f545a59c0325 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1760,11 +1760,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	hva_t hva, reg_end;
 	int ret = 0;
 
-	/* In protected mode, cannot modify memslots once a VM has run. */
-	if (is_protected_kvm_enabled() &&
-	    (change == KVM_MR_DELETE || change == KVM_MR_MOVE) &&
-	    kvm->arch.pkvm.handle) {
-		return -EPERM;
+	if (is_protected_kvm_enabled()) {
+		/* In protected mode, cannot modify memslots once a VM has run. */
+		if ((change == KVM_MR_DELETE || change == KVM_MR_MOVE) &&
+		    kvm->arch.pkvm.handle) {
+			return -EPERM;
+		}
+
+		if (new &&
+		    new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) {
+			return -EPERM;
+		}
 	}
 
 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&

From 3b2d9cbcafbe6513e6423b4b7b790272c338126a Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 11 Apr 2022 14:22:33 +0100
Subject: [PATCH 043/457] ANDROID: KVM: arm64: Handle guest stage-2 page-tables
 entirely at EL2

Now that EL2 is able to manage guest stage-2 page-tables, avoid
allocating a separate MMU structure in the host and instead introduce a
new fault handler which responds to guest stage-2 faults by sharing
GUP-pinned pages with the guest via a hypercall. These pages are
recovered (and unpinned) on guest teardown via the page reclaim
hypercall.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ibbddc97cee322bf2db258b4f0848733e2efb1126
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h   |   1 +
 arch/arm64/include/asm/kvm_host.h  |   6 ++
 arch/arm64/kvm/arm.c               |  10 ++-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |  44 +++++++++-
 arch/arm64/kvm/mmu.c               | 136 +++++++++++++++++++++++++++--
 arch/arm64/kvm/pkvm.c              |  17 ++++
 6 files changed, 206 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 2c1f6a571082..36ef27dcd17c 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -65,6 +65,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_reclaim_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest,
 	__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
 	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1d4f14b4c2b5..da3f38284e59 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -172,11 +172,17 @@ struct kvm_smccc_features {
 	unsigned long vendor_hyp_bmap;
 };
 
+struct kvm_pinned_page {
+	struct list_head	link;
+	struct page		*page;
+};
+
 typedef unsigned int pkvm_handle_t;
 
 struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache teardown_mc;
+	struct list_head pinned_pages;
 };
 
 struct kvm_arch {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1d4b8122d010..6105e84628da 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -369,7 +369,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		static_branch_dec(&userspace_irqchip_in_use);
 
-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+	if (is_protected_kvm_enabled())
+		free_hyp_memcache(&vcpu->arch.pkvm_memcache);
+	else
+		kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_pmu_vcpu_destroy(vcpu);
 
@@ -391,6 +395,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct kvm_s2_mmu *mmu;
 	int *last_ran;
 
+	if (is_protected_kvm_enabled())
+		goto nommu;
+
 	mmu = vcpu->arch.hw_mmu;
 	last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
 
@@ -408,6 +415,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		*last_ran = vcpu->vcpu_id;
 	}
 
+nommu:
 	vcpu->cpu = cpu;
 
 	kvm_vgic_load(vcpu);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index a5bfe6dd948e..da618fa9982a 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -31,8 +31,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	hyp_vcpu->vcpu.arch.sve_state	= kern_hyp_va(host_vcpu->arch.sve_state);
 	hyp_vcpu->vcpu.arch.sve_max_vl	= host_vcpu->arch.sve_max_vl;
 
-	hyp_vcpu->vcpu.arch.hw_mmu	= host_vcpu->arch.hw_mmu;
-
 	hyp_vcpu->vcpu.arch.hcr_el2	= host_vcpu->arch.hcr_el2;
 	hyp_vcpu->vcpu.arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
 	hyp_vcpu->vcpu.arch.cptr_el2	= host_vcpu->arch.cptr_el2;
@@ -104,6 +102,47 @@ out:
 	cpu_reg(host_ctxt, 1) =  ret;
 }
 
+static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	u64 nr_pages = VTCR_EL2_LVLS(hyp_vm->kvm.arch.vtcr) - 1;
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, nr_pages,
+			       &host_vcpu->arch.pkvm_memcache);
+}
+
+static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, pfn, host_ctxt, 1);
+	DECLARE_REG(u64, gfn, host_ctxt, 2);
+	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 3);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct kvm *host_kvm;
+	int ret = -EINVAL;
+
+	if (!is_protected_kvm_enabled())
+		goto out;
+
+	host_vcpu = kern_hyp_va(host_vcpu);
+	host_kvm = kern_hyp_va(host_vcpu->kvm);
+	hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
+				      host_vcpu->vcpu_idx);
+	if (!hyp_vcpu)
+		goto out;
+
+	/* Top-up our per-vcpu memcache from the host's */
+	ret = pkvm_refill_memcache(hyp_vcpu);
+	if (ret)
+		goto out_put_vcpu;
+
+	ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu);
+out_put_vcpu:
+	pkvm_put_hyp_vcpu(hyp_vcpu);
+out:
+	cpu_reg(host_ctxt, 1) =  ret;
+}
+
 static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
@@ -319,6 +358,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_host_share_hyp),
 	HANDLE_FUNC(__pkvm_host_unshare_hyp),
 	HANDLE_FUNC(__pkvm_host_reclaim_page),
+	HANDLE_FUNC(__pkvm_host_map_guest),
 	HANDLE_FUNC(__kvm_adjust_pc),
 	HANDLE_FUNC(__kvm_vcpu_run),
 	HANDLE_FUNC(__kvm_flush_vm_context),
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f545a59c0325..474094291e9b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -225,6 +225,22 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
 	__unmap_stage2_range(mmu, start, size, true);
 }
 
+static void pkvm_stage2_flush(struct kvm *kvm)
+{
+	struct kvm_pinned_page *ppage;
+
+	/*
+	 * Contrary to stage2_apply_range(), we don't need to check
+	 * whether the VM is being torn down, as this is always called
+	 * from a vcpu thread, and the list is only ever freed on VM
+	 * destroy (which only occurs when all vcpu are gone).
+	 */
+	list_for_each_entry(ppage, &kvm->arch.pkvm.pinned_pages, link) {
+		__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
+		cond_resched_rwlock_write(&kvm->mmu_lock);
+	}
+}
+
 static void stage2_flush_memslot(struct kvm *kvm,
 				 struct kvm_memory_slot *memslot)
 {
@@ -250,9 +266,13 @@ static void stage2_flush_vm(struct kvm *kvm)
 	idx = srcu_read_lock(&kvm->srcu);
 	write_lock(&kvm->mmu_lock);
 
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, bkt, slots)
-		stage2_flush_memslot(kvm, memslot);
+	if (!is_protected_kvm_enabled()) {
+		slots = kvm_memslots(kvm);
+		kvm_for_each_memslot(memslot, bkt, slots)
+			stage2_flush_memslot(kvm, memslot);
+	} else if (!kvm_vm_is_protected(kvm)) {
+		pkvm_stage2_flush(kvm);
+	}
 
 	write_unlock(&kvm->mmu_lock);
 	srcu_read_unlock(&kvm->srcu, idx);
@@ -711,6 +731,11 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
+	INIT_LIST_HEAD(&kvm->arch.pkvm.pinned_pages);
+	mmu->arch = &kvm->arch;
+
+	if (is_protected_kvm_enabled())
+		return 0;
 
 	if (mmu->pgt != NULL) {
 		kvm_err("kvm_arch already initialized?\n");
@@ -819,6 +844,9 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
 	struct kvm_pgtable *pgt = NULL;
 
+	if (is_protected_kvm_enabled())
+		return;
+
 	write_lock(&kvm->mmu_lock);
 	pgt = mmu->pgt;
 	if (pgt) {
@@ -1172,6 +1200,99 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
 	return 0;
 }
 
+static int pkvm_host_map_guest(u64 pfn, u64 gfn, struct kvm_vcpu *vcpu)
+{
+	int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest, pfn, gfn, vcpu);
+
+	/*
+	 * Getting -EPERM at this point implies that the pfn has already been
+	 * mapped. This should only ever happen when two vCPUs faulted on the
+	 * same page, and the current one lost the race to do the mapping.
+	 */
+	return (ret == -EPERM) ? -EAGAIN : ret;
+}
+
+static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+			  unsigned long hva)
+{
+	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
+	struct mm_struct *mm = current->mm;
+	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
+	struct kvm_pinned_page *ppage;
+	struct kvm *kvm = vcpu->kvm;
+	struct page *page;
+	u64 pfn;
+	int ret;
+
+	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
+	if (ret)
+		return -ENOMEM;
+
+	ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
+	if (!ppage)
+		return -ENOMEM;
+
+	ret = account_locked_vm(mm, 1, true);
+	if (ret)
+		goto free_ppage;
+
+	mmap_read_lock(mm);
+	ret = pin_user_pages(hva, 1, flags, &page, NULL);
+	mmap_read_unlock(mm);
+
+	if (ret == -EHWPOISON) {
+		kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
+		ret = 0;
+		goto dec_account;
+	} else if (ret != 1) {
+		ret = -EFAULT;
+		goto dec_account;
+	} else if (!PageSwapBacked(page)) {
+		/*
+		 * We really can't deal with page-cache pages returned by GUP
+		 * because (a) we may trigger writeback of a page for which we
+		 * no longer have access and (b) page_mkclean() won't find the
+		 * stage-2 mapping in the rmap so we can get out-of-whack with
+		 * the filesystem when marking the page dirty during unpinning
+		 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
+		 * without asking ext4 first")).
+		 *
+		 * Ideally we'd just restrict ourselves to anonymous pages, but
+		 * we also want to allow memfd (i.e. shmem) pages, so check for
+		 * pages backed by swap in the knowledge that the GUP pin will
+		 * prevent try_to_unmap() from succeeding.
+		 */
+		ret = -EIO;
+		goto dec_account;
+	}
+
+	write_lock(&kvm->mmu_lock);
+	pfn = page_to_pfn(page);
+	ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT, vcpu);
+	if (ret) {
+		if (ret == -EAGAIN)
+			ret = 0;
+		goto unpin;
+	}
+
+	ppage->page = page;
+	INIT_LIST_HEAD(&ppage->link);
+	list_add(&ppage->link, &kvm->arch.pkvm.pinned_pages);
+	write_unlock(&kvm->mmu_lock);
+
+	return 0;
+
+unpin:
+	write_unlock(&kvm->mmu_lock);
+	unpin_user_pages(&page, 1);
+dec_account:
+	account_locked_vm(mm, 1, false);
+free_ppage:
+	kfree(ppage);
+
+	return ret;
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@@ -1455,7 +1576,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		}
 
 		/* Falls between the IPA range and the PARange? */
-		if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
+		if (!is_protected_kvm_enabled() &&
+		    fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
 			fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
 
 			if (is_iabt)
@@ -1551,7 +1673,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		goto out_unlock;
 	}
 
-	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+	if (is_protected_kvm_enabled())
+		ret = pkvm_mem_abort(vcpu, fault_ipa, hva);
+	else
+		ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+
 	if (ret == 0)
 		ret = 1;
 out:
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index cf56958b1492..aea560873028 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -6,6 +6,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/memblock.h>
+#include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/sort.h>
 
@@ -199,6 +200,10 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
 
 void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 {
+	struct kvm_pinned_page *ppage, *tmp;
+	struct mm_struct *mm = current->mm;
+	struct list_head *ppages;
+
 	if (host_kvm->arch.pkvm.handle) {
 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
 					  host_kvm->arch.pkvm.handle));
@@ -206,6 +211,18 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 
 	host_kvm->arch.pkvm.handle = 0;
 	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
+
+	ppages = &host_kvm->arch.pkvm.pinned_pages;
+	list_for_each_entry_safe(ppage, tmp, ppages, link) {
+		WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_reclaim_page,
+					  page_to_pfn(ppage->page)));
+		cond_resched();
+
+		account_locked_vm(mm, 1, false);
+		unpin_user_pages_dirty_lock(&ppage->page, 1, true);
+		list_del(&ppage->link);
+		kfree(ppage);
+	}
 }
 
 int pkvm_init_host_vm(struct kvm *host_kvm)

From 0d40de553ecade142fcaa740a865f5e4fcadf008 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 7 Apr 2022 14:00:10 +0100
Subject: [PATCH 044/457] ANDROID: KVM: arm64: Extend memory donation to allow
 host-to-guest transitions

In preparation for supporting protected guests, where guest memory
defaults to being inaccessible to the host, extend our memory protection
mechanisms to support donation of pages from the host to a specific
guest.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ic397b6fd0f7b5f0911ddd8f457e40c6e6689673c
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 62 +++++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c                  |  2 +-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index a91c3ad6db0b..4d14b4cfb3b7 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -69,6 +69,7 @@ int __pkvm_host_reclaim_page(u64 pfn);
 int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
 int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
 int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 5736f700b451..d73b88c7062e 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -889,6 +889,14 @@ static int guest_ack_share(u64 addr, const struct pkvm_mem_transition *tx,
 					      addr, size, PKVM_NOPAGE);
 }
 
+static int guest_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	return __guest_check_page_state_range(tx->completer.guest.hyp_vcpu,
+					      addr, size, PKVM_NOPAGE);
+}
+
 static int guest_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
 				enum kvm_pgtable_prot perms)
 {
@@ -902,6 +910,17 @@ static int guest_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
 				      prot, &vcpu->vcpu.arch.pkvm_memcache);
 }
 
+static int guest_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	enum kvm_pgtable_prot prot = pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED);
+	struct pkvm_hyp_vcpu *vcpu = tx->completer.guest.hyp_vcpu;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	return kvm_pgtable_stage2_map(&vm->pgt, addr, size, tx->completer.guest.phys,
+				      prot, &vcpu->vcpu.arch.pkvm_memcache);
+}
+
 static int check_share(struct pkvm_mem_share *share)
 {
 	const struct pkvm_mem_transition *tx = &share->tx;
@@ -1087,6 +1106,9 @@ static int check_donation(struct pkvm_mem_donation *donation)
 	case PKVM_ID_HYP:
 		ret = hyp_ack_donation(completer_addr, tx);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_ack_donation(completer_addr, tx);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1121,6 +1143,9 @@ static int __do_donate(struct pkvm_mem_donation *donation)
 	case PKVM_ID_HYP:
 		ret = hyp_complete_donation(completer_addr, tx);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_complete_donation(completer_addr, tx);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1361,6 +1386,43 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
 	return ret;
 }
 
+int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
+{
+	int ret;
+	u64 host_addr = hyp_pfn_to_phys(pfn);
+	u64 guest_addr = hyp_pfn_to_phys(gfn);
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	struct pkvm_mem_donation donation = {
+		.tx	= {
+			.nr_pages	= 1,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= host_addr,
+				.host	= {
+					.completer_addr = guest_addr,
+				},
+			},
+			.completer	= {
+				.id	= PKVM_ID_GUEST,
+				.guest	= {
+					.hyp_vcpu = vcpu,
+					.phys = host_addr,
+				},
+			},
+		},
+	};
+
+	host_lock_component();
+	guest_lock_component(vm);
+
+	ret = do_donate(&donation);
+
+	guest_unlock_component(vm);
+	host_unlock_component();
+
+	return ret;
+}
+
 static int hyp_zero_page(phys_addr_t phys)
 {
 	void *addr;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 362a3fdf651f..1597b388d12e 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -47,7 +47,7 @@
 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
 #define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID		1
+#define KVM_MAX_OWNER_ID		FIELD_MAX(KVM_INVALID_PTE_OWNER_MASK)
 
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable		*pgt;

From 1b19100fcb2da43f7cf55ec12316045761f97ca2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 12 Apr 2022 15:45:11 +0100
Subject: [PATCH 045/457] ANDROID: KVM: arm64: Split up nvhe/fixed_config.h

In preparation for using some of the pKVM fixed configuration register
definitions to filter the available VM CAPs in the host, split the
nvhe/fixed_config.h header so that the definitions can be shared
with the host, while keeping the hypervisor function prototypes in
the nvhe/ namespace.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I33894868e7652f7b79caa91a007dccad997ef4ab
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h             | 190 ++++++++++++++++
 .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 205 ------------------
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h        |   5 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c                |   1 -
 arch/arm64/kvm/hyp/nvhe/setup.c               |   1 -
 arch/arm64/kvm/hyp/nvhe/switch.c              |   2 +-
 arch/arm64/kvm/hyp/nvhe/sys_regs.c            |   2 +-
 7 files changed, 197 insertions(+), 209 deletions(-)
 delete mode 100644 arch/arm64/kvm/hyp/include/nvhe/fixed_config.h

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 01129b0d4c68..017b58a61a78 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -2,12 +2,14 @@
 /*
  * Copyright (C) 2020 - Google LLC
  * Author: Quentin Perret <qperret@google.com>
+ * Author: Fuad Tabba <tabba@google.com>
  */
 #ifndef __ARM64_KVM_PKVM_H__
 #define __ARM64_KVM_PKVM_H__
 
 #include <linux/memblock.h>
 #include <asm/kvm_pgtable.h>
+#include <asm/sysreg.h>
 
 /* Maximum number of VMs that can co-exist under pKVM. */
 #define KVM_MAX_PVMS 255
@@ -18,6 +20,194 @@ int pkvm_init_host_vm(struct kvm *kvm);
 int pkvm_create_hyp_vm(struct kvm *kvm);
 void pkvm_destroy_hyp_vm(struct kvm *kvm);
 
+/*
+ * Definitions for features to be allowed or restricted for guest virtual
+ * machines, depending on the mode KVM is running in and on the type of guest
+ * that is running.
+ *
+ * The ALLOW masks represent a bitmask of feature fields that are allowed
+ * without any restrictions as long as they are supported by the system.
+ *
+ * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for
+ * features that are restricted to support at most the specified feature.
+ *
+ * If a feature field is not present in either, than it is not supported.
+ *
+ * The approach taken for protected VMs is to allow features that are:
+ * - Needed by common Linux distributions (e.g., floating point)
+ * - Trivial to support, e.g., supporting the feature does not introduce or
+ * require tracking of additional state in KVM
+ * - Cannot be trapped or prevent the guest from using anyway
+ */
+
+/*
+ * Allow for protected VMs:
+ * - Floating-point and Advanced SIMD
+ * - Data Independent Timing
+ */
+#define PVM_ID_AA64PFR0_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \
+	)
+
+/*
+ * Restrict to the following *unsigned* features for protected VMs:
+ * - AArch64 guests only (no support for AArch32 guests):
+ *	AArch32 adds complexity in trap handling, emulation, condition codes,
+ *	etc...
+ * - RAS (v1)
+ *	Supported by KVM
+ */
+#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Branch Target Identification
+ * - Speculative Store Bypassing
+ */
+#define PVM_ID_AA64PFR1_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Mixed-endian
+ * - Distinction between Secure and Non-secure Memory
+ * - Mixed-endian at EL0 only
+ * - Non-context synchronizing exception entry and exit
+ */
+#define PVM_ID_AA64MMFR0_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \
+	)
+
+/*
+ * Restrict to the following *unsigned* features for protected VMs:
+ * - 40-bit IPA
+ * - 16-bit ASID
+ */
+#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \
+	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Hardware translation table updates to Access flag and Dirty state
+ * - Number of VMID bits from CPU
+ * - Hierarchical Permission Disables
+ * - Privileged Access Never
+ * - SError interrupt exceptions from speculative reads
+ * - Enhanced Translation Synchronization
+ */
+#define PVM_ID_AA64MMFR1_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \
+	)
+
+/*
+ * Allow for protected VMs:
+ * - Common not Private translations
+ * - User Access Override
+ * - IESB bit in the SCTLR_ELx registers
+ * - Unaligned single-copy atomicity and atomic functions
+ * - ESR_ELx.EC value on an exception by read access to feature ID space
+ * - TTL field in address operations.
+ * - Break-before-make sequences when changing translation block size
+ * - E0PDx mechanism
+ */
+#define PVM_ID_AA64MMFR2_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \
+	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \
+	)
+
+/*
+ * No support for Scalable Vectors for protected VMs:
+ *	Requires additional support from KVM, e.g., context-switching and
+ *	trapping at EL2
+ */
+#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
+
+/*
+ * No support for debug, including breakpoints, and watchpoints for protected
+ * VMs:
+ *	The Arm architecture mandates support for at least the Armv8 debug
+ *	architecture, which would include at least 2 hardware breakpoints and
+ *	watchpoints. Providing that support to protected guests adds
+ *	considerable state and complexity. Therefore, the reserved value of 0 is
+ *	used for debug-related fields.
+ */
+#define PVM_ID_AA64DFR0_ALLOW (0ULL)
+#define PVM_ID_AA64DFR1_ALLOW (0ULL)
+
+/*
+ * No support for implementation defined features.
+ */
+#define PVM_ID_AA64AFR0_ALLOW (0ULL)
+#define PVM_ID_AA64AFR1_ALLOW (0ULL)
+
+/*
+ * No restrictions on instructions implemented in AArch64.
+ */
+#define PVM_ID_AA64ISAR0_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \
+	)
+
+#define PVM_ID_AA64ISAR1_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
+	)
+
+#define PVM_ID_AA64ISAR2_ALLOW (\
+	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
+	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
+	)
+
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
deleted file mode 100644
index 07edfc7524c9..000000000000
--- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2021 Google LLC
- * Author: Fuad Tabba <tabba@google.com>
- */
-
-#ifndef __ARM64_KVM_FIXED_CONFIG_H__
-#define __ARM64_KVM_FIXED_CONFIG_H__
-
-#include <asm/sysreg.h>
-
-/*
- * This file contains definitions for features to be allowed or restricted for
- * guest virtual machines, depending on the mode KVM is running in and on the
- * type of guest that is running.
- *
- * The ALLOW masks represent a bitmask of feature fields that are allowed
- * without any restrictions as long as they are supported by the system.
- *
- * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for
- * features that are restricted to support at most the specified feature.
- *
- * If a feature field is not present in either, than it is not supported.
- *
- * The approach taken for protected VMs is to allow features that are:
- * - Needed by common Linux distributions (e.g., floating point)
- * - Trivial to support, e.g., supporting the feature does not introduce or
- * require tracking of additional state in KVM
- * - Cannot be trapped or prevent the guest from using anyway
- */
-
-/*
- * Allow for protected VMs:
- * - Floating-point and Advanced SIMD
- * - Data Independent Timing
- */
-#define PVM_ID_AA64PFR0_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
-	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
-	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \
-	)
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - AArch64 guests only (no support for AArch32 guests):
- *	AArch32 adds complexity in trap handling, emulation, condition codes,
- *	etc...
- * - RAS (v1)
- *	Supported by KVM
- */
-#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL2), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL3), ID_AA64PFR0_EL1_ELx_64BIT_ONLY) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), ID_AA64PFR0_EL1_RAS_IMP) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Branch Target Identification
- * - Speculative Store Bypassing
- */
-#define PVM_ID_AA64PFR1_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \
-	ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Mixed-endian
- * - Distinction between Secure and Non-secure Memory
- * - Mixed-endian at EL0 only
- * - Non-context synchronizing exception entry and exit
- */
-#define PVM_ID_AA64MMFR0_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \
-	)
-
-/*
- * Restrict to the following *unsigned* features for protected VMs:
- * - 40-bit IPA
- * - 16-bit ASID
- */
-#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \
-	FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Hardware translation table updates to Access flag and Dirty state
- * - Number of VMID bits from CPU
- * - Hierarchical Permission Disables
- * - Privileged Access Never
- * - SError interrupt exceptions from speculative reads
- * - Enhanced Translation Synchronization
- */
-#define PVM_ID_AA64MMFR1_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) \
-	)
-
-/*
- * Allow for protected VMs:
- * - Common not Private translations
- * - User Access Override
- * - IESB bit in the SCTLR_ELx registers
- * - Unaligned single-copy atomicity and atomic functions
- * - ESR_ELx.EC value on an exception by read access to feature ID space
- * - TTL field in address operations.
- * - Break-before-make sequences when changing translation block size
- * - E0PDx mechanism
- */
-#define PVM_ID_AA64MMFR2_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \
-	ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \
-	)
-
-/*
- * No support for Scalable Vectors for protected VMs:
- *	Requires additional support from KVM, e.g., context-switching and
- *	trapping at EL2
- */
-#define PVM_ID_AA64ZFR0_ALLOW (0ULL)
-
-/*
- * No support for debug, including breakpoints, and watchpoints for protected
- * VMs:
- *	The Arm architecture mandates support for at least the Armv8 debug
- *	architecture, which would include at least 2 hardware breakpoints and
- *	watchpoints. Providing that support to protected guests adds
- *	considerable state and complexity. Therefore, the reserved value of 0 is
- *	used for debug-related fields.
- */
-#define PVM_ID_AA64DFR0_ALLOW (0ULL)
-#define PVM_ID_AA64DFR1_ALLOW (0ULL)
-
-/*
- * No support for implementation defined features.
- */
-#define PVM_ID_AA64AFR0_ALLOW (0ULL)
-#define PVM_ID_AA64AFR1_ALLOW (0ULL)
-
-/*
- * No restrictions on instructions implemented in AArch64.
- */
-#define PVM_ID_AA64ISAR0_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \
-	)
-
-#define PVM_ID_AA64ISAR1_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
-	)
-
-#define PVM_ID_AA64ISAR2_ALLOW (\
-	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
-	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
-	)
-
-u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
-bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
-bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
-int kvm_check_pvm_sysreg_table(void);
-
-#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 82b3d62538a6..3e5547004764 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -65,4 +65,9 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
 					 unsigned int vcpu_idx);
 void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
 
+u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
+bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
+bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
+int kvm_check_pvm_sysreg_table(void);
+
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 1e9b507c4190..2b248c56145c 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -6,7 +6,6 @@
 
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
-#include <nvhe/fixed_config.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/memory.h>
 #include <nvhe/pkvm.h>
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 5cdf3fb09bb4..ab9a0d3515af 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -11,7 +11,6 @@
 #include <asm/kvm_pkvm.h>
 
 #include <nvhe/early_alloc.h>
-#include <nvhe/fixed_config.h>
 #include <nvhe/gfp.h>
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c2cb46ca4fb6..9344688a9282 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -26,8 +26,8 @@
 #include <asm/debug-monitors.h>
 #include <asm/processor.h>
 
-#include <nvhe/fixed_config.h>
 #include <nvhe/mem_protect.h>
+#include <nvhe/pkvm.h>
 
 /* Non-VHE specific context */
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 0f9ac25afdf4..e6270150b1cb 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -11,7 +11,7 @@
 
 #include <hyp/adjust_pc.h>
 
-#include <nvhe/fixed_config.h>
+#include <nvhe/pkvm.h>
 
 #include "../../sys_regs.h"
 

From 0a0fb09992a6a5a25ba3ef5f65fa275f2580f427 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 13 Apr 2022 16:44:51 +0100
Subject: [PATCH 046/457] ANDROID: KVM: arm64: Make vcpu_{read,write}_sys_reg
 available to HYP code

Allow vcpu_{read,write}_sys_reg() to be called from EL2 so that nVHE hyp
code can reuse existing helper functions for operations such as
resetting the vCPU state.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I5509ae1cc8d3fd9479fbe0b662bb62e31636eb77
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h | 25 ++++++++++++++++++++++---
 arch/arm64/kvm/sys_regs.c         | 20 --------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index da3f38284e59..ffd31910da52 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -699,9 +699,6 @@ struct kvm_vcpu_arch {
 
 #define __vcpu_sys_reg(v,r)	(ctxt_sys_reg(&(v)->arch.ctxt, (r)))
 
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
-
 static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
 {
 	/*
@@ -793,6 +790,28 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 	return true;
 }
 
+#define vcpu_read_sys_reg(__vcpu, reg)					\
+	({								\
+		u64 __val = 0x8badf00d8badf00d;				\
+									\
+		/* SYSREGS_ON_CPU is only used in VHE */		\
+		((!is_nvhe_hyp_code() &&				\
+		  vcpu_get_flag(__vcpu, SYSREGS_ON_CPU) &&		\
+		  __vcpu_read_sys_reg_from_cpu(reg, &__val))) ?		\
+		 __val							\
+		 :							\
+		 ctxt_sys_reg(&__vcpu->arch.ctxt, reg);			\
+	 })
+
+#define vcpu_write_sys_reg(__vcpu, __val, reg)				\
+	do {								\
+		/* SYSREGS_ON_CPU is only used in VHE */		\
+		if (is_nvhe_hyp_code() ||				\
+		    !vcpu_get_flag(__vcpu, SYSREGS_ON_CPU) ||		\
+		    !__vcpu_write_sys_reg_to_cpu(__val, reg))		\
+			ctxt_sys_reg(&__vcpu->arch.ctxt, reg) = __val;	\
+	} while (0)
+
 struct kvm_vm_stat {
 	struct kvm_vm_stat_generic generic;
 };
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index f4a7c5abcbca..8dea7cd2c281 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -61,26 +61,6 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
 	return false;
 }
 
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
-{
-	u64 val = 0x8badf00d8badf00d;
-
-	if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
-	    __vcpu_read_sys_reg_from_cpu(reg, &val))
-		return val;
-
-	return __vcpu_sys_reg(vcpu, reg);
-}
-
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
-{
-	if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) &&
-	    __vcpu_write_sys_reg_to_cpu(val, reg))
-		return;
-
-	 __vcpu_sys_reg(vcpu, reg) = val;
-}
-
 /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
 static u32 cache_levels;
 

From 4f1e3e2c1ec8720c9d106955091819fa5b58ab5d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 13 Apr 2022 19:06:36 +0100
Subject: [PATCH 047/457] ANDROID: KVM: arm64: Simplify vgic-v3 hypercalls

Consolidate the GICv3 VMCR accessor hypercalls into the APR save/restore
hypercalls so that all of the EL2 GICv3 state is covered by a single pair
of hypercalls.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ifb109d1592a82d0858d5040482d5cf686f9e74e2
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h   |  8 ++------
 arch/arm64/include/asm/kvm_hyp.h   |  4 ++--
 arch/arm64/kvm/arm.c               |  7 +++----
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 24 ++++++------------------
 arch/arm64/kvm/hyp/vgic-v3-sr.c    | 27 +++++++++++++++++++++++----
 arch/arm64/kvm/vgic/vgic-v2.c      |  9 +--------
 arch/arm64/kvm/vgic/vgic-v3.c      | 26 ++++----------------------
 arch/arm64/kvm/vgic/vgic.c         | 17 +++--------------
 arch/arm64/kvm/vgic/vgic.h         |  6 ++----
 include/kvm/arm_vgic.h             |  3 +--
 10 files changed, 47 insertions(+), 84 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 36ef27dcd17c..cb479913d9fb 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -73,10 +73,8 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
 	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
+	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
+	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
@@ -236,8 +234,6 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_gic_config(void);
-extern u64 __vgic_v3_read_vmcr(void);
-extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
 
 extern u64 __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 6797eafe7890..4adf7c2a77bd 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -61,8 +61,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_NVHE_HYPERVISOR__
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 6105e84628da..60d090ed6cbe 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -447,7 +447,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	if (has_vhe())
 		kvm_vcpu_put_sysregs_vhe(vcpu);
 	kvm_timer_vcpu_put(vcpu);
-	kvm_vgic_put(vcpu);
+	kvm_vgic_put(vcpu, false);
 	kvm_vcpu_pmu_restore_host(vcpu);
 	kvm_arm_vmid_clear_active();
 
@@ -680,15 +680,14 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
 	 * doorbells to be signalled, should an interrupt become pending.
 	 */
 	preempt_disable();
-	kvm_vgic_vmcr_sync(vcpu);
-	vgic_v4_put(vcpu, true);
+	kvm_vgic_put(vcpu, true);
 	preempt_enable();
 
 	kvm_vcpu_halt(vcpu);
 	vcpu_clear_flag(vcpu, IN_WFIT);
 
 	preempt_disable();
-	vgic_v4_load(vcpu);
+	kvm_vgic_load(vcpu);
 	preempt_enable();
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index da618fa9982a..6bc7a1e82d91 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -197,16 +197,6 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
 }
 
-static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
-{
-	cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr();
-}
-
-static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt)
-{
-	__vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1));
-}
-
 static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
 {
 	__vgic_v3_init_lrs();
@@ -217,18 +207,18 @@ static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2();
 }
 
-static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
 
-	__vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+	__vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
 }
 
-static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
 
-	__vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+	__vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if));
 }
 
 static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
@@ -366,10 +356,8 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__kvm_tlb_flush_vmid),
 	HANDLE_FUNC(__kvm_flush_cpu_context),
 	HANDLE_FUNC(__kvm_timer_set_cntvoff),
-	HANDLE_FUNC(__vgic_v3_read_vmcr),
-	HANDLE_FUNC(__vgic_v3_write_vmcr),
-	HANDLE_FUNC(__vgic_v3_save_aprs),
-	HANDLE_FUNC(__vgic_v3_restore_aprs),
+	HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
+	HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
 	HANDLE_FUNC(__pkvm_vcpu_init_traps),
 	HANDLE_FUNC(__pkvm_init_vm),
 	HANDLE_FUNC(__pkvm_init_vcpu),
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 6cb638b184b1..7b397fad26f2 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -330,7 +330,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
 		write_gicreg(0, ICH_HCR_EL2);
 }
 
-void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
 	u64 val;
 	u32 nr_pre_bits;
@@ -363,7 +363,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
 	}
 }
 
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
 	u64 val;
 	u32 nr_pre_bits;
@@ -455,16 +455,35 @@ u64 __vgic_v3_get_gic_config(void)
 	return val;
 }
 
-u64 __vgic_v3_read_vmcr(void)
+static u64 __vgic_v3_read_vmcr(void)
 {
 	return read_gicreg(ICH_VMCR_EL2);
 }
 
-void __vgic_v3_write_vmcr(u32 vmcr)
+static void __vgic_v3_write_vmcr(u32 vmcr)
 {
 	write_gicreg(vmcr, ICH_VMCR_EL2);
 }
 
+void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+	__vgic_v3_save_aprs(cpu_if);
+	if (cpu_if->vgic_sre)
+		cpu_if->vgic_vmcr = __vgic_v3_read_vmcr();
+}
+
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+	/*
+	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+	 * VMCR_EL2 save/restore in the world switch.
+	 */
+	if (cpu_if->vgic_sre)
+		__vgic_v3_write_vmcr(cpu_if->vgic_vmcr);
+	__vgic_v3_restore_aprs(cpu_if);
+}
+
 static int __vgic_v3_bpr_min(void)
 {
 	/* See Pseudocode for VPriorityGroup */
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 645648349c99..4e8bb90bd96f 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -470,17 +470,10 @@ void vgic_v2_load(struct kvm_vcpu *vcpu)
 		       kvm_vgic_global_state.vctrl_base + GICH_APR);
 }
 
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
+void vgic_v2_put(struct kvm_vcpu *vcpu, bool blocking)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 
 	cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-}
-
-void vgic_v2_put(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	vgic_v2_vmcr_sync(vcpu);
 	cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
 }
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 826ff6f2a4e7..d5b0544be269 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -724,15 +724,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
-	/*
-	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
-	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
-	 * VMCR_EL2 save/restore in the world switch.
-	 */
-	if (likely(cpu_if->vgic_sre))
-		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
-
-	kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
+	kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_activate_traps(cpu_if);
@@ -740,23 +732,13 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 	WARN_ON(vgic_v4_load(vcpu));
 }
 
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
+void vgic_v3_put(struct kvm_vcpu *vcpu, bool blocking)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
-	if (likely(cpu_if->vgic_sre))
-		cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
-}
+	WARN_ON(vgic_v4_put(vcpu, blocking));
 
-void vgic_v3_put(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	WARN_ON(vgic_v4_put(vcpu, false));
-
-	vgic_v3_vmcr_sync(vcpu);
-
-	kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
+	kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_deactivate_traps(cpu_if);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index d97e6080b421..6189ad969675 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -931,26 +931,15 @@ void kvm_vgic_load(struct kvm_vcpu *vcpu)
 		vgic_v3_load(vcpu);
 }
 
-void kvm_vgic_put(struct kvm_vcpu *vcpu)
+void kvm_vgic_put(struct kvm_vcpu *vcpu, bool blocking)
 {
 	if (unlikely(!vgic_initialized(vcpu->kvm)))
 		return;
 
 	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_put(vcpu);
+		vgic_v2_put(vcpu, blocking);
 	else
-		vgic_v3_put(vcpu);
-}
-
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_vmcr_sync(vcpu);
-	else
-		vgic_v3_vmcr_sync(vcpu);
+		vgic_v3_put(vcpu, blocking);
 }
 
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 0c8da72953f0..5f8bb8c730ce 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -201,8 +201,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
 
 void vgic_v2_init_lrs(void);
 void vgic_v2_load(struct kvm_vcpu *vcpu);
-void vgic_v2_put(struct kvm_vcpu *vcpu);
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
+void vgic_v2_put(struct kvm_vcpu *vcpu, bool blocking);
 
 void vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
@@ -232,8 +231,7 @@ int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
 bool vgic_v3_check_base(struct kvm *kvm);
 
 void vgic_v3_load(struct kvm_vcpu *vcpu);
-void vgic_v3_put(struct kvm_vcpu *vcpu);
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
+void vgic_v3_put(struct kvm_vcpu *vcpu, bool blocking);
 
 bool vgic_has_its(struct kvm *kvm);
 int kvm_vgic_register_its_device(void);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4df9e73a8bb5..ffeffbc43ba3 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -384,8 +384,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 
 void kvm_vgic_load(struct kvm_vcpu *vcpu);
-void kvm_vgic_put(struct kvm_vcpu *vcpu);
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu);
+void kvm_vgic_put(struct kvm_vcpu *vcpu, bool blocking);
 
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)	((k)->arch.vgic.initialized)

From 938d9901973f32e506e93b49e963154e22f0fa17 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 14 Apr 2022 16:24:28 +0100
Subject: [PATCH 048/457] ANDROID: KVM: arm64: Add the
 {flush,sync}_hyp_vgic_state() primitives

Rather than blindly copying the vGIC state to/from the host at EL2,
introduce a couple of helpers to copy only what is needed and to
sanitise untrusted data passed by the host kernel.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ibab19f638a7d0646c4d17ce5dbd2d3c0be474eac
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 50 +++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 6bc7a1e82d91..39f1446c0954 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -18,10 +18,51 @@
 #include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
+#include <linux/irqchip/arm-gic-v3.h>
+
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
+static void flush_hyp_vgic_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	struct vgic_v3_cpu_if *host_cpu_if, *hyp_cpu_if;
+	unsigned int used_lrs, max_lrs, i;
+
+	host_cpu_if	= &host_vcpu->arch.vgic_cpu.vgic_v3;
+	hyp_cpu_if	= &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+
+	max_lrs		= (read_gicreg(ICH_VTR_EL2) & 0xf) + 1;
+	used_lrs	= READ_ONCE(host_cpu_if->used_lrs);
+	used_lrs	= min(used_lrs, max_lrs);
+
+	hyp_cpu_if->vgic_hcr	= READ_ONCE(host_cpu_if->vgic_hcr);
+	/* Should be a one-off */
+	hyp_cpu_if->vgic_sre	= (ICC_SRE_EL1_DIB |
+				   ICC_SRE_EL1_DFB |
+				   ICC_SRE_EL1_SRE);
+	hyp_cpu_if->used_lrs	= used_lrs;
+
+	for (i = 0; i < used_lrs; i++)
+		hyp_cpu_if->vgic_lr[i] = READ_ONCE(host_cpu_if->vgic_lr[i]);
+}
+
+static void sync_hyp_vgic_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	struct vgic_v3_cpu_if *host_cpu_if, *hyp_cpu_if;
+	unsigned int i;
+
+	host_cpu_if	= &host_vcpu->arch.vgic_cpu.vgic_v3;
+	hyp_cpu_if	= &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+
+	WRITE_ONCE(host_cpu_if->vgic_hcr, hyp_cpu_if->vgic_hcr);
+
+	for (i = 0; i < hyp_cpu_if->used_lrs; i++)
+		WRITE_ONCE(host_cpu_if->vgic_lr[i], hyp_cpu_if->vgic_lr[i]);
+}
+
 static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
@@ -43,15 +84,12 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 
 	hyp_vcpu->vcpu.arch.vsesr_el2	= host_vcpu->arch.vsesr_el2;
 
-	hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3;
+	flush_hyp_vgic_state(hyp_vcpu);
 }
 
 static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
-	struct vgic_v3_cpu_if *hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
-	struct vgic_v3_cpu_if *host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
-	unsigned int i;
 
 	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;
 
@@ -63,9 +101,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	host_vcpu->arch.iflags		= hyp_vcpu->vcpu.arch.iflags;
 	host_vcpu->arch.fp_state	= hyp_vcpu->vcpu.arch.fp_state;
 
-	host_cpu_if->vgic_hcr		= hyp_cpu_if->vgic_hcr;
-	for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
-		host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
+	sync_hyp_vgic_state(hyp_vcpu);
 }
 
 static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)

From 9c16ae1dd849f70390655d7d5e5419769a37cda8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Apr 2022 11:10:17 +0100
Subject: [PATCH 049/457] ANDROID: KVM: arm64: Introduce predicates to check
 for protected state

In order to determine whether or not a VM or (hyp) vCPU are protected,
introduce a helper function to query this state. For now, these will
always return 'false' as the underlying field is never configured.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ib39d510d56b5d96d97526d725c7768d4fe5cf752
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h      |  6 ++----
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 13 +++++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ffd31910da52..4d414471142a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -183,6 +183,7 @@ struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache teardown_mc;
 	struct list_head pinned_pages;
+	bool enabled;
 };
 
 struct kvm_arch {
@@ -1032,10 +1033,7 @@ int kvm_set_ipa_limit(void);
 #define __KVM_HAVE_ARCH_VM_ALLOC
 struct kvm *kvm_arch_alloc_vm(void);
 
-static inline bool kvm_vm_is_protected(struct kvm *kvm)
-{
-	return false;
-}
+#define kvm_vm_is_protected(kvm)	((kvm)->arch.pkvm.enabled)
 
 void kvm_init_protected_traps(struct kvm_vcpu *vcpu);
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 3e5547004764..0fc1cf5bae8c 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -53,6 +53,19 @@ pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu)
 	return container_of(hyp_vcpu->vcpu.kvm, struct pkvm_hyp_vm, kvm);
 }
 
+static inline bool vcpu_is_protected(struct kvm_vcpu *vcpu)
+{
+	if (!is_protected_kvm_enabled())
+		return false;
+
+	return vcpu->kvm->arch.pkvm.enabled;
+}
+
+static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	return vcpu_is_protected(&hyp_vcpu->vcpu);
+}
+
 void pkvm_hyp_vm_table_init(void *tbl);
 
 int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,

From 0af9d0d627b0be9478687df45d6f1b6f140013c9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Apr 2022 12:02:15 +0100
Subject: [PATCH 050/457] ANDROID: KVM: arm64: Add the
 {flush,sync}_hyp_timer_state() primitives

In preparation for save/restore of the timer state at EL2 for protected
VMs, introduce a couple of sync/flush primitives for the architected
timer, in much the same way as we have for the GIC.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I51fd848f12c71e2c6cb14d3db834a12f1a3226d8
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 34 ++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 39f1446c0954..359d64f6885b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -63,6 +63,38 @@ static void sync_hyp_vgic_state(struct pkvm_hyp_vcpu *hyp_vcpu)
 		WRITE_ONCE(host_cpu_if->vgic_lr[i], hyp_cpu_if->vgic_lr[i]);
 }
 
+static void flush_hyp_timer_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+		return;
+
+	/*
+	 * A hyp vcpu has no offset, and sees vtime == ptime. The
+	 * ptimer is fully emulated by EL1 and cannot be trusted.
+	 */
+	write_sysreg(0, cntvoff_el2);
+	isb();
+	write_sysreg_el0(__vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CVAL_EL0),
+			 SYS_CNTV_CVAL);
+	write_sysreg_el0(__vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CTL_EL0),
+			 SYS_CNTV_CTL);
+}
+
+static void sync_hyp_timer_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+		return;
+
+	/*
+	 * Preserve the vtimer state so that it is always correct,
+	 * even if the host tries to make a mess.
+	 */
+	__vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CVAL_EL0) =
+		read_sysreg_el0(SYS_CNTV_CVAL);
+	__vcpu_sys_reg(&hyp_vcpu->vcpu, CNTV_CTL_EL0) =
+		read_sysreg_el0(SYS_CNTV_CTL);
+}
+
 static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
@@ -85,6 +117,7 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	hyp_vcpu->vcpu.arch.vsesr_el2	= host_vcpu->arch.vsesr_el2;
 
 	flush_hyp_vgic_state(hyp_vcpu);
+	flush_hyp_timer_state(hyp_vcpu);
 }
 
 static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -102,6 +135,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	host_vcpu->arch.fp_state	= hyp_vcpu->vcpu.arch.fp_state;
 
 	sync_hyp_vgic_state(hyp_vcpu);
+	sync_hyp_timer_state(hyp_vcpu);
 }
 
 static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)

From 44a06e70d8e01992f624b3799f1c09d91d91831e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Apr 2022 16:05:17 +0100
Subject: [PATCH 051/457] ANDROID: KVM: arm64: Introduce the
 pkvm_vcpu_{load,put} hypercalls

Rather than look-up the hyp vCPU on every run hypercall at EL2,
introduce a per-CPU 'loaded_hyp_vcpu' tracking variable which is updated
by a pair of load/put hypercalls called directly from
kvm_arch_vcpu_{load,put}() when pKVM is enabled.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ic640cb805d0f9610059713ff19918dcffc477d44
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h       |  2 +
 arch/arm64/kvm/arm.c                   | 14 ++++++
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  7 +++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c     | 59 +++++++++++++++++---------
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 28 ++++++++++++
 arch/arm64/kvm/mmu.c                   |  6 +--
 arch/arm64/kvm/vgic/vgic-v3.c          |  6 ++-
 7 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index cb479913d9fb..7939291f19a0 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -79,6 +79,8 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
 	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
+	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
+	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 60d090ed6cbe..5af7e385702d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -436,12 +436,26 @@ nommu:
 		vcpu_ptrauth_disable(vcpu);
 	kvm_arch_vcpu_load_debug_state_flags(vcpu);
 
+	if (is_protected_kvm_enabled()) {
+		kvm_call_hyp_nvhe(__pkvm_vcpu_load,
+				  vcpu->kvm->arch.pkvm.handle,
+				  vcpu->vcpu_idx, vcpu->arch.hcr_el2);
+		kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
+			     &vcpu->arch.vgic_cpu.vgic_v3);
+	}
+
 	if (!cpumask_test_cpu(smp_processor_id(), vcpu->kvm->arch.supported_cpus))
 		vcpu_set_on_unsupported_cpu(vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	if (is_protected_kvm_enabled()) {
+		kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
+			     &vcpu->arch.vgic_cpu.vgic_v3);
+		kvm_call_hyp_nvhe(__pkvm_vcpu_put);
+	}
+
 	kvm_arch_vcpu_put_debug_state_flags(vcpu);
 	kvm_arch_vcpu_put_fp(vcpu);
 	if (has_vhe())
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 0fc1cf5bae8c..c9d357028fd9 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -20,6 +20,12 @@ struct pkvm_hyp_vcpu {
 
 	/* Backpointer to the host's (untrusted) vCPU instance. */
 	struct kvm_vcpu *host_vcpu;
+
+	/*
+	 * If this hyp vCPU is loaded, then this is a backpointer to the
+	 * per-cpu pointer tracking us. Otherwise, NULL if not loaded.
+	 */
+	struct pkvm_hyp_vcpu **loaded_hyp_vcpu;
 };
 
 /*
@@ -77,6 +83,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle);
 struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
 					 unsigned int vcpu_idx);
 void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void);
 
 u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
 bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 359d64f6885b..4883fd9c976a 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -138,20 +138,48 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	sync_hyp_timer_state(hyp_vcpu);
 }
 
+static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
+	DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2);
+	DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+
+	if (!is_protected_kvm_enabled())
+		return;
+
+	hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx);
+	if (!hyp_vcpu)
+		return;
+
+	if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
+		/* Propagate WFx trapping flags, trap ptrauth */
+		hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI |
+						     HCR_API | HCR_APK);
+		hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI);
+	}
+}
+
+static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
+{
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+
+	if (!is_protected_kvm_enabled())
+		return;
+
+	hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+	if (hyp_vcpu)
+		pkvm_put_hyp_vcpu(hyp_vcpu);
+}
+
 static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
 	int ret;
 
-	host_vcpu = kern_hyp_va(host_vcpu);
-
 	if (unlikely(is_protected_kvm_enabled())) {
-		struct pkvm_hyp_vcpu *hyp_vcpu;
-		struct kvm *host_kvm;
+		struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
 
-		host_kvm = kern_hyp_va(host_vcpu->kvm);
-		hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
-					      host_vcpu->vcpu_idx);
 		if (!hyp_vcpu) {
 			ret = -EINVAL;
 			goto out;
@@ -162,12 +190,10 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 		ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
 
 		sync_hyp_vcpu(hyp_vcpu);
-		pkvm_put_hyp_vcpu(hyp_vcpu);
 	} else {
 		/* The host is fully trusted, run its vCPU directly. */
-		ret = __kvm_vcpu_run(host_vcpu);
+		ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu));
 	}
-
 out:
 	cpu_reg(host_ctxt, 1) =  ret;
 }
@@ -186,29 +212,22 @@ static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, pfn, host_ctxt, 1);
 	DECLARE_REG(u64, gfn, host_ctxt, 2);
-	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 3);
 	struct pkvm_hyp_vcpu *hyp_vcpu;
-	struct kvm *host_kvm;
 	int ret = -EINVAL;
 
 	if (!is_protected_kvm_enabled())
 		goto out;
 
-	host_vcpu = kern_hyp_va(host_vcpu);
-	host_kvm = kern_hyp_va(host_vcpu->kvm);
-	hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
-				      host_vcpu->vcpu_idx);
+	hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
 	if (!hyp_vcpu)
 		goto out;
 
 	/* Top-up our per-vcpu memcache from the host's */
 	ret = pkvm_refill_memcache(hyp_vcpu);
 	if (ret)
-		goto out_put_vcpu;
+		goto out;
 
 	ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu);
-out_put_vcpu:
-	pkvm_put_hyp_vcpu(hyp_vcpu);
 out:
 	cpu_reg(host_ctxt, 1) =  ret;
 }
@@ -432,6 +451,8 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_init_vm),
 	HANDLE_FUNC(__pkvm_init_vcpu),
 	HANDLE_FUNC(__pkvm_teardown_vm),
+	HANDLE_FUNC(__pkvm_vcpu_load),
+	HANDLE_FUNC(__pkvm_vcpu_put),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 2b248c56145c..3ce85f9caf92 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -17,6 +17,12 @@ unsigned long __icache_flags;
 /* Used by kvm_get_vttbr(). */
 unsigned int kvm_arm_vmid_bits;
 
+/*
+ * The currently loaded hyp vCPU for each physical CPU. Used only when
+ * protected KVM is enabled, but for both protected and non-protected VMs.
+ */
+static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu);
+
 /*
  * Set trap register values based on features in ID_AA64PFR0.
  */
@@ -246,15 +252,30 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
 	struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
 	struct pkvm_hyp_vm *hyp_vm;
 
+	/* Cannot load a new vcpu without putting the old one first. */
+	if (__this_cpu_read(loaded_hyp_vcpu))
+		return NULL;
+
 	hyp_spin_lock(&vm_table_lock);
 	hyp_vm = get_vm_by_handle(handle);
 	if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx)
 		goto unlock;
 
 	hyp_vcpu = hyp_vm->vcpus[vcpu_idx];
+
+	/* Ensure vcpu isn't loaded on more than one cpu simultaneously. */
+	if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) {
+		hyp_vcpu = NULL;
+		goto unlock;
+	}
+
+	hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu);
 	hyp_page_ref_inc(hyp_virt_to_page(hyp_vm));
 unlock:
 	hyp_spin_unlock(&vm_table_lock);
+
+	if (hyp_vcpu)
+		__this_cpu_write(loaded_hyp_vcpu, hyp_vcpu);
 	return hyp_vcpu;
 }
 
@@ -263,10 +284,17 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
 
 	hyp_spin_lock(&vm_table_lock);
+	hyp_vcpu->loaded_hyp_vcpu = NULL;
+	__this_cpu_write(loaded_hyp_vcpu, NULL);
 	hyp_page_ref_dec(hyp_virt_to_page(hyp_vm));
 	hyp_spin_unlock(&vm_table_lock);
 }
 
+struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void)
+{
+	return __this_cpu_read(loaded_hyp_vcpu);
+}
+
 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
 {
 	if (host_vcpu)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 474094291e9b..3dc1ebc923d6 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1200,9 +1200,9 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
 	return 0;
 }
 
-static int pkvm_host_map_guest(u64 pfn, u64 gfn, struct kvm_vcpu *vcpu)
+static int pkvm_host_map_guest(u64 pfn, u64 gfn)
 {
-	int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest, pfn, gfn, vcpu);
+	int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest, pfn, gfn);
 
 	/*
 	 * Getting -EPERM at this point implies that the pfn has already been
@@ -1268,7 +1268,7 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	write_lock(&kvm->mmu_lock);
 	pfn = page_to_pfn(page);
-	ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT, vcpu);
+	ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT);
 	if (ret) {
 		if (ret == -EAGAIN)
 			ret = 0;
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index d5b0544be269..53ab7ce32cfa 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -724,7 +724,8 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
-	kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
+	if (likely(!is_protected_kvm_enabled()))
+		kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_activate_traps(cpu_if);
@@ -738,7 +739,8 @@ void vgic_v3_put(struct kvm_vcpu *vcpu, bool blocking)
 
 	WARN_ON(vgic_v4_put(vcpu, blocking));
 
-	kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
+	if (likely(!is_protected_kvm_enabled()))
+		kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_deactivate_traps(cpu_if);

From 1670659d788992308813c3241264afd61b38fa75 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 9 May 2022 12:22:25 +0100
Subject: [PATCH 052/457] ANDROID: KVM: arm64: Add current host and hyp vCPU
 lookup primitive

In order to be able to safely manipulate the loaded vCPU state,
add a helper that always return the vcpu as mapped in the EL2 S1
address space as well as the pointer to the hyp vCPU if it exists.

In case of failure, both pointers are returned as NULL values.

Convert handle___kvm_vcpu_run() over to the new helper.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I90ba58c0e73a0544878f6b8514e3f91a9f83083d
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 41 ++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 4883fd9c976a..c69cd22d1078 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -172,19 +172,44 @@ static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
 		pkvm_put_hyp_vcpu(hyp_vcpu);
 }
 
-static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
+static struct kvm_vcpu *__get_host_hyp_vcpus(struct kvm_vcpu *arg,
+					     struct pkvm_hyp_vcpu **hyp_vcpup)
 {
-	DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1);
-	int ret;
+	struct kvm_vcpu *host_vcpu = kern_hyp_va(arg);
+	struct pkvm_hyp_vcpu *hyp_vcpu = NULL;
 
 	if (unlikely(is_protected_kvm_enabled())) {
-		struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+		hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
 
-		if (!hyp_vcpu) {
-			ret = -EINVAL;
-			goto out;
+		if (!hyp_vcpu || hyp_vcpu->host_vcpu != host_vcpu) {
+			hyp_vcpu = NULL;
+			host_vcpu = NULL;
 		}
+	}
 
+	*hyp_vcpup = hyp_vcpu;
+	return host_vcpu;
+}
+
+#define get_host_hyp_vcpus(ctxt, regnr, hyp_vcpup)			\
+	({								\
+		DECLARE_REG(struct kvm_vcpu *, __vcpu, ctxt, regnr);	\
+		__get_host_hyp_vcpus(__vcpu, hyp_vcpup);		\
+	})
+
+static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
+{
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct kvm_vcpu *host_vcpu;
+	int ret;
+
+	host_vcpu = get_host_hyp_vcpus(host_ctxt, 1, &hyp_vcpu);
+	if (!host_vcpu) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (unlikely(hyp_vcpu)) {
 		flush_hyp_vcpu(hyp_vcpu);
 
 		ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
@@ -192,7 +217,7 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 		sync_hyp_vcpu(hyp_vcpu);
 	} else {
 		/* The host is fully trusted, run its vCPU directly. */
-		ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu));
+		ret = __kvm_vcpu_run(host_vcpu);
 	}
 out:
 	cpu_reg(host_ctxt, 1) =  ret;

From d5c96bdc268e2387cfc4a66786ec149049d058d1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 20 Apr 2022 17:12:23 +0100
Subject: [PATCH 053/457] ANDROID: KVM: arm64: Skip __kvm_adjust_pc() for
 protected vcpus

Prevent the host from issuing arbitrary PC adjustments for protected
vCPUs.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I28815d1c6782abf2654ae3e931548014c842d760
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index c69cd22d1078..167c9ad98e19 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -259,9 +259,22 @@ out:
 
 static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
 {
-	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct kvm_vcpu *host_vcpu;
 
-	__kvm_adjust_pc(kern_hyp_va(vcpu));
+	host_vcpu = get_host_hyp_vcpus(host_ctxt, 1, &hyp_vcpu);
+	if (!host_vcpu)
+		return;
+
+	if (hyp_vcpu) {
+		/* This only applies to non-protected VMs */
+		if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+			return;
+
+		__kvm_adjust_pc(&hyp_vcpu->vcpu);
+	} else {
+		__kvm_adjust_pc(host_vcpu);
+	}
 }
 
 static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt)

From 3196b641e01ba851ea03120b3a6dda62845de359 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 9 May 2022 13:19:16 +0000
Subject: [PATCH 054/457] ANDROID: KVM: arm64: Add hyp per_cpu variable to
 track current physical cpu number

Hyp cannot trust the equivalent variable at the host.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I3af48c36a700a08327c5f3163c2be1f9b9944816
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h  | 3 +++
 arch/arm64/kvm/arm.c              | 4 ++++
 arch/arm64/kvm/hyp/nvhe/hyp-smp.c | 2 ++
 3 files changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4adf7c2a77bd..e38869e88019 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -15,6 +15,9 @@
 DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
 DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+DECLARE_PER_CPU(int, hyp_cpu_number);
+
+#define hyp_smp_processor_id() (__this_cpu_read(hyp_cpu_number))
 
 #define read_sysreg_elx(r,nvh,vh)					\
 	({								\
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 5af7e385702d..4faa5dae08bd 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -52,6 +52,7 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
 
 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
+DECLARE_KVM_NVHE_PER_CPU(int, hyp_cpu_number);
 
 static bool vgic_present;
 
@@ -1555,6 +1556,9 @@ static void cpu_prepare_hyp_mode(int cpu)
 {
 	struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
 	unsigned long tcr;
+	int *hyp_cpu_number_ptr = per_cpu_ptr_nvhe_sym(hyp_cpu_number, cpu);
+
+	*hyp_cpu_number_ptr = cpu;
 
 	/*
 	 * Calculate the raw per-cpu offset without a translation from the
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
index 04d194583f1e..9fcb92abd0b5 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c
@@ -8,6 +8,8 @@
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+DEFINE_PER_CPU(int, hyp_cpu_number);
+
 /*
  * nVHE copy of data structures tracking available CPU cores.
  * Only entries for CPUs that were online at KVM init are populated.

From df7acc62d4a543a5692492fc956ed066d48f2cc4 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Fri, 30 Sep 2022 14:45:21 +0100
Subject: [PATCH 055/457] ANDROID: KVM: arm64: Ensure that TLBs and I-cache are
 private to each vcpu

Guarantee that both TLBs and I-cache are private to each vcpu.
Flush the CPU context if a different vcpu from the same vm is
loaded on the same physical CPU.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I870e3994c3094b43e1cc6fcaebdd167ebe1de394
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c     | 16 ++++++++++++-
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 31 +++++++++++++++++++++-----
 arch/arm64/kvm/pkvm.c                  | 20 ++++++++++++-----
 4 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index c9d357028fd9..65e09a589eaa 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -75,7 +75,7 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu)
 void pkvm_hyp_vm_table_init(void *tbl);
 
 int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
-		   unsigned long pgd_hva);
+		   unsigned long pgd_hva, unsigned long last_ran_hva);
 int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu,
 		     unsigned long vcpu_hva);
 int __pkvm_teardown_vm(pkvm_handle_t handle);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 167c9ad98e19..76a96814e47b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -144,6 +144,7 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2);
 	DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
 	struct pkvm_hyp_vcpu *hyp_vcpu;
+	int *last_ran;
 
 	if (!is_protected_kvm_enabled())
 		return;
@@ -152,6 +153,17 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
 	if (!hyp_vcpu)
 		return;
 
+	/*
+	 * Guarantee that both TLBs and I-cache are private to each vcpu. If a
+	 * vcpu from the same VM has previously run on the same physical CPU,
+	 * nuke the relevant contexts.
+	 */
+	last_ran = &hyp_vcpu->vcpu.arch.hw_mmu->last_vcpu_ran[hyp_smp_processor_id()];
+	if (*last_ran != hyp_vcpu->vcpu.vcpu_id) {
+		__kvm_flush_cpu_context(hyp_vcpu->vcpu.arch.hw_mmu);
+		*last_ran = hyp_vcpu->vcpu.vcpu_id;
+	}
+
 	if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
 		/* Propagate WFx trapping flags, trap ptrauth */
 		hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI |
@@ -435,9 +447,11 @@ static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
 	DECLARE_REG(unsigned long, vm_hva, host_ctxt, 2);
 	DECLARE_REG(unsigned long, pgd_hva, host_ctxt, 3);
+	DECLARE_REG(unsigned long, last_ran_hva, host_ctxt, 4);
 
 	host_kvm = kern_hyp_va(host_kvm);
-	cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva);
+	cpu_reg(host_ctxt, 1) = __pkvm_init_vm(host_kvm, vm_hva, pgd_hva,
+					       last_ran_hva);
 }
 
 static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3ce85f9caf92..12d1f1a1f48c 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -310,12 +310,19 @@ static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
 		unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
 }
 
+static size_t pkvm_get_last_ran_size(void)
+{
+	return array_size(hyp_nr_cpus, sizeof(int));
+}
+
 static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
-			     unsigned int nr_vcpus)
+			     int *last_ran, unsigned int nr_vcpus)
 {
 	hyp_vm->host_kvm = host_kvm;
 	hyp_vm->kvm.created_vcpus = nr_vcpus;
 	hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+	hyp_vm->kvm.arch.mmu.last_vcpu_ran = last_ran;
+	memset(hyp_vm->kvm.arch.mmu.last_vcpu_ran, -1, pkvm_get_last_ran_size());
 }
 
 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
@@ -471,15 +478,17 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
  * pgd_hva: The host va of the area being donated for the stage-2 PGD for
  *	    the VM. Must be page aligned. Its size is implied by the VM's
  *	    VTCR.
- *
+ * last_ran_hva: The host va of the area being donated for hyp to use to track
+ *		 the most recent physical cpu on which each vcpu has run.
  * Return a unique handle to the protected VM on success,
  * negative error code on failure.
  */
 int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
-		   unsigned long pgd_hva)
+		   unsigned long pgd_hva, unsigned long last_ran_hva)
 {
 	struct pkvm_hyp_vm *hyp_vm = NULL;
-	size_t vm_size, pgd_size;
+	void *last_ran = NULL;
+	size_t vm_size, pgd_size, last_ran_size;
 	unsigned int nr_vcpus;
 	void *pgd = NULL;
 	int ret;
@@ -495,6 +504,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
 	}
 
 	vm_size = pkvm_get_hyp_vm_size(nr_vcpus);
+	last_ran_size = pkvm_get_last_ran_size();
 	pgd_size = kvm_pgtable_stage2_pgd_size(host_mmu.arch.vtcr);
 
 	ret = -ENOMEM;
@@ -503,11 +513,15 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
 	if (!hyp_vm)
 		goto err_remove_mappings;
 
+	last_ran = map_donated_memory(last_ran_hva, last_ran_size);
+	if (!last_ran)
+		goto err_remove_mappings;
+
 	pgd = map_donated_memory_noclear(pgd_hva, pgd_size);
 	if (!pgd)
 		goto err_remove_mappings;
 
-	init_pkvm_hyp_vm(host_kvm, hyp_vm, nr_vcpus);
+	init_pkvm_hyp_vm(host_kvm, hyp_vm, last_ran, nr_vcpus);
 
 	hyp_spin_lock(&vm_table_lock);
 	ret = insert_vm_table_entry(host_kvm, hyp_vm);
@@ -527,6 +541,7 @@ err_unlock:
 	hyp_spin_unlock(&vm_table_lock);
 err_remove_mappings:
 	unmap_donated_memory(hyp_vm, vm_size);
+	unmap_donated_memory(last_ran, last_ran_size);
 	unmap_donated_memory(pgd, pgd_size);
 err_unpin_kvm:
 	hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
@@ -601,10 +616,10 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
 
 int __pkvm_teardown_vm(pkvm_handle_t handle)
 {
+	size_t vm_size, last_ran_size;
 	struct kvm_hyp_memcache *mc;
 	struct pkvm_hyp_vm *hyp_vm;
 	unsigned int idx;
-	size_t vm_size;
 	int err;
 
 	hyp_spin_lock(&vm_table_lock);
@@ -638,6 +653,10 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 		teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
 	}
 
+	last_ran_size = pkvm_get_last_ran_size();
+	teardown_donated_memory(mc, hyp_vm->kvm.arch.mmu.last_vcpu_ran,
+				last_ran_size);
+
 	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
 	teardown_donated_memory(mc, hyp_vm, vm_size);
 	return 0;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index aea560873028..d97d17c96321 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -109,10 +109,10 @@ void __init kvm_hyp_reserve(void)
  */
 static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 {
-	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
+	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz, last_ran_sz;
 	struct kvm_vcpu *host_vcpu;
 	pkvm_handle_t handle;
-	void *pgd, *hyp_vm;
+	void *pgd, *hyp_vm, *last_ran;
 	unsigned long idx;
 	int ret;
 
@@ -140,10 +140,18 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 		goto free_pgd;
 	}
 
-	/* Donate the VM memory to hyp and let hyp initialize it. */
-	ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
-	if (ret < 0)
+	/* Allocate memory to donate to hyp for tracking mmu->last_vcpu_ran. */
+	last_ran_sz = PAGE_ALIGN(array_size(num_possible_cpus(), sizeof(int)));
+	last_ran = alloc_pages_exact(last_ran_sz, GFP_KERNEL_ACCOUNT);
+	if (!last_ran) {
+		ret = -ENOMEM;
 		goto free_vm;
+	}
+
+	/* Donate the VM memory to hyp and let hyp initialize it. */
+	ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd, last_ran);
+	if (ret < 0)
+		goto free_last_ran;
 
 	handle = ret;
 
@@ -179,6 +187,8 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 destroy_vm:
 	pkvm_destroy_hyp_vm(host_kvm);
 	return ret;
+free_last_ran:
+	free_pages_exact(last_ran, last_ran_sz);
 free_vm:
 	free_pages_exact(hyp_vm, hyp_vm_sz);
 free_pgd:

From 2413304588f113e9effc2b9cc39e9bd0383b9c2d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 11 Jun 2022 11:49:37 +0100
Subject: [PATCH 056/457] ANDROID: KVM: arm64: Introduce per-EC entry/exit
 handlers

Introduce per-EC entry/exit handlers at EL2 and provide initial
implementations to manage the 'flags' and fault information registers.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I402a48c77602da969fc04c393d0624d3b2f837df
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  3 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c     | 85 ++++++++++++++++++++++++--
 2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 65e09a589eaa..3f254497e22b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -26,6 +26,9 @@ struct pkvm_hyp_vcpu {
 	 * per-cpu pointer tracking us. Otherwise, NULL if not loaded.
 	 */
 	struct pkvm_hyp_vcpu **loaded_hyp_vcpu;
+
+	/* Tracks exit code for the protected guest. */
+	u32 exit_code;
 };
 
 /*
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 76a96814e47b..776b0c7335c6 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -24,6 +24,43 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
+typedef void (*hyp_entry_exit_handler_fn)(struct pkvm_hyp_vcpu *);
+
+static void handle_vm_entry_generic(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	vcpu_copy_flag(&hyp_vcpu->vcpu, hyp_vcpu->host_vcpu, PC_UPDATE_REQ);
+}
+
+static void handle_vm_exit_generic(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2,
+		   hyp_vcpu->vcpu.arch.fault.esr_el2);
+}
+
+static void handle_vm_exit_abt(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	WRITE_ONCE(host_vcpu->arch.fault.esr_el2,
+		   hyp_vcpu->vcpu.arch.fault.esr_el2);
+	WRITE_ONCE(host_vcpu->arch.fault.far_el2,
+		   hyp_vcpu->vcpu.arch.fault.far_el2);
+	WRITE_ONCE(host_vcpu->arch.fault.hpfar_el2,
+		   hyp_vcpu->vcpu.arch.fault.hpfar_el2);
+	WRITE_ONCE(host_vcpu->arch.fault.disr_el1,
+		   hyp_vcpu->vcpu.arch.fault.disr_el1);
+}
+
+static const hyp_entry_exit_handler_fn entry_hyp_vm_handlers[] = {
+	[0 ... ESR_ELx_EC_MAX]		= handle_vm_entry_generic,
+};
+
+static const hyp_entry_exit_handler_fn exit_hyp_vm_handlers[] = {
+	[0 ... ESR_ELx_EC_MAX]		= handle_vm_exit_generic,
+	[ESR_ELx_EC_IABT_LOW]		= handle_vm_exit_abt,
+	[ESR_ELx_EC_DABT_LOW]		= handle_vm_exit_abt,
+};
+
 static void flush_hyp_vgic_state(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
@@ -98,6 +135,8 @@ static void sync_hyp_timer_state(struct pkvm_hyp_vcpu *hyp_vcpu)
 static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	hyp_entry_exit_handler_fn ec_handler;
+	u8 esr_ec;
 
 	hyp_vcpu->vcpu.arch.ctxt	= host_vcpu->arch.ctxt;
 
@@ -108,7 +147,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	hyp_vcpu->vcpu.arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
 	hyp_vcpu->vcpu.arch.cptr_el2	= host_vcpu->arch.cptr_el2;
 
-	hyp_vcpu->vcpu.arch.iflags	= host_vcpu->arch.iflags;
 	hyp_vcpu->vcpu.arch.fp_state	= host_vcpu->arch.fp_state;
 
 	hyp_vcpu->vcpu.arch.debug_ptr	= kern_hyp_va(host_vcpu->arch.debug_ptr);
@@ -118,24 +156,59 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 
 	flush_hyp_vgic_state(hyp_vcpu);
 	flush_hyp_timer_state(hyp_vcpu);
+
+	switch (ARM_EXCEPTION_CODE(hyp_vcpu->exit_code)) {
+	case ARM_EXCEPTION_IRQ:
+	case ARM_EXCEPTION_EL1_SERROR:
+	case ARM_EXCEPTION_IL:
+		break;
+	case ARM_EXCEPTION_TRAP:
+		esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(&hyp_vcpu->vcpu));
+		ec_handler = entry_hyp_vm_handlers[esr_ec];
+		if (ec_handler)
+			ec_handler(hyp_vcpu);
+		break;
+	default:
+		BUG();
+	}
+
+	hyp_vcpu->exit_code = 0;
 }
 
-static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	hyp_entry_exit_handler_fn ec_handler;
+	u8 esr_ec;
 
 	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;
 
 	host_vcpu->arch.hcr_el2		= hyp_vcpu->vcpu.arch.hcr_el2;
 	host_vcpu->arch.cptr_el2	= hyp_vcpu->vcpu.arch.cptr_el2;
 
-	host_vcpu->arch.fault		= hyp_vcpu->vcpu.arch.fault;
-
-	host_vcpu->arch.iflags		= hyp_vcpu->vcpu.arch.iflags;
 	host_vcpu->arch.fp_state	= hyp_vcpu->vcpu.arch.fp_state;
 
 	sync_hyp_vgic_state(hyp_vcpu);
 	sync_hyp_timer_state(hyp_vcpu);
+
+	switch (ARM_EXCEPTION_CODE(exit_reason)) {
+	case ARM_EXCEPTION_IRQ:
+		break;
+	case ARM_EXCEPTION_TRAP:
+		esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(&hyp_vcpu->vcpu));
+		ec_handler = exit_hyp_vm_handlers[esr_ec];
+		if (ec_handler)
+			ec_handler(hyp_vcpu);
+		break;
+	case ARM_EXCEPTION_EL1_SERROR:
+	case ARM_EXCEPTION_IL:
+		break;
+	default:
+		BUG();
+	}
+
+	vcpu_clear_flag(host_vcpu, PC_UPDATE_REQ);
+	hyp_vcpu->exit_code = exit_reason;
 }
 
 static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
@@ -226,7 +299,7 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 
 		ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
 
-		sync_hyp_vcpu(hyp_vcpu);
+		sync_hyp_vcpu(hyp_vcpu, ret);
 	} else {
 		/* The host is fully trusted, run its vCPU directly. */
 		ret = __kvm_vcpu_run(host_vcpu);

From 31baf60fe4843e37de8b052b45589f4d8df29b51 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 25 Apr 2022 14:48:46 +0100
Subject: [PATCH 057/457] ANDROID: KVM: arm64: Introduce lazy-ish state sync
 for non-protected VMs

Rather than blindly copying the register state between the hyp and host
vCPU structures, abstract this code into some helpers which are called
only for non-protected VMs running under pKVM. To faciliate host access
to guest registers within a get/put sequence, introduce a new
'sync_state' hypercall to provide access to the registers of a
non-protected VM when handling traps.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I5b0d874d2d2184c4da95a91c0b9b57af500cbce3
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h   |  1 +
 arch/arm64/include/asm/kvm_host.h  |  2 +
 arch/arm64/kvm/arm.c               |  7 ++++
 arch/arm64/kvm/handle_exit.c       | 22 ++++++++++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 67 +++++++++++++++++++++++++++++-
 5 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 7939291f19a0..c70f113905c4 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -81,6 +81,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
+	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 4d414471142a..bfcc36c22345 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -624,6 +624,8 @@ struct kvm_vcpu_arch {
 #define DEBUG_STATE_SAVE_SPE	__vcpu_single_flag(iflags, BIT(5))
 /* Save TRBE context if active  */
 #define DEBUG_STATE_SAVE_TRBE	__vcpu_single_flag(iflags, BIT(6))
+/* pKVM host vcpu state is dirty, needs resync */
+#define PKVM_HOST_STATE_DIRTY	__vcpu_single_flag(iflags, BIT(7))
 
 /* SVE enabled for host EL0 */
 #define HOST_SVE_ENABLED	__vcpu_single_flag(sflags, BIT(0))
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4faa5dae08bd..6b5548f2a96b 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -455,6 +455,10 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 		kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
 			     &vcpu->arch.vgic_cpu.vgic_v3);
 		kvm_call_hyp_nvhe(__pkvm_vcpu_put);
+
+		/* __pkvm_vcpu_put implies a sync of the state */
+		if (!kvm_vm_is_protected(vcpu->kvm))
+			vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY);
 	}
 
 	kvm_arch_vcpu_put_debug_state_flags(vcpu);
@@ -599,6 +603,9 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 		return ret;
 
 	if (is_protected_kvm_enabled()) {
+		/* Start with the vcpu in a dirty state */
+		if (!kvm_vm_is_protected(vcpu->kvm))
+			vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY);
 		ret = pkvm_create_hyp_vm(kvm);
 		if (ret)
 			return ret;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e778eefcf214..05b3ec8cf7e8 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -240,6 +240,21 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu)
 {
 	int handled;
 
+	/*
+	 * If we run a non-protected VM when protection is enabled
+	 * system-wide, resync the state from the hypervisor and mark
+	 * it as dirty on the host side if it wasn't dirty already
+	 * (which could happen if preemption has taken place).
+	 */
+	if (is_protected_kvm_enabled() && !kvm_vm_is_protected(vcpu->kvm)) {
+		preempt_disable();
+		if (!(vcpu_get_flag(vcpu, PKVM_HOST_STATE_DIRTY))) {
+			kvm_call_hyp_nvhe(__pkvm_vcpu_sync_state);
+			vcpu_set_flag(vcpu, PKVM_HOST_STATE_DIRTY);
+		}
+		preempt_enable();
+	}
+
 	/*
 	 * See ARM ARM B1.14.1: "Hyp traps on instructions
 	 * that fail their condition code check"
@@ -307,6 +322,13 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
 /* For exit types that need handling before we can be preempted */
 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
 {
+	/*
+	 * We just exited, so the state is clean from a hypervisor
+	 * perspective.
+	 */
+	if (is_protected_kvm_enabled())
+		vcpu_clear_flag(vcpu, PKVM_HOST_STATE_DIRTY);
+
 	if (ARM_SERROR_PENDING(exception_index)) {
 		if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) {
 			u64 disr = kvm_vcpu_get_disr(vcpu);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 776b0c7335c6..71b92996f177 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -132,13 +132,53 @@ static void sync_hyp_timer_state(struct pkvm_hyp_vcpu *hyp_vcpu)
 		read_sysreg_el0(SYS_CNTV_CTL);
 }
 
+static void __copy_vcpu_state(const struct kvm_vcpu *from_vcpu,
+			      struct kvm_vcpu *to_vcpu)
+{
+	int i;
+
+	to_vcpu->arch.ctxt.regs		= from_vcpu->arch.ctxt.regs;
+	to_vcpu->arch.ctxt.spsr_abt	= from_vcpu->arch.ctxt.spsr_abt;
+	to_vcpu->arch.ctxt.spsr_und	= from_vcpu->arch.ctxt.spsr_und;
+	to_vcpu->arch.ctxt.spsr_irq	= from_vcpu->arch.ctxt.spsr_irq;
+	to_vcpu->arch.ctxt.spsr_fiq	= from_vcpu->arch.ctxt.spsr_fiq;
+
+	/*
+	 * Copy the sysregs, but don't mess with the timer state which
+	 * is directly handled by EL1 and is expected to be preserved.
+	 */
+	for (i = 1; i < NR_SYS_REGS; i++) {
+		if (i >= CNTVOFF_EL2 && i <= CNTP_CTL_EL0)
+			continue;
+		to_vcpu->arch.ctxt.sys_regs[i] = from_vcpu->arch.ctxt.sys_regs[i];
+	}
+}
+
+static void __sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	__copy_vcpu_state(&hyp_vcpu->vcpu, hyp_vcpu->host_vcpu);
+}
+
+static void __flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	__copy_vcpu_state(hyp_vcpu->host_vcpu, &hyp_vcpu->vcpu);
+}
+
 static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
 	hyp_entry_exit_handler_fn ec_handler;
 	u8 esr_ec;
 
-	hyp_vcpu->vcpu.arch.ctxt	= host_vcpu->arch.ctxt;
+	/*
+	 * If we deal with a non-protected guest and the state is potentially
+	 * dirty (from a host perspective), copy the state back into the hyp
+	 * vcpu.
+	 */
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu) &&
+	    vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) {
+		__flush_hyp_vcpu(hyp_vcpu);
+	}
 
 	hyp_vcpu->vcpu.arch.sve_state	= kern_hyp_va(host_vcpu->arch.sve_state);
 	hyp_vcpu->vcpu.arch.sve_max_vl	= host_vcpu->arch.sve_max_vl;
@@ -253,8 +293,30 @@ static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
 		return;
 
 	hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
-	if (hyp_vcpu)
+	if (hyp_vcpu) {
+		struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+		if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu) &&
+		    !vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) {
+			__sync_hyp_vcpu(hyp_vcpu);
+		}
+
 		pkvm_put_hyp_vcpu(hyp_vcpu);
+	}
+}
+
+static void handle___pkvm_vcpu_sync_state(struct kvm_cpu_context *host_ctxt)
+{
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+
+	if (!is_protected_kvm_enabled())
+		return;
+
+	hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+	if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+		return;
+
+	__sync_hyp_vcpu(hyp_vcpu);
 }
 
 static struct kvm_vcpu *__get_host_hyp_vcpus(struct kvm_vcpu *arg,
@@ -578,6 +640,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_teardown_vm),
 	HANDLE_FUNC(__pkvm_vcpu_load),
 	HANDLE_FUNC(__pkvm_vcpu_put),
+	HANDLE_FUNC(__pkvm_vcpu_sync_state),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)

From 0fd7165049ba42ee7a77e1da4da8c7d5780281b0 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 20 Jun 2022 13:13:12 +0000
Subject: [PATCH 058/457] ANDROID: KVM: arm64: Reintroduce __sve_save_state

Now that the hypervisor is handling the guest state in protected
mode, it needs to be able to save the guest state.

This reverts commit e66425fc9ba3 ("KVM: arm64: Remove unused
__sve_save_state").

Suggested-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Iada80e9355082e5576d016221fabc7d30ffde46b
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h | 1 +
 arch/arm64/kvm/hyp/fpsimd.S      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index e38869e88019..1b597b7db99b 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -93,6 +93,7 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu);
 
 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+void __sve_save_state(void *sve_pffr, u32 *fpsr);
 void __sve_restore_state(void *sve_pffr, u32 *fpsr);
 
 #ifndef __KVM_NVHE_HYPERVISOR__
diff --git a/arch/arm64/kvm/hyp/fpsimd.S b/arch/arm64/kvm/hyp/fpsimd.S
index 61e6f3ba7b7d..e950875e31ce 100644
--- a/arch/arm64/kvm/hyp/fpsimd.S
+++ b/arch/arm64/kvm/hyp/fpsimd.S
@@ -25,3 +25,9 @@ SYM_FUNC_START(__sve_restore_state)
 	sve_load 0, x1, x2, 3
 	ret
 SYM_FUNC_END(__sve_restore_state)
+
+SYM_FUNC_START(__sve_save_state)
+	mov	x2, #1
+	sve_save 0, x1, x2, 3
+	ret
+SYM_FUNC_END(__sve_save_state)

From f753f65443721afa49897b65f3217f06fcc23c5d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 26 Apr 2022 09:22:50 +0000
Subject: [PATCH 059/457] ANDROID: KVM: arm64: Lazy host FP save/restore

Implement lazy save/restore of the host FPSIMD register state at EL2.
This allows us to save/restore guest FPSIMD registers without involving
the host and means that we can avoid having to repopulate the hyp vCPU
register state on every flush.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I7e9827d7bf52656df69ece1844fc1b8bd7884175
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 76 ++++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 71b92996f177..cfff5a19af19 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -20,6 +20,14 @@
 
 #include <linux/irqchip/arm-gic-v3.h>
 
+/*
+ * Host FPSIMD state. Written to when the guest accesses its own FPSIMD state,
+ * and read when the guest state is live and we need to switch back to the host.
+ *
+ * Only valid when (fp_state == FP_STATE_GUEST_OWNED) in the hyp vCPU structure.
+ */
+static DEFINE_PER_CPU(struct user_fpsimd_state, loaded_host_fpsimd_state);
+
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
@@ -185,12 +193,8 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 
 	hyp_vcpu->vcpu.arch.hcr_el2	= host_vcpu->arch.hcr_el2;
 	hyp_vcpu->vcpu.arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
-	hyp_vcpu->vcpu.arch.cptr_el2	= host_vcpu->arch.cptr_el2;
-
-	hyp_vcpu->vcpu.arch.fp_state	= host_vcpu->arch.fp_state;
 
 	hyp_vcpu->vcpu.arch.debug_ptr	= kern_hyp_va(host_vcpu->arch.debug_ptr);
-	hyp_vcpu->vcpu.arch.host_fpsimd_state = host_vcpu->arch.host_fpsimd_state;
 
 	hyp_vcpu->vcpu.arch.vsesr_el2	= host_vcpu->arch.vsesr_el2;
 
@@ -224,9 +228,6 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;
 
 	host_vcpu->arch.hcr_el2		= hyp_vcpu->vcpu.arch.hcr_el2;
-	host_vcpu->arch.cptr_el2	= hyp_vcpu->vcpu.arch.cptr_el2;
-
-	host_vcpu->arch.fp_state	= hyp_vcpu->vcpu.arch.fp_state;
 
 	sync_hyp_vgic_state(hyp_vcpu);
 	sync_hyp_timer_state(hyp_vcpu);
@@ -251,6 +252,40 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 	hyp_vcpu->exit_code = exit_reason;
 }
 
+static void __hyp_sve_save_guest(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+
+	__sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr);
+	__vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+	sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL1);
+}
+
+static void fpsimd_host_restore(void)
+{
+	sysreg_clear_set(cptr_el2, CPTR_EL2_TZ | CPTR_EL2_TFP, 0);
+	isb();
+
+	if (unlikely(is_protected_kvm_enabled())) {
+		struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu();
+		struct user_fpsimd_state *host_fpsimd_state;
+
+		host_fpsimd_state = this_cpu_ptr(&loaded_host_fpsimd_state);
+
+		if (vcpu_has_sve(&hyp_vcpu->vcpu))
+			__hyp_sve_save_guest(hyp_vcpu);
+		else
+			__fpsimd_save_state(&hyp_vcpu->vcpu.arch.ctxt.fp_regs);
+
+		__fpsimd_restore_state(host_fpsimd_state);
+
+		hyp_vcpu->vcpu.arch.fp_state = FP_STATE_HOST_OWNED;
+	}
+
+	if (system_supports_sve())
+		sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+}
+
 static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
@@ -277,6 +312,9 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
 		*last_ran = hyp_vcpu->vcpu.vcpu_id;
 	}
 
+	hyp_vcpu->vcpu.arch.host_fpsimd_state = this_cpu_ptr(&loaded_host_fpsimd_state);
+	hyp_vcpu->vcpu.arch.fp_state = FP_STATE_HOST_OWNED;
+
 	if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
 		/* Propagate WFx trapping flags, trap ptrauth */
 		hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI |
@@ -296,6 +334,9 @@ static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt)
 	if (hyp_vcpu) {
 		struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
 
+		if (hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED)
+			fpsimd_host_restore();
+
 		if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu) &&
 		    !vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) {
 			__sync_hyp_vcpu(hyp_vcpu);
@@ -316,6 +357,9 @@ static void handle___pkvm_vcpu_sync_state(struct kvm_cpu_context *host_ctxt)
 	if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu))
 		return;
 
+	if (hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED)
+		fpsimd_host_restore();
+
 	__sync_hyp_vcpu(hyp_vcpu);
 }
 
@@ -362,6 +406,19 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 		ret = __kvm_vcpu_run(&hyp_vcpu->vcpu);
 
 		sync_hyp_vcpu(hyp_vcpu, ret);
+
+		if (hyp_vcpu->vcpu.arch.fp_state == FP_STATE_GUEST_OWNED) {
+			/*
+			 * The guest has used the FP, trap all accesses
+			 * from the host (both FP and SVE).
+			 */
+			u64 reg = CPTR_EL2_TFP;
+
+			if (system_supports_sve())
+				reg |= CPTR_EL2_TZ;
+
+			sysreg_clear_set(cptr_el2, 0, reg);
+		}
 	} else {
 		/* The host is fully trusted, run its vCPU directly. */
 		ret = __kvm_vcpu_run(host_vcpu);
@@ -706,10 +763,9 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
 	case ESR_ELx_EC_SMC64:
 		handle_host_smc(host_ctxt);
 		break;
+	case ESR_ELx_EC_FP_ASIMD:
 	case ESR_ELx_EC_SVE:
-		sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
-		isb();
-		sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
+		fpsimd_host_restore();
 		break;
 	case ESR_ELx_EC_IABT_LOW:
 	case ESR_ELx_EC_DABT_LOW:

From 8aa656b4379a1f28c5b3ac1c6575b58d6a22fef3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 27 Apr 2022 10:23:22 +0000
Subject: [PATCH 060/457] ANDROID: KVM: arm64: Reduce host/hyp vcpu state
 copying

When running with pKVM enabled, protected guests run with a fixed CPU
configuration and therefore features such as hardware debug and SVE are
unavailable and their state does not need to be copied from the host
structures on each flush operation. Although non-protected guests do
require the host and hyp structures to be kept in-sync with each
other, we can defer writing back to the host to an explicit sync
hypercall, rather than doing it after every vCPU run.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ia80ae7bf8e374a50fda4ed5637abdfb82bcf3715
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 33 +++++++++++++++---------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index cfff5a19af19..063eeff1dac6 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -183,20 +183,21 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	 * dirty (from a host perspective), copy the state back into the hyp
 	 * vcpu.
 	 */
-	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu) &&
-	    vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY)) {
-		__flush_hyp_vcpu(hyp_vcpu);
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
+		if (vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY))
+			__flush_hyp_vcpu(hyp_vcpu);
+
+		hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
+		hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
+
+		hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS & ~(HCR_RW | HCR_TWI | HCR_TWE);
+		hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2);
+
+		hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
+		hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
 	}
 
-	hyp_vcpu->vcpu.arch.sve_state	= kern_hyp_va(host_vcpu->arch.sve_state);
-	hyp_vcpu->vcpu.arch.sve_max_vl	= host_vcpu->arch.sve_max_vl;
-
-	hyp_vcpu->vcpu.arch.hcr_el2	= host_vcpu->arch.hcr_el2;
-	hyp_vcpu->vcpu.arch.mdcr_el2	= host_vcpu->arch.mdcr_el2;
-
-	hyp_vcpu->vcpu.arch.debug_ptr	= kern_hyp_va(host_vcpu->arch.debug_ptr);
-
-	hyp_vcpu->vcpu.arch.vsesr_el2	= host_vcpu->arch.vsesr_el2;
+	hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
 
 	flush_hyp_vgic_state(hyp_vcpu);
 	flush_hyp_timer_state(hyp_vcpu);
@@ -225,10 +226,10 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 	hyp_entry_exit_handler_fn ec_handler;
 	u8 esr_ec;
 
-	host_vcpu->arch.ctxt		= hyp_vcpu->vcpu.arch.ctxt;
-
-	host_vcpu->arch.hcr_el2		= hyp_vcpu->vcpu.arch.hcr_el2;
-
+	/*
+	 * Don't sync the vcpu GPR/sysreg state after a run. Instead,
+	 * leave it in the hyp vCPU until someone actually requires it.
+	 */
 	sync_hyp_vgic_state(hyp_vcpu);
 	sync_hyp_timer_state(hyp_vcpu);
 

From 9035f1ad8e0da8d8fdd36d2eee880e74a4aa6d5d Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 26 Apr 2022 13:20:15 +0000
Subject: [PATCH 061/457] ANDROID: KVM: arm64: Check directly whether the vcpu
 is protected

Simpler code and ensures we're always looking at hyp state.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ib1de39e6d6600cc8b47857e293a7db716ca2d1bf
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/switch.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 9344688a9282..6bd7511b5e72 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -201,7 +201,7 @@ static const exit_handler_fn pvm_exit_handlers[] = {
 
 static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm))))
+	if (unlikely(vcpu_is_protected(vcpu)))
 		return pvm_exit_handlers;
 
 	return hyp_exit_handlers;
@@ -220,9 +220,7 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
  */
 static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
 {
-	struct kvm *kvm = kern_hyp_va(vcpu->kvm);
-
-	if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) {
+	if (unlikely(vcpu_is_protected(vcpu) && vcpu_mode_is_32bit(vcpu))) {
 		/*
 		 * As we have caught the guest red-handed, decide that it isn't
 		 * fit for purpose anymore by making the vcpu invalid. The VMM

From 7507a1801efe5c05c195d29778f5b8839f21054b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 13:42:59 +0000
Subject: [PATCH 062/457] ANDROID: KVM: arm64: Trap debug break and watch from
 guest

Debug and trace are not currently supported for protected guests, so
trap accesses to the related registers and emulate them as RAZ/WI.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I910be963754c7d98e4f1270d05427e65d4c1b253
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c     |  2 +-
 arch/arm64/kvm/hyp/nvhe/sys_regs.c | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 12d1f1a1f48c..5f86d034877c 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -108,7 +108,7 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
 
 	/* Trap Debug */
 	if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids))
-		mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE;
+		mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA;
 
 	/* Trap OS Double Lock */
 	if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids))
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index e6270150b1cb..0ca44f587144 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -351,6 +351,17 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
 	/* Cache maintenance by set/way operations are restricted. */
 
 	/* Debug and Trace Registers are restricted. */
+	RAZ_WI(SYS_DBGBVRn_EL1(0)),
+	RAZ_WI(SYS_DBGBCRn_EL1(0)),
+	RAZ_WI(SYS_DBGWVRn_EL1(0)),
+	RAZ_WI(SYS_DBGWCRn_EL1(0)),
+	RAZ_WI(SYS_MDSCR_EL1),
+	RAZ_WI(SYS_OSLAR_EL1),
+	RAZ_WI(SYS_OSLSR_EL1),
+	RAZ_WI(SYS_OSDLR_EL1),
+
+	/* Group 1 ID registers */
+	RAZ_WI(SYS_REVIDR_EL1),
 
 	/* AArch64 mappings of the AArch32 ID registers */
 	/* CRm=1 */

From 0d5d52ddb0ebf598fde363fa4fdad436c075720d Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 13:33:24 +0000
Subject: [PATCH 063/457] ANDROID: KVM: arm64: Restrict protected VM
 capabilities

Restrict protected VM capabilities based on the
fixed-configuration for protected VMs.

No functional change intended in current KVM-supported modes
(nVHE, VHE).

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I1df873d512754207decf9eedb50135ee2ae76b29
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h | 27 ++++++++++++
 arch/arm64/kvm/arm.c              | 69 ++++++++++++++++++++++++++++++-
 2 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 017b58a61a78..0d0a0d151bd9 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -208,6 +208,33 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm);
 	ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
 	)
 
+/*
+ * Returns the maximum number of breakpoints supported for protected VMs.
+ */
+static inline int pkvm_get_max_brps(void)
+{
+	int num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_BRPs),
+			    PVM_ID_AA64DFR0_ALLOW);
+
+	/*
+	 * If breakpoints are supported, the maximum number is 1 + the field.
+	 * Otherwise, return 0, which is not compliant with the architecture,
+	 * but is reserved and is used here to indicate no debug support.
+	 */
+	return num ? num + 1 : 0;
+}
+
+/*
+ * Returns the maximum number of watchpoints supported for protected VMs.
+ */
+static inline int pkvm_get_max_wrps(void)
+{
+	int num = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_WRPs),
+			    PVM_ID_AA64DFR0_ALLOW);
+
+	return num ? num + 1 : 0;
+}
+
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 6b5548f2a96b..1580a7998f13 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -199,9 +199,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_unshare_hyp(kvm, kvm + 1);
 }
 
-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+static int kvm_check_extension(struct kvm *kvm, long ext)
 {
 	int r;
+
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
 		r = vgic_present;
@@ -300,6 +301,72 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	return r;
 }
 
+/*
+ * Checks whether the extension specified in ext is supported in protected
+ * mode for the specified vm.
+ * The capabilities supported by kvm in general are passed in kvm_cap.
+ */
+static int pkvm_check_extension(struct kvm *kvm, long ext, int kvm_cap)
+{
+	int r;
+
+	switch (ext) {
+	case KVM_CAP_IRQCHIP:
+	case KVM_CAP_ARM_PSCI:
+	case KVM_CAP_ARM_PSCI_0_2:
+	case KVM_CAP_NR_VCPUS:
+	case KVM_CAP_MAX_VCPUS:
+	case KVM_CAP_MAX_VCPU_ID:
+	case KVM_CAP_MSI_DEVID:
+	case KVM_CAP_ARM_VM_IPA_SIZE:
+		r = kvm_cap;
+		break;
+	case KVM_CAP_GUEST_DEBUG_HW_BPS:
+		r = min(kvm_cap, pkvm_get_max_brps());
+		break;
+	case KVM_CAP_GUEST_DEBUG_HW_WPS:
+		r = min(kvm_cap, pkvm_get_max_wrps());
+		break;
+	case KVM_CAP_ARM_PMU_V3:
+		r = kvm_cap && FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer),
+					 PVM_ID_AA64DFR0_ALLOW);
+		break;
+	case KVM_CAP_ARM_SVE:
+		r = kvm_cap && FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE),
+					 PVM_ID_AA64PFR0_RESTRICT_UNSIGNED);
+		break;
+	case KVM_CAP_ARM_PTRAUTH_ADDRESS:
+		r = kvm_cap &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API),
+			      PVM_ID_AA64ISAR1_ALLOW) &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA),
+			      PVM_ID_AA64ISAR1_ALLOW);
+		break;
+	case KVM_CAP_ARM_PTRAUTH_GENERIC:
+		r = kvm_cap &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI),
+			      PVM_ID_AA64ISAR1_ALLOW) &&
+		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA),
+			      PVM_ID_AA64ISAR1_ALLOW);
+		break;
+	default:
+		r = 0;
+		break;
+	}
+
+	return r;
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+	int r = kvm_check_extension(kvm, ext);
+
+	if (kvm && kvm_vm_is_protected(kvm))
+		r = pkvm_check_extension(kvm, ext, r);
+
+	return r;
+}
+
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg)
 {

From 1789ef05b5de3e380949ae81ade7496c7e1c47eb Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 9 May 2022 10:31:44 +0000
Subject: [PATCH 064/457] ANDROID: KVM: arm64: Do not support MTE for protected
 VMs

Return an error (-EINVAL) if trying to enable MTE on a protected
vm.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I036282854169a341253869d67a3e55e6cec8f040
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/arm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1580a7998f13..efc15a26ada5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -90,7 +90,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		break;
 	case KVM_CAP_ARM_MTE:
 		mutex_lock(&kvm->lock);
-		if (!system_supports_mte() || kvm->created_vcpus) {
+		if (!system_supports_mte() ||
+		    kvm_vm_is_protected(kvm) ||
+		    kvm->created_vcpus) {
 			r = -EINVAL;
 		} else {
 			r = 0;

From f3a13716921fcfca0506cc40a32b2352c8ba557b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 10:31:08 +0000
Subject: [PATCH 065/457] ANDROID: KVM: arm64: Refactor reset_mpidr to extract
 its computation

Move the computation of the mpidr to its own function in a shared
header, as the computation will be used by hyp in protected mode.

No functional change intended.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I48c36ebb430c3322a6991eeb391d617903525304
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/sys_regs.c | 14 +-------------
 arch/arm64/kvm/sys_regs.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 8dea7cd2c281..daad344eb446 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -558,19 +558,7 @@ static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 
 static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
-	u64 mpidr;
-
-	/*
-	 * Map the vcpu_id into the first three affinity level fields of
-	 * the MPIDR. We limit the number of VCPUs in level 0 due to a
-	 * limitation to 16 CPUs in that level in the ICC_SGIxR registers
-	 * of the GICv3 to be able to address each CPU directly when
-	 * sending IPIs.
-	 */
-	mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
-	mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
-	mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
-	vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1);
+	vcpu_write_sys_reg(vcpu, calculate_mpidr(vcpu), MPIDR_EL1);
 }
 
 static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index e4ebb3a379fd..c10239980ce4 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -200,6 +200,25 @@ find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[],
 	return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
 }
 
+static inline u64 calculate_mpidr(const struct kvm_vcpu *vcpu)
+{
+	u64 mpidr;
+
+	/*
+	 * Map the vcpu_id into the first three affinity level fields of
+	 * the MPIDR. We limit the number of VCPUs in level 0 due to a
+	 * limitation to 16 CPUs in that level in the ICC_SGIxR registers
+	 * of the GICv3 to be able to address each CPU directly when
+	 * sending IPIs.
+	 */
+	mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
+	mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
+	mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
+	mpidr |= (1ULL << 31);
+
+	return mpidr;
+}
+
 const struct sys_reg_desc *get_reg_by_id(u64 id,
 					 const struct sys_reg_desc table[],
 					 unsigned int num);

From 0b9604999e55b12228f235d714e62c6804fbeaf9 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 11:27:34 +0000
Subject: [PATCH 066/457] ANDROID: KVM: arm64: Reset sysregs for protected VMs

Create a framework for resetting protected VM system registers to
their architecturally defined reset values.

No functional change intended as these are not hooked in yet.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Id812d1bbe81c7c0a544aba91b35831f486c208ba
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  1 +
 arch/arm64/kvm/hyp/nvhe/sys_regs.c     | 84 +++++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 3f254497e22b..2e4842aa0c38 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -91,6 +91,7 @@ struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void);
 u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
 bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
 bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
+void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu);
 int kvm_check_pvm_sysreg_table(void);
 
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 0ca44f587144..d795f781b455 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -465,8 +465,85 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
 	/* Performance Monitoring Registers are restricted. */
 };
 
+/* A structure to track reset values for system registers in protected vcpus. */
+struct sys_reg_desc_reset {
+	/* Index into sys_reg[]. */
+	int reg;
+
+	/* Reset function. */
+	void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc_reset *);
+
+	/* Reset value. */
+	u64 value;
+};
+
+static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r)
+{
+	__vcpu_sys_reg(vcpu, r->reg) = read_sysreg(actlr_el1);
+}
+
+static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r)
+{
+	__vcpu_sys_reg(vcpu, r->reg) = read_sysreg(amair_el1);
+}
+
+static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r)
+{
+	__vcpu_sys_reg(vcpu, r->reg) = calculate_mpidr(vcpu);
+}
+
+static void reset_value(struct kvm_vcpu *vcpu, const struct sys_reg_desc_reset *r)
+{
+	__vcpu_sys_reg(vcpu, r->reg) = r->value;
+}
+
+/* Specify the register's reset value. */
+#define RESET_VAL(REG, RESET_VAL) {  REG, reset_value, RESET_VAL }
+
+/* Specify a function that calculates the register's reset value. */
+#define RESET_FUNC(REG, RESET_FUNC) {  REG, RESET_FUNC, 0 }
+
 /*
- * Checks that the sysreg table is unique and in-order.
+ * Architected system registers reset values for Protected VMs.
+ * Important: Must be sorted ascending by REG (index into sys_reg[])
+ */
+static const struct sys_reg_desc_reset pvm_sys_reg_reset_vals[] = {
+	RESET_FUNC(MPIDR_EL1, reset_mpidr),
+	RESET_VAL(SCTLR_EL1, 0x00C50078),
+	RESET_FUNC(ACTLR_EL1, reset_actlr),
+	RESET_VAL(CPACR_EL1, 0),
+	RESET_VAL(ZCR_EL1, 0),
+	RESET_VAL(TCR_EL1, 0),
+	RESET_VAL(VBAR_EL1, 0),
+	RESET_VAL(CONTEXTIDR_EL1, 0),
+	RESET_FUNC(AMAIR_EL1, reset_amair_el1),
+	RESET_VAL(CNTKCTL_EL1, 0),
+	RESET_VAL(MDSCR_EL1, 0),
+	RESET_VAL(MDCCINT_EL1, 0),
+	RESET_VAL(DISR_EL1, 0),
+	RESET_VAL(PMCCFILTR_EL0, 0),
+	RESET_VAL(PMUSERENR_EL0, 0),
+};
+
+/*
+ * Sets system registers to reset value
+ *
+ * This function finds the right entry and sets the registers on the protected
+ * vcpu to their architecturally defined reset values.
+ */
+void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu)
+{
+	unsigned long i;
+
+	for (i = 0; i < ARRAY_SIZE(pvm_sys_reg_reset_vals); i++) {
+		const struct sys_reg_desc_reset *r = &pvm_sys_reg_reset_vals[i];
+
+		r->reset(vcpu, r);
+	}
+}
+
+/*
+ * Checks that the sysreg tables are unique and in-order.
  *
  * Returns 0 if the table is consistent, or 1 otherwise.
  */
@@ -479,6 +556,11 @@ int kvm_check_pvm_sysreg_table(void)
 			return 1;
 	}
 
+	for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_reset_vals); i++) {
+		if (pvm_sys_reg_reset_vals[i-1].reg >= pvm_sys_reg_reset_vals[i].reg)
+			return 1;
+	}
+
 	return 0;
 }
 

From 494d5d4ff7bf4f89c6692fa876b48dc127c5f333 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 27 Apr 2022 09:36:47 +0000
Subject: [PATCH 067/457] ANDROID: KVM: arm64: Move pkvm_vcpu_init_traps to hyp
 vcpu init

Move the initialization of traps to the initialization of the
hyp vcpu, and remove the associated hypercall.

No functional change intended.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I2e79a6cb494d9a778b46e481206d5c8fde6890fe
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h               | 1 -
 arch/arm64/kvm/arm.c                           | 8 --------
 arch/arm64/kvm/hyp/include/nvhe/trap_handler.h | 2 --
 arch/arm64/kvm/hyp/nvhe/hyp-main.c             | 8 --------
 arch/arm64/kvm/hyp/nvhe/pkvm.c                 | 4 +++-
 5 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index c70f113905c4..b4aaa79f5919 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -75,7 +75,6 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
-	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
 	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm,
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index efc15a26ada5..932e29aa30c3 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -688,14 +688,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 		static_branch_inc(&userspace_irqchip_in_use);
 	}
 
-	/*
-	 * Initialize traps for protected VMs.
-	 * NOTE: Move to run in EL2 directly, rather than via a hypercall, once
-	 * the code is in place for first run initialization at EL2.
-	 */
-	if (kvm_vm_is_protected(kvm))
-		kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu);
-
 	mutex_lock(&kvm->lock);
 	set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
 	mutex_unlock(&kvm->lock);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
index 45a84f0ade04..1e6d995968a1 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h
@@ -15,6 +15,4 @@
 #define DECLARE_REG(type, name, ctxt, reg)	\
 				type name = (type)cpu_reg(ctxt, (reg))
 
-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu);
-
 #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 063eeff1dac6..5931951eb374 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -628,13 +628,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize();
 }
 
-static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
-{
-	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
-
-	__pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
-}
-
 static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
@@ -692,7 +685,6 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__kvm_timer_set_cntvoff),
 	HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
 	HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
-	HANDLE_FUNC(__pkvm_vcpu_init_traps),
 	HANDLE_FUNC(__pkvm_init_vm),
 	HANDLE_FUNC(__pkvm_init_vcpu),
 	HANDLE_FUNC(__pkvm_teardown_vm),
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 5f86d034877c..d7fd64b09f48 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -188,7 +188,7 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
 /*
  * Initialize trap register values for protected VMs.
  */
-void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
+static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
 {
 	pvm_init_trap_regs(vcpu);
 	pvm_init_traps_aa64pfr0(vcpu);
@@ -348,6 +348,8 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 
 	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
 	hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
+
+	pkvm_vcpu_init_traps(&hyp_vcpu->vcpu);
 done:
 	if (ret)
 		unpin_host_vcpu(host_vcpu);

From e21f000396a0e77b6e8b0f3235a36e8b03ba6445 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 27 Apr 2022 09:47:55 +0000
Subject: [PATCH 068/457] ANDROID: KVM: arm64: Fix initializing traps in
 protected mode

The values of the trapping registers for protected VMs should be
computed from the ground up, and not depend on potentially
preexisting values.

Moreover, non-protected VMs should not be restricted in protected
mode in the same manner as protected VMs.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I689c6d48e8ebb533a86b78ebd6e1a1416cb8729b
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c | 49 ++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index d7fd64b09f48..28b8c8db6339 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -168,34 +168,49 @@ static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu)
  */
 static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
 {
-	const u64 hcr_trap_feat_regs = HCR_TID3;
-	const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1;
-
 	/*
 	 * Always trap:
 	 * - Feature id registers: to control features exposed to guests
 	 * - Implementation-defined features
 	 */
-	vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef;
+	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS |
+			     HCR_TID3 | HCR_TACR | HCR_TIDCP | HCR_TID1;
 
-	/* Clear res0 and set res1 bits to trap potential new features. */
-	vcpu->arch.hcr_el2 &= ~(HCR_RES0);
-	vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
-	vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
-	vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
+	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
+		/* route synchronous external abort exceptions to EL2 */
+		vcpu->arch.hcr_el2 |= HCR_TEA;
+		/* trap error record accesses */
+		vcpu->arch.hcr_el2 |= HCR_TERR;
+	}
+
+	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+		vcpu->arch.hcr_el2 |= HCR_FWB;
+
+	if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE))
+		vcpu->arch.hcr_el2 |= HCR_TID2;
 }
 
 /*
  * Initialize trap register values for protected VMs.
  */
-static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
+static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
-	pvm_init_trap_regs(vcpu);
-	pvm_init_traps_aa64pfr0(vcpu);
-	pvm_init_traps_aa64pfr1(vcpu);
-	pvm_init_traps_aa64dfr0(vcpu);
-	pvm_init_traps_aa64mmfr0(vcpu);
-	pvm_init_traps_aa64mmfr1(vcpu);
+	hyp_vcpu->vcpu.arch.cptr_el2 = CPTR_EL2_DEFAULT;
+	hyp_vcpu->vcpu.arch.mdcr_el2 = 0;
+
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
+		u64 hcr = READ_ONCE(hyp_vcpu->host_vcpu->arch.hcr_el2);
+
+		hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS | hcr;
+		return;
+	}
+
+	pvm_init_trap_regs(&hyp_vcpu->vcpu);
+	pvm_init_traps_aa64pfr0(&hyp_vcpu->vcpu);
+	pvm_init_traps_aa64pfr1(&hyp_vcpu->vcpu);
+	pvm_init_traps_aa64dfr0(&hyp_vcpu->vcpu);
+	pvm_init_traps_aa64mmfr0(&hyp_vcpu->vcpu);
+	pvm_init_traps_aa64mmfr1(&hyp_vcpu->vcpu);
 }
 
 /*
@@ -349,7 +364,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
 	hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
 
-	pkvm_vcpu_init_traps(&hyp_vcpu->vcpu);
+	pkvm_vcpu_init_traps(hyp_vcpu);
 done:
 	if (ret)
 		unpin_host_vcpu(host_vcpu);

From f619565af542407292995cbb832ee8ed72e9dfaa Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 16 May 2022 22:41:51 +0100
Subject: [PATCH 069/457] ANDROID: KVM: arm64: Advertise GICv3 sysreg interface
 to protected guests

Advertise the system register GICv3 CPU interface to protected guests
as that is the only supported configuration under pKVM.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Iea2aeaae7776424727f6833c21597b6236284796
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 0d0a0d151bd9..2da3f274fe11 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -43,11 +43,13 @@ void pkvm_destroy_hyp_vm(struct kvm *kvm);
 /*
  * Allow for protected VMs:
  * - Floating-point and Advanced SIMD
+ * - GICv3(+) system register interface
  * - Data Independent Timing
  */
 #define PVM_ID_AA64PFR0_ALLOW (\
 	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \
 	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \
+	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC) | \
 	ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) \
 	)
 

From 3d2a02ffa2cd8af72360c5ce1280f8d0bbb2cb7c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 26 Apr 2022 09:05:46 +0000
Subject: [PATCH 070/457] ANDROID: KVM: arm64: Force injection of a data abort
 on NISV MMIO exit

If a vcpu exits for a data abort with an invalid syndrome, the
expectations are that userspace has a chance to save the day if
it has requested to see such exits.

However, this is completely futile in the case of a protected VM,
as none of the state is available. In this particular case, inject
a data abort directly into the vcpu, consistent with what userspace
could do.

This also helps with pKVM, which discards all syndrome information when
forwarding data aborts that are not known to be MMIO.

Finally, hide the RETURN_NISV_IO_ABORT_TO_USER cap from userspace on
protected VMs, and document this tweak to the API.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ie081cf0b2fdd1ab374d479e3e355ab3cb536c960
Signed-off-by: Quentin Perret <qperret@google.com>
---
 Documentation/virt/kvm/api.rst |  7 +++++++
 arch/arm64/kvm/arm.c           | 14 ++++++++++----
 arch/arm64/kvm/mmio.c          |  9 +++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 896914e3a847..26694287d567 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6427,6 +6427,13 @@ Note that KVM does not skip the faulting instruction as it does for
 KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state
 if it decides to decode and emulate the instruction.
 
+This feature isn't available to protected VMs, as userspace does not
+have access to the state that is required to perform the emulation.
+Instead, a data abort exception is directly injected in the guest.
+Note that although KVM_CAP_ARM_NISV_TO_USER will be reported if
+queried outside of a protected VM context, the feature will not be
+exposed if queried on a protected VM file descriptor.
+
 ::
 
 		/* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 932e29aa30c3..9bbd10b2cdaf 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -84,9 +84,13 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 
 	switch (cap->cap) {
 	case KVM_CAP_ARM_NISV_TO_USER:
-		r = 0;
-		set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
-			&kvm->arch.flags);
+		if (kvm_vm_is_protected(kvm)) {
+			r = -EINVAL;
+		} else {
+			r = 0;
+			set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
+				&kvm->arch.flags);
+		}
 		break;
 	case KVM_CAP_ARM_MTE:
 		mutex_lock(&kvm->lock);
@@ -222,7 +226,6 @@ static int kvm_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_IMMEDIATE_EXIT:
 	case KVM_CAP_VCPU_EVENTS:
 	case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
-	case KVM_CAP_ARM_NISV_TO_USER:
 	case KVM_CAP_ARM_INJECT_EXT_DABT:
 	case KVM_CAP_SET_GUEST_DEBUG:
 	case KVM_CAP_VCPU_ATTRIBUTES:
@@ -230,6 +233,9 @@ static int kvm_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_SYSTEM_SUSPEND:
 		r = 1;
 		break;
+	case KVM_CAP_ARM_NISV_TO_USER:
+		r = !kvm || !kvm_vm_is_protected(kvm);
+		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
 		return KVM_GUESTDBG_VALID_MASK;
 	case KVM_CAP_ARM_SET_DEVICE_ADDR:
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index 3dd38a151d2a..db6630c70f8b 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -133,8 +133,17 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 	/*
 	 * No valid syndrome? Ask userspace for help if it has
 	 * volunteered to do so, and bail out otherwise.
+	 *
+	 * In the protected VM case, there isn't much userspace can do
+	 * though, so directly deliver an exception to the guest.
 	 */
 	if (!kvm_vcpu_dabt_isvalid(vcpu)) {
+		if (is_protected_kvm_enabled() &&
+		    kvm_vm_is_protected(vcpu->kvm)) {
+			kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+			return 1;
+		}
+
 		if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
 			     &vcpu->kvm->arch.flags)) {
 			run->exit_reason = KVM_EXIT_ARM_NISV;

From 6efdcd5ffb22c93162428c5faeba6e201d7b8e0b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 26 Apr 2022 09:29:17 +0000
Subject: [PATCH 071/457] ANDROID: KVM: arm64: Donate memory to protected
 guests

Instead of sharing memory with protected guests, which still leaves the
host with r/w access, donate the underlying pages so that they are
unmapped from the host stage-2.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I3e0d1d31877acf3978e82350ebbe92136919507c
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 5931951eb374..15f053d6848c 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -457,7 +457,10 @@ static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt)
 	if (ret)
 		goto out;
 
-	ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu);
+	if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+		ret = __pkvm_host_donate_guest(pfn, gfn, hyp_vcpu);
+	else
+		ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu);
 out:
 	cpu_reg(host_ctxt, 1) =  ret;
 }

From e4f7a40910ab26b4b1b9eb938572453b1fa48d5e Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 27 Apr 2022 10:01:34 +0000
Subject: [PATCH 072/457] ANDROID: KVM: arm64: Add EL2 entry/exit handlers for
 pKVM guests

Introduce separate El2 entry/exit handlers for protected and
non-protected guests under pKVM and hook up the protected handlers to
expose the minimum amount of data to the host required for EL1 handling.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I6788edabb3849b661c05c4ce63ab17198f4ed1cd
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 218 ++++++++++++++++++++++++++++-
 1 file changed, 216 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 15f053d6848c..68148a5b3946 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -20,6 +20,8 @@
 
 #include <linux/irqchip/arm-gic-v3.h>
 
+#include "../../sys_regs.h"
+
 /*
  * Host FPSIMD state. Written to when the guest accesses its own FPSIMD state,
  * and read when the guest state is live and we need to switch back to the host.
@@ -34,6 +36,192 @@ void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
 typedef void (*hyp_entry_exit_handler_fn)(struct pkvm_hyp_vcpu *);
 
+static void handle_pvm_entry_wfx(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	if (vcpu_get_flag(hyp_vcpu->host_vcpu, INCREMENT_PC)) {
+		vcpu_clear_flag(&hyp_vcpu->vcpu, PC_UPDATE_REQ);
+		kvm_incr_pc(&hyp_vcpu->vcpu);
+	}
+}
+
+static void handle_pvm_entry_sys64(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	/* Exceptions have priority on anything else */
+	if (vcpu_get_flag(host_vcpu, PENDING_EXCEPTION)) {
+		/* Exceptions caused by this should be undef exceptions. */
+		u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
+
+		__vcpu_sys_reg(&hyp_vcpu->vcpu, ESR_EL1) = esr;
+		kvm_pend_exception(&hyp_vcpu->vcpu, EXCEPT_AA64_EL1_SYNC);
+		return;
+	}
+
+	if (vcpu_get_flag(host_vcpu, INCREMENT_PC)) {
+		vcpu_clear_flag(&hyp_vcpu->vcpu, PC_UPDATE_REQ);
+		kvm_incr_pc(&hyp_vcpu->vcpu);
+	}
+
+	if (!esr_sys64_to_params(hyp_vcpu->vcpu.arch.fault.esr_el2).is_write) {
+		/* r0 as transfer register between the guest and the host. */
+		u64 rt_val = READ_ONCE(host_vcpu->arch.ctxt.regs.regs[0]);
+		int rt = kvm_vcpu_sys_get_rt(&hyp_vcpu->vcpu);
+
+		vcpu_set_reg(&hyp_vcpu->vcpu, rt, rt_val);
+	}
+}
+
+static void handle_pvm_entry_iabt(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	unsigned long cpsr = *vcpu_cpsr(&hyp_vcpu->vcpu);
+	u32 esr = ESR_ELx_IL;
+
+	if (!vcpu_get_flag(hyp_vcpu->host_vcpu, PENDING_EXCEPTION))
+		return;
+
+	/*
+	 * If the host wants to inject an exception, get syndrom and
+	 * fault address.
+	 */
+	if ((cpsr & PSR_MODE_MASK) == PSR_MODE_EL0t)
+		esr |= (ESR_ELx_EC_IABT_LOW << ESR_ELx_EC_SHIFT);
+	else
+		esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT);
+
+	esr |= ESR_ELx_FSC_EXTABT;
+
+	__vcpu_sys_reg(&hyp_vcpu->vcpu, ESR_EL1) = esr;
+	__vcpu_sys_reg(&hyp_vcpu->vcpu, FAR_EL1) =
+		kvm_vcpu_get_hfar(&hyp_vcpu->vcpu);
+
+	/* Tell the run loop that we want to inject something */
+	kvm_pend_exception(&hyp_vcpu->vcpu, EXCEPT_AA64_EL1_SYNC);
+}
+
+static void handle_pvm_entry_dabt(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	bool pc_update;
+
+	/* Exceptions have priority over anything else */
+	if (vcpu_get_flag(host_vcpu, PENDING_EXCEPTION)) {
+		unsigned long cpsr = *vcpu_cpsr(&hyp_vcpu->vcpu);
+		u32 esr = ESR_ELx_IL;
+
+		if ((cpsr & PSR_MODE_MASK) == PSR_MODE_EL0t)
+			esr |= (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT);
+		else
+			esr |= (ESR_ELx_EC_DABT_CUR << ESR_ELx_EC_SHIFT);
+
+		esr |= ESR_ELx_FSC_EXTABT;
+
+		__vcpu_sys_reg(&hyp_vcpu->vcpu, ESR_EL1) = esr;
+		__vcpu_sys_reg(&hyp_vcpu->vcpu, FAR_EL1) =
+			kvm_vcpu_get_hfar(&hyp_vcpu->vcpu);
+
+		/* Tell the run loop that we want to inject something */
+		kvm_pend_exception(&hyp_vcpu->vcpu, EXCEPT_AA64_EL1_SYNC);
+
+		/* Cancel potential in-flight MMIO */
+		hyp_vcpu->vcpu.mmio_needed = false;
+		return;
+	}
+
+	/* Handle PC increment on MMIO */
+	pc_update = (hyp_vcpu->vcpu.mmio_needed &&
+		     vcpu_get_flag(host_vcpu, INCREMENT_PC));
+	if (pc_update) {
+		vcpu_clear_flag(&hyp_vcpu->vcpu, PC_UPDATE_REQ);
+		kvm_incr_pc(&hyp_vcpu->vcpu);
+	}
+
+	/* If we were doing an MMIO read access, update the register*/
+	if (pc_update && !kvm_vcpu_dabt_iswrite(&hyp_vcpu->vcpu)) {
+		/* r0 as transfer register between the guest and the host. */
+		u64 rd_val = READ_ONCE(host_vcpu->arch.ctxt.regs.regs[0]);
+		int rd = kvm_vcpu_dabt_get_rd(&hyp_vcpu->vcpu);
+
+		vcpu_set_reg(&hyp_vcpu->vcpu, rd, rd_val);
+	}
+
+	hyp_vcpu->vcpu.mmio_needed = false;
+}
+
+static void handle_pvm_exit_wfx(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	WRITE_ONCE(hyp_vcpu->host_vcpu->arch.ctxt.regs.pstate,
+		   hyp_vcpu->vcpu.arch.ctxt.regs.pstate & PSR_MODE_MASK);
+	WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2,
+		   hyp_vcpu->vcpu.arch.fault.esr_el2);
+}
+
+static void handle_pvm_exit_sys64(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	u32 esr_el2 = hyp_vcpu->vcpu.arch.fault.esr_el2;
+
+	/* r0 as transfer register between the guest and the host. */
+	WRITE_ONCE(host_vcpu->arch.fault.esr_el2,
+		   esr_el2 & ~ESR_ELx_SYS64_ISS_RT_MASK);
+
+	/* The mode is required for the host to emulate some sysregs */
+	WRITE_ONCE(host_vcpu->arch.ctxt.regs.pstate,
+		   hyp_vcpu->vcpu.arch.ctxt.regs.pstate & PSR_MODE_MASK);
+
+	if (esr_sys64_to_params(esr_el2).is_write) {
+		int rt = kvm_vcpu_sys_get_rt(&hyp_vcpu->vcpu);
+		u64 rt_val = vcpu_get_reg(&hyp_vcpu->vcpu, rt);
+
+		WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[0], rt_val);
+	}
+}
+
+static void handle_pvm_exit_iabt(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2,
+		   hyp_vcpu->vcpu.arch.fault.esr_el2);
+	WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.hpfar_el2,
+		   hyp_vcpu->vcpu.arch.fault.hpfar_el2);
+}
+
+static void handle_pvm_exit_dabt(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	/*
+	 * For now, we treat all data aborts as MMIO since we have no knowledge
+	 * of the memslot configuration at EL2.
+	 */
+	hyp_vcpu->vcpu.mmio_needed = true;
+
+	if (hyp_vcpu->vcpu.mmio_needed) {
+		/* r0 as transfer register between the guest and the host. */
+		WRITE_ONCE(host_vcpu->arch.fault.esr_el2,
+			   hyp_vcpu->vcpu.arch.fault.esr_el2 & ~ESR_ELx_SRT_MASK);
+
+		if (kvm_vcpu_dabt_iswrite(&hyp_vcpu->vcpu)) {
+			int rt = kvm_vcpu_dabt_get_rd(&hyp_vcpu->vcpu);
+			u64 rt_val = vcpu_get_reg(&hyp_vcpu->vcpu, rt);
+
+			WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[0], rt_val);
+		}
+	} else {
+		WRITE_ONCE(host_vcpu->arch.fault.esr_el2,
+			   hyp_vcpu->vcpu.arch.fault.esr_el2 & ~ESR_ELx_ISV);
+	}
+
+	WRITE_ONCE(host_vcpu->arch.ctxt.regs.pstate,
+		   hyp_vcpu->vcpu.arch.ctxt.regs.pstate & PSR_MODE_MASK);
+	WRITE_ONCE(host_vcpu->arch.fault.far_el2,
+		   hyp_vcpu->vcpu.arch.fault.far_el2 & GENMASK(11, 0));
+	WRITE_ONCE(host_vcpu->arch.fault.hpfar_el2,
+		   hyp_vcpu->vcpu.arch.fault.hpfar_el2);
+	WRITE_ONCE(__vcpu_sys_reg(host_vcpu, SCTLR_EL1),
+		   __vcpu_sys_reg(&hyp_vcpu->vcpu, SCTLR_EL1) &
+			(SCTLR_ELx_EE | SCTLR_EL1_E0E));
+}
+
 static void handle_vm_entry_generic(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	vcpu_copy_flag(&hyp_vcpu->vcpu, hyp_vcpu->host_vcpu, PC_UPDATE_REQ);
@@ -59,6 +247,22 @@ static void handle_vm_exit_abt(struct pkvm_hyp_vcpu *hyp_vcpu)
 		   hyp_vcpu->vcpu.arch.fault.disr_el1);
 }
 
+static const hyp_entry_exit_handler_fn entry_hyp_pvm_handlers[] = {
+	[0 ... ESR_ELx_EC_MAX]		= NULL,
+	[ESR_ELx_EC_WFx]		= handle_pvm_entry_wfx,
+	[ESR_ELx_EC_SYS64]		= handle_pvm_entry_sys64,
+	[ESR_ELx_EC_IABT_LOW]		= handle_pvm_entry_iabt,
+	[ESR_ELx_EC_DABT_LOW]		= handle_pvm_entry_dabt,
+};
+
+static const hyp_entry_exit_handler_fn exit_hyp_pvm_handlers[] = {
+	[0 ... ESR_ELx_EC_MAX]		= NULL,
+	[ESR_ELx_EC_WFx]		= handle_pvm_exit_wfx,
+	[ESR_ELx_EC_SYS64]		= handle_pvm_exit_sys64,
+	[ESR_ELx_EC_IABT_LOW]		= handle_pvm_exit_iabt,
+	[ESR_ELx_EC_DABT_LOW]		= handle_pvm_exit_dabt,
+};
+
 static const hyp_entry_exit_handler_fn entry_hyp_vm_handlers[] = {
 	[0 ... ESR_ELx_EC_MAX]		= handle_vm_entry_generic,
 };
@@ -209,7 +413,12 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 		break;
 	case ARM_EXCEPTION_TRAP:
 		esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(&hyp_vcpu->vcpu));
-		ec_handler = entry_hyp_vm_handlers[esr_ec];
+
+		if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+			ec_handler = entry_hyp_pvm_handlers[esr_ec];
+		else
+			ec_handler = entry_hyp_vm_handlers[esr_ec];
+
 		if (ec_handler)
 			ec_handler(hyp_vcpu);
 		break;
@@ -238,7 +447,12 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 		break;
 	case ARM_EXCEPTION_TRAP:
 		esr_ec = ESR_ELx_EC(kvm_vcpu_get_esr(&hyp_vcpu->vcpu));
-		ec_handler = exit_hyp_vm_handlers[esr_ec];
+
+		if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+			ec_handler = exit_hyp_pvm_handlers[esr_ec];
+		else
+			ec_handler = exit_hyp_vm_handlers[esr_ec];
+
 		if (ec_handler)
 			ec_handler(hyp_vcpu);
 		break;

From d8609278ec6e0a7e2ebda4415f82c095b6127dbc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 26 Apr 2022 10:48:39 +0000
Subject: [PATCH 073/457] ANDROID: KVM: arm64: Move vgic state between host and
 hypervisor vcpu structures

Since the world switch vgic code operates on the hypervisor data
structure, move the state back and forth between the host and
hypervisor vcpu.

This is currently limited to the VMCR and APR registers, but further
patches will deal with the rest of the state.

Note that some of the control settings (such as SRE) are always
set to the same value. This will eventually be moved to initialisation
time for the hypervisor structures.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8a3a9009ce3408fe06ea272504f4f71c3d47b7bf
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 65 ++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 68148a5b3946..d4ae517d597f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -603,6 +603,16 @@ static struct kvm_vcpu *__get_host_hyp_vcpus(struct kvm_vcpu *arg,
 		__get_host_hyp_vcpus(__vcpu, hyp_vcpup);		\
 	})
 
+#define get_host_hyp_vcpus_from_vgic_v3_cpu_if(ctxt, regnr, hyp_vcpup)		\
+	({									\
+		DECLARE_REG(struct vgic_v3_cpu_if *, cif, ctxt, regnr); 	\
+		struct kvm_vcpu *__vcpu = container_of(cif,			\
+						       struct kvm_vcpu,		\
+						       arch.vgic_cpu.vgic_v3);	\
+										\
+		__get_host_hyp_vcpus(__vcpu, hyp_vcpup);			\
+	})
+
 static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 {
 	struct pkvm_hyp_vcpu *hyp_vcpu;
@@ -758,16 +768,63 @@ static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
 
 static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
 {
-	DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct kvm_vcpu *host_vcpu;
 
-	__vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
+	host_vcpu = get_host_hyp_vcpus_from_vgic_v3_cpu_if(host_ctxt, 1,
+							   &hyp_vcpu);
+	if (!host_vcpu)
+		return;
+
+	if (unlikely(hyp_vcpu)) {
+		struct vgic_v3_cpu_if *hyp_cpu_if, *host_cpu_if;
+		int i;
+
+		hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+		__vgic_v3_save_vmcr_aprs(hyp_cpu_if);
+
+		host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
+		host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr;
+		for (i = 0; i < ARRAY_SIZE(host_cpu_if->vgic_ap0r); i++) {
+			host_cpu_if->vgic_ap0r[i] = hyp_cpu_if->vgic_ap0r[i];
+			host_cpu_if->vgic_ap1r[i] = hyp_cpu_if->vgic_ap1r[i];
+		}
+	} else {
+		__vgic_v3_save_vmcr_aprs(&host_vcpu->arch.vgic_cpu.vgic_v3);
+	}
 }
 
 static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
 {
-	DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+	struct kvm_vcpu *host_vcpu;
 
-	__vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if));
+	host_vcpu = get_host_hyp_vcpus_from_vgic_v3_cpu_if(host_ctxt, 1,
+							   &hyp_vcpu);
+	if (!host_vcpu)
+		return;
+
+	if (unlikely(hyp_vcpu)) {
+		struct vgic_v3_cpu_if *hyp_cpu_if, *host_cpu_if;
+		int i;
+
+		hyp_cpu_if = &hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3;
+		host_cpu_if = &host_vcpu->arch.vgic_cpu.vgic_v3;
+
+		hyp_cpu_if->vgic_vmcr = host_cpu_if->vgic_vmcr;
+		/* Should be a one-off */
+		hyp_cpu_if->vgic_sre = (ICC_SRE_EL1_DIB |
+					ICC_SRE_EL1_DFB |
+					ICC_SRE_EL1_SRE);
+		for (i = 0; i < ARRAY_SIZE(host_cpu_if->vgic_ap0r); i++) {
+			hyp_cpu_if->vgic_ap0r[i] = host_cpu_if->vgic_ap0r[i];
+			hyp_cpu_if->vgic_ap1r[i] = host_cpu_if->vgic_ap1r[i];
+		}
+
+		__vgic_v3_restore_vmcr_aprs(hyp_cpu_if);
+	} else {
+		__vgic_v3_restore_vmcr_aprs(&host_vcpu->arch.vgic_cpu.vgic_v3);
+	}
 }
 
 static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)

From 94dfcd72f68f327e5b373dfdcb345f4ecba3ea77 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 26 Apr 2022 10:51:25 +0000
Subject: [PATCH 074/457] ANDROID: KVM: arm64: Do not update virtual timer
 state for protected VMs

Protected vCPUs always run with a virtual counter offset of 0, so don't
bother trying to update it from the host.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I02a30687e36886aa5c97439874e3e4cf066fe6e7
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/arch_timer.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index bb24a76b4224..3c7096cd2a6e 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -88,7 +88,9 @@ static u64 timer_get_offset(struct arch_timer_context *ctxt)
 
 	switch(arch_timer_ctx_index(ctxt)) {
 	case TIMER_VTIMER:
-		return __vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		if (likely(!kvm_vm_is_protected(vcpu->kvm)))
+			return __vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		fallthrough;
 	default:
 		return 0;
 	}
@@ -768,6 +770,9 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_vcpu *tmp;
 
+	if (unlikely(kvm_vm_is_protected(vcpu->kvm)))
+		cntvoff = 0;
+
 	mutex_lock(&kvm->lock);
 	kvm_for_each_vcpu(i, tmp, kvm)
 		timer_set_offset(vcpu_vtimer(tmp), cntvoff);

From 34289140d06f935950c230f0fb5f9cc93064ee68 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 11:13:46 +0000
Subject: [PATCH 075/457] ANDROID: KVM: arm64: Refactor
 kvm_vcpu_enable_ptrauth() for hyp use

Move kvm_vcpu_enable_ptrauth() to a shared header to be used by
hyp in protected mode.

No functional change intended.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Idb90ae3228fc3acb1fe310227a4f606f47b026a5
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_emulate.h | 16 ++++++++++++++++
 arch/arm64/kvm/reset.c               | 16 ----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 9bdba47f7e14..1db0fc8063a9 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -490,4 +490,20 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
 	return test_bit(feature, vcpu->arch.features);
 }
 
+static inline int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * For now make sure that both address/generic pointer authentication
+	 * features are requested by the userspace together and the system
+	 * supports these capabilities.
+	 */
+	if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) ||
+	    !vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC) ||
+	    !system_has_full_ptr_auth())
+		return -EINVAL;
+
+	vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH);
+	return 0;
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e0267f672b8a..e072792aaafc 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -165,22 +165,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
 		memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu));
 }
 
-static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * For now make sure that both address/generic pointer authentication
-	 * features are requested by the userspace together and the system
-	 * supports these capabilities.
-	 */
-	if (!test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
-	    !test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features) ||
-	    !system_has_full_ptr_auth())
-		return -EINVAL;
-
-	vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH);
-	return 0;
-}
-
 /**
  * kvm_set_vm_width() - set the register width for the guest
  * @vcpu: Pointer to the vcpu being configured

From f50ed63901b547acefd3408cda331b67cd01110a Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 27 Apr 2022 11:25:01 +0000
Subject: [PATCH 076/457] ANDROID: KVM: arm64: Initialize hypervisor vm state
 at EL2

Do not rely on the state of the vm as provided by the host, but
initialize it instead at EL2 to a known good and safe state.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8e0e9fd7cdf0b5b4d422260be06920d0550d5f91
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c | 85 ++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 28b8c8db6339..8f97ae28441b 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -6,6 +6,9 @@
 
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
+
+#include <asm/kvm_emulate.h>
+
 #include <nvhe/mem_protect.h>
 #include <nvhe/memory.h>
 #include <nvhe/pkvm.h>
@@ -310,6 +313,79 @@ struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void)
 	return __this_cpu_read(loaded_hyp_vcpu);
 }
 
+static void pkvm_vcpu_init_features_from_host(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+	/* No restrictions for non-protected VMs. */
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu)) {
+		bitmap_copy(hyp_vcpu->vcpu.arch.features,
+			    host_vcpu->arch.features,
+			    KVM_VCPU_MAX_FEATURES);
+		return;
+	}
+
+	bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+	/*
+	 * For protected vms, always allow:
+	 * - CPU starting in poweroff state
+	 * - PSCI v0.2
+	 */
+	set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features);
+	set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
+
+	/*
+	 * Check if remaining features are allowed:
+	 * - Performance Monitoring
+	 * - Scalable Vectors
+	 * - Pointer Authentication
+	 */
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW))
+		set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
+
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW))
+		set_bit(KVM_ARM_VCPU_SVE, allowed_features);
+
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_ALLOW) &&
+	    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_ALLOW))
+		set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
+
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) &&
+	    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW))
+		set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
+
+	bitmap_and(hyp_vcpu->vcpu.arch.features, host_vcpu->arch.features,
+		   allowed_features, KVM_VCPU_MAX_FEATURES);
+
+	/*
+	 * Now sanitise the configuration flags that we have inherited
+	 * from the host, as they may refer to features that protected
+	 * mode doesn't support.
+	 */
+	if (!vcpu_has_feature(&hyp_vcpu->vcpu,(KVM_ARM_VCPU_SVE))) {
+		vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_SVE);
+		vcpu_clear_flag(&hyp_vcpu->vcpu, VCPU_SVE_FINALIZED);
+	}
+
+	if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) ||
+	    !vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC))
+		vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH);
+}
+
+static int pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	int ret = 0;
+
+	if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, vcpu->arch.features) ||
+	    test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, vcpu->arch.features))
+		ret = kvm_vcpu_enable_ptrauth(vcpu);
+
+	return ret;
+}
+
 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
 {
 	if (host_vcpu)
@@ -336,6 +412,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
 	hyp_vm->host_kvm = host_kvm;
 	hyp_vm->kvm.created_vcpus = nr_vcpus;
 	hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+	hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled);
 	hyp_vm->kvm.arch.mmu.last_vcpu_ran = last_ran;
 	memset(hyp_vm->kvm.arch.mmu.last_vcpu_ran, -1, pkvm_get_last_ran_size());
 }
@@ -363,8 +440,16 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 
 	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
 	hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
+	hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+
+	pkvm_vcpu_init_features_from_host(hyp_vcpu);
+
+	ret = pkvm_vcpu_init_ptrauth(hyp_vcpu);
+	if (ret)
+		goto done;
 
 	pkvm_vcpu_init_traps(hyp_vcpu);
+	kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
 done:
 	if (ret)
 		unpin_host_vcpu(host_vcpu);

From 5281f6ba87562d6aae0cc29c3388d8d196b75c99 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 4 Apr 2022 16:10:48 +0100
Subject: [PATCH 077/457] ANDROID: KVM: arm64: Track the SVE state in the
 hypervisor vcpu structure

When dealing with a guest with SVE enabled, make sure the host SVE
state is pinned at EL2 S1, and that the hypervisor vCPU state is
correctly initialised (and then unpinned on teardown).

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ic4d0ef9a6124701026cd56f6725ab4737857ed5b
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |  9 ++++---
 arch/arm64/kvm/hyp/nvhe/pkvm.c     | 40 ++++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index d4ae517d597f..4e4b2175dce0 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -391,9 +391,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 		if (vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY))
 			__flush_hyp_vcpu(hyp_vcpu);
 
-		hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
-		hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
-
 		hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS & ~(HCR_RW | HCR_TWI | HCR_TWE);
 		hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2);
 
@@ -463,7 +460,11 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 		BUG();
 	}
 
-	vcpu_clear_flag(host_vcpu, PC_UPDATE_REQ);
+	if (pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+		vcpu_clear_flag(host_vcpu, PC_UPDATE_REQ);
+	else
+		host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
+
 	hyp_vcpu->exit_code = exit_reason;
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 8f97ae28441b..251830e0938f 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -392,13 +392,29 @@ static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
 		hyp_unpin_shared_mem(host_vcpu, host_vcpu + 1);
 }
 
+static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	void *sve_state;
+
+	if (!test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features))
+		return;
+
+	sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state);
+	hyp_unpin_shared_mem(sve_state,
+			     sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu));
+}
+
 static void unpin_host_vcpus(struct pkvm_hyp_vcpu *hyp_vcpus[],
 			     unsigned int nr_vcpus)
 {
 	int i;
 
-	for (i = 0; i < nr_vcpus; i++)
-		unpin_host_vcpu(hyp_vcpus[i]->host_vcpu);
+	for (i = 0; i < nr_vcpus; i++) {
+		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vcpus[i];
+
+		unpin_host_vcpu(hyp_vcpu->host_vcpu);
+		unpin_host_sve_state(hyp_vcpu);
+	}
 }
 
 static size_t pkvm_get_last_ran_size(void)
@@ -448,6 +464,26 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 	if (ret)
 		goto done;
 
+	if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) {
+		size_t sve_state_size;
+		void *sve_state;
+
+		hyp_vcpu->vcpu.arch.sve_state = READ_ONCE(host_vcpu->arch.sve_state);
+		hyp_vcpu->vcpu.arch.sve_max_vl = READ_ONCE(host_vcpu->arch.sve_max_vl);
+
+		sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state);
+		sve_state_size = vcpu_sve_state_size(&hyp_vcpu->vcpu);
+
+		if (!hyp_vcpu->vcpu.arch.sve_state || !sve_state_size ||
+		    hyp_pin_shared_mem(sve_state, sve_state + sve_state_size)) {
+			clear_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features);
+			hyp_vcpu->vcpu.arch.sve_state = NULL;
+			hyp_vcpu->vcpu.arch.sve_max_vl = 0;
+			ret = -EINVAL;
+			goto done;
+		}
+	}
+
 	pkvm_vcpu_init_traps(hyp_vcpu);
 	kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
 done:

From 0faf32c7273ffe5dd7d71ec73d12db8eddfe18b6 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 26 Apr 2022 12:36:39 +0000
Subject: [PATCH 078/457] ANDROID: KVM: arm64: Add HVC handling for protected
 guests at EL2

Rather than forwarding guest hypercalls back to the host for handling,
implement some basic handling at EL2 which will later be extending to
provide additional functionality such as PSCI.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I14613c416078818b25bb29ed8899d7b71f8c40cc
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  2 ++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c     | 26 ++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 22 ++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/switch.c       |  1 +
 4 files changed, 51 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 2e4842aa0c38..1168b9ffa77b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -94,4 +94,6 @@ bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
 void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu);
 int kvm_check_pvm_sysreg_table(void);
 
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
+
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 4e4b2175dce0..b825b6778ffb 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -4,6 +4,8 @@
  * Author: Andrew Scull <ascull@google.com>
  */
 
+#include <kvm/arm_hypercalls.h>
+
 #include <hyp/adjust_pc.h>
 
 #include <asm/pgtable-types.h>
@@ -44,6 +46,13 @@ static void handle_pvm_entry_wfx(struct pkvm_hyp_vcpu *hyp_vcpu)
 	}
 }
 
+static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	u64 ret = READ_ONCE(hyp_vcpu->host_vcpu->arch.ctxt.regs.regs[0]);
+
+	vcpu_set_reg(&hyp_vcpu->vcpu, 0, ret);
+}
+
 static void handle_pvm_entry_sys64(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
@@ -177,6 +186,21 @@ static void handle_pvm_exit_sys64(struct pkvm_hyp_vcpu *hyp_vcpu)
 	}
 }
 
+static void handle_pvm_exit_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	int i;
+
+	WRITE_ONCE(host_vcpu->arch.fault.esr_el2,
+		   hyp_vcpu->vcpu.arch.fault.esr_el2);
+
+	/* Pass the hvc function id (r0) as well as any potential arguments. */
+	for (i = 0; i < 8; i++) {
+		WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[i],
+			   vcpu_get_reg(&hyp_vcpu->vcpu, i));
+	}
+}
+
 static void handle_pvm_exit_iabt(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	WRITE_ONCE(hyp_vcpu->host_vcpu->arch.fault.esr_el2,
@@ -250,6 +274,7 @@ static void handle_vm_exit_abt(struct pkvm_hyp_vcpu *hyp_vcpu)
 static const hyp_entry_exit_handler_fn entry_hyp_pvm_handlers[] = {
 	[0 ... ESR_ELx_EC_MAX]		= NULL,
 	[ESR_ELx_EC_WFx]		= handle_pvm_entry_wfx,
+	[ESR_ELx_EC_HVC64]		= handle_pvm_entry_hvc64,
 	[ESR_ELx_EC_SYS64]		= handle_pvm_entry_sys64,
 	[ESR_ELx_EC_IABT_LOW]		= handle_pvm_entry_iabt,
 	[ESR_ELx_EC_DABT_LOW]		= handle_pvm_entry_dabt,
@@ -258,6 +283,7 @@ static const hyp_entry_exit_handler_fn entry_hyp_pvm_handlers[] = {
 static const hyp_entry_exit_handler_fn exit_hyp_pvm_handlers[] = {
 	[0 ... ESR_ELx_EC_MAX]		= NULL,
 	[ESR_ELx_EC_WFx]		= handle_pvm_exit_wfx,
+	[ESR_ELx_EC_HVC64]		= handle_pvm_exit_hvc64,
 	[ESR_ELx_EC_SYS64]		= handle_pvm_exit_sys64,
 	[ESR_ELx_EC_IABT_LOW]		= handle_pvm_exit_iabt,
 	[ESR_ELx_EC_DABT_LOW]		= handle_pvm_exit_dabt,
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 251830e0938f..9de463ad0d3b 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -7,6 +7,8 @@
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
 
+#include <kvm/arm_hypercalls.h>
+
 #include <asm/kvm_emulate.h>
 
 #include <nvhe/mem_protect.h>
@@ -803,3 +805,23 @@ err_unlock:
 	hyp_spin_unlock(&vm_table_lock);
 	return err;
 }
+
+/*
+ * Handler for protected VM HVC calls.
+ *
+ * Returns true if the hypervisor has handled the exit, and control should go
+ * back to the guest, or false if it hasn't.
+ */
+bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+	u32 fn = smccc_get_function(vcpu);
+
+	switch (fn) {
+	case ARM_SMCCC_VERSION_FUNC_ID:
+		/* Nothing to be handled by the host. Go back to the guest. */
+		smccc_set_retval(vcpu, ARM_SMCCC_VERSION_1_1, 0, 0, 0);
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 6bd7511b5e72..4b904252e1db 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -191,6 +191,7 @@ static const exit_handler_fn hyp_exit_handlers[] = {
 
 static const exit_handler_fn pvm_exit_handlers[] = {
 	[0 ... ESR_ELx_EC_MAX]		= NULL,
+	[ESR_ELx_EC_HVC64]		= kvm_handle_pvm_hvc64,
 	[ESR_ELx_EC_SYS64]		= kvm_handle_pvm_sys64,
 	[ESR_ELx_EC_SVE]		= kvm_handle_pvm_restricted,
 	[ESR_ELx_EC_FP_ASIMD]		= kvm_hyp_handle_fpsimd,

From a9c469cccefb9ad4a06ba45ea18763fc4b42e485 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 10:26:34 +0000
Subject: [PATCH 079/457] ANDROID: KVM: arm64: Move pstate reset values to
 kvm_arm.h

Move the macro defines of the pstate reset values to a shared
header to be used by hyp in protected mode.

No functional change intended.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ib98734d2ced07a958427c6552a9c22d159b85ad1
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_arm.h | 9 +++++++++
 arch/arm64/kvm/reset.c           | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 89e63585dae4..3a49a03513d0 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -361,4 +361,13 @@
 #define CPACR_EL1_DEFAULT	(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\
 				 CPACR_EL1_ZEN_EL1EN)
 
+/*
+ * ARMv8 Reset Values
+ */
+#define VCPU_RESET_PSTATE_EL1	(PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
+				 PSR_F_BIT | PSR_D_BIT)
+
+#define VCPU_RESET_PSTATE_SVC	(PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
+				 PSR_AA32_I_BIT | PSR_AA32_F_BIT)
+
 #endif /* __ARM64_KVM_ARM_H__ */
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e072792aaafc..f60b9b53ee65 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -32,15 +32,6 @@
 /* Maximum phys_shift supported for any VM on this host */
 static u32 kvm_ipa_limit;
 
-/*
- * ARMv8 Reset Values
- */
-#define VCPU_RESET_PSTATE_EL1	(PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
-				 PSR_F_BIT | PSR_D_BIT)
-
-#define VCPU_RESET_PSTATE_SVC	(PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
-				 PSR_AA32_I_BIT | PSR_AA32_F_BIT)
-
 unsigned int kvm_sve_max_vl;
 
 int kvm_arm_init_sve(void)

From 9fb1fb85faee2cbd7afe8a0288d3cdee96018f03 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Mon, 25 Apr 2022 10:16:34 +0000
Subject: [PATCH 080/457] ANDROID: KVM: arm64: Move some kvm_psci functions to
 a shared header

Move some PSCI functions and macros to a shared header to be used
by hyp in protected mode.

No functional change intended.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ibe84564f423cd0281f3dc33d9801b474fe8f2db9
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/psci.c  | 28 ----------------------------
 include/kvm/arm_psci.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 7fbc4c1b9df0..0f7001d726a2 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -21,16 +21,6 @@
  * as described in ARM document number ARM DEN 0022A.
  */
 
-#define AFFINITY_MASK(level)	~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
-
-static unsigned long psci_affinity_mask(unsigned long affinity_level)
-{
-	if (affinity_level <= 3)
-		return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
-
-	return 0;
-}
-
 static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -51,12 +41,6 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 	return PSCI_RET_SUCCESS;
 }
 
-static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu,
-					   unsigned long affinity)
-{
-	return !(affinity & ~MPIDR_HWID_BITMASK);
-}
-
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
 	struct vcpu_reset_state *reset_state;
@@ -204,18 +188,6 @@ static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
 	run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
 }
 
-static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
-{
-	int i;
-
-	/*
-	 * Zero the input registers' upper 32 bits. They will be fully
-	 * zeroed on exit, so we're fine changing them in place.
-	 */
-	for (i = 1; i < 4; i++)
-		vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
-}
-
 static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
 {
 	/*
diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h
index 6e55b9283789..817093052396 100644
--- a/include/kvm/arm_psci.h
+++ b/include/kvm/arm_psci.h
@@ -36,6 +36,35 @@ static inline int kvm_psci_version(struct kvm_vcpu *vcpu)
 	return KVM_ARM_PSCI_0_1;
 }
 
+/* Narrow the PSCI register arguments (r1 to r3) to 32 bits. */
+static inline void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	/*
+	 * Zero the input registers' upper 32 bits. They will be fully
+	 * zeroed on exit, so we're fine changing them in place.
+	 */
+	for (i = 1; i < 4; i++)
+		vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
+}
+
+static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu,
+					   unsigned long affinity)
+{
+	return !(affinity & ~MPIDR_HWID_BITMASK);
+}
+
+
+#define AFFINITY_MASK(level)	~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
+
+static inline unsigned long psci_affinity_mask(unsigned long affinity_level)
+{
+	if (affinity_level <= 3)
+		return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
+
+	return 0;
+}
 
 int kvm_psci_call(struct kvm_vcpu *vcpu);
 

From ba74c827975816471500f8b7317e2fd5afe91d7b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 11 May 2022 09:39:53 +0000
Subject: [PATCH 081/457] ANDROID: KVM: arm64: Factor out vcpu_reset code for
 core registers and PSCI

Factor out logic that resets a vcpu's core registers, including
additional PSCI handling. This code will be reused when resetting
VMs in protected mode.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I22468be1d382e05e39557e32ea09a023173dbf48
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_emulate.h | 41 +++++++++++++++++++++++++
 arch/arm64/kvm/reset.c               | 45 +++++-----------------------
 2 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 1db0fc8063a9..d1cc41d494fa 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -506,4 +506,45 @@ static inline int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+/* Reset a vcpu's core registers. */
+static inline void kvm_reset_vcpu_core(struct kvm_vcpu *vcpu)
+{
+	u32 pstate;
+
+	if (vcpu_el1_is_32bit(vcpu)) {
+		pstate = VCPU_RESET_PSTATE_SVC;
+	} else {
+		pstate = VCPU_RESET_PSTATE_EL1;
+	}
+
+	/* Reset core registers */
+	memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+	memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
+	vcpu->arch.ctxt.spsr_abt = 0;
+	vcpu->arch.ctxt.spsr_und = 0;
+	vcpu->arch.ctxt.spsr_irq = 0;
+	vcpu->arch.ctxt.spsr_fiq = 0;
+	vcpu_gp_regs(vcpu)->pstate = pstate;
+}
+
+/* PSCI reset handling for a vcpu. */
+static inline void kvm_reset_vcpu_psci(struct kvm_vcpu *vcpu,
+				       struct vcpu_reset_state *reset_state)
+{
+	unsigned long target_pc = reset_state->pc;
+
+	/* Gracefully handle Thumb2 entry point */
+	if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
+		target_pc &= ~1UL;
+		vcpu_set_thumb(vcpu);
+	}
+
+	/* Propagate caller endianness */
+	if (reset_state->be)
+		kvm_vcpu_set_be(vcpu);
+
+	*vcpu_pc(vcpu) = target_pc;
+	vcpu_set_reg(vcpu, 0, reset_state->r0);
+}
+
 #endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index f60b9b53ee65..29a185472f8b 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -109,7 +109,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
 		kfree(buf);
 		return ret;
 	}
-	
+
 	vcpu->arch.sve_state = buf;
 	vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED);
 	return 0;
@@ -226,7 +226,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_reset_state reset_state;
 	int ret;
 	bool loaded;
-	u32 pstate;
 
 	mutex_lock(&vcpu->kvm->lock);
 	ret = kvm_set_vm_width(vcpu);
@@ -265,29 +264,13 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 		}
 	}
 
-	switch (vcpu->arch.target) {
-	default:
-		if (vcpu_el1_is_32bit(vcpu)) {
-			pstate = VCPU_RESET_PSTATE_SVC;
-		} else {
-			pstate = VCPU_RESET_PSTATE_EL1;
-		}
-
-		if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) {
-			ret = -EINVAL;
-			goto out;
-		}
-		break;
+	if (kvm_vcpu_has_pmu(vcpu) && !kvm_arm_support_pmu_v3()) {
+		ret = -EINVAL;
+		goto out;
 	}
 
 	/* Reset core registers */
-	memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
-	memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
-	vcpu->arch.ctxt.spsr_abt = 0;
-	vcpu->arch.ctxt.spsr_und = 0;
-	vcpu->arch.ctxt.spsr_irq = 0;
-	vcpu->arch.ctxt.spsr_fiq = 0;
-	vcpu_gp_regs(vcpu)->pstate = pstate;
+	kvm_reset_vcpu_core(vcpu);
 
 	/* Reset system registers */
 	kvm_reset_sys_regs(vcpu);
@@ -296,22 +279,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	 * Additional reset state handling that PSCI may have imposed on us.
 	 * Must be done after all the sys_reg reset.
 	 */
-	if (reset_state.reset) {
-		unsigned long target_pc = reset_state.pc;
-
-		/* Gracefully handle Thumb2 entry point */
-		if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
-			target_pc &= ~1UL;
-			vcpu_set_thumb(vcpu);
-		}
-
-		/* Propagate caller endianness */
-		if (reset_state.be)
-			kvm_vcpu_set_be(vcpu);
-
-		*vcpu_pc(vcpu) = target_pc;
-		vcpu_set_reg(vcpu, 0, reset_state.r0);
-	}
+	if (reset_state.reset)
+		kvm_reset_vcpu_psci(vcpu, &reset_state);
 
 	/* Reset timer */
 	ret = kvm_timer_vcpu_reset(vcpu);

From 8d7b422918064236c3e21b3fc4b90ea178e9c813 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 26 Apr 2022 12:51:01 +0000
Subject: [PATCH 082/457] ANDROID: KVM: arm64: Handle PSCI for protected VMs in
 EL2

Add PSCI 1.1 support for protected VMs at EL2.

Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I0e23fdc1f2d62563fd806400aff70be49337dd22
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  13 ++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c     |  70 +++++-
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 302 ++++++++++++++++++++++++-
 3 files changed, 382 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 1168b9ffa77b..6160d1a34fa2 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -29,6 +29,15 @@ struct pkvm_hyp_vcpu {
 
 	/* Tracks exit code for the protected guest. */
 	u32 exit_code;
+
+	/*
+	 * Track the power state transition of a protected vcpu.
+	 * Can be in one of three states:
+	 * PSCI_0_2_AFFINITY_LEVEL_ON
+	 * PSCI_0_2_AFFINITY_LEVEL_OFF
+	 * PSCI_0_2_AFFINITY_LEVEL_PENDING
+	 */
+	int power_state;
 };
 
 /*
@@ -94,6 +103,10 @@ bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
 void kvm_reset_pvm_sys_regs(struct kvm_vcpu *vcpu);
 int kvm_check_pvm_sysreg_table(void);
 
+void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
+
 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
 
+struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr);
+
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index b825b6778ffb..f70baf884323 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -21,6 +21,7 @@
 #include <nvhe/trap_handler.h>
 
 #include <linux/irqchip/arm-gic-v3.h>
+#include <uapi/linux/psci.h>
 
 #include "../../sys_regs.h"
 
@@ -48,8 +49,38 @@ static void handle_pvm_entry_wfx(struct pkvm_hyp_vcpu *hyp_vcpu)
 
 static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
+	u32 psci_fn = smccc_get_function(&hyp_vcpu->vcpu);
 	u64 ret = READ_ONCE(hyp_vcpu->host_vcpu->arch.ctxt.regs.regs[0]);
 
+	switch (psci_fn) {
+	case PSCI_0_2_FN_CPU_ON:
+	case PSCI_0_2_FN64_CPU_ON:
+		/*
+		 * Check whether the cpu_on request to the host was successful.
+		 * If not, reset the vcpu state from ON_PENDING to OFF.
+		 * This could happen if this vcpu attempted to turn on the other
+		 * vcpu while the other one is in the process of turning itself
+		 * off.
+		 */
+		if (ret != PSCI_RET_SUCCESS) {
+			unsigned long cpu_id = smccc_get_arg1(&hyp_vcpu->vcpu);
+			struct pkvm_hyp_vcpu *target_vcpu;
+			struct pkvm_hyp_vm *hyp_vm;
+
+			hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+			target_vcpu = pkvm_mpidr_to_hyp_vcpu(hyp_vm, cpu_id);
+
+			if (target_vcpu && READ_ONCE(target_vcpu->power_state) == PSCI_0_2_AFFINITY_LEVEL_ON_PENDING)
+				WRITE_ONCE(target_vcpu->power_state, PSCI_0_2_AFFINITY_LEVEL_OFF);
+
+			ret = PSCI_RET_INTERNAL_FAILURE;
+		}
+
+		break;
+	default:
+		break;
+	}
+
 	vcpu_set_reg(&hyp_vcpu->vcpu, 0, ret);
 }
 
@@ -189,13 +220,45 @@ static void handle_pvm_exit_sys64(struct pkvm_hyp_vcpu *hyp_vcpu)
 static void handle_pvm_exit_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
-	int i;
+	int n, i;
+
+	switch (smccc_get_function(&hyp_vcpu->vcpu)) {
+	/*
+	 * CPU_ON takes 3 arguments, however, to wake up the target vcpu the
+	 * host only needs to know the target's cpu_id, which is passed as the
+	 * first argument. The processing of the reset state is done at hyp.
+	 */
+	case PSCI_0_2_FN_CPU_ON:
+	case PSCI_0_2_FN64_CPU_ON:
+		n = 2;
+		break;
+
+	case PSCI_0_2_FN_CPU_OFF:
+	case PSCI_0_2_FN_SYSTEM_OFF:
+	case PSCI_0_2_FN_SYSTEM_RESET:
+	case PSCI_0_2_FN_CPU_SUSPEND:
+	case PSCI_0_2_FN64_CPU_SUSPEND:
+		n = 1;
+		break;
+
+	case PSCI_1_1_FN_SYSTEM_RESET2:
+	case PSCI_1_1_FN64_SYSTEM_RESET2:
+		n = 3;
+		break;
+
+	/*
+	 * The rest are either blocked or handled by HYP, so we should
+	 * really never be here.
+	 */
+	default:
+		BUG();
+	}
 
 	WRITE_ONCE(host_vcpu->arch.fault.esr_el2,
 		   hyp_vcpu->vcpu.arch.fault.esr_el2);
 
 	/* Pass the hvc function id (r0) as well as any potential arguments. */
-	for (i = 0; i < 8; i++) {
+	for (i = 0; i < n; i++) {
 		WRITE_ONCE(host_vcpu->arch.ctxt.regs.regs[i],
 			   vcpu_get_reg(&hyp_vcpu->vcpu, i));
 	}
@@ -408,6 +471,9 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	hyp_entry_exit_handler_fn ec_handler;
 	u8 esr_ec;
 
+	if (READ_ONCE(hyp_vcpu->power_state) == PSCI_0_2_AFFINITY_LEVEL_ON_PENDING)
+		pkvm_reset_vcpu(hyp_vcpu);
+
 	/*
 	 * If we deal with a non-protected guest and the state is potentially
 	 * dirty (from a host perspective), copy the state back into the hyp
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 9de463ad0d3b..89c3a0ad2933 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 
 #include <kvm/arm_hypercalls.h>
+#include <kvm/arm_psci.h>
 
 #include <asm/kvm_emulate.h>
 
@@ -388,6 +389,23 @@ static int pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu)
 	return ret;
 }
 
+static void pkvm_vcpu_init_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct vcpu_reset_state *reset_state = &hyp_vcpu->vcpu.arch.reset_state;
+
+	if (test_bit(KVM_ARM_VCPU_POWER_OFF, hyp_vcpu->vcpu.arch.features)) {
+		reset_state->reset = false;
+		hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_OFF;
+	} else {
+		struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+		reset_state->pc = READ_ONCE(host_vcpu->arch.ctxt.regs.pc);
+		reset_state->r0 = READ_ONCE(host_vcpu->arch.ctxt.regs.regs[0]);
+		reset_state->reset = true;
+		hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
+	}
+}
+
 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
 {
 	if (host_vcpu)
@@ -488,6 +506,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 
 	pkvm_vcpu_init_traps(hyp_vcpu);
 	kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
+	pkvm_vcpu_init_psci(hyp_vcpu);
 done:
 	if (ret)
 		unpin_host_vcpu(host_vcpu);
@@ -806,6 +825,284 @@ err_unlock:
 	return err;
 }
 
+/*
+ * This function sets the registers on the vcpu to their architecturally defined
+ * reset values.
+ *
+ * Note: Can only be called by the vcpu on itself, after it has been turned on.
+ */
+void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct vcpu_reset_state *reset_state = &hyp_vcpu->vcpu.arch.reset_state;
+
+	WARN_ON(!reset_state->reset);
+
+	pkvm_vcpu_init_ptrauth(hyp_vcpu);
+	kvm_reset_vcpu_core(&hyp_vcpu->vcpu);
+	kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
+
+	/* Must be done after reseting sys registers. */
+	kvm_reset_vcpu_psci(&hyp_vcpu->vcpu, reset_state);
+
+	reset_state->reset = false;
+
+	hyp_vcpu->exit_code = 0;
+
+	WARN_ON(hyp_vcpu->power_state != PSCI_0_2_AFFINITY_LEVEL_ON_PENDING);
+	WRITE_ONCE(hyp_vcpu->vcpu.arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
+	WRITE_ONCE(hyp_vcpu->power_state, PSCI_0_2_AFFINITY_LEVEL_ON);
+}
+
+struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *hyp_vm,
+					     u64 mpidr)
+{
+	int i;
+
+	mpidr &= MPIDR_HWID_BITMASK;
+
+	for (i = 0; i < hyp_vm->nr_vcpus; i++) {
+		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[i];
+
+		if (mpidr == kvm_vcpu_get_mpidr_aff(&hyp_vcpu->vcpu))
+			return hyp_vcpu;
+	}
+
+	return NULL;
+}
+
+/*
+ * Returns true if the hypervisor has handled the PSCI call, and control should
+ * go back to the guest, or false if the host needs to do some additional work
+ * (i.e., wake up the vcpu).
+ */
+static bool pvm_psci_vcpu_on(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	struct vcpu_reset_state *reset_state;
+	struct pkvm_hyp_vcpu *target;
+	unsigned long cpu_id, ret;
+	int power_state;
+
+	cpu_id = smccc_get_arg1(&hyp_vcpu->vcpu);
+	if (!kvm_psci_valid_affinity(&hyp_vcpu->vcpu, cpu_id)) {
+		ret = PSCI_RET_INVALID_PARAMS;
+		goto error;
+	}
+
+	target = pkvm_mpidr_to_hyp_vcpu(hyp_vm, cpu_id);
+	if (!target) {
+		ret = PSCI_RET_INVALID_PARAMS;
+		goto error;
+	}
+
+	/*
+	 * Make sure the requested vcpu is not on to begin with.
+	 * Atomic to avoid race between vcpus trying to power on the same vcpu.
+	 */
+	power_state = cmpxchg(&target->power_state,
+			      PSCI_0_2_AFFINITY_LEVEL_OFF,
+			      PSCI_0_2_AFFINITY_LEVEL_ON_PENDING);
+	switch (power_state) {
+	case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING:
+		ret = PSCI_RET_ON_PENDING;
+		goto error;
+	case PSCI_0_2_AFFINITY_LEVEL_ON:
+		ret = PSCI_RET_ALREADY_ON;
+		goto error;
+	case PSCI_0_2_AFFINITY_LEVEL_OFF:
+		break;
+	default:
+		ret = PSCI_RET_INTERNAL_FAILURE;
+		goto error;
+	}
+
+	reset_state = &target->vcpu.arch.reset_state;
+	reset_state->pc = smccc_get_arg2(&hyp_vcpu->vcpu);
+	reset_state->r0 = smccc_get_arg3(&hyp_vcpu->vcpu);
+	/* Propagate caller endianness */
+	reset_state->be = kvm_vcpu_is_be(&hyp_vcpu->vcpu);
+	reset_state->reset = true;
+
+	/*
+	 * Return to the host, which should make the KVM_REQ_VCPU_RESET request
+	 * as well as kvm_vcpu_wake_up() to schedule the vcpu.
+	 */
+	return false;
+
+error:
+	/* If there's an error go back straight to the guest. */
+	smccc_set_retval(&hyp_vcpu->vcpu, ret, 0, 0, 0);
+	return true;
+}
+
+static bool pvm_psci_vcpu_affinity_info(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	unsigned long target_affinity_mask, target_affinity, lowest_affinity_level;
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	unsigned long mpidr, ret;
+	int i, matching_cpus = 0;
+
+	target_affinity = smccc_get_arg1(vcpu);
+	lowest_affinity_level = smccc_get_arg2(vcpu);
+	if (!kvm_psci_valid_affinity(vcpu, target_affinity)) {
+		ret = PSCI_RET_INVALID_PARAMS;
+		goto done;
+	}
+
+	/* Determine target affinity mask */
+	target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
+	if (!target_affinity_mask) {
+		ret = PSCI_RET_INVALID_PARAMS;
+		goto done;
+	}
+
+	/* Ignore other bits of target affinity */
+	target_affinity &= target_affinity_mask;
+	ret = PSCI_0_2_AFFINITY_LEVEL_OFF;
+
+	/*
+	 * If at least one vcpu matching target affinity is ON then return ON,
+	 * then if at least one is PENDING_ON then return PENDING_ON.
+	 * Otherwise, return OFF.
+	 */
+	for (i = 0; i < hyp_vm->nr_vcpus; i++) {
+		struct pkvm_hyp_vcpu *target = hyp_vm->vcpus[i];
+
+		mpidr = kvm_vcpu_get_mpidr_aff(&target->vcpu);
+
+		if ((mpidr & target_affinity_mask) == target_affinity) {
+			int power_state;
+
+			matching_cpus++;
+			power_state = READ_ONCE(target->power_state);
+			switch (power_state) {
+			case PSCI_0_2_AFFINITY_LEVEL_ON_PENDING:
+				ret = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
+				break;
+			case PSCI_0_2_AFFINITY_LEVEL_ON:
+				ret = PSCI_0_2_AFFINITY_LEVEL_ON;
+				goto done;
+			case PSCI_0_2_AFFINITY_LEVEL_OFF:
+				break;
+			default:
+				ret = PSCI_RET_INTERNAL_FAILURE;
+				goto done;
+			}
+		}
+	}
+
+	if (!matching_cpus)
+		ret = PSCI_RET_INVALID_PARAMS;
+
+done:
+	/* Nothing to be handled by the host. Go back to the guest. */
+	smccc_set_retval(vcpu, ret, 0, 0, 0);
+	return true;
+}
+
+/*
+ * Returns true if the hypervisor has handled the PSCI call, and control should
+ * go back to the guest, or false if the host needs to do some additional work
+ * (e.g., turn off and update vcpu scheduling status).
+ */
+static bool pvm_psci_vcpu_off(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+
+	WARN_ON(vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED);
+	WARN_ON(hyp_vcpu->power_state != PSCI_0_2_AFFINITY_LEVEL_ON);
+
+	WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
+	WRITE_ONCE(hyp_vcpu->power_state, PSCI_0_2_AFFINITY_LEVEL_OFF);
+
+	/* Return to the host so that it can finish powering off the vcpu. */
+	return false;
+}
+
+static bool pvm_psci_version(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	/* Nothing to be handled by the host. Go back to the guest. */
+	smccc_set_retval(&hyp_vcpu->vcpu, KVM_ARM_PSCI_1_1, 0, 0, 0);
+	return true;
+}
+
+static bool pvm_psci_not_supported(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	/* Nothing to be handled by the host. Go back to the guest. */
+	smccc_set_retval(&hyp_vcpu->vcpu, PSCI_RET_NOT_SUPPORTED, 0, 0, 0);
+	return true;
+}
+
+static bool pvm_psci_features(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u32 feature = smccc_get_arg1(vcpu);
+	unsigned long val;
+
+	switch (feature) {
+	case PSCI_0_2_FN_PSCI_VERSION:
+	case PSCI_0_2_FN_CPU_SUSPEND:
+	case PSCI_0_2_FN64_CPU_SUSPEND:
+	case PSCI_0_2_FN_CPU_OFF:
+	case PSCI_0_2_FN_CPU_ON:
+	case PSCI_0_2_FN64_CPU_ON:
+	case PSCI_0_2_FN_AFFINITY_INFO:
+	case PSCI_0_2_FN64_AFFINITY_INFO:
+	case PSCI_0_2_FN_SYSTEM_OFF:
+	case PSCI_0_2_FN_SYSTEM_RESET:
+	case PSCI_1_0_FN_PSCI_FEATURES:
+	case PSCI_1_1_FN_SYSTEM_RESET2:
+	case PSCI_1_1_FN64_SYSTEM_RESET2:
+	case ARM_SMCCC_VERSION_FUNC_ID:
+		val = PSCI_RET_SUCCESS;
+		break;
+	default:
+		val = PSCI_RET_NOT_SUPPORTED;
+		break;
+	}
+
+	/* Nothing to be handled by the host. Go back to the guest. */
+	smccc_set_retval(vcpu, val, 0, 0, 0);
+	return true;
+}
+
+static bool pkvm_handle_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u32 psci_fn = smccc_get_function(vcpu);
+
+	switch (psci_fn) {
+	case PSCI_0_2_FN_CPU_ON:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_0_2_FN64_CPU_ON:
+		return pvm_psci_vcpu_on(hyp_vcpu);
+	case PSCI_0_2_FN_CPU_OFF:
+		return pvm_psci_vcpu_off(hyp_vcpu);
+	case PSCI_0_2_FN_AFFINITY_INFO:
+		kvm_psci_narrow_to_32bit(vcpu);
+		fallthrough;
+	case PSCI_0_2_FN64_AFFINITY_INFO:
+		return pvm_psci_vcpu_affinity_info(hyp_vcpu);
+	case PSCI_0_2_FN_PSCI_VERSION:
+		return pvm_psci_version(hyp_vcpu);
+	case PSCI_1_0_FN_PSCI_FEATURES:
+		return pvm_psci_features(hyp_vcpu);
+	case PSCI_0_2_FN_SYSTEM_RESET:
+	case PSCI_0_2_FN_CPU_SUSPEND:
+	case PSCI_0_2_FN64_CPU_SUSPEND:
+	case PSCI_0_2_FN_SYSTEM_OFF:
+	case PSCI_1_1_FN_SYSTEM_RESET2:
+	case PSCI_1_1_FN64_SYSTEM_RESET2:
+		return false; /* Handled by the host. */
+	default:
+		break;
+	}
+
+	return pvm_psci_not_supported(hyp_vcpu);
+}
+
 /*
  * Handler for protected VM HVC calls.
  *
@@ -815,6 +1112,9 @@ err_unlock:
 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 {
 	u32 fn = smccc_get_function(vcpu);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+
+	hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
 
 	switch (fn) {
 	case ARM_SMCCC_VERSION_FUNC_ID:
@@ -822,6 +1122,6 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 		smccc_set_retval(vcpu, ARM_SMCCC_VERSION_1_1, 0, 0, 0);
 		return true;
 	default:
-		return false;
+		return pkvm_handle_psci(hyp_vcpu);
 	}
 }

From 26f8573abf3f95fa5b663c3a838c293a079c0b82 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 9 Feb 2022 17:15:01 +0000
Subject: [PATCH 083/457] ANDROID: KVM: arm64: Don't expose TLBI hypercalls
 after de-privilege

Now that TLBI invalidation is handled entirely at EL2 for both protected
and non-protected guests when protected KVM has initialised, unplug the
unused TLBI hypercalls.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I50ad4cb930c43f88e00320e47b358613224dd1cc
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h   | 8 ++++----
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index b4aaa79f5919..12aa0ccc3b3d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -59,6 +59,10 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config,
+	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
+	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
+	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
+	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
 	__KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
 
 	/* Hypercalls available after pKVM finalisation */
@@ -68,10 +72,6 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest,
 	__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
 	__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
-	__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
-	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
-	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
-	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
 	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f70baf884323..237f6b5a203f 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1037,6 +1037,10 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__kvm_enable_ssbs),
 	HANDLE_FUNC(__vgic_v3_init_lrs),
 	HANDLE_FUNC(__vgic_v3_get_gic_config),
+	HANDLE_FUNC(__kvm_flush_vm_context),
+	HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
+	HANDLE_FUNC(__kvm_tlb_flush_vmid),
+	HANDLE_FUNC(__kvm_flush_cpu_context),
 	HANDLE_FUNC(__pkvm_prot_finalize),
 
 	HANDLE_FUNC(__pkvm_host_share_hyp),
@@ -1045,10 +1049,6 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_host_map_guest),
 	HANDLE_FUNC(__kvm_adjust_pc),
 	HANDLE_FUNC(__kvm_vcpu_run),
-	HANDLE_FUNC(__kvm_flush_vm_context),
-	HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
-	HANDLE_FUNC(__kvm_tlb_flush_vmid),
-	HANDLE_FUNC(__kvm_flush_cpu_context),
 	HANDLE_FUNC(__kvm_timer_set_cntvoff),
 	HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
 	HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),

From 2c8c86040fcfd1b7573228740fdc879b344d7e3d Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 22 Mar 2022 14:31:37 +0000
Subject: [PATCH 084/457] ANDROID: KVM: arm64: Add is_pkvm_initialized() helper

Add a helper allowing to check when the pkvm static key is enabled to
ease the introduction of pkvm hooks in other parts of the code.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I8995021768def73bd7636a84059bdc43fa7ab2fc
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/virt.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 4eb601e7de50..44dbede8997e 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -81,6 +81,12 @@ void __hyp_reset_vectors(void);
 
 DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
 
+static inline bool is_pkvm_initialized(void)
+{
+	return IS_ENABLED(CONFIG_KVM) &&
+	       static_branch_likely(&kvm_protected_mode_initialized);
+}
+
 /* Reports the availability of HYP mode */
 static inline bool is_hyp_mode_available(void)
 {
@@ -88,8 +94,7 @@ static inline bool is_hyp_mode_available(void)
 	 * If KVM protected mode is initialized, all CPUs must have been booted
 	 * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
 	 */
-	if (IS_ENABLED(CONFIG_KVM) &&
-	    static_branch_likely(&kvm_protected_mode_initialized))
+	if (is_pkvm_initialized())
 		return true;
 
 	return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 &&
@@ -103,8 +108,7 @@ static inline bool is_hyp_mode_mismatched(void)
 	 * If KVM protected mode is initialized, all CPUs must have been booted
 	 * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
 	 */
-	if (IS_ENABLED(CONFIG_KVM) &&
-	    static_branch_likely(&kvm_protected_mode_initialized))
+	if (is_pkvm_initialized())
 		return false;
 
 	return __boot_cpu_mode[0] != __boot_cpu_mode[1];

From 6c616c355d4504f16a824b2e57f6686c48eef93d Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 22 Mar 2022 14:33:37 +0000
Subject: [PATCH 085/457] ANDROID: KVM: arm64: Refactor enter_exception64()

In order to simplify the injection of exceptions in the host in pkvm
context, let's factor out of enter_exception64() the code calculating
the exception offset from VBAR_EL1 and the cpsr.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I50a2510b59311717c6e17ea4e45fc634b4b43073
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  5 ++
 arch/arm64/kvm/hyp/exception.c       | 89 ++++++++++++++++------------
 2 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index d1cc41d494fa..c76f93b60244 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -42,6 +42,11 @@ void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_size_fault(struct kvm_vcpu *vcpu);
 
+unsigned long get_except64_offset(unsigned long psr, unsigned long target_mode,
+				  enum exception_type type);
+unsigned long get_except64_cpsr(unsigned long old, bool has_mte,
+				unsigned long sctlr, unsigned long mode);
+
 void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);
 
 #if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__)
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 791d3de76771..a5fa6a13c641 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -61,12 +61,25 @@ static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
 		vcpu->arch.ctxt.spsr_und = val;
 }
 
+unsigned long get_except64_offset(unsigned long psr, unsigned long target_mode,
+				  enum exception_type type)
+{
+	u64 mode = psr & (PSR_MODE_MASK | PSR_MODE32_BIT);
+	u64 exc_offset;
+
+	if      (mode == target_mode)
+		exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+	else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
+		exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+	else if (!(mode & PSR_MODE32_BIT))
+		exc_offset = LOWER_EL_AArch64_VECTOR;
+	else
+		exc_offset = LOWER_EL_AArch32_VECTOR;
+
+	return exc_offset + type;
+}
+
 /*
- * This performs the exception entry at a given EL (@target_mode), stashing PC
- * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
- * The EL passed to this function *must* be a non-secure, privileged mode with
- * bit 0 being set (PSTATE.SP == 1).
- *
  * When an exception is taken, most PSTATE fields are left unchanged in the
  * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
  * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
@@ -78,45 +91,17 @@ static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
  * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
  * MSB to LSB.
  */
-static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
-			      enum exception_type type)
+unsigned long get_except64_cpsr(unsigned long old, bool has_mte,
+				unsigned long sctlr, unsigned long target_mode)
 {
-	unsigned long sctlr, vbar, old, new, mode;
-	u64 exc_offset;
-
-	mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
-
-	if      (mode == target_mode)
-		exc_offset = CURRENT_EL_SP_ELx_VECTOR;
-	else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
-		exc_offset = CURRENT_EL_SP_EL0_VECTOR;
-	else if (!(mode & PSR_MODE32_BIT))
-		exc_offset = LOWER_EL_AArch64_VECTOR;
-	else
-		exc_offset = LOWER_EL_AArch32_VECTOR;
-
-	switch (target_mode) {
-	case PSR_MODE_EL1h:
-		vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1);
-		sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
-		__vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
-		break;
-	default:
-		/* Don't do that */
-		BUG();
-	}
-
-	*vcpu_pc(vcpu) = vbar + exc_offset + type;
-
-	old = *vcpu_cpsr(vcpu);
-	new = 0;
+	u64 new = 0;
 
 	new |= (old & PSR_N_BIT);
 	new |= (old & PSR_Z_BIT);
 	new |= (old & PSR_C_BIT);
 	new |= (old & PSR_V_BIT);
 
-	if (kvm_has_mte(kern_hyp_va(vcpu->kvm)))
+	if (has_mte)
 		new |= PSR_TCO_BIT;
 
 	new |= (old & PSR_DIT_BIT);
@@ -152,6 +137,36 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
 
 	new |= target_mode;
 
+	return new;
+}
+
+/*
+ * This performs the exception entry at a given EL (@target_mode), stashing PC
+ * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
+ * The EL passed to this function *must* be a non-secure, privileged mode with
+ * bit 0 being set (PSTATE.SP == 1).
+ */
+static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
+			      enum exception_type type)
+{
+	u64 offset = get_except64_offset(*vcpu_cpsr(vcpu), target_mode, type);
+	unsigned long sctlr, vbar, old, new;
+
+	switch (target_mode) {
+	case PSR_MODE_EL1h:
+		vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1);
+		sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+		__vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
+		break;
+	default:
+		/* Don't do that */
+		BUG();
+	}
+
+	*vcpu_pc(vcpu) = vbar + offset;
+
+	old = *vcpu_cpsr(vcpu);
+	new = get_except64_cpsr(old, kvm_has_mte(kern_hyp_va(vcpu->kvm)), sctlr, target_mode);
 	*vcpu_cpsr(vcpu) = new;
 	__vcpu_write_spsr(vcpu, old);
 }

From 01803a8ee162fc0962a32209de3358116816eec9 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 22 Mar 2022 14:35:33 +0000
Subject: [PATCH 086/457] ANDROID: KVM: arm64: Inject SIGSEGV on illegal
 accesses

The pKVM hypervisor will currently panic if the host tries to access
memory that it doesn't own (e.g. protected guest memory). Sadly, as
guest memory can still be mapped into the VMM's address space, userspace
can trivially crash the kernel/hypervisor by poking into guest memory.

To prevent this, inject the abort back in the host with S1PTW set in the
ESR, hence allowing the host to differentiate this abort from normal
userspace faults and inject a SIGSEGV cleanly.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I09ee54fbf4c202dc3ac2e1b5eea264d4dc84f613
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 50 ++++++++++++++++++++++++++-
 arch/arm64/mm/fault.c                 | 22 ++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d73b88c7062e..0130d137173e 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -548,6 +548,50 @@ unlock:
 	return ret;
 }
 
+static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
+{
+	u64 spsr = read_sysreg_el2(SYS_SPSR);
+	u64 esr = read_sysreg_el2(SYS_ESR);
+	u64 ventry, ec;
+
+	/* Repaint the ESR to report a same-level fault if taken from EL1 */
+	if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) {
+		ec = ESR_ELx_EC(esr);
+		if (ec == ESR_ELx_EC_DABT_LOW)
+			ec = ESR_ELx_EC_DABT_CUR;
+		else if (ec == ESR_ELx_EC_IABT_LOW)
+			ec = ESR_ELx_EC_IABT_CUR;
+		else
+			WARN_ON(1);
+		esr &= ~ESR_ELx_EC_MASK;
+		esr |= ec << ESR_ELx_EC_SHIFT;
+	}
+
+	/*
+	 * Since S1PTW should only ever be set for stage-2 faults, we're pretty
+	 * much guaranteed that it won't be set in ESR_EL1 by the hardware. So,
+	 * let's use that bit to allow the host abort handler to differentiate
+	 * this abort from normal userspace faults.
+	 *
+	 * Note: although S1PTW is RES0 at EL1, it is guaranteed by the
+	 * architecture to be backed by flops, so it should be safe to use.
+	 */
+	esr |= ESR_ELx_S1PTW;
+
+	write_sysreg_el1(esr, SYS_ESR);
+	write_sysreg_el1(spsr, SYS_SPSR);
+	write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR);
+	write_sysreg_el1(read_sysreg_el2(SYS_FAR), SYS_FAR);
+
+	ventry = read_sysreg_el1(SYS_VBAR);
+	ventry += get_except64_offset(spsr, PSR_MODE_EL1h, except_type_sync);
+	write_sysreg_el2(ventry, SYS_ELR);
+
+	spsr = get_except64_cpsr(spsr, system_supports_mte(),
+				 read_sysreg_el1(SYS_SCTLR), PSR_MODE_EL1h);
+	write_sysreg_el2(spsr, SYS_SPSR);
+}
+
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 {
 	struct kvm_vcpu_fault_info fault;
@@ -559,7 +603,11 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 
 	addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
 	ret = host_stage2_idmap(addr);
-	BUG_ON(ret && ret != -EAGAIN);
+
+	if (ret == -EPERM)
+		host_inject_abort(host_ctxt);
+	else
+		BUG_ON(ret && ret != -EAGAIN);
 }
 
 struct pkvm_mem_transition {
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 5b391490e045..646b3b51fde0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -41,6 +41,7 @@
 #include <asm/system_misc.h>
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
+#include <asm/virt.h>
 
 struct fault_info {
 	int	(*fn)(unsigned long far, unsigned long esr,
@@ -257,6 +258,15 @@ static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr
 	return false;
 }
 
+static bool is_pkvm_stage2_abort(unsigned int esr)
+{
+	/*
+	 * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor
+	 * injected a stage-2 abort -- see host_inject_abort().
+	 */
+	return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW);
+}
+
 static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 							unsigned long esr,
 							struct pt_regs *regs)
@@ -268,6 +278,9 @@ static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
 	    (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
 		return false;
 
+	if (is_pkvm_stage2_abort(esr))
+		return false;
+
 	local_irq_save(flags);
 	asm volatile("at s1e1r, %0" :: "r" (addr));
 	isb();
@@ -384,6 +397,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 			msg = "read from unreadable memory";
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
+	} else if (is_pkvm_stage2_abort(esr)) {
+		msg = "access to hypervisor-protected memory";
 	} else {
 		if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
 			return;
@@ -573,6 +588,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 					 addr, esr, regs);
 	}
 
+	if (is_pkvm_stage2_abort(esr)) {
+		if (!user_mode(regs))
+			goto no_context;
+		arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault");
+		return 0;
+	}
+
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
 	/*

From 47318559bc510b40f802ad7a8e9dca16972ced1a Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 24 Jan 2022 11:16:08 +0000
Subject: [PATCH 087/457] ANDROID: KVM: arm64: Support TLB invalidation in
 guest context

Typically, TLB invalidation of guest stage-2 mappings using nVHE is
performed by a hypercall originating from the host. For the invalidation
instruction to be effective, therefore, __tlb_switch_to_{guest,host}()
swizzle the active stage-2 context around the TLBI instruction.

With guest-to-host memory sharing and unsharing hypercalls originating
from the guest under pKVM, there is now a need to support both guest
and host VMID invalidations issued from guest context.

Replace the __tlb_switch_to_{guest,host}() functions with a more general
{enter,exit}_vmid_context() implementation which supports being invoked
from guest context and acts as a no-op if the target context matches the
running context.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I92c6f48eb4c4b6286b930c2f0cda245bccc1927b
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/tlb.c | 96 ++++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index d296d617f589..35092e154614 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -11,26 +11,62 @@
 #include <nvhe/mem_protect.h>
 
 struct tlb_inv_context {
-	u64		tcr;
+	struct kvm_s2_mmu	*mmu;
+	u64			tcr;
+	u64			sctlr;
 };
 
-static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
-				  struct tlb_inv_context *cxt)
+static void enter_vmid_context(struct kvm_s2_mmu *mmu,
+			       struct tlb_inv_context *cxt)
 {
+	struct kvm_s2_mmu *host_s2_mmu = &host_mmu.arch.mmu;
+	struct kvm_cpu_context *host_ctxt;
+	struct kvm_vcpu *vcpu;
+
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+	vcpu = host_ctxt->__hyp_running_vcpu;
+	cxt->mmu = NULL;
+
+	/*
+	 * If we're already in the desired context, then there's nothing
+	 * to do.
+	 */
+	if (vcpu) {
+		if (mmu == vcpu->arch.hw_mmu || WARN_ON(mmu != host_s2_mmu))
+			return;
+	} else if (mmu == host_s2_mmu) {
+		return;
+	}
+
+	cxt->mmu = mmu;
 	if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
 		u64 val;
 
 		/*
 		 * For CPUs that are affected by ARM 1319367, we need to
-		 * avoid a host Stage-1 walk while we have the guest's
-		 * VMID set in the VTTBR in order to invalidate TLBs.
-		 * We're guaranteed that the S1 MMU is enabled, so we can
-		 * simply set the EPD bits to avoid any further TLB fill.
+		 * avoid a Stage-1 walk with the old VMID while we have
+		 * the new VMID set in the VTTBR in order to invalidate TLBs.
+		 * We're guaranteed that the host S1 MMU is enabled, so
+		 * we can simply set the EPD bits to avoid any further
+		 * TLB fill. For guests, we ensure that the S1 MMU is
+		 * temporarily enabled in the next context.
 		 */
 		val = cxt->tcr = read_sysreg_el1(SYS_TCR);
 		val |= TCR_EPD1_MASK | TCR_EPD0_MASK;
 		write_sysreg_el1(val, SYS_TCR);
 		isb();
+
+		if (vcpu) {
+			val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR);
+			if (!(val & SCTLR_ELx_M)) {
+				val |= SCTLR_ELx_M;
+				write_sysreg_el1(val, SYS_SCTLR);
+				isb();
+			}
+		} else {
+			/* The host S1 MMU is always enabled. */
+			cxt->sctlr = SCTLR_ELx_M;
+		}
 	}
 
 	/*
@@ -39,20 +75,44 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
 	 * ensuring that we always have an ISB, but not two ISBs back
 	 * to back.
 	 */
-	__load_stage2(mmu, kern_hyp_va(mmu->arch));
+	if (vcpu)
+		__load_host_stage2();
+	else
+		__load_stage2(mmu, kern_hyp_va(mmu->arch));
+
 	asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
 }
 
-static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
+static void exit_vmid_context(struct tlb_inv_context *cxt)
 {
-	__load_host_stage2();
+	struct kvm_s2_mmu *mmu = cxt->mmu;
+	struct kvm_cpu_context *host_ctxt;
+	struct kvm_vcpu *vcpu;
+
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+	vcpu = host_ctxt->__hyp_running_vcpu;
+
+	if (!mmu)
+		return;
+
+	if (vcpu)
+		__load_stage2(mmu, kern_hyp_va(mmu->arch));
+	else
+		__load_host_stage2();
 
 	if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
-		/* Ensure write of the host VMID */
+		/* Ensure write of the old VMID */
 		isb();
-		/* Restore the host's TCR_EL1 */
+
+		if (!(cxt->sctlr & SCTLR_ELx_M)) {
+			write_sysreg_el1(cxt->sctlr, SYS_SCTLR);
+			isb();
+		}
+
 		write_sysreg_el1(cxt->tcr, SYS_TCR);
 	}
+
+	cxt->mmu = NULL;
 }
 
 void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
@@ -63,7 +123,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	dsb(ishst);
 
 	/* Switch to requested VMID */
-	__tlb_switch_to_guest(mmu, &cxt);
+	enter_vmid_context(mmu, &cxt);
 
 	/*
 	 * We could do so much better if we had the VA as well.
@@ -106,7 +166,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	if (icache_is_vpipt())
 		icache_inval_all_pou();
 
-	__tlb_switch_to_host(&cxt);
+	exit_vmid_context(&cxt);
 }
 
 void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
@@ -116,13 +176,13 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
 	dsb(ishst);
 
 	/* Switch to requested VMID */
-	__tlb_switch_to_guest(mmu, &cxt);
+	enter_vmid_context(mmu, &cxt);
 
 	__tlbi(vmalls12e1is);
 	dsb(ish);
 	isb();
 
-	__tlb_switch_to_host(&cxt);
+	exit_vmid_context(&cxt);
 }
 
 void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
@@ -130,14 +190,14 @@ void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
 	struct tlb_inv_context cxt;
 
 	/* Switch to requested VMID */
-	__tlb_switch_to_guest(mmu, &cxt);
+	enter_vmid_context(mmu, &cxt);
 
 	__tlbi(vmalle1);
 	asm volatile("ic iallu");
 	dsb(nsh);
 	isb();
 
-	__tlb_switch_to_host(&cxt);
+	exit_vmid_context(&cxt);
 }
 
 void __kvm_flush_vm_context(void)

From b76dc613efab7261ac2661e069ccf913d65e811a Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 10 Nov 2021 12:31:25 +0000
Subject: [PATCH 088/457] ANDROID: KVM: arm64: Avoid BBM when changing only s/w
 bits in Stage-2 PTE

Break-before-make (BBM) can be expensive, as transitioning via an
invalid mapping (i.e. the "break" step) requires the completion of TLB
invalidation and can also cause other agents to fault concurrently on
the invalid mapping.

Since BBM is not required when changing only the software bits of a PTE,
avoid the sequence in this case and just update the PTE directly.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I91ec043a75154fa2ca732f5269c6ae1bceea4a93
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 1597b388d12e..b5e183f6f5a9 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -732,6 +732,13 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 		if (!stage2_pte_needs_update(old, new))
 			return -EAGAIN;
 
+		/*
+		 * If we're only changing software bits, then we don't need to
+		 * do anything else/
+		 */
+		if (!((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
+			goto out_set_pte;
+
 		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
 	}
 
@@ -742,9 +749,11 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
 		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
 
-	smp_store_release(ptep, new);
 	if (stage2_pte_is_counted(new))
 		mm_ops->get_page(ptep);
+
+out_set_pte:
+	smp_store_release(ptep, new);
 	if (kvm_phys_is_valid(phys))
 		data->phys += granule;
 	return 0;

From 6d66cf9daf65e9ac2bacb136a90dd5b83be96816 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 29 Mar 2022 15:30:16 +0100
Subject: [PATCH 089/457] ANDROID: KVM: arm64: Extend memory sharing to allow
 guest-to-host transitions

A guest that can only operate on private memory is pretty useless, as it
has no way to share buffers with the host for things like virtio.

Extend our memory protection mechanisms to support the sharing and
unsharing of guest pages from the guest to the host. For now, this
functionality is unused but will later be exposed to the guest via
hypercalls.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I6b0d6f63348f3a2a847acf4d7bb87bd6e9742af0
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   2 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 232 ++++++++++++++++++
 2 files changed, 234 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 4d14b4cfb3b7..38e5e9b259fc 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -70,6 +70,8 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
 int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
 int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 0130d137173e..85289934cccd 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -779,11 +779,41 @@ static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx,
 	return __host_check_page_state_range(addr, size, state);
 }
 
+static int host_ack_share(u64 addr, const struct pkvm_mem_transition *tx,
+			  enum kvm_pgtable_prot perms)
+{
+	if (perms != PKVM_HOST_MEM_PROT)
+		return -EPERM;
+
+	return __host_ack_transition(addr, tx, PKVM_NOPAGE);
+}
+
 static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx)
 {
 	return __host_ack_transition(addr, tx, PKVM_NOPAGE);
 }
 
+static int host_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	return __host_ack_transition(addr, tx, PKVM_PAGE_SHARED_BORROWED);
+}
+
+static int host_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
+			       enum kvm_pgtable_prot perms)
+{
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_BORROWED);
+}
+
+static int host_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
+{
+	u8 owner_id = tx->initiator.id;
+	u64 size = tx->nr_pages * PAGE_SIZE;
+
+	return host_stage2_set_owner_locked(addr, size, owner_id);
+}
+
 static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
 {
 	u64 size = tx->nr_pages * PAGE_SIZE;
@@ -969,6 +999,120 @@ static int guest_complete_donation(u64 addr, const struct pkvm_mem_transition *t
 				      prot, &vcpu->vcpu.arch.pkvm_memcache);
 }
 
+static int __guest_get_completer_addr(u64 *completer_addr, phys_addr_t phys,
+				      const struct pkvm_mem_transition *tx)
+{
+	switch (tx->completer.id) {
+	case PKVM_ID_HOST:
+		*completer_addr = phys;
+		break;
+	case PKVM_ID_HYP:
+		*completer_addr = (u64)__hyp_va(phys);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __guest_request_page_transition(u64 *completer_addr,
+					   const struct pkvm_mem_transition *tx,
+					   enum pkvm_page_state desired)
+{
+	struct pkvm_hyp_vcpu *vcpu = tx->initiator.guest.hyp_vcpu;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	enum pkvm_page_state state;
+	phys_addr_t phys;
+	kvm_pte_t pte;
+	u32 level;
+	int ret;
+
+	if (tx->nr_pages != 1)
+		return -E2BIG;
+
+	ret = kvm_pgtable_get_leaf(&vm->pgt, tx->initiator.addr, &pte, &level);
+	if (ret)
+		return ret;
+
+	state = guest_get_page_state(pte);
+	if (state == PKVM_NOPAGE)
+		return -EFAULT;
+
+	if (state != desired)
+		return -EPERM;
+
+	/*
+	 * We only deal with page granular mappings in the guest for now as
+	 * the pgtable code relies on being able to recreate page mappings
+	 * lazily after zapping a block mapping, which doesn't work once the
+	 * pages have been donated.
+	 */
+	if (level != KVM_PGTABLE_MAX_LEVELS - 1)
+		return -EINVAL;
+
+	phys = kvm_pte_to_phys(pte);
+	if (!addr_is_allowed_memory(phys))
+		return -EINVAL;
+
+	return __guest_get_completer_addr(completer_addr, phys, tx);
+}
+
+static int guest_request_share(u64 *completer_addr,
+			       const struct pkvm_mem_transition *tx)
+{
+	return __guest_request_page_transition(completer_addr, tx,
+					       PKVM_PAGE_OWNED);
+}
+
+static int guest_request_unshare(u64 *completer_addr,
+				 const struct pkvm_mem_transition *tx)
+{
+	return __guest_request_page_transition(completer_addr, tx,
+					       PKVM_PAGE_SHARED_OWNED);
+}
+
+static int __guest_initiate_page_transition(u64 *completer_addr,
+					    const struct pkvm_mem_transition *tx,
+					    enum pkvm_page_state state)
+{
+	struct pkvm_hyp_vcpu *vcpu = tx->initiator.guest.hyp_vcpu;
+	struct kvm_hyp_memcache *mc = &vcpu->vcpu.arch.pkvm_memcache;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	u64 size = tx->nr_pages * PAGE_SIZE;
+	u64 addr = tx->initiator.addr;
+	enum kvm_pgtable_prot prot;
+	phys_addr_t phys;
+	kvm_pte_t pte;
+	int ret;
+
+	ret = kvm_pgtable_get_leaf(&vm->pgt, addr, &pte, NULL);
+	if (ret)
+		return ret;
+
+	phys = kvm_pte_to_phys(pte);
+	prot = pkvm_mkstate(kvm_pgtable_stage2_pte_prot(pte), state);
+	ret = kvm_pgtable_stage2_map(&vm->pgt, addr, size, phys, prot, mc);
+	if (ret)
+		return ret;
+
+	return __guest_get_completer_addr(completer_addr, phys, tx);
+}
+
+static int guest_initiate_share(u64 *completer_addr,
+				const struct pkvm_mem_transition *tx)
+{
+	return __guest_initiate_page_transition(completer_addr, tx,
+						PKVM_PAGE_SHARED_OWNED);
+}
+
+static int guest_initiate_unshare(u64 *completer_addr,
+				  const struct pkvm_mem_transition *tx)
+{
+	return __guest_initiate_page_transition(completer_addr, tx,
+						PKVM_PAGE_OWNED);
+}
+
 static int check_share(struct pkvm_mem_share *share)
 {
 	const struct pkvm_mem_transition *tx = &share->tx;
@@ -979,6 +1123,9 @@ static int check_share(struct pkvm_mem_share *share)
 	case PKVM_ID_HOST:
 		ret = host_request_owned_transition(&completer_addr, tx);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_request_share(&completer_addr, tx);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -987,6 +1134,9 @@ static int check_share(struct pkvm_mem_share *share)
 		return ret;
 
 	switch (tx->completer.id) {
+	case PKVM_ID_HOST:
+		ret = host_ack_share(completer_addr, tx, share->completer_prot);
+		break;
 	case PKVM_ID_HYP:
 		ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
 		break;
@@ -1010,6 +1160,9 @@ static int __do_share(struct pkvm_mem_share *share)
 	case PKVM_ID_HOST:
 		ret = host_initiate_share(&completer_addr, tx);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_initiate_share(&completer_addr, tx);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1018,6 +1171,9 @@ static int __do_share(struct pkvm_mem_share *share)
 		return ret;
 
 	switch (tx->completer.id) {
+	case PKVM_ID_HOST:
+		ret = host_complete_share(completer_addr, tx, share->completer_prot);
+		break;
 	case PKVM_ID_HYP:
 		ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
 		break;
@@ -1061,6 +1217,9 @@ static int check_unshare(struct pkvm_mem_share *share)
 	case PKVM_ID_HOST:
 		ret = host_request_unshare(&completer_addr, tx);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_request_unshare(&completer_addr, tx);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1069,6 +1228,9 @@ static int check_unshare(struct pkvm_mem_share *share)
 		return ret;
 
 	switch (tx->completer.id) {
+	case PKVM_ID_HOST:
+		ret = host_ack_unshare(completer_addr, tx);
+		break;
 	case PKVM_ID_HYP:
 		ret = hyp_ack_unshare(completer_addr, tx);
 		break;
@@ -1089,6 +1251,9 @@ static int __do_unshare(struct pkvm_mem_share *share)
 	case PKVM_ID_HOST:
 		ret = host_initiate_unshare(&completer_addr, tx);
 		break;
+	case PKVM_ID_GUEST:
+		ret = guest_initiate_unshare(&completer_addr, tx);
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1097,6 +1262,9 @@ static int __do_unshare(struct pkvm_mem_share *share)
 		return ret;
 
 	switch (tx->completer.id) {
+	case PKVM_ID_HOST:
+		ret = host_complete_unshare(completer_addr, tx);
+		break;
 	case PKVM_ID_HYP:
 		ret = hyp_complete_unshare(completer_addr, tx);
 		break;
@@ -1254,6 +1422,70 @@ int __pkvm_host_share_hyp(u64 pfn)
 	return ret;
 }
 
+int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *vcpu, u64 ipa)
+{
+	int ret;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= 1,
+			.initiator	= {
+				.id	= PKVM_ID_GUEST,
+				.addr	= ipa,
+				.guest	= {
+					.hyp_vcpu = vcpu,
+				},
+			},
+			.completer	= {
+				.id	= PKVM_ID_HOST,
+			},
+		},
+		.completer_prot	= PKVM_HOST_MEM_PROT,
+	};
+
+	host_lock_component();
+	guest_lock_component(vm);
+
+	ret = do_share(&share);
+
+	guest_unlock_component(vm);
+	host_unlock_component();
+
+	return ret;
+}
+
+int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *vcpu, u64 ipa)
+{
+	int ret;
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= 1,
+			.initiator	= {
+				.id	= PKVM_ID_GUEST,
+				.addr	= ipa,
+				.guest	= {
+					.hyp_vcpu = vcpu,
+				},
+			},
+			.completer	= {
+				.id	= PKVM_ID_HOST,
+			},
+		},
+		.completer_prot	= PKVM_HOST_MEM_PROT,
+	};
+
+	host_lock_component();
+	guest_lock_component(vm);
+
+	ret = do_unshare(&share);
+
+	guest_unlock_component(vm);
+	host_unlock_component();
+
+	return ret;
+}
+
 int __pkvm_host_unshare_hyp(u64 pfn)
 {
 	int ret;

From 43e7aea295332c4775845a8ac13029d6199d95c5 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 8 Jun 2022 12:53:42 +0100
Subject: [PATCH 090/457] ANDROID: KVM: arm64: Rename firmware pseudo-register
 documentation file

In preparation for describing the guest view of KVM/arm64 hypercalls in
hypercalls.rst, move the existing contents of the file concerning the
firmware pseudo-registers elsewhere.

Cc: Raghavendra Rao Ananta <rananta@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ie8931290b291c0ffd2f1f11265babe2475972868
Signed-off-by: Quentin Perret <qperret@google.com>
---
 .../kvm/arm/{hypercalls.rst => fw-pseudo-registers.rst}     | 6 +++---
 Documentation/virt/kvm/arm/index.rst                        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename Documentation/virt/kvm/arm/{hypercalls.rst => fw-pseudo-registers.rst} (97%)

diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst
similarity index 97%
rename from Documentation/virt/kvm/arm/hypercalls.rst
rename to Documentation/virt/kvm/arm/fw-pseudo-registers.rst
index 3e23084644ba..b90fd0b0fa66 100644
--- a/Documentation/virt/kvm/arm/hypercalls.rst
+++ b/Documentation/virt/kvm/arm/fw-pseudo-registers.rst
@@ -1,8 +1,8 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-=======================
-ARM Hypercall Interface
-=======================
+=======================================
+ARM firmware pseudo-registers interface
+=======================================
 
 KVM handles the hypercall services as requested by the guests. New hypercall
 services are regularly made available by the ARM specification or by KVM (as
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index e84848432158..5435b92baf4c 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -7,7 +7,7 @@ ARM
 .. toctree::
    :maxdepth: 2
 
+   fw-pseudo-registers
    hyp-abi
-   hypercalls
    pvtime
    ptp_kvm

From a307199c6958a527aea3213b2844d1a8e6f362d7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 14 Apr 2022 16:43:37 +0100
Subject: [PATCH 091/457] ANDROID: KVM: arm64: Document the KVM/arm64-specific
 calls in hypercalls.rst

KVM/arm64 makes use of the SMCCC "Vendor Specific Hypervisor Service
Call Range" to expose KVM-specific hypercalls to guests in a
discoverable and extensible fashion.

Document the existence of this interface and the discovery hypercall.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I5754589b1b695828eab7cb41c7aa6a0fb55ad273
Signed-off-by: Quentin Perret <qperret@google.com>
---
 Documentation/virt/kvm/arm/hypercalls.rst | 46 +++++++++++++++++++++++
 Documentation/virt/kvm/arm/index.rst      |  1 +
 2 files changed, 47 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/hypercalls.rst

diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst
new file mode 100644
index 000000000000..17be111f493f
--- /dev/null
+++ b/Documentation/virt/kvm/arm/hypercalls.rst
@@ -0,0 +1,46 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================================
+KVM/arm64-specific hypercalls exposed to guests
+===============================================
+
+This file documents the KVM/arm64-specific hypercalls which may be
+exposed by KVM/arm64 to guest operating systems. These hypercalls are
+issued using the HVC instruction according to version 1.1 of the Arm SMC
+Calling Convention (DEN0028/C):
+
+https://developer.arm.com/docs/den0028/c
+
+All KVM/arm64-specific hypercalls are allocated within the "Vendor
+Specific Hypervisor Service Call" range with a UID of
+``28b46fb6-2ec5-11e9-a9ca-4b564d003a74``. This UID should be queried by the
+guest using the standard "Call UID" function for the service range in
+order to determine that the KVM/arm64-specific hypercalls are available.
+
+``ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID``
+---------------------------------------------
+
+Provides a discovery mechanism for other KVM/arm64 hypercalls.
+
++---------------------+-------------------------------------------------------------+
+| Presence:           | Mandatory for the KVM/arm64 UID                             |
++---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC32                                                       |
++---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0x86000000                                       |
++---------------------+----------+--------------------------------------------------+
+| Arguments:          | None                                                        |
++---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (uint32) | R0 | Bitmap of available function numbers 0-31   |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint32) | R1 | Bitmap of available function numbers 32-63  |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint32) | R2 | Bitmap of available function numbers 64-95  |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint32) | R3 | Bitmap of available function numbers 96-127 |
++---------------------+----------+----+---------------------------------------------+
+
+``ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID``
+----------------------------------------
+
+See ptp_kvm.rst
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 5435b92baf4c..1f8ad6393921 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -9,5 +9,6 @@ ARM
 
    fw-pseudo-registers
    hyp-abi
+   hypercalls
    pvtime
    ptp_kvm

From 2f6e171dd35c8917b37a83fd109a1d0c29035135 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 14 Apr 2022 17:53:36 +0100
Subject: [PATCH 092/457] ANDROID: KVM: arm64: Reformat/beautify PTP hypercall
 documentation

The PTP hypercall documentation doesn't produce the best-looking table
when formatting in HTML as all of the return value definitions end up
on the same line.

Reformat the PTP hypercall documentation to follow the formatting used
by hypercalls.rst.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Ic77cea5a621a9278d098afd80ef4c0e125760814
Signed-off-by: Quentin Perret <qperret@google.com>
---
 Documentation/virt/kvm/arm/ptp_kvm.rst | 38 ++++++++++++++++----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/Documentation/virt/kvm/arm/ptp_kvm.rst b/Documentation/virt/kvm/arm/ptp_kvm.rst
index aecdc80ddcd8..7c0960970a0e 100644
--- a/Documentation/virt/kvm/arm/ptp_kvm.rst
+++ b/Documentation/virt/kvm/arm/ptp_kvm.rst
@@ -7,19 +7,29 @@ PTP_KVM is used for high precision time sync between host and guests.
 It relies on transferring the wall clock and counter value from the
 host to the guest using a KVM-specific hypercall.
 
-* ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001
+``ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID``
+----------------------------------------
 
-This hypercall uses the SMC32/HVC32 calling convention:
+Retrieve current time information for the specific counter. There are no
+endianness restrictions.
 
-ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID
-    ==============    ========    =====================================
-    Function ID:      (uint32)    0x86000001
-    Arguments:        (uint32)    KVM_PTP_VIRT_COUNTER(0)
-                                  KVM_PTP_PHYS_COUNTER(1)
-    Return Values:    (int32)     NOT_SUPPORTED(-1) on error, or
-                      (uint32)    Upper 32 bits of wall clock time (r0)
-                      (uint32)    Lower 32 bits of wall clock time (r1)
-                      (uint32)    Upper 32 bits of counter (r2)
-                      (uint32)    Lower 32 bits of counter (r3)
-    Endianness:                   No Restrictions.
-    ==============    ========    =====================================
++---------------------+-------------------------------------------------------+
+| Presence:           | Optional                                              |
++---------------------+-------------------------------------------------------+
+| Calling convention: | HVC32                                                 |
++---------------------+----------+--------------------------------------------+
+| Function ID:        | (uint32) | 0x86000001                                 |
++---------------------+----------+----+---------------------------------------+
+| Arguments:          | (uint32) | R1 | ``KVM_PTP_VIRT_COUNTER (0)``          |
+|                     |          |    +---------------------------------------+
+|                     |          |    | ``KVM_PTP_PHYS_COUNTER (1)``          |
++---------------------+----------+----+---------------------------------------+
+| Return Values:      | (int32)  | R0 | ``NOT_SUPPORTED (-1)`` on error, else |
+|                     |          |    | upper 32 bits of wall clock time      |
+|                     +----------+----+---------------------------------------+
+|                     | (uint32) | R1 | Lower 32 bits of wall clock time      |
+|                     +----------+----+---------------------------------------+
+|                     | (uint32) | R2 | Upper 32 bits of counter              |
+|                     +----------+----+---------------------------------------+
+|                     | (uint32) | R3 | Lower 32 bits of counter              |
++---------------------+----------+----+---------------------------------------+

From 49d7c22232214c97647011c5675e9f24d2b0950e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 29 Mar 2022 15:10:14 +0100
Subject: [PATCH 093/457] ANDROID: KVM: arm64: Expose memory sharing hypercalls
 to protected guests

Extend our KVM "vendor" hypercalls to expose three new hypercalls to
protected guests for the purpose of opening and closing shared memory
windows with the host:

  MEMINFO:	Query the stage-2 page size (i.e. the minimum granule at
		which memory can be shared)

  MEM_SHARE:	Share a page RWX with the host, faulting the page in if
  		necessary.

  MEM_UNSHARE:	Unshare a page with the host. Subsequent host accesses
		to the page will result in a fault being injected by the
		hypervisor.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I80fe8af0bc0b3a40460c5065eabe26b1d9f634f2
Signed-off-by: Quentin Perret <qperret@google.com>
---
 Documentation/virt/kvm/arm/hypercalls.rst |  72 ++++++++++++++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c        |  24 ++++-
 arch/arm64/kvm/hyp/nvhe/pkvm.c            | 112 +++++++++++++++++++++-
 include/linux/arm-smccc.h                 |  21 ++++
 4 files changed, 226 insertions(+), 3 deletions(-)

diff --git a/Documentation/virt/kvm/arm/hypercalls.rst b/Documentation/virt/kvm/arm/hypercalls.rst
index 17be111f493f..d96c9fd7d8c5 100644
--- a/Documentation/virt/kvm/arm/hypercalls.rst
+++ b/Documentation/virt/kvm/arm/hypercalls.rst
@@ -44,3 +44,75 @@ Provides a discovery mechanism for other KVM/arm64 hypercalls.
 ----------------------------------------
 
 See ptp_kvm.rst
+
+``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``
+----------------------------------
+
+Query the memory protection parameters for a protected virtual machine.
+
++---------------------+-------------------------------------------------------------+
+| Presence:           | Optional; protected guests only.                            |
++---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC64                                                       |
++---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0xC6000002                                       |
++---------------------+----------+----+---------------------------------------------+
+| Arguments:          | (uint64) | R1 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R2 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R3 | Reserved / Must be zero                     |
++---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (int64)  | R0 | ``INVALID_PARAMETER (-3)`` on error, else   |
+|                     |          |    | memory protection granule in bytes          |
++---------------------+----------+----+---------------------------------------------+
+
+``ARM_SMCCC_KVM_FUNC_MEM_SHARE``
+--------------------------------
+
+Share a region of memory with the KVM host, granting it read, write and execute
+permissions. The size of the region is equal to the memory protection granule
+advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``.
+
++---------------------+-------------------------------------------------------------+
+| Presence:           | Optional; protected guests only.                            |
++---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC64                                                       |
++---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0xC6000003                                       |
++---------------------+----------+----+---------------------------------------------+
+| Arguments:          | (uint64) | R1 | Base IPA of memory region to share          |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R2 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R3 | Reserved / Must be zero                     |
++---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (int64)  | R0 | ``SUCCESS (0)``                             |
+|                     |          |    +---------------------------------------------+
+|                     |          |    | ``INVALID_PARAMETER (-3)``                  |
++---------------------+----------+----+---------------------------------------------+
+
+``ARM_SMCCC_KVM_FUNC_MEM_UNSHARE``
+----------------------------------
+
+Revoke access permission from the KVM host to a memory region previously shared
+with ``ARM_SMCCC_KVM_FUNC_MEM_SHARE``. The size of the region is equal to the
+memory protection granule advertised by ``ARM_SMCCC_KVM_FUNC_HYP_MEMINFO``.
+
++---------------------+-------------------------------------------------------------+
+| Presence:           | Optional; protected guests only.                            |
++---------------------+-------------------------------------------------------------+
+| Calling convention: | HVC64                                                       |
++---------------------+----------+--------------------------------------------------+
+| Function ID:        | (uint32) | 0xC6000004                                       |
++---------------------+----------+----+---------------------------------------------+
+| Arguments:          | (uint64) | R1 | Base IPA of memory region to unshare        |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R2 | Reserved / Must be zero                     |
+|                     +----------+----+---------------------------------------------+
+|                     | (uint64) | R3 | Reserved / Must be zero                     |
++---------------------+----------+----+---------------------------------------------+
+| Return Values:      | (int64)  | R0 | ``SUCCESS (0)``                             |
+|                     |          |    +---------------------------------------------+
+|                     |          |    | ``INVALID_PARAMETER (-3)``                  |
++---------------------+----------+----+---------------------------------------------+
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 237f6b5a203f..877dbb21da3a 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -47,7 +47,7 @@ static void handle_pvm_entry_wfx(struct pkvm_hyp_vcpu *hyp_vcpu)
 	}
 }
 
-static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
+static void handle_pvm_entry_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	u32 psci_fn = smccc_get_function(&hyp_vcpu->vcpu);
 	u64 ret = READ_ONCE(hyp_vcpu->host_vcpu->arch.ctxt.regs.regs[0]);
@@ -84,6 +84,22 @@ static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 	vcpu_set_reg(&hyp_vcpu->vcpu, 0, ret);
 }
 
+static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	u32 fn = smccc_get_function(&hyp_vcpu->vcpu);
+
+	switch (fn) {
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+		fallthrough;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+		vcpu_set_reg(&hyp_vcpu->vcpu, 0, SMCCC_RET_SUCCESS);
+		break;
+	default:
+		handle_pvm_entry_psci(hyp_vcpu);
+		break;
+	}
+}
+
 static void handle_pvm_entry_sys64(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
@@ -241,6 +257,12 @@ static void handle_pvm_exit_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 		n = 1;
 		break;
 
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+		fallthrough;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+		n = 4;
+		break;
+
 	case PSCI_1_1_FN_SYSTEM_RESET2:
 	case PSCI_1_1_FN64_SYSTEM_RESET2:
 		n = 3;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 89c3a0ad2933..52e45f42e1ec 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -1103,6 +1103,85 @@ static bool pkvm_handle_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
 	return pvm_psci_not_supported(hyp_vcpu);
 }
 
+static u64 __pkvm_memshare_page_req(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u64 elr;
+
+	/* Fake up a data abort (Level 3 translation fault on write) */
+	vcpu->arch.fault.esr_el2 = (u32)ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT |
+				   ESR_ELx_WNR | ESR_ELx_FSC_FAULT |
+				   FIELD_PREP(ESR_ELx_FSC_LEVEL, 3);
+
+	/* Shuffle the IPA around into the HPFAR */
+	vcpu->arch.fault.hpfar_el2 = (ipa >> 8) & HPFAR_MASK;
+
+	/* This is a virtual address. 0's good. Let's go with 0. */
+	vcpu->arch.fault.far_el2 = 0;
+
+	/* Rewind the ELR so we return to the HVC once the IPA is mapped */
+	elr = read_sysreg(elr_el2);
+	elr -= 4;
+	write_sysreg(elr, elr_el2);
+
+	return ARM_EXCEPTION_TRAP;
+}
+
+static bool pkvm_memshare_call(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u64 ipa = smccc_get_arg1(vcpu);
+	u64 arg2 = smccc_get_arg2(vcpu);
+	u64 arg3 = smccc_get_arg3(vcpu);
+	int err;
+
+	if (arg2 || arg3)
+		goto out_guest_err;
+
+	err = __pkvm_guest_share_host(hyp_vcpu, ipa);
+	switch (err) {
+	case 0:
+		/* Success! Now tell the host. */
+		goto out_host;
+	case -EFAULT:
+		/*
+		 * Convert the exception into a data abort so that the page
+		 * being shared is mapped into the guest next time.
+		 */
+		*exit_code = __pkvm_memshare_page_req(hyp_vcpu, ipa);
+		goto out_host;
+	}
+
+out_guest_err:
+	smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
+	return true;
+
+out_host:
+	return false;
+}
+
+static bool pkvm_memunshare_call(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u64 ipa = smccc_get_arg1(vcpu);
+	u64 arg2 = smccc_get_arg2(vcpu);
+	u64 arg3 = smccc_get_arg3(vcpu);
+	int err;
+
+	if (arg2 || arg3)
+		goto out_guest_err;
+
+	err = __pkvm_guest_unshare_host(hyp_vcpu, ipa);
+	if (err)
+		goto out_guest_err;
+
+	return false;
+
+out_guest_err:
+	smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
+	return true;
+}
+
 /*
  * Handler for protected VM HVC calls.
  *
@@ -1111,6 +1190,7 @@ static bool pkvm_handle_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
  */
 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 {
+	u64 val[4] = { SMCCC_RET_NOT_SUPPORTED };
 	u32 fn = smccc_get_function(vcpu);
 	struct pkvm_hyp_vcpu *hyp_vcpu;
 
@@ -1119,9 +1199,37 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 	switch (fn) {
 	case ARM_SMCCC_VERSION_FUNC_ID:
 		/* Nothing to be handled by the host. Go back to the guest. */
-		smccc_set_retval(vcpu, ARM_SMCCC_VERSION_1_1, 0, 0, 0);
-		return true;
+		val[0] = ARM_SMCCC_VERSION_1_1;
+		break;
+	case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
+		val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
+		val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
+		val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
+		val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
+		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
+		if (smccc_get_arg1(vcpu) ||
+		    smccc_get_arg2(vcpu) ||
+		    smccc_get_arg3(vcpu)) {
+			val[0] = SMCCC_RET_INVALID_PARAMETER;
+		} else {
+			val[0] = PAGE_SIZE;
+		}
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+		return pkvm_memshare_call(hyp_vcpu, exit_code);
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+		return pkvm_memunshare_call(hyp_vcpu);
 	default:
 		return pkvm_handle_psci(hyp_vcpu);
 	}
+
+	smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
+	return true;
 }
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 220c8c60e021..25c576a910df 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -112,6 +112,9 @@
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
 #define ARM_SMCCC_KVM_FUNC_PTP			1
+#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
+#define ARM_SMCCC_KVM_FUNC_MEM_SHARE		3
+#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE		4
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -134,6 +137,24 @@
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_PTP)
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_HYP_MEMINFO)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_SHARE)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_UNSHARE)
+
 /* ptp_kvm counter type ID */
 #define KVM_PTP_VIRT_COUNTER			0
 #define KVM_PTP_PHYS_COUNTER			1

From f6ac1d52871c481d7bce78fb7579a8aedf265741 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 25 Apr 2022 15:20:12 +0000
Subject: [PATCH 094/457] ANDROID: BACKPORT: KVM: arm64: Introduce
 KVM_VM_TYPE_ARM_PROTECTED machine type for PVMs

Introduce a new virtual machine type, KVM_VM_TYPE_ARM_PROTECTED, which
specifies that the guest memory pages are to be unmapped from the host
stage-2 by the hypervisor.

Signed-off-by: Will Deacon <will@kernel.org>
[willdeacon@: Align KVM_VM_TYPE_ARM_PROTECTED value with android13 kernels]
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: Iabcd03865aed4a41637597ac247897fd185bfc4d
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h |  2 +-
 arch/arm64/kvm/arm.c              |  5 ++++-
 arch/arm64/kvm/mmu.c              |  3 ---
 arch/arm64/kvm/pkvm.c             | 10 +++++++++-
 include/uapi/linux/kvm.h          |  6 ++++++
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 2da3f274fe11..014f13c04b4b 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -16,7 +16,7 @@
 
 #define HYP_MEMBLOCK_REGIONS 128
 
-int pkvm_init_host_vm(struct kvm *kvm);
+int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
 int pkvm_create_hyp_vm(struct kvm *kvm);
 void pkvm_destroy_hyp_vm(struct kvm *kvm);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9bbd10b2cdaf..79fe530159cc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -145,11 +145,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
 	int ret;
 
+	if (type & ~KVM_VM_TYPE_MASK)
+		return -EINVAL;
+
 	ret = kvm_share_hyp(kvm, kvm + 1);
 	if (ret)
 		return ret;
 
-	ret = pkvm_init_host_vm(kvm);
+	ret = pkvm_init_host_vm(kvm, type);
 	if (ret)
 		goto err_unshare_kvm;
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3dc1ebc923d6..d7138520e0f1 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -709,9 +709,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	u64 mmfr0, mmfr1;
 	u32 phys_shift;
 
-	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
-		return -EINVAL;
-
 	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
 	if (is_protected_kvm_enabled()) {
 		phys_shift = kvm_ipa_limit;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index d97d17c96321..8e9bcf75e3e3 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -235,8 +235,16 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 	}
 }
 
-int pkvm_init_host_vm(struct kvm *host_kvm)
+int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)
 {
 	mutex_init(&host_kvm->lock);
+
+	if (!(type & KVM_VM_TYPE_ARM_PROTECTED))
+		return 0;
+
+	if (!is_protected_kvm_enabled())
+		return -EINVAL;
+
+	host_kvm->arch.pkvm.enabled = true;
 	return 0;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 0d5d4419139a..48916ae8099c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -911,6 +911,12 @@ struct kvm_ppc_resize_hpt {
 #define KVM_VM_TYPE_ARM_IPA_SIZE_MASK	0xffULL
 #define KVM_VM_TYPE_ARM_IPA_SIZE(x)		\
 	((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+
+#define KVM_VM_TYPE_ARM_PROTECTED	(1UL << 31)
+
+#define KVM_VM_TYPE_MASK	(KVM_VM_TYPE_ARM_IPA_SIZE_MASK | \
+				 KVM_VM_TYPE_ARM_PROTECTED)
+
 /*
  * ioctls for /dev/kvm fds:
  */

From a7b23eb054de43a684403e62828557d37b6100ed Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Apr 2022 14:44:11 +0100
Subject: [PATCH 095/457] ANDROID: Documentation: KVM: Add some documentation
 for Protected KVM on arm64

Add some initial documentation for the Protected KVM (pKVM) feature on
arm64, describing the user ABI for creating protected VMs as well as
their limitations.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 233587962
Change-Id: I152af404f24b9aba3cc9be6acd8e26afcfa4b0a5
Signed-off-by: Quentin Perret <qperret@google.com>
---
 .../admin-guide/kernel-parameters.txt         |  4 +-
 Documentation/virt/kvm/arm/index.rst          |  1 +
 Documentation/virt/kvm/arm/pkvm.rst           | 96 +++++++++++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/virt/kvm/arm/pkvm.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 72874edc0320..1ccd7e09a882 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2532,7 +2532,9 @@
 			      protected guests.
 
 			protected: nVHE-based mode with support for guests whose
-				   state is kept private from the host.
+				   state is kept private from the host. See
+				   Documentation/virt/kvm/arm/pkvm.rst for more
+				   information about this mode of operation.
 
 			Defaults to VHE/nVHE based on hardware support. Setting
 			mode to "protected" will disable kexec and hibernation
diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 1f8ad6393921..1e3f28722768 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -10,5 +10,6 @@ ARM
    fw-pseudo-registers
    hyp-abi
    hypercalls
+   pkvm
    pvtime
    ptp_kvm
diff --git a/Documentation/virt/kvm/arm/pkvm.rst b/Documentation/virt/kvm/arm/pkvm.rst
new file mode 100644
index 000000000000..64f099a5ac2e
--- /dev/null
+++ b/Documentation/virt/kvm/arm/pkvm.rst
@@ -0,0 +1,96 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Protected virtual machines (pKVM)
+=================================
+
+Introduction
+------------
+
+Protected KVM (pKVM) is a KVM/arm64 extension which uses the two-stage
+translation capability of the Armv8 MMU to isolate guest memory from the host
+system. This allows for the creation of a confidential computing environment
+without relying on whizz-bang features in hardware, but still allowing room for
+complementary technologies such as memory encryption and hardware-backed
+attestation.
+
+The major implementation change brought about by pKVM is that the hypervisor
+code running at EL2 is now largely independent of (and isolated from) the rest
+of the host kernel running at EL1 and therefore additional hypercalls are
+introduced to manage manipulation of guest stage-2 page tables, creation of VM
+data structures and reclamation of memory on teardown. An immediate consequence
+of this change is that the host itself runs with an identity mapping enabled
+at stage-2, providing the hypervisor code with a mechanism to restrict host
+access to an arbitrary physical page.
+
+Enabling pKVM
+-------------
+
+The pKVM hypervisor is enabled by booting the host kernel at EL2 with
+"``kvm-arm.mode=protected``" on the command-line. Once enabled, VMs can be spawned
+in either protected or non-protected state, although the hypervisor is still
+responsible for managing most of the VM metadata in either case.
+
+Limitations
+-----------
+
+Enabling pKVM places some significant limitations on KVM guests, regardless of
+whether they are spawned in protected state. It is therefore recommended only
+to enable pKVM if protected VMs are required, with non-protected state acting
+primarily as a debug and development aid.
+
+If you're still keen, then here is an incomplete list of caveats that apply
+to all VMs running under pKVM:
+
+- Guest memory cannot be file-backed (with the exception of shmem/memfd) and is
+  pinned as it is mapped into the guest. This prevents the host from
+  swapping-out, migrating, merging or generally doing anything useful with the
+  guest pages. It also requires that the VMM has either ``CAP_IPC_LOCK`` or
+  sufficient ``RLIMIT_MEMLOCK`` to account for this pinned memory.
+
+- GICv2 is not supported and therefore GICv3 hardware is required in order
+  to expose a virtual GICv3 to the guest.
+
+- Read-only memslots are unsupported and therefore dirty logging cannot be
+  enabled.
+
+- Memslot configuration is fixed once a VM has started running, with subsequent
+  move or deletion requests being rejected with ``-EPERM``.
+
+- There are probably many others.
+
+Since the host is unable to tear down the hypervisor when pKVM is enabled,
+hibernation (``CONFIG_HIBERNATION``) and kexec (``CONFIG_KEXEC``) will fail
+with ``-EBUSY``.
+
+If you are not happy with these limitations, then please don't enable pKVM :)
+
+VM creation
+-----------
+
+When pKVM is enabled, protected VMs can be created by specifying the
+``KVM_VM_TYPE_ARM_PROTECTED`` flag in the machine type identifier parameter
+passed to ``KVM_CREATE_VM``.
+
+Protected VMs are instantiated according to a fixed vCPU configuration
+described by the ID register definitions in
+``arch/arm64/include/asm/kvm_pkvm.h``. Only a subset of the architectural
+features that may be available to the host are exposed to the guest and the
+capabilities advertised by ``KVM_CHECK_EXTENSION`` are limited accordingly,
+with the vCPU registers being initialised to their architecturally-defined
+values.
+
+Where not defined by the architecture, the registers of a protected vCPU
+are reset to zero with the exception of the PC and X0 which can be set
+either by the ``KVM_SET_ONE_REG`` interface or by a call to PSCI ``CPU_ON``.
+
+VM runtime
+----------
+
+By default, memory pages mapped into a protected guest are inaccessible to the
+host and any attempt by the host to access such a page will result in the
+injection of an abort at EL1 by the hypervisor. For accesses originating from
+EL0, the host will then terminate the current task with a ``SIGSEGV``.
+
+pKVM exposes additional hypercalls to protected guests, primarily for the
+purpose of establishing shared-memory regions with the host for communication
+and I/O. These hypercalls are documented in hypercalls.rst.

From 3255d10e84c9413aaf678ebaa741eaa9c15f4f87 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 17 Nov 2020 13:05:03 +0000
Subject: [PATCH 096/457] ANDROID: KVM: arm64: Parse reserved-memory node for
 pkvm guest firmware region

Add support for a "linux,pkvm-guest-firmware-memory" reserved memory
region, which can be used to identify a firmware image for protected
VMs. If pKVM fails to initialise and a firmware region is advertised,
then the memory is cleared during boot.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: Ibfcc0ff00d4b8a42747452047856cb9ba8def4c4
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/pkvm.c | 61 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 8e9bcf75e3e3..f0cdb15e25b1 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -4,16 +4,21 @@
  * Author: Quentin Perret <qperret@google.com>
  */
 
+#include <linux/io.h>
 #include <linux/kvm_host.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/of_fdt.h>
+#include <linux/of_reserved_mem.h>
 #include <linux/sort.h>
 
 #include <asm/kvm_pkvm.h>
 
 #include "hyp_constants.h"
 
+static struct reserved_mem *pkvm_firmware_mem;
+
 static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
 static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
 
@@ -248,3 +253,59 @@ int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)
 	host_kvm->arch.pkvm.enabled = true;
 	return 0;
 }
+
+static int __init pkvm_firmware_rmem_err(struct reserved_mem *rmem,
+					 const char *reason)
+{
+	phys_addr_t end = rmem->base + rmem->size;
+
+	kvm_err("Ignoring pkvm guest firmware memory reservation [%pa - %pa]: %s\n",
+		&rmem->base, &end, reason);
+	return -EINVAL;
+}
+
+static int __init pkvm_firmware_rmem_init(struct reserved_mem *rmem)
+{
+	unsigned long node = rmem->fdt_node;
+
+	if (pkvm_firmware_mem)
+		return pkvm_firmware_rmem_err(rmem, "duplicate reservation");
+
+	if (!of_get_flat_dt_prop(node, "no-map", NULL))
+		return pkvm_firmware_rmem_err(rmem, "missing \"no-map\" property");
+
+	if (of_get_flat_dt_prop(node, "reusable", NULL))
+		return pkvm_firmware_rmem_err(rmem, "\"reusable\" property unsupported");
+
+	if (!PAGE_ALIGNED(rmem->base))
+		return pkvm_firmware_rmem_err(rmem, "base is not page-aligned");
+
+	if (!PAGE_ALIGNED(rmem->size))
+		return pkvm_firmware_rmem_err(rmem, "size is not page-aligned");
+
+	pkvm_firmware_mem = rmem;
+	return 0;
+}
+RESERVEDMEM_OF_DECLARE(pkvm_firmware, "linux,pkvm-guest-firmware-memory",
+		       pkvm_firmware_rmem_init);
+
+static int __init pkvm_firmware_rmem_clear(void)
+{
+	void *addr;
+	phys_addr_t size;
+
+	if (likely(!pkvm_firmware_mem) || is_protected_kvm_enabled())
+		return 0;
+
+	kvm_info("Clearing unused pKVM firmware memory\n");
+	size = pkvm_firmware_mem->size;
+	addr = memremap(pkvm_firmware_mem->base, size, MEMREMAP_WB);
+	if (!addr)
+		return -EINVAL;
+
+	memset(addr, 0, size);
+	dcache_clean_poc((unsigned long)addr, (unsigned long)addr + size);
+	memunmap(addr);
+	return 0;
+}
+device_initcall_sync(pkvm_firmware_rmem_clear);

From d912c45d04730ad2a7a5acd7316c188dce3e3ad0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 9 Dec 2021 11:51:12 +0000
Subject: [PATCH 097/457] ANDROID: KVM: arm64: Unmap PVM firmware from host
 stage-2 during de-privilege

Unmap the PVM firmware memory from the pKVM host by transferring
ownership of the pages to the hypervisor when the host deprivileges
itself during boot.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: I311642f543c0c73d0e0cf2ec051e8e2d9759c5d1
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h |  3 +++
 arch/arm64/kvm/hyp/nvhe/setup.c   | 10 ++++++++++
 arch/arm64/kvm/pkvm.c             |  4 ++++
 3 files changed, 17 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 014f13c04b4b..542cf02ec6b0 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -240,6 +240,9 @@ static inline int pkvm_get_max_wrps(void)
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
 
+extern phys_addr_t kvm_nvhe_sym(pvmfw_base);
+extern phys_addr_t kvm_nvhe_sym(pvmfw_size);
+
 static inline unsigned long
 hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size)
 {
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index ab9a0d3515af..58be7ec39332 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -20,6 +20,9 @@
 
 unsigned long hyp_nr_cpus;
 
+phys_addr_t pvmfw_base;
+phys_addr_t pvmfw_size;
+
 #define hyp_percpu_size ((unsigned long)__per_cpu_end - \
 			 (unsigned long)__per_cpu_start)
 
@@ -157,6 +160,13 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
 	if (ret)
 		return ret;
 
+	start = hyp_phys_to_virt(pvmfw_base);
+	end = start + pvmfw_size;
+	prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_OWNED);
+	ret = pkvm_create_mappings(start, end, prot);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index f0cdb15e25b1..7ee1b39daaff 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -18,6 +18,8 @@
 #include "hyp_constants.h"
 
 static struct reserved_mem *pkvm_firmware_mem;
+static phys_addr_t *pvmfw_base = &kvm_nvhe_sym(pvmfw_base);
+static phys_addr_t *pvmfw_size = &kvm_nvhe_sym(pvmfw_size);
 
 static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
 static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
@@ -283,6 +285,8 @@ static int __init pkvm_firmware_rmem_init(struct reserved_mem *rmem)
 	if (!PAGE_ALIGNED(rmem->size))
 		return pkvm_firmware_rmem_err(rmem, "size is not page-aligned");
 
+	*pvmfw_size = rmem->size;
+	*pvmfw_base = rmem->base;
 	pkvm_firmware_mem = rmem;
 	return 0;
 }

From 49e5a971bc30dfeb0a5cbcc5810e7715058bffa7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 24 Oct 2022 11:26:07 +0100
Subject: [PATCH 098/457] ANDROID: KVM: arm64: Ignore length of 0 in
 kvm_flush_dcache_to_poc()

kvm_flush_dcache_to_poc() converts its (start,len) parameters into
(start,end) parameters for dcache_clean_inval_poc(). This mostly works
out except for the case when 'len == 0', where dcache_clean_inval_poc()
will still issue cache maintenance for the cache line containing 'start'.
If 'start' is not mapped, then this can generate an unexpected fault.

In preparation for cleaning the pvmfw memory pages to the PoC on
system reset, tweak kvm_flush_dcache_to_poc() to act as a no-op when
the supplied length is 0 and avoid having to check for this corner case
in the caller.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: Idae2b22289398e941938821d1d3b3a5a1da3fd8f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_mmu.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e4a7e6369499..d4b2114cef54 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -187,8 +187,13 @@ static inline void *__kvm_vector_slot2addr(void *base,
 
 struct kvm;
 
-#define kvm_flush_dcache_to_poc(a,l)	\
-	dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))
+#define kvm_flush_dcache_to_poc(a, l)	do {			\
+	unsigned long __a = (unsigned long)(a);			\
+	unsigned long __l = (unsigned long)(l);			\
+								\
+	if (__l)						\
+		dcache_clean_inval_poc(__a, __a + __l);		\
+} while (0)
 
 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {

From 5ae1450bd01df9ee677a22bb6208819c6e6a277f Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 24 Feb 2022 14:46:45 +0000
Subject: [PATCH 099/457] ANDROID: KVM: arm64: Clear pvmfw pages on clean host
 shutdown

When the host shuts down cleanly under pKVM, it is EL2's responsibility
to clear the pvmfw pages before forwarding the PSCI call onto EL3.

Wipe the pvmfw pages on SYSTEM_OFF, SYSTEM_RESET and SYSTEM_RESET2 calls
from the host, cleaning the zeroed memory to the PoC for good measure.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: I0dd2757e355f384813319034c6eed0fa2c2328c2
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 2 ++
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 8 ++++++++
 arch/arm64/kvm/hyp/nvhe/psci-relay.c   | 6 +++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 6160d1a34fa2..69f33647ce05 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -109,4 +109,6 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
 
 struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr);
 
+void pkvm_clear_pvmfw_pages(void);
+
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 52e45f42e1ec..1f15646c3f98 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -825,6 +825,14 @@ err_unlock:
 	return err;
 }
 
+void pkvm_clear_pvmfw_pages(void)
+{
+	void *addr = hyp_phys_to_virt(pvmfw_base);
+
+	memset(addr, 0, pvmfw_size);
+	kvm_flush_dcache_to_poc(addr, pvmfw_size);
+}
+
 /*
  * This function sets the registers on the vcpu to their architecturally defined
  * reset values.
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 08508783ec3d..cee6d4a2821f 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -12,6 +12,7 @@
 #include <uapi/linux/psci.h>
 
 #include <nvhe/memory.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
 void kvm_hyp_cpu_entry(unsigned long r0);
@@ -249,6 +250,7 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_
 	 */
 	case PSCI_0_2_FN_SYSTEM_OFF:
 	case PSCI_0_2_FN_SYSTEM_RESET:
+		pkvm_clear_pvmfw_pages();
 		return psci_forward(host_ctxt);
 	case PSCI_0_2_FN64_CPU_SUSPEND:
 		return psci_cpu_suspend(func_id, host_ctxt);
@@ -262,9 +264,11 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_
 static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
 {
 	switch (func_id) {
+	case PSCI_1_1_FN64_SYSTEM_RESET2:
+		pkvm_clear_pvmfw_pages();
+		fallthrough;
 	case PSCI_1_0_FN_PSCI_FEATURES:
 	case PSCI_1_0_FN_SET_SUSPEND_MODE:
-	case PSCI_1_1_FN64_SYSTEM_RESET2:
 		return psci_forward(host_ctxt);
 	case PSCI_1_0_FN64_SYSTEM_SUSPEND:
 		return psci_system_suspend(func_id, host_ctxt);

From fffed9c1431cfd744073f810c33493d3504231f7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 9 Dec 2021 11:59:38 +0000
Subject: [PATCH 100/457] ANDROID: KVM: arm64: Copy pvmfw into guest pages
 during donation from the host

When the host donates a page to a protected guest at an IPA which
coincides with the PVM firmware load address, copy-in the relevant
firmware page after unmapping it from the host but before mapping it
into the guest.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: I8cec813fa52938945f3122655deb785523a96ec8
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h      |  1 +
 arch/arm64/include/asm/kvm_pkvm.h      |  1 +
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 20 +++++++++++++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c  | 16 +++++++++--
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 39 ++++++++++++++++++++++++++
 5 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index bfcc36c22345..aeede8de9339 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -183,6 +183,7 @@ struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache teardown_mc;
 	struct list_head pinned_pages;
+	gpa_t pvmfw_load_addr;
 	bool enabled;
 };
 
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 542cf02ec6b0..e44c79558625 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -15,6 +15,7 @@
 #define KVM_MAX_PVMS 255
 
 #define HYP_MEMBLOCK_REGIONS 128
+#define PVMFW_INVALID_LOAD_ADDR	(-1)
 
 int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
 int pkvm_create_hyp_vm(struct kvm *kvm);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 69f33647ce05..ea1f340b63a3 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -84,6 +84,9 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu)
 	return vcpu_is_protected(&hyp_vcpu->vcpu);
 }
 
+extern phys_addr_t pvmfw_base;
+extern phys_addr_t pvmfw_size;
+
 void pkvm_hyp_vm_table_init(void *tbl);
 
 int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
@@ -109,6 +112,23 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
 
 struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr);
 
+static inline bool pkvm_hyp_vm_has_pvmfw(struct pkvm_hyp_vm *vm)
+{
+	return vm->kvm.arch.pkvm.pvmfw_load_addr != PVMFW_INVALID_LOAD_ADDR;
+}
+
+static inline bool pkvm_ipa_in_pvmfw_region(struct pkvm_hyp_vm *vm, u64 ipa)
+{
+	struct kvm_protected_vm *pkvm = &vm->kvm.arch.pkvm;
+
+	if (!pkvm_hyp_vm_has_pvmfw(vm))
+		return false;
+
+	return ipa - pkvm->pvmfw_load_addr < pvmfw_size;
+}
+
+int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
+			  u64 size);
 void pkvm_clear_pvmfw_pages(void);
 
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 85289934cccd..5ac5ad879419 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -993,10 +993,22 @@ static int guest_complete_donation(u64 addr, const struct pkvm_mem_transition *t
 	enum kvm_pgtable_prot prot = pkvm_mkstate(KVM_PGTABLE_PROT_RWX, PKVM_PAGE_OWNED);
 	struct pkvm_hyp_vcpu *vcpu = tx->completer.guest.hyp_vcpu;
 	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	phys_addr_t phys = tx->completer.guest.phys;
 	u64 size = tx->nr_pages * PAGE_SIZE;
+	int err;
 
-	return kvm_pgtable_stage2_map(&vm->pgt, addr, size, tx->completer.guest.phys,
-				      prot, &vcpu->vcpu.arch.pkvm_memcache);
+	if (tx->initiator.id == PKVM_ID_HOST &&
+	    pkvm_ipa_in_pvmfw_region(vm, addr)) {
+		if (WARN_ON(!pkvm_hyp_vcpu_is_protected(vcpu)))
+			return -EPERM;
+
+		err = pkvm_load_pvmfw_pages(vm, addr, phys, size);
+		if (err)
+			return err;
+	}
+
+	return kvm_pgtable_stage2_map(&vm->pgt, addr, size, phys, prot,
+				      &vcpu->vcpu.arch.pkvm_memcache);
 }
 
 static int __guest_get_completer_addr(u64 *completer_addr, phys_addr_t phys,
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 1f15646c3f98..34d6c2d55d62 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -14,6 +14,7 @@
 
 #include <nvhe/mem_protect.h>
 #include <nvhe/memory.h>
+#include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
 
@@ -448,6 +449,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
 	hyp_vm->host_kvm = host_kvm;
 	hyp_vm->kvm.created_vcpus = nr_vcpus;
 	hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
+	hyp_vm->kvm.arch.pkvm.pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR;
 	hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled);
 	hyp_vm->kvm.arch.mmu.last_vcpu_ran = last_ran;
 	memset(hyp_vm->kvm.arch.mmu.last_vcpu_ran, -1, pkvm_get_last_ran_size());
@@ -825,6 +827,43 @@ err_unlock:
 	return err;
 }
 
+int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
+			  u64 size)
+{
+	struct kvm_protected_vm *pkvm = &vm->kvm.arch.pkvm;
+	u64 npages, offset = ipa - pkvm->pvmfw_load_addr;
+	void *src = hyp_phys_to_virt(pvmfw_base) + offset;
+
+	if (offset >= pvmfw_size)
+		return -EINVAL;
+
+	size = min(size, pvmfw_size - offset);
+	if (!PAGE_ALIGNED(size) || !PAGE_ALIGNED(src))
+		return -EINVAL;
+
+	npages = size >> PAGE_SHIFT;
+	while (npages--) {
+		void *dst;
+
+		dst = hyp_fixmap_map(phys);
+		if (!dst)
+			return -EINVAL;
+
+		/*
+		 * No need for cache maintenance here, as the pgtable code will
+		 * take care of this when installing the pte in the guest's
+		 * stage-2 page table.
+		 */
+		memcpy(dst, src, PAGE_SIZE);
+
+		hyp_fixmap_unmap();
+		src += PAGE_SIZE;
+		phys += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
 void pkvm_clear_pvmfw_pages(void)
 {
 	void *addr = hyp_phys_to_virt(pvmfw_base);

From 6d697e8a6158c63d59d10aa4ebe13472e601c866 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 9 Dec 2021 12:46:14 +0000
Subject: [PATCH 101/457] ANDROID: KVM: arm64: Reset primary vCPU according to
 PVM firmware boot protocol

When a PVM firmware image is present for a protected VM, treat the first
running vCPU as the "primary" vCPU and reset its registers accordingly,
in particular by initialising its PC to enter the firmware at startup.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: I26676637145c7d809c5dc5ac0ad0e1fadaf275d2
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h |  3 +++
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 37 ++++++++++++++++++++++++--
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index ea1f340b63a3..eea4c60ffb75 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -55,6 +55,9 @@ struct pkvm_hyp_vm {
 	struct hyp_pool pool;
 	hyp_spinlock_t lock;
 
+	/* Primary vCPU pending entry to the pvmfw */
+	struct pkvm_hyp_vcpu *pvmfw_entry_vcpu;
+
 	/*
 	 * The number of vcpus initialized and ready to run.
 	 * Modifying this is protected by 'vm_table_lock'.
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 34d6c2d55d62..95b1393404be 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -390,13 +390,21 @@ static int pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu)
 	return ret;
 }
 
-static void pkvm_vcpu_init_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
+static int pkvm_vcpu_init_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct vcpu_reset_state *reset_state = &hyp_vcpu->vcpu.arch.reset_state;
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
 
 	if (test_bit(KVM_ARM_VCPU_POWER_OFF, hyp_vcpu->vcpu.arch.features)) {
 		reset_state->reset = false;
 		hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_OFF;
+	} else if (pkvm_hyp_vm_has_pvmfw(hyp_vm)) {
+		if (hyp_vm->pvmfw_entry_vcpu)
+			return -EINVAL;
+
+		hyp_vm->pvmfw_entry_vcpu = hyp_vcpu;
+		reset_state->reset = true;
+		hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
 	} else {
 		struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
 
@@ -405,6 +413,8 @@ static void pkvm_vcpu_init_psci(struct pkvm_hyp_vcpu *hyp_vcpu)
 		reset_state->reset = true;
 		hyp_vcpu->power_state = PSCI_0_2_AFFINITY_LEVEL_ON_PENDING;
 	}
+
+	return 0;
 }
 
 static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu)
@@ -486,6 +496,10 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 	if (ret)
 		goto done;
 
+	ret = pkvm_vcpu_init_psci(hyp_vcpu);
+	if (ret)
+		goto done;
+
 	if (test_bit(KVM_ARM_VCPU_SVE, hyp_vcpu->vcpu.arch.features)) {
 		size_t sve_state_size;
 		void *sve_state;
@@ -508,7 +522,6 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 
 	pkvm_vcpu_init_traps(hyp_vcpu);
 	kvm_reset_pvm_sys_regs(&hyp_vcpu->vcpu);
-	pkvm_vcpu_init_psci(hyp_vcpu);
 done:
 	if (ret)
 		unpin_host_vcpu(host_vcpu);
@@ -881,6 +894,7 @@ void pkvm_clear_pvmfw_pages(void)
 void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct vcpu_reset_state *reset_state = &hyp_vcpu->vcpu.arch.reset_state;
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
 
 	WARN_ON(!reset_state->reset);
 
@@ -890,6 +904,25 @@ void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 
 	/* Must be done after reseting sys registers. */
 	kvm_reset_vcpu_psci(&hyp_vcpu->vcpu, reset_state);
+	if (hyp_vm->pvmfw_entry_vcpu == hyp_vcpu) {
+		struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+		u64 entry = hyp_vm->kvm.arch.pkvm.pvmfw_load_addr;
+		int i;
+
+		/* X0 - X14 provided by the VMM (preserved) */
+		for (i = 0; i <= 14; ++i) {
+			u64 val = vcpu_get_reg(host_vcpu, i);
+
+			vcpu_set_reg(&hyp_vcpu->vcpu, i, val);
+		}
+
+		/* X15: Boot protocol version */
+		vcpu_set_reg(&hyp_vcpu->vcpu, 15, 0);
+
+		/* PC: IPA of pvmfw base */
+		*vcpu_pc(&hyp_vcpu->vcpu) = entry;
+		hyp_vm->pvmfw_entry_vcpu = NULL;
+	}
 
 	reset_state->reset = false;
 

From 2fa4eeb947ccf278b705debb1bf682de786149f0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 9 Dec 2021 13:15:41 +0000
Subject: [PATCH 102/457] ANDROID: KVM: arm64: Introduce
 KVM_CAP_ARM_PROTECTED_VM to set/query PVM firmware

Expose a new capability, KVM_CAP_ARM_PROTECTED_VM, for protected VMs
which allows the size of the PVM firmware region to be discovered from
userspace and for the firmware load address to be specified if it is
required.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: I819b9b2cfa227f1a0607a8f683aa01d4ae50704f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h |  1 +
 arch/arm64/include/uapi/asm/kvm.h |  9 ++++++
 arch/arm64/kvm/arm.c              | 14 +++++++--
 arch/arm64/kvm/hyp/nvhe/pkvm.c    |  8 ++++-
 arch/arm64/kvm/pkvm.c             | 52 +++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  1 +
 6 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index e44c79558625..bec69219b989 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -17,6 +17,7 @@
 #define HYP_MEMBLOCK_REGIONS 128
 #define PVMFW_INVALID_LOAD_ADDR	(-1)
 
+int pkvm_vm_ioctl_enable_cap(struct kvm *kvm,struct kvm_enable_cap *cap);
 int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
 int pkvm_create_hyp_vm(struct kvm *kvm);
 void pkvm_destroy_hyp_vm(struct kvm *kvm);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 316917b98707..288f2da2cb48 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -457,6 +457,15 @@ enum {
 #define KVM_PSCI_RET_INVAL		PSCI_RET_INVALID_PARAMS
 #define KVM_PSCI_RET_DENIED		PSCI_RET_DENIED
 
+/* Protected KVM */
+#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA	0
+#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO		1
+
+struct kvm_protected_vm_info {
+	__u64 firmware_size;
+	__u64 __reserved[7];
+};
+
 /* arm64-specific kvm_run::system_event flags */
 /*
  * Reset caused by a PSCI v1.1 SYSTEM_RESET2 call.
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 79fe530159cc..efface47fac2 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -79,9 +79,16 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 {
 	int r;
 
-	if (cap->flags)
-		return -EINVAL;
+	/* Capabilities with flags */
+	switch (cap->cap) {
+	case KVM_CAP_ARM_PROTECTED_VM:
+		return pkvm_vm_ioctl_enable_cap(kvm, cap);
+	default:
+		if (cap->flags)
+			return -EINVAL;
+	}
 
+	/* Capabilities without flags */
 	switch (cap->cap) {
 	case KVM_CAP_ARM_NISV_TO_USER:
 		if (kvm_vm_is_protected(kvm)) {
@@ -360,6 +367,9 @@ static int pkvm_check_extension(struct kvm *kvm, long ext, int kvm_cap)
 		    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA),
 			      PVM_ID_AA64ISAR1_ALLOW);
 		break;
+	case KVM_CAP_ARM_PROTECTED_VM:
+		r = 1;
+		break;
 	default:
 		r = 0;
 		break;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 95b1393404be..be228104e6a3 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -456,11 +456,17 @@ static size_t pkvm_get_last_ran_size(void)
 static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
 			     int *last_ran, unsigned int nr_vcpus)
 {
+	u64 pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR;
+
 	hyp_vm->host_kvm = host_kvm;
 	hyp_vm->kvm.created_vcpus = nr_vcpus;
 	hyp_vm->kvm.arch.vtcr = host_mmu.arch.vtcr;
-	hyp_vm->kvm.arch.pkvm.pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR;
 	hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled);
+
+	if (hyp_vm->kvm.arch.pkvm.enabled)
+		pvmfw_load_addr = READ_ONCE(host_kvm->arch.pkvm.pvmfw_load_addr);
+	hyp_vm->kvm.arch.pkvm.pvmfw_load_addr = pvmfw_load_addr;
+
 	hyp_vm->kvm.arch.mmu.last_vcpu_ran = last_ran;
 	memset(hyp_vm->kvm.arch.mmu.last_vcpu_ran, -1, pkvm_get_last_ran_size());
 }
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 7ee1b39daaff..6cad8440e324 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -252,6 +252,7 @@ int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)
 	if (!is_protected_kvm_enabled())
 		return -EINVAL;
 
+	host_kvm->arch.pkvm.pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR;
 	host_kvm->arch.pkvm.enabled = true;
 	return 0;
 }
@@ -313,3 +314,54 @@ static int __init pkvm_firmware_rmem_clear(void)
 	return 0;
 }
 device_initcall_sync(pkvm_firmware_rmem_clear);
+
+static int pkvm_vm_ioctl_set_fw_ipa(struct kvm *kvm, u64 ipa)
+{
+	int ret = 0;
+
+	if (!pkvm_firmware_mem)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.pkvm.handle) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	kvm->arch.pkvm.pvmfw_load_addr = ipa;
+out_unlock:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static int pkvm_vm_ioctl_info(struct kvm *kvm,
+			      struct kvm_protected_vm_info __user *info)
+{
+	struct kvm_protected_vm_info kinfo = {
+		.firmware_size = pkvm_firmware_mem ?
+				 pkvm_firmware_mem->size :
+				 0,
+	};
+
+	return copy_to_user(info, &kinfo, sizeof(kinfo)) ? -EFAULT : 0;
+}
+
+int pkvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
+{
+	if (!kvm_vm_is_protected(kvm))
+		return -EINVAL;
+
+	if (cap->args[1] || cap->args[2] || cap->args[3])
+		return -EINVAL;
+
+	switch (cap->flags) {
+	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA:
+		return pkvm_vm_ioctl_set_fw_ipa(kvm, cap->args[0]);
+	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO:
+		return pkvm_vm_ioctl_info(kvm, (void __force __user *)cap->args[0]);
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 48916ae8099c..1c9279f7c09f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1184,6 +1184,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_ZPCI_OP 221
 #define KVM_CAP_S390_CPU_TOPOLOGY 222
 #define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223
+#define KVM_CAP_ARM_PROTECTED_VM 0xffbadab1
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From dd0b2eb0d56a82b953e0da64b38825eb7fbeb547 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 5 Jan 2022 14:10:54 +0000
Subject: [PATCH 103/457] ANDROID: KVM: arm64: relay entropy requests from
 protected guests directly to secure

As pKVM does not trust the host, it should not be involved in the
handling of, or be able to observe the response to entropy requests
issues by protected guests.

When an SMC-based implementation of the ARM SMCCC TRNG interface is
present, pass any HVC-based requests directly on to the secure firmware.

Co-developed-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Bug: 209580772
Change-Id: Ica492ce49fd059a62ecc31bb7ac13c9adb773a08
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h |  1 +
 arch/arm64/kvm/arm.c             |  1 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c   | 34 ++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 1b597b7db99b..d450ed354d69 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -129,5 +129,6 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
 
 extern unsigned long kvm_nvhe_sym(__icache_flags);
 extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
+extern bool kvm_nvhe_sym(smccc_trng_available);
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index efface47fac2..51f83e6a13af 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2006,6 +2006,7 @@ static void kvm_hyp_init_symbols(void)
 	kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
 	kvm_nvhe_sym(__icache_flags) = __icache_flags;
 	kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
+	kvm_nvhe_sym(smccc_trng_available) = smccc_trng_available;
 }
 
 static int kvm_hyp_init_protection(u32 hyp_va_bits)
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index be228104e6a3..a91a5d9205e6 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -1268,6 +1268,35 @@ out_guest_err:
 	return true;
 }
 
+bool smccc_trng_available;
+
+static bool pkvm_forward_trng(struct kvm_vcpu *vcpu)
+{
+	u32 fn = smccc_get_function(vcpu);
+	struct arm_smccc_res res;
+	unsigned long arg1 = 0;
+
+	/*
+	 * Forward TRNG calls to EL3, as we can't trust the host to handle
+	 * these for us.
+	 */
+	switch (fn) {
+	case ARM_SMCCC_TRNG_FEATURES:
+	case ARM_SMCCC_TRNG_RND32:
+	case ARM_SMCCC_TRNG_RND64:
+		arg1 = smccc_get_arg1(vcpu);
+		fallthrough;
+	case ARM_SMCCC_TRNG_VERSION:
+	case ARM_SMCCC_TRNG_GET_UUID:
+		arm_smccc_1_1_smc(fn, arg1, &res);
+		smccc_set_retval(vcpu, res.a0, res.a1, res.a2, res.a3);
+		memzero_explicit(&res, sizeof(res));
+		break;
+	}
+
+	return true;
+}
+
 /*
  * Handler for protected VM HVC calls.
  *
@@ -1312,6 +1341,11 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 		return pkvm_memshare_call(hyp_vcpu, exit_code);
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
 		return pkvm_memunshare_call(hyp_vcpu);
+	case ARM_SMCCC_TRNG_VERSION ... ARM_SMCCC_TRNG_RND32:
+	case ARM_SMCCC_TRNG_RND64:
+		if (smccc_trng_available)
+			return pkvm_forward_trng(vcpu);
+		break;
 	default:
 		return pkvm_handle_psci(hyp_vcpu);
 	}

From 223a3844b73e6d2e9b74208ab29d502bb8e316a1 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 3 Feb 2022 13:57:57 +0000
Subject: [PATCH 104/457] ANDROID: KVM: arm64: Add .hyp.data section

The hypervisor has not needed its own .data section because all globals
were either .rodata or .bss. Linked lists are initialized with the head
pointing to itself. To avoid having to work around this by initializing
at runtime, add a .hyp.data section.

Bug: 190463801
Signed-off-by: David Brazdil <dbrazdil@google.com>
Change-Id: I7a56dc4c93e05bbef53c66837164d17c6103b6b8
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/sections.h |  1 +
 arch/arm64/kernel/image-vars.h    |  2 ++
 arch/arm64/kernel/vmlinux.lds.S   | 18 +++++++++++++++---
 arch/arm64/kvm/arm.c              |  7 +++++++
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S |  2 ++
 arch/arm64/kvm/hyp/nvhe/setup.c   |  4 ++++
 6 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h
index 40971ac1303f..51b0d594239e 100644
--- a/arch/arm64/include/asm/sections.h
+++ b/arch/arm64/include/asm/sections.h
@@ -11,6 +11,7 @@ extern char __alt_instructions[], __alt_instructions_end[];
 extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[];
 extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 extern char __hyp_text_start[], __hyp_text_end[];
+extern char __hyp_data_start[], __hyp_data_end[];
 extern char __hyp_rodata_start[], __hyp_rodata_end[];
 extern char __hyp_reloc_begin[], __hyp_reloc_end[];
 extern char __hyp_bss_start[], __hyp_bss_end[];
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index e3f88b5836a2..0b301150ebbf 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -109,6 +109,8 @@ KVM_NVHE_ALIAS(__hyp_text_start);
 KVM_NVHE_ALIAS(__hyp_text_end);
 KVM_NVHE_ALIAS(__hyp_bss_start);
 KVM_NVHE_ALIAS(__hyp_bss_end);
+KVM_NVHE_ALIAS(__hyp_data_start);
+KVM_NVHE_ALIAS(__hyp_data_end);
 KVM_NVHE_ALIAS(__hyp_rodata_start);
 KVM_NVHE_ALIAS(__hyp_rodata_end);
 
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 45131e354e27..dde6d49c2ef8 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -13,7 +13,7 @@
 	*(__kvm_ex_table)					\
 	__stop___kvm_ex_table = .;
 
-#define HYPERVISOR_DATA_SECTIONS				\
+#define HYPERVISOR_RODATA_SECTIONS				\
 	HYP_SECTION_NAME(.rodata) : {				\
 		. = ALIGN(PAGE_SIZE);				\
 		__hyp_rodata_start = .;				\
@@ -23,6 +23,15 @@
 		__hyp_rodata_end = .;				\
 	}
 
+#define HYPERVISOR_DATA_SECTION					\
+	HYP_SECTION_NAME(.data) : {				\
+		. = ALIGN(PAGE_SIZE);				\
+		__hyp_data_start = .;				\
+		*(HYP_SECTION_NAME(.data))			\
+		. = ALIGN(PAGE_SIZE);				\
+		__hyp_data_end = .;				\
+	}
+
 #define HYPERVISOR_PERCPU_SECTION				\
 	. = ALIGN(PAGE_SIZE);					\
 	HYP_SECTION_NAME(.data..percpu) : {			\
@@ -51,7 +60,8 @@
 #define SBSS_ALIGN			PAGE_SIZE
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
-#define HYPERVISOR_DATA_SECTIONS
+#define HYPERVISOR_RODATA_SECTIONS
+#define HYPERVISOR_DATA_SECTION
 #define HYPERVISOR_PERCPU_SECTION
 #define HYPERVISOR_RELOC_SECTION
 #define SBSS_ALIGN			0
@@ -188,7 +198,7 @@ SECTIONS
 	/* everything from this point to __init_begin will be marked RO NX */
 	RO_DATA(PAGE_SIZE)
 
-	HYPERVISOR_DATA_SECTIONS
+	HYPERVISOR_RODATA_SECTIONS
 
 	/* code sections that are never executed via the kernel mapping */
 	.rodata.text : {
@@ -276,6 +286,8 @@ SECTIONS
 	_sdata = .;
 	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
 
+	HYPERVISOR_DATA_SECTION
+
 	/*
 	 * Data written with the MMU off but read with the MMU on requires
 	 * cache lines to be invalidated, discarding up to a Cache Writeback
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 51f83e6a13af..af647ea938b2 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2093,6 +2093,13 @@ static int init_hyp_mode(void)
 		goto out_err;
 	}
 
+	err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start),
+				  kvm_ksym_ref(__hyp_data_end), PAGE_HYP);
+	if (err) {
+		kvm_err("Cannot map .hyp.data section\n");
+		goto out_err;
+	}
+
 	err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
 				  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
 	if (err) {
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index f4562f417d3f..d724f6d69302 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -25,5 +25,7 @@ SECTIONS {
 	BEGIN_HYP_SECTION(.data..percpu)
 		PERCPU_INPUT(L1_CACHE_BYTES)
 	END_HYP_SECTION
+
 	HYP_SECTION(.bss)
+	HYP_SECTION(.data)
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 58be7ec39332..81c9a999ad44 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -94,6 +94,10 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size,
 	if (ret)
 		return ret;
 
+	ret = pkvm_create_mappings(__hyp_data_start, __hyp_data_end, PAGE_HYP);
+	if (ret)
+		return ret;
+
 	ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO);
 	if (ret)
 		return ret;

From 71fb5851c5e21dcc118f3de87c57f46eb4eec367 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 11 Oct 2022 17:54:00 +0100
Subject: [PATCH 105/457] FROMLIST: KVM: arm64: pkvm: Fixup boot mode to
 reflect that the kernel resumes from EL1

The kernel has an awfully complicated boot sequence in order to cope
with the various EL2 configurations, including those that "enhanced"
the architecture. We go from EL2 to EL1, then back to EL2, staying
at EL2 if VHE capable and otherwise go back to EL1.

Here's a paracetamol tablet for you.

The cpu_resume path follows the same logic, because coming up with
two versions of a square wheel is hard.

However, things aren't this straightforward with pKVM, as the host
resume path is always proxied by the hypervisor, which means that
the kernel is always entered at EL1. Which contradicts what the
__boot_cpu_mode[] array contains (it obviously says EL2).

This thus triggers a HVC call from EL1 to EL2 in a vain attempt
to upgrade from EL1 to EL2 VHE, which we are, funnily enough,
reluctant to grant to the host kernel. This is also completely
unexpected, and puzzles your average EL2 hacker.

Address it by fixing up the boot mode at the point the host gets
deprivileged. is_hyp_mode_available() and co already have a static
branch to deal with this, making it pretty safe.

Cc: <stable@vger.kernel.org> # 5.15+
Reported-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Tested-by: Vincent Donnefort <vdonnefort@google.com>
Bug: 258157858
Link: https://lore.kernel.org/all/20221108100138.3887862-1-vdonnefort@google.com/
Change-Id: I4a2269402ececa0ec47cab88343c3c623b4b2e3d
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/arm.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index af647ea938b2..1f74e8fd3bd7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2236,6 +2236,17 @@ static int pkvm_drop_host_privileges(void)
 	 * once the host stage 2 is installed.
 	 */
 	static_branch_enable(&kvm_protected_mode_initialized);
+
+	/*
+	 * Fixup the boot mode so that we don't take spurious round
+	 * trips via EL2 on cpu_resume. Flush to the PoC for a good
+	 * measure, so that it can be observed by a CPU coming out of
+	 * suspend with the MMU off.
+	 */
+	__boot_cpu_mode[0] = __boot_cpu_mode[1] = BOOT_CPU_MODE_EL1;
+	dcache_clean_poc((unsigned long)__boot_cpu_mode,
+			 (unsigned long)(__boot_cpu_mode + 2));
+
 	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
 	return ret;
 }

From f3f6e133581b95b1d91f1320d43a44baad465037 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 6 Jul 2022 10:56:24 +0000
Subject: [PATCH 106/457] ANDROID: KVM: arm64: Turn llist of pinned pages into
 an rb-tree

Indexed by IPA, so we can efficiently lookup.

Bug: 240239989
Change-Id: I10ad6ad5a7a6aa34a0814ed334b20f4ae42ca830
Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  5 +++--
 arch/arm64/kvm/mmu.c              | 30 ++++++++++++++++++++++++++----
 arch/arm64/kvm/pkvm.c             | 12 +++++++-----
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index aeede8de9339..26c145aa7fa6 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -173,8 +173,9 @@ struct kvm_smccc_features {
 };
 
 struct kvm_pinned_page {
-	struct list_head	link;
+	struct rb_node		node;
 	struct page		*page;
+	u64			ipa;
 };
 
 typedef unsigned int pkvm_handle_t;
@@ -182,7 +183,7 @@ typedef unsigned int pkvm_handle_t;
 struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache teardown_mc;
-	struct list_head pinned_pages;
+	struct rb_root pinned_pages;
 	gpa_t pvmfw_load_addr;
 	bool enabled;
 };
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d7138520e0f1..610d8f146197 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -228,6 +228,7 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
 static void pkvm_stage2_flush(struct kvm *kvm)
 {
 	struct kvm_pinned_page *ppage;
+	struct rb_node *node;
 
 	/*
 	 * Contrary to stage2_apply_range(), we don't need to check
@@ -235,7 +236,8 @@ static void pkvm_stage2_flush(struct kvm *kvm)
 	 * from a vcpu thread, and the list is only ever freed on VM
 	 * destroy (which only occurs when all vcpu are gone).
 	 */
-	list_for_each_entry(ppage, &kvm->arch.pkvm.pinned_pages, link) {
+	for (node = rb_first(&kvm->arch.pkvm.pinned_pages); node; node = rb_next(node)) {
+		ppage = rb_entry(node, struct kvm_pinned_page, node);
 		__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
 		cond_resched_rwlock_write(&kvm->mmu_lock);
 	}
@@ -728,7 +730,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
 	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
 	kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
-	INIT_LIST_HEAD(&kvm->arch.pkvm.pinned_pages);
+	kvm->arch.pkvm.pinned_pages = RB_ROOT;
 	mmu->arch = &kvm->arch;
 
 	if (is_protected_kvm_enabled())
@@ -1209,6 +1211,26 @@ static int pkvm_host_map_guest(u64 pfn, u64 gfn)
 	return (ret == -EPERM) ? -EAGAIN : ret;
 }
 
+static int cmp_ppages(struct rb_node *node, const struct rb_node *parent)
+{
+	struct kvm_pinned_page *a = container_of(node, struct kvm_pinned_page, node);
+	struct kvm_pinned_page *b = container_of(parent, struct kvm_pinned_page, node);
+
+	if (a->ipa < b->ipa)
+		return -1;
+	if (a->ipa > b->ipa)
+		return 1;
+	return 0;
+}
+
+static int insert_ppage(struct kvm *kvm, struct kvm_pinned_page *ppage)
+{
+	if (rb_find_add(&ppage->node, &kvm->arch.pkvm.pinned_pages, cmp_ppages))
+		return -EEXIST;
+
+	return 0;
+}
+
 static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  unsigned long hva)
 {
@@ -1273,8 +1295,8 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	}
 
 	ppage->page = page;
-	INIT_LIST_HEAD(&ppage->link);
-	list_add(&ppage->link, &kvm->arch.pkvm.pinned_pages);
+	ppage->ipa = fault_ipa;
+	WARN_ON(insert_ppage(kvm, ppage));
 	write_unlock(&kvm->mmu_lock);
 
 	return 0;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 6cad8440e324..1ef33dc9b0da 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -217,9 +217,9 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
 
 void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 {
-	struct kvm_pinned_page *ppage, *tmp;
+	struct kvm_pinned_page *ppage;
 	struct mm_struct *mm = current->mm;
-	struct list_head *ppages;
+	struct rb_node *node;
 
 	if (host_kvm->arch.pkvm.handle) {
 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
@@ -229,15 +229,17 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 	host_kvm->arch.pkvm.handle = 0;
 	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
 
-	ppages = &host_kvm->arch.pkvm.pinned_pages;
-	list_for_each_entry_safe(ppage, tmp, ppages, link) {
+	node = rb_first(&host_kvm->arch.pkvm.pinned_pages);
+	while (node) {
+		ppage = rb_entry(node, struct kvm_pinned_page, node);
 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_reclaim_page,
 					  page_to_pfn(ppage->page)));
 		cond_resched();
 
 		account_locked_vm(mm, 1, false);
 		unpin_user_pages_dirty_lock(&ppage->page, 1, true);
-		list_del(&ppage->link);
+		node = rb_next(node);
+		rb_erase(&ppage->node, &host_kvm->arch.pkvm.pinned_pages);
 		kfree(ppage);
 	}
 }

From 4fc49028bbddb8a22a4fcbf55eda72b37a346bbd Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Tue, 8 Nov 2022 11:04:49 +0000
Subject: [PATCH 107/457] ANDROID: KVM: arm64: Implement MEM_RELINQUISH SMCCC
 hypercall

This allows a VM running on PKVM to notify the hypervisor (and host)
that it is returning pages to host ownership.

Bug: 240239989
Change-Id: I4644736db04afacd7da4c6f465130c73c2e44b93
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h             |  1 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h        |  1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  4 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 33 ++++++++
 arch/arm64/kvm/hyp/nvhe/pkvm.c                | 83 +++++++++++++++++--
 arch/arm64/kvm/hyp/nvhe/switch.c              |  1 +
 arch/arm64/kvm/hypercalls.c                   | 19 ++++-
 arch/arm64/kvm/pkvm.c                         | 35 ++++++++
 include/linux/arm-smccc.h                     |  7 ++
 10 files changed, 176 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index bec69219b989..86adf6778108 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -21,6 +21,7 @@ int pkvm_vm_ioctl_enable_cap(struct kvm *kvm,struct kvm_enable_cap *cap);
 int pkvm_init_host_vm(struct kvm *kvm, unsigned long type);
 int pkvm_create_hyp_vm(struct kvm *kvm);
 void pkvm_destroy_hyp_vm(struct kvm *kvm);
+void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa);
 
 /*
  * Definitions for features to be allowed or restricted for guest virtual
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 38e5e9b259fc..81a2c9c9450a 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -72,6 +72,8 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu);
 int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
+				    u64 ipa, u64 *ppa);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index eea4c60ffb75..c880d6605453 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -112,6 +112,7 @@ int kvm_check_pvm_sysreg_table(void);
 void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu);
 
 bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
+bool kvm_hyp_handle_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code);
 
 struct pkvm_hyp_vcpu *pkvm_mpidr_to_hyp_vcpu(struct pkvm_hyp_vm *vm, u64 mpidr);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 877dbb21da3a..2c9ac32daf11 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -92,6 +92,8 @@ static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
 		fallthrough;
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+		fallthrough;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
 		vcpu_set_reg(&hyp_vcpu->vcpu, 0, SMCCC_RET_SUCCESS);
 		break;
 	default:
@@ -260,6 +262,8 @@ static void handle_pvm_exit_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
 		fallthrough;
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+		fallthrough;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
 		n = 4;
 		break;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 5ac5ad879419..2e934988adb2 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -265,6 +265,7 @@ static int reclaim_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 {
 	kvm_pte_t pte = *ptep;
 	struct hyp_page *page;
+	u64 *pa = arg;
 
 	if (!kvm_pte_valid(pte))
 		return 0;
@@ -276,6 +277,8 @@ static int reclaim_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 		fallthrough;
 	case PKVM_PAGE_SHARED_BORROWED:
 	case PKVM_PAGE_SHARED_OWNED:
+		if (pa)
+			*pa = kvm_pte_to_phys(pte);
 		page->flags |= HOST_PAGE_PENDING_RECLAIM;
 		break;
 	default:
@@ -315,6 +318,36 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
 	}
 }
 
+int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
+				    u64 ipa, u64 *ppa)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb     = reclaim_walker,
+		.arg    = ppa,
+		.flags  = KVM_PGTABLE_WALK_LEAF
+	};
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
+	int ret;
+
+	host_lock_component();
+	guest_lock_component(vm);
+
+	/* Set default pa value to "not found". */
+	*ppa = 0;
+
+	/* If ipa is mapped: sets page flags, and gets the pa. */
+	ret = kvm_pgtable_walk(&vm->pgt, ipa, PAGE_SIZE, &walker);
+
+	/* Zap the guest stage2 pte. */
+	if (!ret)
+		kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
+
+	guest_unlock_component(vm);
+	host_unlock_component();
+
+	return ret;
+}
+
 int __pkvm_prot_finalize(void)
 {
 	struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index a91a5d9205e6..94ec94d5c58d 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -1268,6 +1268,54 @@ out_guest_err:
 	return true;
 }
 
+static bool pkvm_meminfo_call(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u64 arg1 = smccc_get_arg1(vcpu);
+	u64 arg2 = smccc_get_arg2(vcpu);
+	u64 arg3 = smccc_get_arg3(vcpu);
+
+	if (arg1 || arg2 || arg3)
+		goto out_guest_err;
+
+	smccc_set_retval(vcpu, PAGE_SIZE, 0, 0, 0);
+	return true;
+
+out_guest_err:
+	smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
+	return true;
+}
+
+static bool pkvm_memrelinquish_call(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	u64 ipa = smccc_get_arg1(vcpu);
+	u64 arg2 = smccc_get_arg2(vcpu);
+	u64 arg3 = smccc_get_arg3(vcpu);
+	u64 pa = 0;
+	int ret;
+
+	if (arg2 || arg3)
+		goto out_guest_err;
+
+	ret = __pkvm_guest_relinquish_to_host(hyp_vcpu, ipa, &pa);
+	if (ret)
+		goto out_guest_err;
+
+	if (pa != 0) {
+		/* Now pass to host. */
+		return false;
+	}
+
+	/* This was a NOP as no page was actually mapped at the IPA. */
+	smccc_set_retval(vcpu, 0, 0, 0, 0);
+	return true;
+
+out_guest_err:
+	smccc_set_retval(vcpu, SMCCC_RET_INVALID_PARAMETER, 0, 0, 0);
+	return true;
+}
+
 bool smccc_trng_available;
 
 static bool pkvm_forward_trng(struct kvm_vcpu *vcpu)
@@ -1327,20 +1375,16 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);
 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH);
 		break;
 	case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
-		if (smccc_get_arg1(vcpu) ||
-		    smccc_get_arg2(vcpu) ||
-		    smccc_get_arg3(vcpu)) {
-			val[0] = SMCCC_RET_INVALID_PARAMETER;
-		} else {
-			val[0] = PAGE_SIZE;
-		}
-		break;
+		return pkvm_meminfo_call(hyp_vcpu);
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
 		return pkvm_memshare_call(hyp_vcpu, exit_code);
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
 		return pkvm_memunshare_call(hyp_vcpu);
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
+		return pkvm_memrelinquish_call(hyp_vcpu);
 	case ARM_SMCCC_TRNG_VERSION ... ARM_SMCCC_TRNG_RND32:
 	case ARM_SMCCC_TRNG_RND64:
 		if (smccc_trng_available)
@@ -1353,3 +1397,26 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 	smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
 	return true;
 }
+
+/*
+ * Handler for non-protected VM HVC calls.
+ *
+ * Returns true if the hypervisor has handled the exit, and control should go
+ * back to the guest, or false if it hasn't.
+ */
+bool kvm_hyp_handle_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
+{
+	u32 fn = smccc_get_function(vcpu);
+	struct pkvm_hyp_vcpu *hyp_vcpu;
+
+	hyp_vcpu = container_of(vcpu, struct pkvm_hyp_vcpu, vcpu);
+
+	switch (fn) {
+	case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
+		return pkvm_meminfo_call(hyp_vcpu);
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
+		return pkvm_memrelinquish_call(hyp_vcpu);
+	}
+
+	return false;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 4b904252e1db..310aaf493909 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -181,6 +181,7 @@ static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code)
 static const exit_handler_fn hyp_exit_handlers[] = {
 	[0 ... ESR_ELx_EC_MAX]		= NULL,
 	[ESR_ELx_EC_CP15_32]		= kvm_hyp_handle_cp15_32,
+	[ESR_ELx_EC_HVC64]		= kvm_hyp_handle_hvc64,
 	[ESR_ELx_EC_SYS64]		= kvm_hyp_handle_sysreg,
 	[ESR_ELx_EC_SVE]		= kvm_hyp_handle_fpsimd,
 	[ESR_ELx_EC_FP_ASIMD]		= kvm_hyp_handle_fpsimd,
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index c9f401fa01a9..2a92b9bd2a46 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -5,6 +5,7 @@
 #include <linux/kvm_host.h>
 
 #include <asm/kvm_emulate.h>
+#include <asm/kvm_pkvm.h>
 
 #include <kvm/arm_hypercalls.h>
 #include <kvm/arm_psci.h>
@@ -13,8 +14,15 @@
 	GENMASK(KVM_REG_ARM_STD_BMAP_BIT_COUNT - 1, 0)
 #define KVM_ARM_SMCCC_STD_HYP_FEATURES				\
 	GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0)
-#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES			\
-	GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0)
+#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES ({				\
+	unsigned long f;						\
+	f = GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0);	\
+	if (is_protected_kvm_enabled()) {				\
+		f |= BIT(ARM_SMCCC_KVM_FUNC_HYP_MEMINFO);		\
+		f |= BIT(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH);		\
+	}								\
+	f;								\
+})
 
 static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
 {
@@ -116,6 +124,9 @@ static bool kvm_hvc_call_allowed(struct kvm_vcpu *vcpu, u32 func_id)
 	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
 		return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP,
 				&smccc_feat->vendor_hyp_bmap);
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
+		return test_bit(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH,
+				&smccc_feat->vendor_hyp_bmap);
 	default:
 		return kvm_hvc_call_default_allowed(func_id);
 	}
@@ -213,6 +224,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
 		kvm_ptp_get_time(vcpu, val);
 		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
+		pkvm_host_reclaim_page(vcpu->kvm, smccc_get_arg1(vcpu));
+		val[0] = SMCCC_RET_SUCCESS;
+		break;
 	case ARM_SMCCC_TRNG_VERSION:
 	case ARM_SMCCC_TRNG_FEATURES:
 	case ARM_SMCCC_TRNG_GET_UUID:
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 1ef33dc9b0da..ea2ea5b61424 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -259,6 +259,41 @@ int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)
 	return 0;
 }
 
+static int rb_ppage_cmp(const void *key, const struct rb_node *node)
+{
+       struct kvm_pinned_page *p = container_of(node, struct kvm_pinned_page, node);
+       phys_addr_t ipa = (phys_addr_t)key;
+
+       return (ipa < p->ipa) ? -1 : (ipa > p->ipa);
+}
+
+void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
+{
+	struct kvm_pinned_page *ppage;
+	struct mm_struct *mm = current->mm;
+	struct rb_node *node;
+
+	write_lock(&host_kvm->mmu_lock);
+	node = rb_find((void *)ipa, &host_kvm->arch.pkvm.pinned_pages,
+		       rb_ppage_cmp);
+	if (node)
+		rb_erase(node, &host_kvm->arch.pkvm.pinned_pages);
+	write_unlock(&host_kvm->mmu_lock);
+
+	WARN_ON(!node);
+	if (!node)
+		return;
+
+	ppage = container_of(node, struct kvm_pinned_page, node);
+
+	WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_reclaim_page,
+				  page_to_pfn(ppage->page)));
+
+	account_locked_vm(mm, 1, false);
+	unpin_user_pages_dirty_lock(&ppage->page, 1, true);
+	kfree(ppage);
+}
+
 static int __init pkvm_firmware_rmem_err(struct reserved_mem *rmem,
 					 const char *reason)
 {
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 25c576a910df..8224bed759ca 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -115,6 +115,7 @@
 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE		3
 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE		4
+#define ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH	9
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -155,6 +156,12 @@
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_MEM_UNSHARE)
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH)
+
 /* ptp_kvm counter type ID */
 #define KVM_PTP_VIRT_COUNTER			0
 #define KVM_PTP_PHYS_COUNTER			1

From 102dcecb67dc08ab3071a9ba4945297aaf6d389a Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 26 May 2022 14:22:34 +0000
Subject: [PATCH 108/457] ANDROID: Define mem_relinquish interface for
 releasing memory to a hypervisor.

On PKVM/ARM64 this uses the ARM SMCCC relinquish hypercall when available.

Bug: 240239989
Change-Id: Ifa85b641a48f348a2364cf8c6b06b6417f1eeedb
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/Kconfig                            |  3 ++
 arch/arm64/Kconfig                      |  1 +
 arch/arm64/include/asm/hypervisor.h     |  6 +++
 arch/arm64/include/asm/mem_relinquish.h | 12 +++++
 arch/arm64/kernel/setup.c               |  1 +
 arch/arm64/mm/Makefile                  |  1 +
 arch/arm64/mm/mem_relinquish.c          | 58 +++++++++++++++++++++++++
 include/linux/mem_relinquish.h          | 20 +++++++++
 8 files changed, 102 insertions(+)
 create mode 100644 arch/arm64/include/asm/mem_relinquish.h
 create mode 100644 arch/arm64/mm/mem_relinquish.c
 create mode 100644 include/linux/mem_relinquish.h

diff --git a/arch/Kconfig b/arch/Kconfig
index 8f138e580d1a..b5fb130a07e2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1316,6 +1316,9 @@ config RELR
 config ARCH_HAS_MEM_ENCRYPT
 	bool
 
+config ARCH_HAS_MEM_RELINQUISH
+	bool
+
 config ARCH_HAS_CC_PLATFORM
 	bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9bf4dc17c574..1489ef400c6a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -33,6 +33,7 @@ config ARM64
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_MEM_ENCRYPT
+	select ARCH_HAS_MEM_RELINQUISH
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index ee45ae14be27..aa577913d019 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -10,4 +10,10 @@ void kvm_arm_init_hyp_services(void);
 void kvm_init_memshare_services(void);
 void kvm_init_ioremap_services(void);
 
+#ifdef CONFIG_MEMORY_BALLOON
+void kvm_init_memrelinquish_services(void);
+#else
+static inline void kvm_init_memrelinquish_services(void) {}
+#endif
+
 #endif
diff --git a/arch/arm64/include/asm/mem_relinquish.h b/arch/arm64/include/asm/mem_relinquish.h
new file mode 100644
index 000000000000..a4ace9e6e413
--- /dev/null
+++ b/arch/arm64/include/asm/mem_relinquish.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google LLC
+ * Author: Keir Fraser <keirf@google.com>
+ */
+
+#ifndef __ASM_MEM_RELINQUISH_H
+#define __ASM_MEM_RELINQUISH_H
+
+void page_relinquish(struct page *page);
+
+#endif	/* __ASM_MEM_RELINQUISH_H */
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 575eafcfcb66..cfbf8947aa27 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -445,4 +445,5 @@ void kvm_arm_init_hyp_services(void)
 {
 	kvm_init_ioremap_services();
 	kvm_init_memshare_services();
+	kvm_init_memrelinquish_services();
 }
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index f48be6c63938..542a4d0120d0 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,6 +3,7 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   cache.o copypage.o flush.o \
 				   ioremap.o mem_encrypt.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o
+obj-$(CONFIG_MEMORY_BALLOON)	+= mem_relinquish.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
 obj-$(CONFIG_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
diff --git a/arch/arm64/mm/mem_relinquish.c b/arch/arm64/mm/mem_relinquish.c
new file mode 100644
index 000000000000..c95bcbb14d92
--- /dev/null
+++ b/arch/arm64/mm/mem_relinquish.c
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google LLC
+ * Author: Keir Fraser <keirf@google.com>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+
+#include <asm/hypervisor.h>
+
+static unsigned long memshare_granule_sz;
+
+void kvm_init_memrelinquish_services(void)
+{
+	int i;
+	struct arm_smccc_res res;
+	const u32 funcs[] = {
+		ARM_SMCCC_KVM_FUNC_HYP_MEMINFO,
+		ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(funcs); ++i) {
+		if (!kvm_arm_hyp_service_available(funcs[i]))
+			return;
+	}
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID,
+			     0, 0, 0, &res);
+	if (res.a0 > PAGE_SIZE) /* Includes error codes */
+		return;
+
+	memshare_granule_sz = res.a0;
+}
+
+void page_relinquish(struct page *page)
+{
+	phys_addr_t phys, end;
+	u32 func_id = ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID;
+
+	if (!memshare_granule_sz)
+		return;
+
+	phys = page_to_phys(page);
+	end = phys + PAGE_SIZE;
+
+	while (phys < end) {
+		struct arm_smccc_res res;
+
+		arm_smccc_1_1_invoke(func_id, phys, 0, 0, &res);
+		BUG_ON(res.a0 != SMCCC_RET_SUCCESS);
+
+		phys += memshare_granule_sz;
+	}
+}
+EXPORT_SYMBOL_GPL(page_relinquish);
diff --git a/include/linux/mem_relinquish.h b/include/linux/mem_relinquish.h
new file mode 100644
index 000000000000..6b7bf861d92d
--- /dev/null
+++ b/include/linux/mem_relinquish.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google LLC
+ * Author: Keir Fraser <keirf@google.com>
+ */
+
+#ifndef __MEM_RELINQUISH_H__
+#define __MEM_RELINQUISH_H__
+
+#ifdef CONFIG_ARCH_HAS_MEM_RELINQUISH
+
+#include <asm/mem_relinquish.h>
+
+#else	/* !CONFIG_ARCH_HAS_MEM_RELINQUISH */
+
+static inline void page_relinquish(struct page *page) { }
+
+#endif	/* CONFIG_ARCH_HAS_MEM_RELINQUISH */
+
+#endif	/* __MEM_RELINQUISH_H__ */

From 23cfd30bb3969f51ab77b9047e77ca01e0e0b350 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Wed, 9 Nov 2022 13:53:28 +0000
Subject: [PATCH 109/457] ANDROID: KVM: arm64: memory balloon: Notify hyp when
 ballooning

When running as a protected VM, the hypervisor isolates the VM's
memory pages from the host. Returning ownership of a VM page
therefore requires hypervisor involvement, and acknowledgement from
the protected VM that it is voluntarily cooperating.

To this end, notify pages via the new relinquish hypercall when they
are entered into the memory balloon.

Bug: 240239989
Change-Id: Ic89b45312a7478ddff081a934d99e693eded92dc
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 include/linux/balloon_compaction.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 5ca2d5699620..318cfe8b57a6 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -43,6 +43,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/mem_relinquish.h>
 
 /*
  * Balloon device information descriptor.
@@ -95,6 +96,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 	__SetPageMovable(page, &balloon_mops);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
+	page_relinquish(page);
 }
 
 /*
@@ -139,6 +141,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 {
 	__SetPageOffline(page);
 	list_add(&page->lru, &balloon->pages);
+	page_relinquish(page);
 }
 
 static inline void balloon_page_delete(struct page *page)

From e95813e9da6b43edc63a7dd0e2f704428c992b4d Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 18 Aug 2022 10:41:35 +0000
Subject: [PATCH 110/457] ANDROID: KVM: arm64: balloon: Notify hyp before
 reporting free pages to host

When running as a protected VM, the hypervisor isolates the VM's
memory pages from the host. Returning ownership of a VM page therefore
requires hypervisor involvement, and acknowledgement from the
protected VM that it is voluntarily cooperating.

To this end, notify pages via the new relinquish hypercall when they
are being reported to the host as free and available for temporary
reclaim.

Bug: 240239989
Change-Id: I8718e468be63c3aacb2f79ff141fbcedd6d19b56
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 mm/page_reporting.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 382958eef8a9..5c4b1fb73187 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
+#include <linux/mem_relinquish.h>
 
 #include "page_reporting.h"
 #include "internal.h"
@@ -120,7 +121,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
 	unsigned int page_len = PAGE_SIZE << order;
 	struct page *page, *next;
 	long budget;
-	int err = 0;
+	int i, err = 0;
 
 	/*
 	 * Perform early check, if free area is empty there is
@@ -175,6 +176,10 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
 			--(*offset);
 			sg_set_page(&sgl[*offset], page, page_len, 0);
 
+			/* Notify hyp that these pages are reclaimable. */
+			for (i = 0; i < (1<<order); i++)
+				page_relinquish(page+i);
+
 			continue;
 		}
 

From aa2a4936ece6e65e3416d0357ef56fb0aa29a3cc Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 18 Aug 2022 11:20:25 +0000
Subject: [PATCH 111/457] ANDROID: virtio_balloon: Do not translate reported
 pages through DMA API

The free-page reporting and hinting queues do not pass arrays of page
addresses (like the basic inflate queue) but instead pass the free page
ranges as buffers. This does not work well with DMA API: The host wants
to know the GPA, not an IOVA.

For these two virtqueues, disable DMA API and pass through buffers untranslated.

Bug: 240239989
Change-Id: I2d13a8b7e8f6775819de7fe96f4579afa08b1300
Signed-off-by: Keir Fraser <keirf@google.com>
[ qperret@: Fixed minor context conflict in virtio.h ]
Signed-off-by: Quentin Perret <qperret@google.com>
---
 drivers/virtio/virtio_balloon.c |  8 ++++++--
 drivers/virtio/virtio_ring.c    | 10 ++++++++++
 include/linux/virtio.h          |  2 ++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 3f78a3a1eb75..4c85349024c1 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -553,11 +553,15 @@ static int init_vqs(struct virtio_balloon *vb)
 		virtqueue_kick(vb->stats_vq);
 	}
 
-	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT))
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
 		vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE];
+		virtqueue_disable_dma_api_for_buffers(vb->free_page_vq);
+	}
 
-	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) {
 		vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING];
+		virtqueue_disable_dma_api_for_buffers(vb->reporting_vq);
+	}
 
 	return 0;
 }
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 2e7689bb933b..3eca7140bd80 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2863,4 +2863,14 @@ const struct vring *virtqueue_get_vring(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring);
 
+/*
+ * Prevents use of DMA API for buffers passed via the specified virtqueue.
+ * DMA API may still be used for the vrings themselves.
+ */
+void virtqueue_disable_dma_api_for_buffers(struct virtqueue *vq)
+{
+	to_vvq(vq)->use_dma_api = false;
+}
+EXPORT_SYMBOL_GPL(virtqueue_disable_dma_api_for_buffers);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index dcab9c7e8784..34e936343ae4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -96,6 +96,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
 int virtqueue_resize(struct virtqueue *vq, u32 num,
 		     void (*recycle)(struct virtqueue *vq, void *buf));
 
+void virtqueue_disable_dma_api_for_buffers(struct virtqueue *vq);
+
 /**
  * struct virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus

From f4b25ab1066d51c04bbd6a107f1baa5f16141e9a Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 8 Nov 2022 09:56:37 +0000
Subject: [PATCH 112/457] ANDROID: KVM: arm64: Flush the vcpu iflags for
 non-protected VMs

The iflags are meant as input flags to the hypervisor, and will
be used in future patches by calls to functions that sync debug
and pmu state. Ensure that the hyp_vcpu copy is up-to-date with
the host's on entry.

Bug: 228011917
Signed-off-by: Fuad Tabba <tabba@google.com>
Change-Id: Id04d65ee084c3745ddc283ff5e30348511a4a1d2
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 2c9ac32daf11..598590365a38 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -509,6 +509,7 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 		if (vcpu_get_flag(host_vcpu, PKVM_HOST_STATE_DIRTY))
 			__flush_hyp_vcpu(hyp_vcpu);
 
+		hyp_vcpu->vcpu.arch.iflags = READ_ONCE(host_vcpu->arch.iflags);
 		hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS & ~(HCR_RW | HCR_TWI | HCR_TWE);
 		hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2);
 

From b34bc46188b2535bb0e9085b4d004f0015b85358 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 25 May 2022 14:52:00 +0000
Subject: [PATCH 113/457] ANDROID: KVM: arm64: Factor out code for
 saving/restoring guest debug regs

This code will be reused when supporting debug for non-protected
VMs in protected mode.

No functional change intended

Bug: 228011917
Signed-off-by: Fuad Tabba <tabba@google.com>
Change-Id: If05dc8fdb3fff8e811f06cf5050d3eaf0ce67116
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h | 17 +++++++++++++++++
 arch/arm64/kvm/debug.c            | 12 ++++--------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 26c145aa7fa6..bac1038ace93 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -991,9 +991,26 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
 
+#define __vcpu_save_guest_debug_regs(vcpu)				\
+	do {								\
+		u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);		\
+									\
+		(vcpu)->arch.guest_debug_preserved.mdscr_el1 = val;	\
+	} while(0)
+
+#define __vcpu_restore_guest_debug_regs(vcpu)				\
+	do {								\
+		u64 val = (vcpu)->arch.guest_debug_preserved.mdscr_el1;	\
+									\
+		vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);		\
+	} while (0)
+
 #define kvm_vcpu_os_lock_enabled(vcpu)		\
 	(!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & SYS_OSLSR_OSLK))
 
+#define kvm_vcpu_needs_debug_regs(vcpu)		\
+	((vcpu)->guest_debug || kvm_vcpu_os_lock_enabled(vcpu))
+
 int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
 			       struct kvm_device_attr *attr);
 int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index fccf9ec01813..81218d707b01 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -39,9 +39,7 @@ static DEFINE_PER_CPU(u64, mdcr_el2);
  */
 static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
 {
-	u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
-
-	vcpu->arch.guest_debug_preserved.mdscr_el1 = val;
+	__vcpu_save_guest_debug_regs(vcpu);
 
 	trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
 				vcpu->arch.guest_debug_preserved.mdscr_el1);
@@ -52,9 +50,7 @@ static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
 
 static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
 {
-	u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1;
-
-	vcpu_write_sys_reg(vcpu, val, MDSCR_EL1);
+	__vcpu_restore_guest_debug_regs(vcpu);
 
 	trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
 				vcpu_read_sys_reg(vcpu, MDSCR_EL1));
@@ -175,7 +171,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
 	kvm_arm_setup_mdcr_el2(vcpu);
 
 	/* Check if we need to use the debug registers. */
-	if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
+	if (kvm_vcpu_needs_debug_regs(vcpu)) {
 		/* Save guest debug state */
 		save_guest_debug_regs(vcpu);
 
@@ -284,7 +280,7 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
 	/*
 	 * Restore the guest's debug registers if we were using them.
 	 */
-	if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
+	if (kvm_vcpu_needs_debug_regs(vcpu)) {
 		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
 			if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
 				/*

From 0a1f3a1f7b914e33b9ffbdac698f56260be5beea Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 8 Nov 2022 10:23:24 +0000
Subject: [PATCH 114/457] ANDROID: KVM: arm64: Monitor Debug support for
 non-protected guests

Add monitor debug support for non-protected guests in protected
mode.

Save and restore the monitor debug state when running a
non-protected guest, and propagate the monitor debug
configuration of non-protected vcpus from the host.

This patch assumes that the hyp vcpu debug iflags are kept in
sync with the host.

Bug: 228011917
Signed-off-by: Fuad Tabba <tabba@google.com>
Change-Id: Ie525693a6a6f236e388b16a1af297403e729057f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 59 ++++++++++++++++++++++++++++--
 arch/arm64/kvm/hyp/nvhe/pkvm.c     |  1 +
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 598590365a38..82b139255136 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -491,6 +491,57 @@ static void __flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 	__copy_vcpu_state(hyp_vcpu->host_vcpu, &hyp_vcpu->vcpu);
 }
 
+static void flush_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+	u64 mdcr_el2 = READ_ONCE(host_vcpu->arch.mdcr_el2);
+
+	/*
+	 * Propagate the monitor debug configuration of the vcpu from host.
+	 * Preserve HPMN, which is set-up by some knowledgeable bootcode.
+	 * Ensure that MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK are clear,
+	 * as guests should not be able to access profiling and trace buffers.
+	 * Ensure that RES0 bits are clear.
+	 */
+	mdcr_el2 &= ~(MDCR_EL2_RES0 |
+		      MDCR_EL2_HPMN_MASK |
+		      (MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) |
+		      (MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT));
+	vcpu->arch.mdcr_el2 = read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+	vcpu->arch.mdcr_el2 |= mdcr_el2;
+
+	vcpu->arch.pmu = host_vcpu->arch.pmu;
+	vcpu->guest_debug = READ_ONCE(host_vcpu->guest_debug);
+
+	if (!kvm_vcpu_needs_debug_regs(vcpu))
+		return;
+
+	__vcpu_save_guest_debug_regs(vcpu);
+
+	/* Switch debug_ptr to the external_debug_state if done by the host. */
+	if (kern_hyp_va(READ_ONCE(host_vcpu->arch.debug_ptr)) ==
+	    &host_vcpu->arch.external_debug_state)
+		vcpu->arch.debug_ptr = &host_vcpu->arch.external_debug_state;
+
+	/* Propagate any special handling for single step from host. */
+	vcpu_write_sys_reg(vcpu, vcpu_read_sys_reg(host_vcpu, MDSCR_EL1),
+						   MDSCR_EL1);
+	*vcpu_cpsr(vcpu) = *vcpu_cpsr(host_vcpu);
+}
+
+static void sync_debug_state(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu;
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	if (!kvm_vcpu_needs_debug_regs(vcpu))
+		return;
+
+	__vcpu_restore_guest_debug_regs(vcpu);
+	vcpu->arch.debug_ptr = &host_vcpu->arch.vcpu_debug_state;
+}
+
 static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
@@ -510,11 +561,10 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 			__flush_hyp_vcpu(hyp_vcpu);
 
 		hyp_vcpu->vcpu.arch.iflags = READ_ONCE(host_vcpu->arch.iflags);
+		flush_debug_state(hyp_vcpu);
+
 		hyp_vcpu->vcpu.arch.hcr_el2 = HCR_GUEST_FLAGS & ~(HCR_RW | HCR_TWI | HCR_TWE);
 		hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2);
-
-		hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
-		hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr);
 	}
 
 	hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2;
@@ -551,6 +601,9 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, u32 exit_reason)
 	hyp_entry_exit_handler_fn ec_handler;
 	u8 esr_ec;
 
+	if (!pkvm_hyp_vcpu_is_protected(hyp_vcpu))
+		sync_debug_state(hyp_vcpu);
+
 	/*
 	 * Don't sync the vcpu GPR/sysreg state after a run. Instead,
 	 * leave it in the hyp vCPU until someone actually requires it.
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 94ec94d5c58d..d70ff178bb8e 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -495,6 +495,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
 	hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu;
 	hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags);
 	hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
+	hyp_vcpu->vcpu.arch.debug_ptr = &host_vcpu->arch.vcpu_debug_state;
 
 	pkvm_vcpu_init_features_from_host(hyp_vcpu);
 

From 1f6276e0c58ad50bf9e0d1515fcae279a25330e1 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Sun, 13 Nov 2022 10:34:43 +0000
Subject: [PATCH 115/457] ANDROID: memory relinquish: Fix build dependencies

Memory relinquish interface is used by both memory ballooning and
by page reporting. It must be built if either is specified.

Bug: 258944680
Change-Id: I3b949dadbfc4a2b17dba1809a46f0a7386e70ebf
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/hypervisor.h | 2 +-
 arch/arm64/mm/Makefile              | 2 +-
 include/linux/mem_relinquish.h      | 6 +++---
 mm/Kconfig                          | 7 +++++++
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/hypervisor.h b/arch/arm64/include/asm/hypervisor.h
index aa577913d019..9b4e4ed79623 100644
--- a/arch/arm64/include/asm/hypervisor.h
+++ b/arch/arm64/include/asm/hypervisor.h
@@ -10,7 +10,7 @@ void kvm_arm_init_hyp_services(void);
 void kvm_init_memshare_services(void);
 void kvm_init_ioremap_services(void);
 
-#ifdef CONFIG_MEMORY_BALLOON
+#ifdef CONFIG_MEMORY_RELINQUISH
 void kvm_init_memrelinquish_services(void);
 #else
 static inline void kvm_init_memrelinquish_services(void) {}
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 542a4d0120d0..675de58aea92 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,7 +3,7 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   cache.o copypage.o flush.o \
 				   ioremap.o mem_encrypt.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o
-obj-$(CONFIG_MEMORY_BALLOON)	+= mem_relinquish.o
+obj-$(CONFIG_MEMORY_RELINQUISH)	+= mem_relinquish.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
 obj-$(CONFIG_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
diff --git a/include/linux/mem_relinquish.h b/include/linux/mem_relinquish.h
index 6b7bf861d92d..d3213cb77fc5 100644
--- a/include/linux/mem_relinquish.h
+++ b/include/linux/mem_relinquish.h
@@ -7,14 +7,14 @@
 #ifndef __MEM_RELINQUISH_H__
 #define __MEM_RELINQUISH_H__
 
-#ifdef CONFIG_ARCH_HAS_MEM_RELINQUISH
+#ifdef CONFIG_MEMORY_RELINQUISH
 
 #include <asm/mem_relinquish.h>
 
-#else	/* !CONFIG_ARCH_HAS_MEM_RELINQUISH */
+#else	/* !CONFIG_MEMORY_RELINQUISH */
 
 static inline void page_relinquish(struct page *page) { }
 
-#endif	/* CONFIG_ARCH_HAS_MEM_RELINQUISH */
+#endif	/* CONFIG_MEMORY_RELINQUISH */
 
 #endif	/* __MEM_RELINQUISH_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index 9e8261dd622a..f41b9630fe32 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -541,6 +541,13 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 config MEMORY_BALLOON
 	bool
 
+#
+# support for memory relinquish
+config MEMORY_RELINQUISH
+	def_bool y
+	depends on ARCH_HAS_MEM_RELINQUISH
+	depends on MEMORY_BALLOON || PAGE_REPORTING
+
 #
 # support for memory balloon compaction
 config BALLOON_COMPACTION

From 2e5989b50317a14ced415b0944cfc002a788d9e9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 21 Jun 2021 16:51:02 +0100
Subject: [PATCH 116/457] ANDROID: KVM: arm64: Turn
 kvm_pgtable_stage2_set_owner into kvm_pgtable_stage2_annotate

kvm_pgtable_stage2_set_owner() could be generalised into a way
to store up to 63 bits in the page tables, as long as we don't
set bit 0.

Let's just do that.

Bug: 209580772
Change-Id: I4e42d149b457870c35a5ae0f77e14c95dee16b4d
Signed-off-by: Marc Zyngier <maz@kernel.org>
[tabba@: Fix conflict in host_stage2_set_owner_locked()]
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h  | 12 +++++++-----
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 18 ++++++++++++++++--
 arch/arm64/kvm/hyp/pgtable.c          | 20 ++++++--------------
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 34cb93f3c96d..02546b1082f4 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -379,14 +379,16 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 			   void *mc);
 
 /**
- * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
- *				    track ownership.
+ * kvm_pgtable_stage2_annotate() - Unmap and annotate pages in the IPA space
+ *				   to track ownership (and more).
  * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init*().
  * @addr:	Base intermediate physical address to annotate.
  * @size:	Size of the annotated range.
  * @mc:		Cache of pre-allocated and zeroed memory from which to allocate
  *		page-table pages.
- * @owner_id:	Unique identifier for the owner of the page.
+ * @annotation:	A 63 bit value that will be stored in the page tables.
+ *		@annotation[0] must be 0, and @annotation[63:1] is stored
+ *		in the page tables.
  *
  * By default, all page-tables are owned by identifier 0. This function can be
  * used to mark portions of the IPA space as owned by other entities. When a
@@ -395,8 +397,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
  *
  * Return: 0 on success, negative error code on failure.
  */
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
-				 void *mc, u8 owner_id);
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+				void *mc, kvm_pte_t annotation);
 
 /**
  * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2e934988adb2..6c49d9d691ea 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -532,10 +532,24 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
 	return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
 }
 
+#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
+#define KVM_MAX_OWNER_ID		FIELD_MAX(KVM_INVALID_PTE_OWNER_MASK)
+static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+{
+	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
+}
+
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 {
-	return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
-			       addr, size, &host_s2_pool, owner_id);
+	kvm_pte_t annotation;
+
+	if (owner_id > KVM_MAX_OWNER_ID)
+		return -EINVAL;
+
+	annotation = kvm_init_invalid_leaf_owner(owner_id);
+
+	return host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
+			       addr, size, &host_s2_pool, annotation);
 }
 
 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b5e183f6f5a9..5992b7a7cb1f 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -46,9 +46,6 @@
 					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
 
-#define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID		FIELD_MAX(KVM_INVALID_PTE_OWNER_MASK)
-
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable		*pgt;
 	struct kvm_pgtable_walker	*walker;
@@ -155,11 +152,6 @@ static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level)
 	return pte;
 }
 
-static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
-{
-	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
-}
-
 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
 				  u32 level, kvm_pte_t *ptep,
 				  enum kvm_pgtable_walk_flags flag)
@@ -553,7 +545,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
 struct stage2_map_data {
 	u64				phys;
 	kvm_pte_t			attr;
-	u8				owner_id;
+	u64				annotation;
 
 	kvm_pte_t			*anchor;
 	kvm_pte_t			*childp;
@@ -720,7 +712,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	if (kvm_phys_is_valid(phys))
 		new = kvm_init_valid_leaf_pte(phys, data->attr, level);
 	else
-		new = kvm_init_invalid_leaf_owner(data->owner_id);
+		new = data->annotation;
 
 	if (stage2_pte_is_counted(old)) {
 		/*
@@ -918,8 +910,8 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 	return ret;
 }
 
-int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
-				 void *mc, u8 owner_id)
+int kvm_pgtable_stage2_annotate(struct kvm_pgtable *pgt, u64 addr, u64 size,
+				void *mc, kvm_pte_t annotation)
 {
 	int ret;
 	struct stage2_map_data map_data = {
@@ -927,8 +919,8 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
 		.mmu		= pgt->mmu,
 		.memcache	= mc,
 		.mm_ops		= pgt->mm_ops,
-		.owner_id	= owner_id,
 		.force_pte	= true,
+		.annotation	= annotation,
 	};
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_map_walker,
@@ -938,7 +930,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
 		.arg		= &map_data,
 	};
 
-	if (owner_id > KVM_MAX_OWNER_ID)
+	if (annotation & PTE_VALID)
 		return -EINVAL;
 
 	ret = kvm_pgtable_walk(pgt, addr, size, &walker);

From 7c4000b1ea6346cb45b8d476978baf52307cb112 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Tue, 12 Oct 2021 15:18:45 +0100
Subject: [PATCH 117/457] ANDROID: KVM: arm64: FAR_EL2 mask as a define

Create a macro definition for the FAR_EL2 mask and use it instead
of a hard-coded value, and put it in a share header to be used by
hyp.

No functional change intended.

Bug: 209580772
Change-Id: Ib83932d670cba6bf8f1ed45d2c0e1ed34331d98d
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_arm.h | 2 ++
 arch/arm64/kvm/mmu.c             | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 3a49a03513d0..7313fac7086d 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -344,6 +344,8 @@
 #define PAR_TO_HPFAR(par)		\
 	(((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
 
+#define FAR_MASK GENMASK_ULL(11, 0)
+
 #define ECN(x) { ESR_ELx_EC_##x, #x }
 
 #define kvm_arm_exception_class \
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 610d8f146197..946145a93d85 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1678,7 +1678,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
 		 * faulting VA. This is always 12 bits, irrespective
 		 * of the page size.
 		 */
-		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
+		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & FAR_MASK;
 		ret = io_mem_abort(vcpu, fault_ipa);
 		goto out_unlock;
 	}

From 2b8c3bb99baf9ded64b733f617e06c5ce9232f6a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 21 Jun 2021 19:49:45 +0100
Subject: [PATCH 118/457] ANDROID: KVM: arm64: Define MMIO guard hypercalls

Define the handful of hypercalls that MMIO guard will require.

Bug: 209580772
Change-Id: Iac312b2327c31a1532fdb38e8fa8066291d9f611
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 include/linux/arm-smccc.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 8224bed759ca..b23906d0652f 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -115,6 +115,10 @@
 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO		2
 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE		3
 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE		4
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO	5
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL	6
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP	7
+#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP	8
 #define ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH	9
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
@@ -166,6 +170,30 @@
 #define KVM_PTP_VIRT_COUNTER			0
 #define KVM_PTP_PHYS_COUNTER			1
 
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID		\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID		\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID			\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP)
+
+#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID		\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP)
+
 /* Paravirtualised time calls (defined by ARM DEN0057A) */
 #define ARM_SMCCC_HV_PV_TIME_FEATURES				\
 	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\

From d8bf88fa79324c46b5cdecb065428abeee18b6f4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 6 Jan 2022 10:01:01 +0000
Subject: [PATCH 119/457] ANDROID: KVM: arm64: Expose topup_hyp_memcache() to
 the rest of KVM

In order to simplify the implementation of an EL2-only version of
MMIO guard, expose topup_hyp_memcache() and simplify its usage
by only requiring a vcpu.

Bug: 209580772
Change-Id: I4f54c57a9693cf7a3450f99fedc15ae32af09a31
Signed-off-by: Marc Zyngier <maz@kernel.org>
[tabba@: original patch did the same for free_hyp_memcache(), but
it's already exposed]
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h | 2 +-
 arch/arm64/kvm/mmu.c              | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index bac1038ace93..b7d175e73a1b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -128,7 +128,7 @@ static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
 }
 
 void free_hyp_memcache(struct kvm_hyp_memcache *mc);
-int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);
+int topup_hyp_memcache(struct kvm_vcpu *vcpu);
 
 struct kvm_vmid {
 	atomic64_t id;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 946145a93d85..b23af71ebef6 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -878,12 +878,14 @@ void free_hyp_memcache(struct kvm_hyp_memcache *mc)
 				    kvm_host_va, NULL);
 }
 
-int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
+int topup_hyp_memcache(struct kvm_vcpu *vcpu)
 {
 	if (!is_protected_kvm_enabled())
 		return 0;
 
-	return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
+	return __topup_hyp_memcache(&vcpu->arch.pkvm_memcache,
+				    kvm_mmu_cache_min_pages(vcpu->kvm),
+				    hyp_mc_alloc_fn,
 				    kvm_host_pa, NULL);
 }
 
@@ -1234,7 +1236,6 @@ static int insert_ppage(struct kvm *kvm, struct kvm_pinned_page *ppage)
 static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  unsigned long hva)
 {
-	struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
 	struct mm_struct *mm = current->mm;
 	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
 	struct kvm_pinned_page *ppage;
@@ -1243,7 +1244,7 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	u64 pfn;
 	int ret;
 
-	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
+	ret = topup_hyp_memcache(vcpu);
 	if (ret)
 		return -ENOMEM;
 

From e439ca081061b2b35ea70329adda50a607683c8b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 6 Jan 2022 10:23:51 +0000
Subject: [PATCH 120/457] ANDROID: KVM: arm64: Introduce
 KVM_ARCH_FLAG_MMIO_GUARD flag

Add a per-VM flag indicating that the guest has bought into the
MMIO guard enforcement framework.

Bug: 209580772
Change-Id: If60b2b38a419a9f44ebe9029f55dd016fd2444b5
Signed-off-by: Marc Zyngier <maz@kernel.org>
[tabba@: had to assign it a new number since there are existing
flags now]
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index b7d175e73a1b..6ebb2de3b120 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -222,7 +222,8 @@ struct kvm_arch {
 #define KVM_ARCH_FLAG_EL1_32BIT				4
 	/* PSCI SYSTEM_SUSPEND enabled for the guest */
 #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		5
-
+	/* Guest has bought into the MMIO guard extension */
+#define KVM_ARCH_FLAG_MMIO_GUARD			6
 	unsigned long flags;
 
 	/*

From 0e40cc466b287ddf9d1744887344a91ec82a62f6 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 6 Jan 2022 10:29:13 +0000
Subject: [PATCH 121/457] ANDROID: KVM: arm64: pkvm: Add MMIO guard
 infrastructure

Introduce the infrastructure required to identify an IPA region
that is expected to be used as an MMIO window.

This include mapping, unmapping and checking the regions. Nothing
calls into it yet, so no expected functional change.

Bug: 209580772
Change-Id: I227eaa28b98e067e3daae4f9e1071eb37a6761cc
Signed-off-by: Marc Zyngier <maz@kernel.org>
[tabba@: use the new pkvm_hyp_* infrastructure, and remove
redundant reassignment in __pkvm_remove_ioguard_page()]
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   3 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 100 ++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 81a2c9c9450a..b8e47a47c1e6 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -74,6 +74,9 @@ int __pkvm_guest_share_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 int __pkvm_guest_unshare_host(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 				    u64 ipa, u64 *ppa);
+int __pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+int __pkvm_remove_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
+bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 6c49d9d691ea..ea87f5fd4cde 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1823,3 +1823,103 @@ unlock:
 
 	return ret;
 }
+
+/* Replace this with something more structured once day */
+#define MMIO_NOTE	(('M' << 24 | 'M' << 16 | 'I' << 8 | 'O') << 1)
+
+static bool __check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa)
+{
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	kvm_pte_t pte;
+	u32 level;
+	int ret;
+
+	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+	if (ret)
+		return false;
+
+	/* Must be a PAGE_SIZE mapping with our annotation */
+	return (BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)) == PAGE_SIZE &&
+		pte == MMIO_NOTE);
+}
+
+int __pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa)
+{
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	kvm_pte_t pte;
+	u32 level;
+	int ret;
+
+	if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vm->kvm.arch.flags))
+		return -EINVAL;
+
+	if (ipa & ~PAGE_MASK)
+		return -EINVAL;
+
+	guest_lock_component(vm);
+
+	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+	if (ret)
+		goto unlock;
+
+	if (pte && BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)) == PAGE_SIZE) {
+		/*
+		 * Already flagged as MMIO, let's accept it, and fail
+		 * otherwise
+		 */
+		if (pte != MMIO_NOTE)
+			ret = -EBUSY;
+
+		goto unlock;
+	}
+
+	ret = kvm_pgtable_stage2_annotate(&vm->pgt, ipa, PAGE_SIZE,
+					  &hyp_vcpu->vcpu.arch.pkvm_memcache,
+					  MMIO_NOTE);
+
+unlock:
+	guest_unlock_component(vm);
+	return ret;
+}
+
+int __pkvm_remove_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa)
+{
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+
+	if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vm->kvm.arch.flags))
+		return -EINVAL;
+
+	guest_lock_component(vm);
+
+	if (__check_ioguard_page(hyp_vcpu, ipa))
+		WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt,
+				ALIGN_DOWN(ipa, PAGE_SIZE), PAGE_SIZE));
+
+	guest_unlock_component(vm);
+	return 0;
+}
+
+bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	u64 ipa, end;
+	bool ret;
+
+	if (!kvm_vcpu_dabt_isvalid(&hyp_vcpu->vcpu))
+		return false;
+
+	if (!test_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vm->kvm.arch.flags))
+		return true;
+
+	ipa  = kvm_vcpu_get_fault_ipa(&hyp_vcpu->vcpu);
+	ipa |= kvm_vcpu_get_hfar(&hyp_vcpu->vcpu) & FAR_MASK;
+	end = ipa + kvm_vcpu_dabt_get_as(&hyp_vcpu->vcpu) - 1;
+
+	guest_lock_component(vm);
+	ret = __check_ioguard_page(hyp_vcpu, ipa);
+	if ((end & PAGE_MASK) != (ipa & PAGE_MASK))
+		ret &= __check_ioguard_page(hyp_vcpu, end);
+	guest_unlock_component(vm);
+
+	return ret;
+}

From 1ac4d8a25d9514c77a16d0391a6643cedfc68ce3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 21 Jun 2021 19:49:45 +0100
Subject: [PATCH 122/457] ANDROID: KVM: arm64: pkvm: Wire MMIO guard hypercalls

Plumb in the hypercall interface to allow a guest to discover,
enroll, map and unmap MMIO regions.

Bug: 209580772
Change-Id: I0390456ffde8ceca351d3d8e82fd1dddeb747fac
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
[tabba@:
- use the new pkvm_hyp_* infrastructure
- move pkvm_refill_memcache() up in file to expose it to
handle_pvm_entry_hvc64()
- include asm/stage2_pgtable.h in hypercalls.c for
topup_hyp_memcache()
- fix pkvm_install_ioguard_page() retval to u64, reported in
b/253586500 and fixed in a separate patch before
- fix smccc to return success, reported in b/251426790 and fixed
in a separate patch before
]
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 27 ++++++++++++-------
 arch/arm64/kvm/hyp/nvhe/pkvm.c     | 42 ++++++++++++++++++++++++++++++
 arch/arm64/kvm/hypercalls.c        |  4 +++
 3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 82b139255136..94d81b094182 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -37,6 +37,16 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
+static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
+{
+	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
+	u64 nr_pages = VTCR_EL2_LVLS(hyp_vm->kvm.arch.vtcr) - 1;
+	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
+
+	return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, nr_pages,
+			       &host_vcpu->arch.pkvm_memcache);
+}
+
 typedef void (*hyp_entry_exit_handler_fn)(struct pkvm_hyp_vcpu *);
 
 static void handle_pvm_entry_wfx(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -89,6 +99,9 @@ static void handle_pvm_entry_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 	u32 fn = smccc_get_function(&hyp_vcpu->vcpu);
 
 	switch (fn) {
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
+		pkvm_refill_memcache(hyp_vcpu);
+		break;
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
 		fallthrough;
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
@@ -267,6 +280,10 @@ static void handle_pvm_exit_hvc64(struct pkvm_hyp_vcpu *hyp_vcpu)
 		n = 4;
 		break;
 
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
+		n = 3;
+		break;
+
 	case PSCI_1_1_FN_SYSTEM_RESET2:
 	case PSCI_1_1_FN64_SYSTEM_RESET2:
 		n = 3;
@@ -825,16 +842,6 @@ out:
 	cpu_reg(host_ctxt, 1) =  ret;
 }
 
-static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
-{
-	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
-	u64 nr_pages = VTCR_EL2_LVLS(hyp_vm->kvm.arch.vtcr) - 1;
-	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
-
-	return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, nr_pages,
-			       &host_vcpu->arch.pkvm_memcache);
-}
-
 static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, pfn, host_ctxt, 1);
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index d70ff178bb8e..24c7f0486c65 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -1317,6 +1317,31 @@ out_guest_err:
 	return true;
 }
 
+static bool pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 *exit_code)
+{
+	u64 retval = SMCCC_RET_SUCCESS;
+	u64 ipa = smccc_get_arg1(&hyp_vcpu->vcpu);
+	int ret;
+
+	ret = __pkvm_install_ioguard_page(hyp_vcpu, ipa);
+	if (ret == -ENOMEM) {
+		/*
+		 * We ran out of memcache, let's ask for more. Cancel
+		 * the effects of the HVC that took us here, and
+		 * forward the hypercall to the host for page donation
+		 * purposes.
+		 */
+		write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
+		return false;
+	}
+
+	if (ret)
+		retval = SMCCC_RET_INVALID_PARAMETER;
+
+	smccc_set_retval(&hyp_vcpu->vcpu, retval, 0, 0, 0);
+	return true;
+}
+
 bool smccc_trng_available;
 
 static bool pkvm_forward_trng(struct kvm_vcpu *vcpu)
@@ -1377,7 +1402,24 @@ bool kvm_handle_pvm_hvc64(struct kvm_vcpu *vcpu, u64 *exit_code)
 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_SHARE);
 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_UNSHARE);
 		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MEM_RELINQUISH);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP);
+		val[0] |= BIT(ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP);
 		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID:
+		set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &vcpu->kvm->arch.flags);
+		val[0] = SMCCC_RET_SUCCESS;
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
+		return pkvm_install_ioguard_page(hyp_vcpu, exit_code);
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID:
+		if (__pkvm_remove_ioguard_page(hyp_vcpu, vcpu_get_reg(vcpu, 1)))
+			val[0] = SMCCC_RET_INVALID_PARAMETER;
+		else
+			val[0] = SMCCC_RET_SUCCESS;
+		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID:
 	case ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID:
 		return pkvm_meminfo_call(hyp_vcpu);
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 2a92b9bd2a46..a92428ce1912 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -228,6 +228,10 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 		pkvm_host_reclaim_page(vcpu->kvm, smccc_get_arg1(vcpu));
 		val[0] = SMCCC_RET_SUCCESS;
 		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
+		if (kvm_vm_is_protected(vcpu->kvm) && !topup_hyp_memcache(vcpu))
+			val[0] = SMCCC_RET_SUCCESS;
+		break;
 	case ARM_SMCCC_TRNG_VERSION:
 	case ARM_SMCCC_TRNG_FEATURES:
 	case ARM_SMCCC_TRNG_GET_UUID:

From b0af5b4925a5b27ebd38938fe7f47159d56f0372 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 21 Jun 2021 19:00:50 +0100
Subject: [PATCH 123/457] ANDROID: KVM: arm64: Plumb MMIO checking into the
 fault handling

Plumb the MMIO checking code into the MMIO fault handling code.
Any fault hitting outside of an MMIO region will now report
an invalid syndrome, and won't leak any data from the guest.

Bug: 209580772
Change-Id: I68bef2d0211a804aa1e598aeaa0c85dc4098f61e
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 94d81b094182..b4719b3db2f9 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -319,11 +319,7 @@ static void handle_pvm_exit_dabt(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
 
-	/*
-	 * For now, we treat all data aborts as MMIO since we have no knowledge
-	 * of the memslot configuration at EL2.
-	 */
-	hyp_vcpu->vcpu.mmio_needed = true;
+	hyp_vcpu->vcpu.mmio_needed = __pkvm_check_ioguard_page(hyp_vcpu);
 
 	if (hyp_vcpu->vcpu.mmio_needed) {
 		/* r0 as transfer register between the guest and the host. */

From 0dbb5f89f5eaaf7fbc29da2f586df4597adb9ca7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 5 Jul 2021 10:33:15 +0100
Subject: [PATCH 124/457] ANDROID: KVM: arm64: Add some documentation for the
 MMIO guard feature

Document the hypercalls user for the MMIO guard infrastructure.

Bug: 209580772
Change-Id: I927bcd6c5e3ef932265d817288ff2b46b0e0db66
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 Documentation/virt/kvm/arm/index.rst      |  1 +
 Documentation/virt/kvm/arm/mmio-guard.rst | 74 +++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 Documentation/virt/kvm/arm/mmio-guard.rst

diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst
index 1e3f28722768..4d795a7da2c8 100644
--- a/Documentation/virt/kvm/arm/index.rst
+++ b/Documentation/virt/kvm/arm/index.rst
@@ -13,3 +13,4 @@ ARM
    pkvm
    pvtime
    ptp_kvm
+   mmio-guard
diff --git a/Documentation/virt/kvm/arm/mmio-guard.rst b/Documentation/virt/kvm/arm/mmio-guard.rst
new file mode 100644
index 000000000000..8b3c852c5d92
--- /dev/null
+++ b/Documentation/virt/kvm/arm/mmio-guard.rst
@@ -0,0 +1,74 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+KVM MMIO guard
+==============
+
+KVM implements device emulation by handling translation faults to any
+IPA range that is not contained in a memory slot. Such a translation
+fault is in most cases passed on to userspace (or in rare cases to the
+host kernel) with the address, size and possibly data of the access
+for emulation.
+
+Should the guest exit with an address that is not one that corresponds
+to an emulatable device, userspace may take measures that are not the
+most graceful as far as the guest is concerned (such as terminating it
+or delivering a fatal exception).
+
+There is also an element of trust: by forwarding the request to
+userspace, the kernel assumes that the guest trusts userspace to do
+the right thing.
+
+The KVM MMIO guard offers a way to mitigate this last point: a guest
+can request that only certain regions of the IPA space are valid as
+MMIO. Only these regions will be handled as an MMIO, and any other
+will result in an exception being delivered to the guest.
+
+This relies on a set of hypercalls defined in the KVM-specific range,
+using the HVC64 calling convention.
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_INFO
+
+    ==============    ========    ================================
+    Function ID:      (uint32)    0xC6000002
+    Arguments:        none
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                      (uint64)    Protection Granule (PG) size in
+                                  bytes (r0)
+    ==============    ========    ================================
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_ENROLL
+
+    ==============    ========    ==============================
+    Function ID:      (uint32)    0xC6000003
+    Arguments:        none
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                                  RET_SUCCESS(0) (r0)
+    ==============    ========    ==============================
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_MAP
+
+    ==============    ========    ====================================
+    Function ID:      (uint32)    0xC6000004
+    Arguments:        (uint64)    The base of the PG-sized IPA range
+                                  that is allowed to be accessed as
+                                  MMIO. Must be aligned to the PG size
+                                  (r1)
+                      (uint64)    Index in the MAIR_EL1 register
+		                  providing the memory attribute that
+				  is used by the guest (r2)
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                                  RET_SUCCESS(0) (r0)
+    ==============    ========    ====================================
+
+* ARM_SMCCC_KVM_FUNC_MMIO_GUARD_UNMAP
+
+    ==============    ========    ======================================
+    Function ID:      (uint32)    0xC6000005
+    Arguments:        (uint64)    PG-sized IPA range aligned to the PG
+                                  size which has been previously mapped.
+                                  Must be aligned to the PG size and
+                                  have been previously mapped (r1)
+    Return Values:    (int64)     NOT_SUPPORTED(-1) on error, or
+                                  RET_SUCCESS(0) (r0)
+    ==============    ========    ======================================

From 437fc98ccb2dd2ac42f2e831260ef33a528ba7ce Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Thu, 21 Apr 2022 10:45:51 +0000
Subject: [PATCH 125/457] ANDROID: arm64: Auto-enroll MMIO guard on protected
 vms

Set the MMIO guard flag for protected vms prior to entering the guest
for the first time.

Bug: 216798684
Change-Id: I1448102ae85176d495ae7f8d6d20de4092049f0d
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 24c7f0486c65..18fc2d0a6e70 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -929,6 +929,9 @@ void pkvm_reset_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
 		/* PC: IPA of pvmfw base */
 		*vcpu_pc(&hyp_vcpu->vcpu) = entry;
 		hyp_vm->pvmfw_entry_vcpu = NULL;
+
+		/* Auto enroll MMIO guard */
+		set_bit(KVM_ARCH_FLAG_MMIO_GUARD, &hyp_vm->kvm.arch.flags);
 	}
 
 	reset_state->reset = false;

From 9720e4d3745d7ced5a9fd2fc22f36a56e787368b Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 9 Nov 2022 14:32:41 +0000
Subject: [PATCH 126/457] ANDROID: arm64: Check if pfn is valid for all ioremap
 loop iterations

The size could cover pfns that are valid.

Fixes:
arm64: Implement ioremap/iounmap hooks calling into KVM's MMIO guard

Bug: 251432016
Change-Id: Ie3b678d40d629ed610b0ee484b5007cb64437435
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/mm/ioremap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index e91a79c5e3df..e0cc7b5bf4a3 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -114,9 +114,6 @@ void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
 	if (!static_branch_unlikely(&ioremap_guard_key))
 		return;
 
-	if (pfn_valid(__phys_to_pfn(phys_addr)))
-		return;
-
 	mutex_lock(&ioremap_guard_lock);
 
 	while (size) {
@@ -124,6 +121,9 @@ void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
 		struct ioremap_guard_ref *ref;
 		struct arm_smccc_res res;
 
+		if (pfn_valid(__phys_to_pfn(phys_addr)))
+			goto next;
+
 		ref = xa_load(&ioremap_guard_array, pfn);
 		if (ref) {
 			refcount_inc(&ref->count);

From eb0f5f7486f113af2b2ece434edfa831b47e4fa9 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 9 Nov 2022 14:40:29 +0000
Subject: [PATCH 127/457] ANDROID: arm64: ioremap/iounmap use stage-2 granule
 size

When MMIO guard is queried, it advertises the guard granule size
it uses. Use that value.

Fixes: arm64: Implement ioremap/iounmap hooks calling into KVM's MMIO guard

Bug: 251432016
Change-Id: Iff4dcb6229bf89aef832a29a98fecc041a1aec1b
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/mm/ioremap.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index e0cc7b5bf4a3..0d05e91491a2 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -59,6 +59,8 @@ static DEFINE_STATIC_KEY_FALSE(ioremap_guard_key);
 static DEFINE_XARRAY(ioremap_guard_array);
 static DEFINE_MUTEX(ioremap_guard_lock);
 
+static size_t guard_granule;
+
 static bool ioremap_guard;
 static int __init ioremap_guard_setup(char *str)
 {
@@ -82,6 +84,7 @@ static void fixup_fixmap(void)
 void kvm_init_ioremap_services(void)
 {
 	struct arm_smccc_res res;
+	size_t granule;
 
 	if (!ioremap_guard)
 		return;
@@ -95,12 +98,18 @@ void kvm_init_ioremap_services(void)
 
 	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_INFO_FUNC_ID,
 			     0, 0, 0, &res);
-	if (res.a0 != PAGE_SIZE)
+	granule = res.a0;
+	if (granule > PAGE_SIZE || !granule || (granule & (granule - 1))) {
+		pr_warn("KVM MMIO guard initialization failed: "
+			"guard granule (%lu), page size (%lu)\n",
+			granule, PAGE_SIZE);
 		return;
+	}
 
 	arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_ENROLL_FUNC_ID,
 			     &res);
 	if (res.a0 == SMCCC_RET_SUCCESS) {
+		guard_granule = granule;
 		static_branch_enable(&ioremap_guard_key);
 		fixup_fixmap();
 		pr_info("Using KVM MMIO guard for ioremap\n");
@@ -111,20 +120,24 @@ void kvm_init_ioremap_services(void)
 
 void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
 {
+	int guard_shift;
+
 	if (!static_branch_unlikely(&ioremap_guard_key))
 		return;
 
+	guard_shift = __builtin_ctzl(guard_granule);
+
 	mutex_lock(&ioremap_guard_lock);
 
 	while (size) {
-		u64 pfn = phys_addr >> PAGE_SHIFT;
+		u64 guard_fn = phys_addr >> guard_shift;
 		struct ioremap_guard_ref *ref;
 		struct arm_smccc_res res;
 
 		if (pfn_valid(__phys_to_pfn(phys_addr)))
 			goto next;
 
-		ref = xa_load(&ioremap_guard_array, pfn);
+		ref = xa_load(&ioremap_guard_array, guard_fn);
 		if (ref) {
 			refcount_inc(&ref->count);
 			goto next;
@@ -141,7 +154,7 @@ void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
 			ref = kzalloc(sizeof(*ref), GFP_KERNEL);
 		if (ref) {
 			refcount_set(&ref->count, 1);
-			if (xa_err(xa_store(&ioremap_guard_array, pfn, ref,
+			if (xa_err(xa_store(&ioremap_guard_array, guard_fn, ref,
 					    GFP_KERNEL))) {
 				kfree(ref);
 				ref = NULL;
@@ -153,14 +166,14 @@ void ioremap_phys_range_hook(phys_addr_t phys_addr, size_t size, pgprot_t prot)
 		if (res.a0 != SMCCC_RET_SUCCESS) {
 			pr_warn_ratelimited("Failed to register %llx\n",
 					    phys_addr);
-			xa_erase(&ioremap_guard_array, pfn);
+			xa_erase(&ioremap_guard_array, guard_fn);
 			kfree(ref);
 			goto out;
 		}
 
 	next:
-		size -= PAGE_SIZE;
-		phys_addr += PAGE_SIZE;
+		size -= guard_granule;
+		phys_addr += guard_granule;
 	}
 out:
 	mutex_unlock(&ioremap_guard_lock);
@@ -168,19 +181,22 @@ out:
 
 void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size)
 {
+	int guard_shift;
+
 	if (!static_branch_unlikely(&ioremap_guard_key))
 		return;
 
 	VM_BUG_ON(phys_addr & ~PAGE_MASK || size & ~PAGE_MASK);
+	guard_shift = __builtin_ctzl(guard_granule);
 
 	mutex_lock(&ioremap_guard_lock);
 
 	while (size) {
-		u64 pfn = phys_addr >> PAGE_SHIFT;
+		u64 guard_fn = phys_addr >> guard_shift;
 		struct ioremap_guard_ref *ref;
 		struct arm_smccc_res res;
 
-		ref = xa_load(&ioremap_guard_array, pfn);
+		ref = xa_load(&ioremap_guard_array, guard_fn);
 		if (!ref) {
 			pr_warn_ratelimited("%llx not tracked, left mapped\n",
 					    phys_addr);
@@ -190,7 +206,7 @@ void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size)
 		if (!refcount_dec_and_test(&ref->count))
 			goto next;
 
-		xa_erase(&ioremap_guard_array, pfn);
+		xa_erase(&ioremap_guard_array, guard_fn);
 		kfree(ref);
 
 		arm_smccc_1_1_hvc(ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_UNMAP_FUNC_ID,
@@ -202,8 +218,8 @@ void iounmap_phys_range_hook(phys_addr_t phys_addr, size_t size)
 		}
 
 	next:
-		size -= PAGE_SIZE;
-		phys_addr += PAGE_SIZE;
+		size -= guard_granule;
+		phys_addr += guard_granule;
 	}
 out:
 	mutex_unlock(&ioremap_guard_lock);

From cafd6d7893a56e7631eae0b98865be466a52fc7d Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Wed, 16 Nov 2022 10:44:02 +0000
Subject: [PATCH 128/457] ANDROID: KVM: Include prototype for page_relinquish
 before definition

Fixes build failure on -Werror=missing-prototypes.

At the same time, make the header file more resilient to ordering by
declaring 'struct page'.

Bug: 240239989
Change-Id: I84d069bde5ff03d1afa55d25c01448b0d43042da
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/mem_relinquish.h | 2 ++
 arch/arm64/mm/mem_relinquish.c          | 1 +
 2 files changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/mem_relinquish.h b/arch/arm64/include/asm/mem_relinquish.h
index a4ace9e6e413..091de6767440 100644
--- a/arch/arm64/include/asm/mem_relinquish.h
+++ b/arch/arm64/include/asm/mem_relinquish.h
@@ -7,6 +7,8 @@
 #ifndef __ASM_MEM_RELINQUISH_H
 #define __ASM_MEM_RELINQUISH_H
 
+struct page;
+
 void page_relinquish(struct page *page);
 
 #endif	/* __ASM_MEM_RELINQUISH_H */
diff --git a/arch/arm64/mm/mem_relinquish.c b/arch/arm64/mm/mem_relinquish.c
index c95bcbb14d92..feea3adc2acc 100644
--- a/arch/arm64/mm/mem_relinquish.c
+++ b/arch/arm64/mm/mem_relinquish.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/arm-smccc.h>
+#include <linux/mem_relinquish.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
 #include <linux/types.h>

From 15a4df5ad44253054a599efb599866f069abc6a4 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Wed, 16 Nov 2022 10:48:54 +0000
Subject: [PATCH 129/457] ANDROID: KVM: arm64: Strictly check page type in
 MEM_RELINQUISH hypercall

The VM should only relinquish "normal" pages. For a protected VM, this
means PAGE_OWNED; For a normal VM, this means PAGE_SHARED_BORROWED. All
other page types are rejected and failure is reported to the caller.

Bug: 259217067
Change-Id: Icff3474dc2c975a6c5befe546c5521a05b3bd575
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 48 +++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index ea87f5fd4cde..5023100f581d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -265,7 +265,6 @@ static int reclaim_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 {
 	kvm_pte_t pte = *ptep;
 	struct hyp_page *page;
-	u64 *pa = arg;
 
 	if (!kvm_pte_valid(pte))
 		return 0;
@@ -277,8 +276,6 @@ static int reclaim_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 		fallthrough;
 	case PKVM_PAGE_SHARED_BORROWED:
 	case PKVM_PAGE_SHARED_OWNED:
-		if (pa)
-			*pa = kvm_pte_to_phys(pte);
 		page->flags |= HOST_PAGE_PENDING_RECLAIM;
 		break;
 	default:
@@ -318,13 +315,44 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
 	}
 }
 
+struct relinquish_data {
+	enum pkvm_page_state expected_state;
+	u64 pa;
+};
+
+static int relinquish_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			     enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+	kvm_pte_t pte = *ptep;
+	struct hyp_page *page;
+	struct relinquish_data *data = arg;
+	enum pkvm_page_state state;
+
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	state = pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+	if (state != data->expected_state)
+		return -EPERM;
+
+	page = hyp_phys_to_page(kvm_pte_to_phys(pte));
+	if (state == PKVM_PAGE_OWNED)
+		page->flags |= HOST_PAGE_NEED_POISONING;
+	page->flags |= HOST_PAGE_PENDING_RECLAIM;
+
+	data->pa = kvm_pte_to_phys(pte);
+
+	return 0;
+}
+
 int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 				    u64 ipa, u64 *ppa)
 {
+	struct relinquish_data data;
 	struct kvm_pgtable_walker walker = {
-		.cb     = reclaim_walker,
-		.arg    = ppa,
-		.flags  = KVM_PGTABLE_WALK_LEAF
+		.cb     = relinquish_walker,
+		.flags  = KVM_PGTABLE_WALK_LEAF,
+		.arg    = &data,
 	};
 	struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu);
 	int ret;
@@ -332,8 +360,13 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 	host_lock_component();
 	guest_lock_component(vm);
 
+	/* Expected page state depends on VM type. */
+	data.expected_state = pkvm_hyp_vcpu_is_protected(vcpu) ?
+		PKVM_PAGE_OWNED :
+		PKVM_PAGE_SHARED_BORROWED;
+
 	/* Set default pa value to "not found". */
-	*ppa = 0;
+	data.pa = 0;
 
 	/* If ipa is mapped: sets page flags, and gets the pa. */
 	ret = kvm_pgtable_walk(&vm->pgt, ipa, PAGE_SIZE, &walker);
@@ -345,6 +378,7 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 	guest_unlock_component(vm);
 	host_unlock_component();
 
+	*ppa = data.pa;
 	return ret;
 }
 

From 4a477ac0a9620c0cde1f4206f3ab8f0f02c72cf8 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Wed, 16 Nov 2022 10:53:00 +0000
Subject: [PATCH 130/457] ANDROID: KVM: arm64: Avoid unnecessary unmap walk in
 MEM_RELINQUISH hypercall

If the mapping is determined to be not present in an earlier walk,
attempting the unmap is pointless.

Bug: 259217067
Change-Id: I6fd939556b80d7a9a0731cab36166a652f7a7c6d
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 5023100f581d..0db8ebd60ba1 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -372,7 +372,7 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 	ret = kvm_pgtable_walk(&vm->pgt, ipa, PAGE_SIZE, &walker);
 
 	/* Zap the guest stage2 pte. */
-	if (!ret)
+	if (!ret && data.pa)
 		kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE);
 
 	guest_unlock_component(vm);

From bf76ea0a21ea967c40cea17703daf82d01dd831e Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 26 Oct 2022 17:44:14 +0100
Subject: [PATCH 131/457] ANDROID: KVM: arm64: Flush nVHE hyp_vcpu memcache

When using nVHE in protected mode, the host donates pages through an arch
specific memcache the hyp can then pours in its local vcpu copy. The latter
should be flushed on VM teardown.

Bug: 237506543
Change-Id: Ic37d794ac33e9f844fa6ae1b4943febcdad5b033
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 18fc2d0a6e70..b684671d1f7d 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -830,6 +830,15 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 
 	for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
 		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
+		struct kvm_hyp_memcache *vcpu_mc;
+		void *addr;
+
+		vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
+		while (vcpu_mc->nr_pages) {
+			addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
+			push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+			unmap_donated_memory_noclear(addr, PAGE_SIZE);
+		}
 
 		teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
 	}

From e210ad7cd99fd6c2a54c1255d53bb767436f8f98 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 17 Nov 2022 15:22:58 +0000
Subject: [PATCH 132/457] ANDROID: KVM: arm64: Introduce
 kvm_has_memrelinquish_services

Used to determine whether memrelinquish services have been
initialised.

Bug: 240239989
Change-Id: I81dd23d8122ea54924d52b3fdc1fc4a8cdb28ea5
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/mem_relinquish.h | 1 +
 arch/arm64/mm/mem_relinquish.c          | 6 ++++++
 include/linux/mem_relinquish.h          | 1 +
 3 files changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/mem_relinquish.h b/arch/arm64/include/asm/mem_relinquish.h
index 091de6767440..ac51786a4a11 100644
--- a/arch/arm64/include/asm/mem_relinquish.h
+++ b/arch/arm64/include/asm/mem_relinquish.h
@@ -9,6 +9,7 @@
 
 struct page;
 
+bool kvm_has_memrelinquish_services(void);
 void page_relinquish(struct page *page);
 
 #endif	/* __ASM_MEM_RELINQUISH_H */
diff --git a/arch/arm64/mm/mem_relinquish.c b/arch/arm64/mm/mem_relinquish.c
index feea3adc2acc..7948098288e3 100644
--- a/arch/arm64/mm/mem_relinquish.c
+++ b/arch/arm64/mm/mem_relinquish.c
@@ -36,6 +36,12 @@ void kvm_init_memrelinquish_services(void)
 	memshare_granule_sz = res.a0;
 }
 
+bool kvm_has_memrelinquish_services(void)
+{
+	return !!memshare_granule_sz;
+}
+EXPORT_SYMBOL_GPL(kvm_has_memrelinquish_services);
+
 void page_relinquish(struct page *page)
 {
 	phys_addr_t phys, end;
diff --git a/include/linux/mem_relinquish.h b/include/linux/mem_relinquish.h
index d3213cb77fc5..b906c4196d11 100644
--- a/include/linux/mem_relinquish.h
+++ b/include/linux/mem_relinquish.h
@@ -13,6 +13,7 @@
 
 #else	/* !CONFIG_MEMORY_RELINQUISH */
 
+static inline bool kvm_has_memrelinquish_services(void) { return false; }
 static inline void page_relinquish(struct page *page) { }
 
 #endif	/* CONFIG_MEMORY_RELINQUISH */

From a12648bd1e6da9bcdf3b6ac978c3fcfeaf3fca36 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 17 Nov 2022 15:24:40 +0000
Subject: [PATCH 133/457] ANDROID: virtio_balloon: New module parameter "pkvm"

This specifies that the driver is running on a PKVM hypervisor
and must use the memrelinquish service to cooperatively release
memory. If this service is unavailable, virtio_balloon cannot be
used.

Bug: 240239989
Change-Id: I8800c4435d8fae9df6f1ab108cc61c8f93020773
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 drivers/virtio/virtio_balloon.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 4c85349024c1..18a6d8d78fe7 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -18,6 +18,11 @@
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/page_reporting.h>
+#include <linux/mem_relinquish.h>
+
+static bool pkvm;
+module_param(pkvm, bool, 0);
+MODULE_PARM_DESC(pkvm, "Running on PKVM. Must use MEM_RELINQUISH.");
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -868,6 +873,12 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	struct virtio_balloon *vb;
 	int err;
 
+	if (pkvm && !kvm_has_memrelinquish_services()) {
+		dev_err(&vdev->dev, "%s failure: pkvm but no memrelinquish\n",
+			__func__);
+		return -EINVAL;
+	}
+
 	if (!vdev->config->get) {
 		dev_err(&vdev->dev, "%s failure: config access disabled\n",
 			__func__);

From 6fe4c366af6240671c071f63a25a9cf0f9cc230d Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 17 Nov 2022 14:59:38 +0000
Subject: [PATCH 134/457] ANDROID: virtio_balloon: Do not clear
 VIRTIO_F_ACCESS_PLATFORM

This essentially reverts commit e41b1355508debe45fda33
"virtio_balloon: disable VIOMMU support".

Although the virtio_balloon driver does not translate through a
VIOMMU (or bounce buffer) the pages that it sends to the device,
it *does* need to perform these translations on the virtio rings
themselves.

This fixes virtio_balloon initialisation inside a PKVM/ARM64
protected virtual machine.

Bug: 240239989
Change-Id: I2a84eec870fd638223b231e5c4d1c27216dc40a2
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 drivers/virtio/virtio_balloon.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 18a6d8d78fe7..eb27c82bf0dc 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -1108,7 +1108,6 @@ static int virtballoon_validate(struct virtio_device *vdev)
 	else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON))
 		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING);
 
-	__virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM);
 	return 0;
 }
 

From 8f9c7f7acbabfb1367c4ada33b87899db9311a91 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Nov 2022 14:24:02 +0000
Subject: [PATCH 135/457] ANDROID: KVM: arm64: Don't check for hyp_fixmap_map()
 returning NULL

hyp_fixmap_map() never returns NULL, so remove the redundant checks for
it and simplify the error handling in the callers.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 254819795
Change-Id: Ie73a97cc3d9bded3750abe6e243003827393ee5e
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 14 +++-----------
 arch/arm64/kvm/hyp/nvhe/pkvm.c        | 10 ++--------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 0db8ebd60ba1..298b2816b620 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1796,16 +1796,11 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
 	return ret;
 }
 
-static int hyp_zero_page(phys_addr_t phys)
+static void hyp_zero_page(phys_addr_t phys)
 {
-	void *addr;
-
-	addr = hyp_fixmap_map(phys);
-	if (!addr)
-		return -EINVAL;
+	void *addr = hyp_fixmap_map(phys);
 
 	memset(addr, 0, PAGE_SIZE);
-
 	/*
 	 * Prefer kvm_flush_dcache_to_poc() over __clean_dcache_guest_page()
 	 * here as the latter may elide the CMO under the assumption that FWB
@@ -1815,7 +1810,6 @@ static int hyp_zero_page(phys_addr_t phys)
 	 */
 	kvm_flush_dcache_to_poc(addr, PAGE_SIZE);
 	hyp_fixmap_unmap();
-	return 0;
 }
 
 int __pkvm_host_reclaim_page(u64 pfn)
@@ -1841,9 +1835,7 @@ int __pkvm_host_reclaim_page(u64 pfn)
 	}
 
 	if (page->flags & HOST_PAGE_NEED_POISONING) {
-		ret = hyp_zero_page(addr);
-		if (ret)
-			goto unlock;
+		hyp_zero_page(addr);
 		page->flags &= ~HOST_PAGE_NEED_POISONING;
 	}
 
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index b684671d1f7d..3ccfdf23e0dd 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -872,20 +872,14 @@ int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
 
 	npages = size >> PAGE_SHIFT;
 	while (npages--) {
-		void *dst;
-
-		dst = hyp_fixmap_map(phys);
-		if (!dst)
-			return -EINVAL;
-
 		/*
 		 * No need for cache maintenance here, as the pgtable code will
 		 * take care of this when installing the pte in the guest's
 		 * stage-2 page table.
 		 */
-		memcpy(dst, src, PAGE_SIZE);
-
+		memcpy(hyp_fixmap_map(phys), src, PAGE_SIZE);
 		hyp_fixmap_unmap();
+
 		src += PAGE_SIZE;
 		phys += PAGE_SIZE;
 	}

From 8b744c24f01c19b966f4b2400803239b2f7505e7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Nov 2022 11:34:59 +0000
Subject: [PATCH 136/457] ANDROID: KVM: arm64: Rename hyp_zero_page() and make
 available as helper

hyp_zero_page() is used for poisoning memory, so rename it to
hyp_poison_page() to avoid confusing with the concept of a "zero page"
and make it available outside of mem_protect.c as it will be used to
poison the pvmfw memory in a subsequent patch.

Signed-off-by: Will Deacon <will@kernel.org>
Bug: 254819795
Change-Id: Ia4aec46437db3ffe466ae09bd180392fa06c0b46
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mm.h  | 1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index d5ec972b5c1e..164a0986dc52 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -16,6 +16,7 @@ extern hyp_spinlock_t pkvm_pgd_lock;
 int hyp_create_pcpu_fixmap(void);
 void *hyp_fixmap_map(phys_addr_t phys);
 void hyp_fixmap_unmap(void);
+void hyp_poison_page(phys_addr_t phys);
 
 int hyp_create_idmap(u32 hyp_va_bits);
 int hyp_map_vectors(void);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 298b2816b620..ba7f7564adb8 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1796,7 +1796,7 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
 	return ret;
 }
 
-static void hyp_zero_page(phys_addr_t phys)
+void hyp_poison_page(phys_addr_t phys)
 {
 	void *addr = hyp_fixmap_map(phys);
 
@@ -1835,7 +1835,7 @@ int __pkvm_host_reclaim_page(u64 pfn)
 	}
 
 	if (page->flags & HOST_PAGE_NEED_POISONING) {
-		hyp_zero_page(addr);
+		hyp_poison_page(addr);
 		page->flags &= ~HOST_PAGE_NEED_POISONING;
 	}
 

From a542a1202a036398d9f54847d2cd2fb0a6247a23 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Nov 2022 11:40:01 +0000
Subject: [PATCH 137/457] ANDROID: KVM: arm64: Rename pkvm_clear_pvmfw_pages()

pkvm_clear_pvmfw_pages() is used to poison the pvmfw pages during reset,
so rename it to pkvm_poison_pvmfw_pages() instead.

Signed-off-by: Will Deacon <will@kernel.org>
Bug: 254819795
Change-Id: Ie5b9c90f0707fa81d9099425cff35383bfb0d009
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 2 +-
 arch/arm64/kvm/hyp/nvhe/pkvm.c         | 2 +-
 arch/arm64/kvm/hyp/nvhe/psci-relay.c   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index c880d6605453..65861da8b460 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -133,6 +133,6 @@ static inline bool pkvm_ipa_in_pvmfw_region(struct pkvm_hyp_vm *vm, u64 ipa)
 
 int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
 			  u64 size);
-void pkvm_clear_pvmfw_pages(void);
+void pkvm_poison_pvmfw_pages(void);
 
 #endif /* __ARM64_KVM_NVHE_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3ccfdf23e0dd..3236ed39e70f 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -887,7 +887,7 @@ int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
 	return 0;
 }
 
-void pkvm_clear_pvmfw_pages(void)
+void pkvm_poison_pvmfw_pages(void)
 {
 	void *addr = hyp_phys_to_virt(pvmfw_base);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index cee6d4a2821f..0bb1b66977b0 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -250,7 +250,7 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_
 	 */
 	case PSCI_0_2_FN_SYSTEM_OFF:
 	case PSCI_0_2_FN_SYSTEM_RESET:
-		pkvm_clear_pvmfw_pages();
+		pkvm_poison_pvmfw_pages();
 		return psci_forward(host_ctxt);
 	case PSCI_0_2_FN64_CPU_SUSPEND:
 		return psci_cpu_suspend(func_id, host_ctxt);
@@ -265,7 +265,7 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_
 {
 	switch (func_id) {
 	case PSCI_1_1_FN64_SYSTEM_RESET2:
-		pkvm_clear_pvmfw_pages();
+		pkvm_poison_pvmfw_pages();
 		fallthrough;
 	case PSCI_1_0_FN_PSCI_FEATURES:
 	case PSCI_1_0_FN_SET_SUSPEND_MODE:

From 677980a696cf16bc7da1310d7749f8196bd44441 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Nov 2022 14:24:02 +0000
Subject: [PATCH 138/457] ANDROID: KVM: arm64: Use fixmap when poisoning pvmfw
 pages

When poisoning the pvmfw pages during system reset at EL2, ensure that we
use a writable fixmap mapping rather than the persistent read-only mapping
of the region.

Signed-off-by: Will Deacon <will@kernel.org>
Bug: 254819795
Change-Id: I4c8be092d3c822695afd7d03d0d64163664a9f64
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3236ed39e70f..26db4f6e0a14 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -889,10 +889,13 @@ int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
 
 void pkvm_poison_pvmfw_pages(void)
 {
-	void *addr = hyp_phys_to_virt(pvmfw_base);
+	u64 npages = pvmfw_size >> PAGE_SHIFT;
+	phys_addr_t addr = pvmfw_base;
 
-	memset(addr, 0, pvmfw_size);
-	kvm_flush_dcache_to_poc(addr, pvmfw_size);
+	while (npages--) {
+		hyp_poison_page(addr);
+		addr += PAGE_SIZE;
+	}
 }
 
 /*

From 22c8a338c7900455180e80d4dcda94a33c01fe0c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Nov 2022 15:32:08 +0000
Subject: [PATCH 139/457] ANDROID: KVM: arm64: Check IPA range for pvmfw during
 guest donation

When donating pages to the guest, we only check the first IPA in the
range against the pvmfw loading range. Although this is fine for the
page-at-a-time faulting path, it doesn't fit with the rest of the mem
protection logic, which deals with the possibility of an arbitrarily
sized contiguous address range.

Rework the logic so that we check the whole IPA range during guest
donation and trigger the pvmfw loading path if any of the pages
intersect with the pvmfw region.

Signed-off-by: Will Deacon <will@kernel.org>
Bug: 254819795
Change-Id: I6fef9f1898e65a95cab7f6a0ffa8aa422a8d5a91
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 6 ++++--
 arch/arm64/kvm/hyp/nvhe/mem_protect.c  | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 65861da8b460..644dce2f9256 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -121,14 +121,16 @@ static inline bool pkvm_hyp_vm_has_pvmfw(struct pkvm_hyp_vm *vm)
 	return vm->kvm.arch.pkvm.pvmfw_load_addr != PVMFW_INVALID_LOAD_ADDR;
 }
 
-static inline bool pkvm_ipa_in_pvmfw_region(struct pkvm_hyp_vm *vm, u64 ipa)
+static inline bool pkvm_ipa_range_has_pvmfw(struct pkvm_hyp_vm *vm,
+					    u64 ipa_start, u64 ipa_end)
 {
 	struct kvm_protected_vm *pkvm = &vm->kvm.arch.pkvm;
+	u64 pvmfw_load_end = pkvm->pvmfw_load_addr + pvmfw_size;
 
 	if (!pkvm_hyp_vm_has_pvmfw(vm))
 		return false;
 
-	return ipa - pkvm->pvmfw_load_addr < pvmfw_size;
+	return ipa_end > pkvm->pvmfw_load_addr && ipa_start < pvmfw_load_end;
 }
 
 int pkvm_load_pvmfw_pages(struct pkvm_hyp_vm *vm, u64 ipa, phys_addr_t phys,
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index ba7f7564adb8..abed3fd9e0a2 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1078,11 +1078,11 @@ static int guest_complete_donation(u64 addr, const struct pkvm_mem_transition *t
 	u64 size = tx->nr_pages * PAGE_SIZE;
 	int err;
 
-	if (tx->initiator.id == PKVM_ID_HOST &&
-	    pkvm_ipa_in_pvmfw_region(vm, addr)) {
+	if (pkvm_ipa_range_has_pvmfw(vm, addr, addr + size)) {
 		if (WARN_ON(!pkvm_hyp_vcpu_is_protected(vcpu)))
 			return -EPERM;
 
+		WARN_ON(tx->initiator.id != PKVM_ID_HOST);
 		err = pkvm_load_pvmfw_pages(vm, addr, phys, size);
 		if (err)
 			return err;

From ffe9d28274d5a38e97ba0aae32f250c76e0fd381 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 25 Mar 2022 15:01:38 +0000
Subject: [PATCH 140/457] ANDROID: KVM: arm64: Use PSCI MEM_PROTECT to zap
 guest pages on reset

If a malicious/compromised host issues a PSCI SYSTEM_RESET call in the
presence of guest-owned pages then the contents of those pages may be
susceptible to cold-reboot attacks.

Use the PSCI MEM_PROTECT call to ensure that volatile memory is wiped by
the firmware if a SYSTEM_RESET occurs while unpoisoned guest pages exist
in the system. Since this call does not offer protection for a "warm"
reset initiated by SYSTEM_RESET2, detect this case in the PSCI relay and
repaint the call to a standard SYSTEM_RESET instead.

Signed-off-by: Will Deacon <will@kernel.org>
Bug: 254821051
Change-Id: I5c3dd93bc83ebcd0b6cea2ec734f6e3a77f0064e
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  3 ++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 34 ++++++++++++--
 arch/arm64/kvm/hyp/nvhe/psci-relay.c          | 44 +++++++++++++++++++
 include/uapi/linux/psci.h                     |  2 +
 4 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b8e47a47c1e6..7d0b49800bd8 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -91,6 +91,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
 int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
 		    struct kvm_hyp_memcache *host_mc);
 
+void psci_mem_protect_inc(u64 n);
+void psci_mem_protect_dec(u64 n);
+
 static __always_inline void __load_host_stage2(void)
 {
 	if (static_branch_likely(&kvm_protected_mode_initialized))
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index abed3fd9e0a2..cdf965814724 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -883,8 +883,16 @@ static int host_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
 			       enum kvm_pgtable_prot perms)
 {
 	u64 size = tx->nr_pages * PAGE_SIZE;
+	int err;
 
-	return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_BORROWED);
+	err = __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_BORROWED);
+	if (err)
+		return err;
+
+	if (tx->initiator.id == PKVM_ID_GUEST)
+		psci_mem_protect_dec(tx->nr_pages);
+
+	return 0;
 }
 
 static int host_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
@@ -892,6 +900,9 @@ static int host_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
 	u8 owner_id = tx->initiator.id;
 	u64 size = tx->nr_pages * PAGE_SIZE;
 
+	if (tx->initiator.id == PKVM_ID_GUEST)
+		psci_mem_protect_inc(tx->nr_pages);
+
 	return host_stage2_set_owner_locked(addr, size, owner_id);
 }
 
@@ -1078,18 +1089,32 @@ static int guest_complete_donation(u64 addr, const struct pkvm_mem_transition *t
 	u64 size = tx->nr_pages * PAGE_SIZE;
 	int err;
 
+	if (tx->initiator.id == PKVM_ID_HOST)
+		psci_mem_protect_inc(tx->nr_pages);
+
 	if (pkvm_ipa_range_has_pvmfw(vm, addr, addr + size)) {
-		if (WARN_ON(!pkvm_hyp_vcpu_is_protected(vcpu)))
-			return -EPERM;
+		if (WARN_ON(!pkvm_hyp_vcpu_is_protected(vcpu))) {
+			err = -EPERM;
+			goto err_undo_psci;
+		}
 
 		WARN_ON(tx->initiator.id != PKVM_ID_HOST);
 		err = pkvm_load_pvmfw_pages(vm, addr, phys, size);
 		if (err)
-			return err;
+			goto err_undo_psci;
 	}
 
+	/*
+	 * If this fails, we effectively leak the pages since they're now
+	 * owned by the guest but not mapped into its stage-2 page-table.
+	 */
 	return kvm_pgtable_stage2_map(&vm->pgt, addr, size, phys, prot,
 				      &vcpu->vcpu.arch.pkvm_memcache);
+
+err_undo_psci:
+	if (tx->initiator.id == PKVM_ID_HOST)
+		psci_mem_protect_dec(tx->nr_pages);
+	return err;
 }
 
 static int __guest_get_completer_addr(u64 *completer_addr, phys_addr_t phys,
@@ -1837,6 +1862,7 @@ int __pkvm_host_reclaim_page(u64 pfn)
 	if (page->flags & HOST_PAGE_NEED_POISONING) {
 		hyp_poison_page(addr);
 		page->flags &= ~HOST_PAGE_NEED_POISONING;
+		psci_mem_protect_dec(1);
 	}
 
 	ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_HOST);
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 0bb1b66977b0..4b26298e7927 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -11,6 +11,7 @@
 #include <linux/kvm_host.h>
 #include <uapi/linux/psci.h>
 
+#include <nvhe/mem_protect.h>
 #include <nvhe/memory.h>
 #include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
@@ -222,6 +223,44 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on)
 	__host_enter(host_ctxt);
 }
 
+static DEFINE_HYP_SPINLOCK(mem_protect_lock);
+
+static u64 psci_mem_protect(s64 offset)
+{
+	static u64 cnt;
+	u64 new = cnt + offset;
+
+	hyp_assert_lock_held(&mem_protect_lock);
+
+	if (!offset || kvm_host_psci_config.version < PSCI_VERSION(1, 1))
+		return cnt;
+
+	if (!cnt || !new)
+		psci_call(PSCI_1_1_FN64_MEM_PROTECT, offset < 0 ? 0 : 1, 0, 0);
+
+	cnt = new;
+	return cnt;
+}
+
+static bool psci_mem_protect_active(void)
+{
+	return psci_mem_protect(0);
+}
+
+void psci_mem_protect_inc(u64 n)
+{
+	hyp_spin_lock(&mem_protect_lock);
+	psci_mem_protect(n);
+	hyp_spin_unlock(&mem_protect_lock);
+}
+
+void psci_mem_protect_dec(u64 n)
+{
+	hyp_spin_lock(&mem_protect_lock);
+	psci_mem_protect(-n);
+	hyp_spin_unlock(&mem_protect_lock);
+}
+
 static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt)
 {
 	if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id))
@@ -251,6 +290,8 @@ static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_
 	case PSCI_0_2_FN_SYSTEM_OFF:
 	case PSCI_0_2_FN_SYSTEM_RESET:
 		pkvm_poison_pvmfw_pages();
+		/* Avoid racing with a MEM_PROTECT call. */
+		hyp_spin_lock(&mem_protect_lock);
 		return psci_forward(host_ctxt);
 	case PSCI_0_2_FN64_CPU_SUSPEND:
 		return psci_cpu_suspend(func_id, host_ctxt);
@@ -266,6 +307,9 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_
 	switch (func_id) {
 	case PSCI_1_1_FN64_SYSTEM_RESET2:
 		pkvm_poison_pvmfw_pages();
+		hyp_spin_lock(&mem_protect_lock);
+		if (psci_mem_protect_active())
+			cpu_reg(host_ctxt, 0) = PSCI_0_2_FN_SYSTEM_RESET;
 		fallthrough;
 	case PSCI_1_0_FN_PSCI_FEATURES:
 	case PSCI_1_0_FN_SET_SUSPEND_MODE:
diff --git a/include/uapi/linux/psci.h b/include/uapi/linux/psci.h
index 3511095c2702..cec9c39d2f77 100644
--- a/include/uapi/linux/psci.h
+++ b/include/uapi/linux/psci.h
@@ -69,6 +69,8 @@
 #define PSCI_1_1_FN64_SYSTEM_RESET2		PSCI_0_2_FN64(18)
 #define PSCI_1_1_FN64_MEM_PROTECT_CHECK_RANGE	PSCI_0_2_FN64(19)
 
+#define PSCI_1_1_FN64_MEM_PROTECT		PSCI_0_2_FN64(19)
+
 /* PSCI v0.2 power state encoding for CPU_SUSPEND function */
 #define PSCI_0_2_POWER_STATE_ID_MASK		0xffff
 #define PSCI_0_2_POWER_STATE_ID_SHIFT		0

From 3c8afbbfa45b9bae1c95688e7e136c20cb601b8f Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 17 Nov 2022 17:22:34 +0000
Subject: [PATCH 141/457] ANDROID: KVM: arm64: Issue CMOs when tearing down
 shadow pages

On the guest teardown path, pKVM will zero the pages used to back the
guest shadow data structures before returning them to the host as they
may contain secrets (e.g. in the vCPU registers). However, the zeroing
is done using a cacheable alias, and CMOs are missing, hence giving the
host a potential opportunity to read the original content of the shadow
structs from memory.

Fix this by issuing CMOs after zeroing the pages.

[ qperret@: moved the CMOs to __unmap_donated_memory() to cover all
  callers, including the __pkvm_init_vm() error path ]

Bug: 259551298
Change-Id: Id696d47d16e4c3fd870cb70b792eeb7f2282fc78
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/pkvm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 26db4f6e0a14..0aa6eba48a33 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -626,6 +626,7 @@ static void *map_donated_memory(unsigned long host_va, size_t size)
 
 static void __unmap_donated_memory(void *va, size_t size)
 {
+	kvm_flush_dcache_to_poc(va, size);
 	WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(va),
 				       PAGE_ALIGN(size) >> PAGE_SHIFT));
 }

From 40e897b246a6528328fd5737bdf13efeba5110f1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:24 +0000
Subject: [PATCH 142/457] FROMLIST: firmware: arm_ffa: Move constants to header
 file

FF-A function IDs and error codes will be needed in the hypervisor too,
so move to them to the header file where they can be shared. Rename the
version constants with an "FFA_" prefix so that they are less likely
to clash with other code in the tree.

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: I00ed487279fdfb61ea34ae99140c6fac8ee89187
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-2-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 drivers/firmware/arm_ffa/driver.c | 101 +++---------------------------
 include/linux/arm_ffa.h           |  83 ++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 91 deletions(-)

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index d5e86ef40b89..fa85c64d3ded 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -36,81 +36,6 @@
 #include "common.h"
 
 #define FFA_DRIVER_VERSION	FFA_VERSION_1_0
-
-#define FFA_SMC(calling_convention, func_num)				\
-	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, (calling_convention),	\
-			   ARM_SMCCC_OWNER_STANDARD, (func_num))
-
-#define FFA_SMC_32(func_num)	FFA_SMC(ARM_SMCCC_SMC_32, (func_num))
-#define FFA_SMC_64(func_num)	FFA_SMC(ARM_SMCCC_SMC_64, (func_num))
-
-#define FFA_ERROR			FFA_SMC_32(0x60)
-#define FFA_SUCCESS			FFA_SMC_32(0x61)
-#define FFA_INTERRUPT			FFA_SMC_32(0x62)
-#define FFA_VERSION			FFA_SMC_32(0x63)
-#define FFA_FEATURES			FFA_SMC_32(0x64)
-#define FFA_RX_RELEASE			FFA_SMC_32(0x65)
-#define FFA_RXTX_MAP			FFA_SMC_32(0x66)
-#define FFA_FN64_RXTX_MAP		FFA_SMC_64(0x66)
-#define FFA_RXTX_UNMAP			FFA_SMC_32(0x67)
-#define FFA_PARTITION_INFO_GET		FFA_SMC_32(0x68)
-#define FFA_ID_GET			FFA_SMC_32(0x69)
-#define FFA_MSG_POLL			FFA_SMC_32(0x6A)
-#define FFA_MSG_WAIT			FFA_SMC_32(0x6B)
-#define FFA_YIELD			FFA_SMC_32(0x6C)
-#define FFA_RUN				FFA_SMC_32(0x6D)
-#define FFA_MSG_SEND			FFA_SMC_32(0x6E)
-#define FFA_MSG_SEND_DIRECT_REQ		FFA_SMC_32(0x6F)
-#define FFA_FN64_MSG_SEND_DIRECT_REQ	FFA_SMC_64(0x6F)
-#define FFA_MSG_SEND_DIRECT_RESP	FFA_SMC_32(0x70)
-#define FFA_FN64_MSG_SEND_DIRECT_RESP	FFA_SMC_64(0x70)
-#define FFA_MEM_DONATE			FFA_SMC_32(0x71)
-#define FFA_FN64_MEM_DONATE		FFA_SMC_64(0x71)
-#define FFA_MEM_LEND			FFA_SMC_32(0x72)
-#define FFA_FN64_MEM_LEND		FFA_SMC_64(0x72)
-#define FFA_MEM_SHARE			FFA_SMC_32(0x73)
-#define FFA_FN64_MEM_SHARE		FFA_SMC_64(0x73)
-#define FFA_MEM_RETRIEVE_REQ		FFA_SMC_32(0x74)
-#define FFA_FN64_MEM_RETRIEVE_REQ	FFA_SMC_64(0x74)
-#define FFA_MEM_RETRIEVE_RESP		FFA_SMC_32(0x75)
-#define FFA_MEM_RELINQUISH		FFA_SMC_32(0x76)
-#define FFA_MEM_RECLAIM			FFA_SMC_32(0x77)
-#define FFA_MEM_OP_PAUSE		FFA_SMC_32(0x78)
-#define FFA_MEM_OP_RESUME		FFA_SMC_32(0x79)
-#define FFA_MEM_FRAG_RX			FFA_SMC_32(0x7A)
-#define FFA_MEM_FRAG_TX			FFA_SMC_32(0x7B)
-#define FFA_NORMAL_WORLD_RESUME		FFA_SMC_32(0x7C)
-
-/*
- * For some calls it is necessary to use SMC64 to pass or return 64-bit values.
- * For such calls FFA_FN_NATIVE(name) will choose the appropriate
- * (native-width) function ID.
- */
-#ifdef CONFIG_64BIT
-#define FFA_FN_NATIVE(name)	FFA_FN64_##name
-#else
-#define FFA_FN_NATIVE(name)	FFA_##name
-#endif
-
-/* FFA error codes. */
-#define FFA_RET_SUCCESS            (0)
-#define FFA_RET_NOT_SUPPORTED      (-1)
-#define FFA_RET_INVALID_PARAMETERS (-2)
-#define FFA_RET_NO_MEMORY          (-3)
-#define FFA_RET_BUSY               (-4)
-#define FFA_RET_INTERRUPTED        (-5)
-#define FFA_RET_DENIED             (-6)
-#define FFA_RET_RETRY              (-7)
-#define FFA_RET_ABORTED            (-8)
-
-#define MAJOR_VERSION_MASK	GENMASK(30, 16)
-#define MINOR_VERSION_MASK	GENMASK(15, 0)
-#define MAJOR_VERSION(x)	((u16)(FIELD_GET(MAJOR_VERSION_MASK, (x))))
-#define MINOR_VERSION(x)	((u16)(FIELD_GET(MINOR_VERSION_MASK, (x))))
-#define PACK_VERSION_INFO(major, minor)			\
-	(FIELD_PREP(MAJOR_VERSION_MASK, (major)) |	\
-	 FIELD_PREP(MINOR_VERSION_MASK, (minor)))
-#define FFA_VERSION_1_0		PACK_VERSION_INFO(1, 0)
 #define FFA_MIN_VERSION		FFA_VERSION_1_0
 
 #define SENDER_ID_MASK		GENMASK(31, 16)
@@ -120,12 +45,6 @@
 #define PACK_TARGET_INFO(s, r)		\
 	(FIELD_PREP(SENDER_ID_MASK, (s)) | FIELD_PREP(RECEIVER_ID_MASK, (r)))
 
-/*
- * FF-A specification mentions explicitly about '4K pages'. This should
- * not be confused with the kernel PAGE_SIZE, which is the translation
- * granule kernel is configured and may be one among 4K, 16K and 64K.
- */
-#define FFA_PAGE_SIZE		SZ_4K
 /*
  * Keeping RX TX buffer size as 4K for now
  * 64K may be preferred to keep it min a page in 64K PAGE_SIZE config
@@ -178,9 +97,9 @@ static struct ffa_drv_info *drv_info;
  */
 static u32 ffa_compatible_version_find(u32 version)
 {
-	u16 major = MAJOR_VERSION(version), minor = MINOR_VERSION(version);
-	u16 drv_major = MAJOR_VERSION(FFA_DRIVER_VERSION);
-	u16 drv_minor = MINOR_VERSION(FFA_DRIVER_VERSION);
+	u16 major = FFA_MAJOR_VERSION(version), minor = FFA_MINOR_VERSION(version);
+	u16 drv_major = FFA_MAJOR_VERSION(FFA_DRIVER_VERSION);
+	u16 drv_minor = FFA_MINOR_VERSION(FFA_DRIVER_VERSION);
 
 	if ((major < drv_major) || (major == drv_major && minor <= drv_minor))
 		return version;
@@ -204,16 +123,16 @@ static int ffa_version_check(u32 *version)
 
 	if (ver.a0 < FFA_MIN_VERSION) {
 		pr_err("Incompatible v%d.%d! Earliest supported v%d.%d\n",
-		       MAJOR_VERSION(ver.a0), MINOR_VERSION(ver.a0),
-		       MAJOR_VERSION(FFA_MIN_VERSION),
-		       MINOR_VERSION(FFA_MIN_VERSION));
+		       FFA_MAJOR_VERSION(ver.a0), FFA_MINOR_VERSION(ver.a0),
+		       FFA_MAJOR_VERSION(FFA_MIN_VERSION),
+		       FFA_MINOR_VERSION(FFA_MIN_VERSION));
 		return -EINVAL;
 	}
 
-	pr_info("Driver version %d.%d\n", MAJOR_VERSION(FFA_DRIVER_VERSION),
-		MINOR_VERSION(FFA_DRIVER_VERSION));
-	pr_info("Firmware version %d.%d found\n", MAJOR_VERSION(ver.a0),
-		MINOR_VERSION(ver.a0));
+	pr_info("Driver version %d.%d\n", FFA_MAJOR_VERSION(FFA_DRIVER_VERSION),
+		FFA_MINOR_VERSION(FFA_DRIVER_VERSION));
+	pr_info("Firmware version %d.%d found\n", FFA_MAJOR_VERSION(ver.a0),
+		FFA_MINOR_VERSION(ver.a0));
 	*version = ffa_compatible_version_find(ver.a0);
 
 	return 0;
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 5f02d2e6b9d9..daff44d777fa 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -11,6 +11,89 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 
+#define FFA_SMC(calling_convention, func_num)				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, (calling_convention),	\
+			   ARM_SMCCC_OWNER_STANDARD, (func_num))
+
+#define FFA_SMC_32(func_num)	FFA_SMC(ARM_SMCCC_SMC_32, (func_num))
+#define FFA_SMC_64(func_num)	FFA_SMC(ARM_SMCCC_SMC_64, (func_num))
+
+#define FFA_ERROR			FFA_SMC_32(0x60)
+#define FFA_SUCCESS			FFA_SMC_32(0x61)
+#define FFA_INTERRUPT			FFA_SMC_32(0x62)
+#define FFA_VERSION			FFA_SMC_32(0x63)
+#define FFA_FEATURES			FFA_SMC_32(0x64)
+#define FFA_RX_RELEASE			FFA_SMC_32(0x65)
+#define FFA_RXTX_MAP			FFA_SMC_32(0x66)
+#define FFA_FN64_RXTX_MAP		FFA_SMC_64(0x66)
+#define FFA_RXTX_UNMAP			FFA_SMC_32(0x67)
+#define FFA_PARTITION_INFO_GET		FFA_SMC_32(0x68)
+#define FFA_ID_GET			FFA_SMC_32(0x69)
+#define FFA_MSG_POLL			FFA_SMC_32(0x6A)
+#define FFA_MSG_WAIT			FFA_SMC_32(0x6B)
+#define FFA_YIELD			FFA_SMC_32(0x6C)
+#define FFA_RUN				FFA_SMC_32(0x6D)
+#define FFA_MSG_SEND			FFA_SMC_32(0x6E)
+#define FFA_MSG_SEND_DIRECT_REQ		FFA_SMC_32(0x6F)
+#define FFA_FN64_MSG_SEND_DIRECT_REQ	FFA_SMC_64(0x6F)
+#define FFA_MSG_SEND_DIRECT_RESP	FFA_SMC_32(0x70)
+#define FFA_FN64_MSG_SEND_DIRECT_RESP	FFA_SMC_64(0x70)
+#define FFA_MEM_DONATE			FFA_SMC_32(0x71)
+#define FFA_FN64_MEM_DONATE		FFA_SMC_64(0x71)
+#define FFA_MEM_LEND			FFA_SMC_32(0x72)
+#define FFA_FN64_MEM_LEND		FFA_SMC_64(0x72)
+#define FFA_MEM_SHARE			FFA_SMC_32(0x73)
+#define FFA_FN64_MEM_SHARE		FFA_SMC_64(0x73)
+#define FFA_MEM_RETRIEVE_REQ		FFA_SMC_32(0x74)
+#define FFA_FN64_MEM_RETRIEVE_REQ	FFA_SMC_64(0x74)
+#define FFA_MEM_RETRIEVE_RESP		FFA_SMC_32(0x75)
+#define FFA_MEM_RELINQUISH		FFA_SMC_32(0x76)
+#define FFA_MEM_RECLAIM			FFA_SMC_32(0x77)
+#define FFA_MEM_OP_PAUSE		FFA_SMC_32(0x78)
+#define FFA_MEM_OP_RESUME		FFA_SMC_32(0x79)
+#define FFA_MEM_FRAG_RX			FFA_SMC_32(0x7A)
+#define FFA_MEM_FRAG_TX			FFA_SMC_32(0x7B)
+#define FFA_NORMAL_WORLD_RESUME		FFA_SMC_32(0x7C)
+
+/*
+ * For some calls it is necessary to use SMC64 to pass or return 64-bit values.
+ * For such calls FFA_FN_NATIVE(name) will choose the appropriate
+ * (native-width) function ID.
+ */
+#ifdef CONFIG_64BIT
+#define FFA_FN_NATIVE(name)	FFA_FN64_##name
+#else
+#define FFA_FN_NATIVE(name)	FFA_##name
+#endif
+
+/* FFA error codes. */
+#define FFA_RET_SUCCESS            (0)
+#define FFA_RET_NOT_SUPPORTED      (-1)
+#define FFA_RET_INVALID_PARAMETERS (-2)
+#define FFA_RET_NO_MEMORY          (-3)
+#define FFA_RET_BUSY               (-4)
+#define FFA_RET_INTERRUPTED        (-5)
+#define FFA_RET_DENIED             (-6)
+#define FFA_RET_RETRY              (-7)
+#define FFA_RET_ABORTED            (-8)
+
+/* FFA version encoding */
+#define FFA_MAJOR_VERSION_MASK	GENMASK(30, 16)
+#define FFA_MINOR_VERSION_MASK	GENMASK(15, 0)
+#define FFA_MAJOR_VERSION(x)	((u16)(FIELD_GET(FFA_MAJOR_VERSION_MASK, (x))))
+#define FFA_MINOR_VERSION(x)	((u16)(FIELD_GET(FFA_MINOR_VERSION_MASK, (x))))
+#define FFA_PACK_VERSION_INFO(major, minor)			\
+	(FIELD_PREP(FFA_MAJOR_VERSION_MASK, (major)) |		\
+	 FIELD_PREP(FFA_MINOR_VERSION_MASK, (minor)))
+#define FFA_VERSION_1_0		FFA_PACK_VERSION_INFO(1, 0)
+
+/**
+ * FF-A specification mentions explicitly about '4K pages'. This should
+ * not be confused with the kernel PAGE_SIZE, which is the translation
+ * granule kernel is configured and may be one among 4K, 16K and 64K.
+ */
+#define FFA_PAGE_SIZE		SZ_4K
+
 /* FFA Bus/Device/Driver related */
 struct ffa_device {
 	int vm_id;

From dba12d905917ce42ffa0c438d56482345ac30e0e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:25 +0000
Subject: [PATCH 143/457] FROMLIST: firmware: arm_ffa: Move comment before the
 field it is documenting

This is consistent with the other comments in the struct.

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: I10e9014a0d505fe5e132fb1cd6105b95a3f5f2bf
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-3-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 include/linux/arm_ffa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index daff44d777fa..c87aeecaa9b2 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -244,11 +244,11 @@ struct ffa_mem_region_attributes {
 	 */
 #define FFA_MEM_RETRIEVE_SELF_BORROWER	BIT(0)
 	u8 flag;
-	u32 composite_off;
 	/*
 	 * Offset in bytes from the start of the outer `ffa_memory_region` to
 	 * an `struct ffa_mem_region_addr_range`.
 	 */
+	u32 composite_off;
 	u64 reserved;
 };
 

From 1e8c7d72162e2586d82a31020c547f8089228a6e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:26 +0000
Subject: [PATCH 144/457] FROMLIST: KVM: arm64: Block unsafe FF-A calls from
 the host

When KVM is initialised in protected mode, we must take care to filter
certain FFA calls from the host kernel so that the integrity of guest
and hypervisor memory is maintained and is not made available to the
secure world.

As a first step, intercept and block all memory-related FF-A SMC calls
from the host to EL3. This puts the framework in place for handling them
properly.

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: I5279bce56956c590862a68e8c4803dd2205e3f81
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-4-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/ffa.h |  16 ++++
 arch/arm64/kvm/hyp/nvhe/Makefile      |   2 +-
 arch/arm64/kvm/hyp/nvhe/ffa.c         | 113 ++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c    |   3 +
 4 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/ffa.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/ffa.c

diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
new file mode 100644
index 000000000000..fc09ec671e24
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ */
+#ifndef __KVM_HYP_FFA_H
+#define __KVM_HYP_FFA_H
+
+#include <asm/kvm_host.h>
+
+#define FFA_MIN_FUNC_NUM 0x60
+#define FFA_MAX_FUNC_NUM 0x7F
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
+
+#endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index be0a2bc3e20d..1b34d3ff57f3 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
 
 hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
 	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
-	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
+	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
 hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
new file mode 100644
index 000000000000..6ccf935d3b41
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by
+ * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware
+ * Framework for Arm A-profile", which is specified by Arm in document
+ * number DEN0077.
+ *
+ * Copyright (C) 2022 - Google LLC
+ * Author: Andrew Walbran <qwandor@google.com>
+ *
+ * This driver hooks into the SMC trapping logic for the host and intercepts
+ * all calls falling within the FF-A range. Each call is either:
+ *
+ *	- Forwarded on unmodified to the SPMD at EL3
+ *	- Rejected as "unsupported"
+ *	- Accompanied by a host stage-2 page-table check/update and reissued
+ *
+ * Consequently, any attempts by the host to make guest memory pages
+ * accessible to the secure world using FF-A will be detected either here
+ * (in the case that the memory is already owned by the guest) or during
+ * donation to the guest (in the case that the memory was previously shared
+ * with the secure world).
+ *
+ * To allow the rolling-back of page-table updates and FF-A calls in the
+ * event of failure, operations involving the RXTX buffers are locked for
+ * the duration and are therefore serialised.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/arm_ffa.h>
+#include <nvhe/ffa.h>
+#include <nvhe/trap_handler.h>
+
+static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
+{
+	*res = (struct arm_smccc_res) {
+		.a0	= FFA_ERROR,
+		.a2	= ffa_errno,
+	};
+}
+
+static void ffa_set_retval(struct kvm_cpu_context *ctxt,
+			   struct arm_smccc_res *res)
+{
+	cpu_reg(ctxt, 0) = res->a0;
+	cpu_reg(ctxt, 1) = res->a1;
+	cpu_reg(ctxt, 2) = res->a2;
+	cpu_reg(ctxt, 3) = res->a3;
+}
+
+static bool is_ffa_call(u64 func_id)
+{
+	return ARM_SMCCC_IS_FAST_CALL(func_id) &&
+	       ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
+	       ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM &&
+	       ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
+}
+
+static bool ffa_call_unsupported(u64 func_id)
+{
+	switch (func_id) {
+	/* Unsupported memory management calls */
+	case FFA_FN64_MEM_RETRIEVE_REQ:
+	case FFA_MEM_RETRIEVE_RESP:
+	case FFA_MEM_RELINQUISH:
+	case FFA_MEM_OP_PAUSE:
+	case FFA_MEM_OP_RESUME:
+	case FFA_MEM_FRAG_RX:
+	case FFA_FN64_MEM_DONATE:
+	/* Indirect message passing via RX/TX buffers */
+	case FFA_MSG_SEND:
+	case FFA_MSG_POLL:
+	case FFA_MSG_WAIT:
+	/* 32-bit variants of 64-bit calls */
+	case FFA_MSG_SEND_DIRECT_REQ:
+	case FFA_MSG_SEND_DIRECT_RESP:
+	case FFA_RXTX_MAP:
+	case FFA_MEM_DONATE:
+	case FFA_MEM_RETRIEVE_REQ:
+		return true;
+	}
+
+	return false;
+}
+
+bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, func_id, host_ctxt, 0);
+	struct arm_smccc_res res;
+
+	if (!is_ffa_call(func_id))
+		return false;
+
+	switch (func_id) {
+	/* Memory management */
+	case FFA_FN64_RXTX_MAP:
+	case FFA_RXTX_UNMAP:
+	case FFA_MEM_SHARE:
+	case FFA_FN64_MEM_SHARE:
+	case FFA_MEM_LEND:
+	case FFA_FN64_MEM_LEND:
+	case FFA_MEM_RECLAIM:
+	case FFA_MEM_FRAG_TX:
+		break;
+	}
+
+	if (!ffa_call_unsupported(func_id))
+		return false; /* Pass through */
+
+	ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+	ffa_set_retval(host_ctxt, &res);
+	return true;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index b4719b3db2f9..f10a7508921c 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -15,6 +15,7 @@
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
+#include <nvhe/ffa.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
@@ -1188,6 +1189,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 	bool handled;
 
 	handled = kvm_host_psci_handler(host_ctxt);
+	if (!handled)
+		handled = kvm_host_ffa_handler(host_ctxt);
 	if (!handled)
 		default_host_smc_handler(host_ctxt);
 

From 376236b85886c63918de54c393b2eb986c854f1c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:27 +0000
Subject: [PATCH 145/457] BACKPORT: FROMLIST: KVM: arm64: Probe FF-A version
 and host/hyp partition ID during init

Probe FF-A during pKVM initialisation so that we can detect any
inconsistencies in the version or partition ID early on.

[ qperret: BACKPORT due to trivial conflict with header includes in
  setup.c ]

Bug: 254811097
Change-Id: I7def4c2c497017ba86621bc98298bc65ffdeefae
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-5-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h     |  1 +
 arch/arm64/kvm/arm.c                  |  1 +
 arch/arm64/kvm/hyp/include/nvhe/ffa.h |  1 +
 arch/arm64/kvm/hyp/nvhe/ffa.c         | 30 +++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c       |  5 +++++
 5 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 6ebb2de3b120..9409aa82ff6a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -357,6 +357,7 @@ struct kvm_host_data {
 struct kvm_host_psci_config {
 	/* PSCI version used by host. */
 	u32 version;
+	u32 smccc_version;
 
 	/* Function IDs used by host if version is v0.1. */
 	struct psci_0_1_function_ids function_ids_0_1;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1f74e8fd3bd7..09f35ba6ed1d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1903,6 +1903,7 @@ static bool init_psci_relay(void)
 	}
 
 	kvm_host_psci_config.version = psci_ops.get_version();
+	kvm_host_psci_config.smccc_version = arm_smccc_get_version();
 
 	if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
 		kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
index fc09ec671e24..5c9b92430ff3 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -11,6 +11,7 @@
 #define FFA_MIN_FUNC_NUM 0x60
 #define FFA_MAX_FUNC_NUM 0x7F
 
+int hyp_ffa_init(void);
 bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
 
 #endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 6ccf935d3b41..969d72390844 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -31,6 +31,12 @@
 #include <nvhe/ffa.h>
 #include <nvhe/trap_handler.h>
 
+/*
+ * "ID value 0 must be returned at the Non-secure physical FF-A instance"
+ * We share this ID with the host.
+ */
+#define HOST_FFA_ID	0
+
 static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
 {
 	*res = (struct arm_smccc_res) {
@@ -111,3 +117,27 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 	ffa_set_retval(host_ctxt, &res);
 	return true;
 }
+
+int hyp_ffa_init(void)
+{
+	struct arm_smccc_res res;
+
+	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
+		return 0;
+
+	arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 == FFA_RET_NOT_SUPPORTED)
+		return 0;
+
+	if (res.a0 != FFA_VERSION_1_0)
+		return -EOPNOTSUPP;
+
+	arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 != FFA_SUCCESS)
+		return -EOPNOTSUPP;
+
+	if (res.a2 != HOST_FFA_ID)
+		return -EINVAL;
+
+	return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 81c9a999ad44..5669895803c0 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -11,6 +11,7 @@
 #include <asm/kvm_pkvm.h>
 
 #include <nvhe/early_alloc.h>
+#include <nvhe/ffa.h>
 #include <nvhe/gfp.h>
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
@@ -335,6 +336,10 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	ret = hyp_ffa_init();
+	if (ret)
+		goto out;
+
 	pkvm_hyp_vm_table_init(vm_table_base);
 out:
 	/*

From a805114bc5727e971d85a994b980dbd901f92253 Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 16 Nov 2022 17:03:28 +0000
Subject: [PATCH 146/457] FROMLIST: KVM: arm64: Handle FFA_FEATURES call from
 the host

Filter out advertising unsupported features, and only advertise
features and properties that are supported by the hypervisor proxy.

Bug: 254811097
Change-Id: I071766d6d241f4bdd00b8f80e6b237c184a1e59a
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-6-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 44 +++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 969d72390844..d199f868583e 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -45,6 +45,16 @@ static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
 	};
 }
 
+static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
+{
+	if (ret == FFA_RET_SUCCESS) {
+		*res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS,
+						.a2 = prop };
+	} else {
+		ffa_to_smccc_error(res, ret);
+	}
+}
+
 static void ffa_set_retval(struct kvm_cpu_context *ctxt,
 			   struct arm_smccc_res *res)
 {
@@ -89,6 +99,35 @@ static bool ffa_call_unsupported(u64 func_id)
 	return false;
 }
 
+static bool do_ffa_features(struct arm_smccc_res *res,
+			    struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, id, ctxt, 1);
+	u64 prop = 0;
+	int ret = 0;
+
+	if (ffa_call_unsupported(id)) {
+		ret = FFA_RET_NOT_SUPPORTED;
+		goto out_handled;
+	}
+
+	switch (id) {
+	case FFA_MEM_SHARE:
+	case FFA_FN64_MEM_SHARE:
+	case FFA_MEM_LEND:
+	case FFA_FN64_MEM_LEND:
+		ret = FFA_RET_SUCCESS;
+		prop = 0; /* No support for dynamic buffers */
+		goto out_handled;
+	default:
+		return false;
+	}
+
+out_handled:
+	ffa_to_smccc_res_prop(res, ret, prop);
+	return true;
+}
+
 bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, func_id, host_ctxt, 0);
@@ -98,6 +137,10 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 		return false;
 
 	switch (func_id) {
+	case FFA_FEATURES:
+		if (!do_ffa_features(&res, host_ctxt))
+			return false;
+		goto out_handled;
 	/* Memory management */
 	case FFA_FN64_RXTX_MAP:
 	case FFA_RXTX_UNMAP:
@@ -114,6 +157,7 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 		return false; /* Pass through */
 
 	ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED);
+out_handled:
 	ffa_set_retval(host_ctxt, &res);
 	return true;
 }

From 508713769a462b379dea47f4c5a5c6dd8e50cec2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:29 +0000
Subject: [PATCH 147/457] FROMLIST: KVM: arm64: Allocate pages for hypervisor
 FF-A mailboxes

The FF-A proxy code needs to allocate its own buffer pair for
communication with EL3 and for forwarding calls from the host at EL1.

Reserve a couple of pages for this purpose and use them to initialise
the hypervisor's FF-A buffer structure.

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: Id72cd7f59be20eb6d1faa6f1c5e64ecc8debf929
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-7-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h     |  8 ++++++++
 arch/arm64/kvm/hyp/include/nvhe/ffa.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/ffa.c         | 24 +++++++++++++++++++++++-
 arch/arm64/kvm/hyp/nvhe/setup.c       |  8 +++++++-
 arch/arm64/kvm/pkvm.c                 |  1 +
 5 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 86adf6778108..4e67a9446e55 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -331,4 +331,12 @@ static inline unsigned long host_s2_pgtable_pages(void)
 	return res;
 }
 
+#define KVM_FFA_MBOX_NR_PAGES	1
+
+static inline unsigned long hyp_ffa_proxy_pages(void)
+{
+	/* A page each for the hypervisor's RX and TX mailboxes. */
+	return 2 * KVM_FFA_MBOX_NR_PAGES;
+}
+
 #endif	/* __ARM64_KVM_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
index 5c9b92430ff3..1becb10ecd80 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h
@@ -11,7 +11,7 @@
 #define FFA_MIN_FUNC_NUM 0x60
 #define FFA_MAX_FUNC_NUM 0x7F
 
-int hyp_ffa_init(void);
+int hyp_ffa_init(void *pages);
 bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
 
 #endif /* __KVM_HYP_FFA_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index d199f868583e..7cb9095e00ff 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -28,8 +28,11 @@
 
 #include <linux/arm-smccc.h>
 #include <linux/arm_ffa.h>
+#include <asm/kvm_pkvm.h>
+
 #include <nvhe/ffa.h>
 #include <nvhe/trap_handler.h>
+#include <nvhe/spinlock.h>
 
 /*
  * "ID value 0 must be returned at the Non-secure physical FF-A instance"
@@ -37,6 +40,19 @@
  */
 #define HOST_FFA_ID	0
 
+struct kvm_ffa_buffers {
+	hyp_spinlock_t lock;
+	void *tx;
+	void *rx;
+};
+
+/*
+ * Note that we don't currently lock these buffers explicitly, instead
+ * relying on the locking of the host FFA buffers as we only have one
+ * client.
+ */
+static struct kvm_ffa_buffers hyp_buffers;
+
 static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
 {
 	*res = (struct arm_smccc_res) {
@@ -162,7 +178,7 @@ out_handled:
 	return true;
 }
 
-int hyp_ffa_init(void)
+int hyp_ffa_init(void *pages)
 {
 	struct arm_smccc_res res;
 
@@ -183,5 +199,11 @@ int hyp_ffa_init(void)
 	if (res.a2 != HOST_FFA_ID)
 		return -EINVAL;
 
+	hyp_buffers = (struct kvm_ffa_buffers) {
+		.lock	= __HYP_SPIN_LOCK_UNLOCKED,
+		.tx	= pages,
+		.rx	= pages + (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE),
+	};
+
 	return 0;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 5669895803c0..bde9368725c4 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -31,6 +31,7 @@ static void *vmemmap_base;
 static void *vm_table_base;
 static void *hyp_pgt_base;
 static void *host_s2_pgt_base;
+static void *ffa_proxy_pages;
 static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
 static struct hyp_pool hpool;
 
@@ -60,6 +61,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
 	if (!host_s2_pgt_base)
 		return -ENOMEM;
 
+	nr_pages = hyp_ffa_proxy_pages();
+	ffa_proxy_pages = hyp_early_alloc_contig(nr_pages);
+	if (!ffa_proxy_pages)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -336,7 +342,7 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
-	ret = hyp_ffa_init();
+	ret = hyp_ffa_init(ffa_proxy_pages);
 	if (ret)
 		goto out;
 
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index ea2ea5b61424..a86328a92374 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -82,6 +82,7 @@ void __init kvm_hyp_reserve(void)
 	hyp_mem_pages += host_s2_pgtable_pages();
 	hyp_mem_pages += hyp_vm_table_pages();
 	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
+	hyp_mem_pages += hyp_ffa_proxy_pages();
 
 	/*
 	 * Try to allocate a PMD-aligned region to reduce TLB pressure once

From e7e5e9120f281bad3a6c8bd99fd4c6cab4cebb5d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:30 +0000
Subject: [PATCH 148/457] FROMLIST: KVM: arm64: Handle FFA_RXTX_MAP and
 FFA_RXTX_UNMAP calls from the host

Handle FFA_RXTX_MAP and FFA_RXTX_UNMAP calls from the host by sharing
the host's mailbox memory with the hypervisor and establishing a
separate pair of mailboxes between the hypervisor and the SPMD at EL3.

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: Ib5fa89e9b01aa20f7c1b5b41df79d66e98d07f55
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-8-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 173 ++++++++++++++++++++++++++++++++++
 include/linux/arm_ffa.h       |   8 ++
 2 files changed, 181 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 7cb9095e00ff..8388cc901c97 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -31,6 +31,8 @@
 #include <asm/kvm_pkvm.h>
 
 #include <nvhe/ffa.h>
+#include <nvhe/mem_protect.h>
+#include <nvhe/memory.h>
 #include <nvhe/trap_handler.h>
 #include <nvhe/spinlock.h>
 
@@ -52,6 +54,7 @@ struct kvm_ffa_buffers {
  * client.
  */
 static struct kvm_ffa_buffers hyp_buffers;
+static struct kvm_ffa_buffers host_buffers;
 
 static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno)
 {
@@ -71,6 +74,11 @@ static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop)
 	}
 }
 
+static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret)
+{
+	ffa_to_smccc_res_prop(res, ret, 0);
+}
+
 static void ffa_set_retval(struct kvm_cpu_context *ctxt,
 			   struct arm_smccc_res *res)
 {
@@ -88,6 +96,140 @@ static bool is_ffa_call(u64 func_id)
 	       ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM;
 }
 
+static int spmd_map_ffa_buffers(u64 ffa_page_count)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP,
+			  hyp_virt_to_phys(hyp_buffers.tx),
+			  hyp_virt_to_phys(hyp_buffers.rx),
+			  ffa_page_count,
+			  0, 0, 0, 0,
+			  &res);
+
+	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static int spmd_unmap_ffa_buffers(void)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_smc(FFA_RXTX_UNMAP,
+			  HOST_FFA_ID,
+			  0, 0, 0, 0, 0, 0,
+			  &res);
+
+	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
+}
+
+static void do_ffa_rxtx_map(struct arm_smccc_res *res,
+			    struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(phys_addr_t, tx, ctxt, 1);
+	DECLARE_REG(phys_addr_t, rx, ctxt, 2);
+	DECLARE_REG(u32, npages, ctxt, 3);
+	int ret = 0;
+	void *rx_virt, *tx_virt;
+
+	if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (host_buffers.tx) {
+		ret = FFA_RET_DENIED;
+		goto out_unlock;
+	}
+
+	ret = spmd_map_ffa_buffers(npages);
+	if (ret)
+		goto out_unlock;
+
+	ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx));
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unmap;
+	}
+
+	ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx));
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unshare_tx;
+	}
+
+	tx_virt = hyp_phys_to_virt(tx);
+	ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1);
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unshare_rx;
+	}
+
+	rx_virt = hyp_phys_to_virt(rx);
+	ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1);
+	if (ret) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto err_unpin_tx;
+	}
+
+	host_buffers.tx = tx_virt;
+	host_buffers.rx = rx_virt;
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	ffa_to_smccc_res(res, ret);
+	return;
+
+err_unpin_tx:
+	hyp_unpin_shared_mem(tx_virt, tx_virt + 1);
+err_unshare_rx:
+	__pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx));
+err_unshare_tx:
+	__pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx));
+err_unmap:
+	spmd_unmap_ffa_buffers();
+	goto out_unlock;
+}
+
+static void do_ffa_rxtx_unmap(struct arm_smccc_res *res,
+			      struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, id, ctxt, 1);
+	int ret = 0;
+
+	if (id != HOST_FFA_ID) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (!host_buffers.tx) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1);
+	WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx)));
+	host_buffers.tx = NULL;
+
+	hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1);
+	WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx)));
+	host_buffers.rx = NULL;
+
+	spmd_unmap_ffa_buffers();
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	ffa_to_smccc_res(res, ret);
+}
+
 static bool ffa_call_unsupported(u64 func_id)
 {
 	switch (func_id) {
@@ -159,7 +301,11 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 		goto out_handled;
 	/* Memory management */
 	case FFA_FN64_RXTX_MAP:
+		do_ffa_rxtx_map(&res, host_ctxt);
+		goto out_handled;
 	case FFA_RXTX_UNMAP:
+		do_ffa_rxtx_unmap(&res, host_ctxt);
+		goto out_handled;
 	case FFA_MEM_SHARE:
 	case FFA_FN64_MEM_SHARE:
 	case FFA_MEM_LEND:
@@ -181,6 +327,7 @@ out_handled:
 int hyp_ffa_init(void *pages)
 {
 	struct arm_smccc_res res;
+	size_t min_rxtx_sz;
 
 	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
 		return 0;
@@ -199,11 +346,37 @@ int hyp_ffa_init(void *pages)
 	if (res.a2 != HOST_FFA_ID)
 		return -EINVAL;
 
+	arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP,
+			  0, 0, 0, 0, 0, 0, &res);
+	if (res.a0 != FFA_SUCCESS)
+		return -EOPNOTSUPP;
+
+	switch (res.a2) {
+	case FFA_FEAT_RXTX_MIN_SZ_4K:
+		min_rxtx_sz = SZ_4K;
+		break;
+	case FFA_FEAT_RXTX_MIN_SZ_16K:
+		min_rxtx_sz = SZ_16K;
+		break;
+	case FFA_FEAT_RXTX_MIN_SZ_64K:
+		min_rxtx_sz = SZ_64K;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (min_rxtx_sz > PAGE_SIZE)
+		return -EOPNOTSUPP;
+
 	hyp_buffers = (struct kvm_ffa_buffers) {
 		.lock	= __HYP_SPIN_LOCK_UNLOCKED,
 		.tx	= pages,
 		.rx	= pages + (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE),
 	};
 
+	host_buffers = (struct kvm_ffa_buffers) {
+		.lock	= __HYP_SPIN_LOCK_UNLOCKED,
+	};
+
 	return 0;
 }
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index c87aeecaa9b2..b9f81035eb41 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -94,6 +94,14 @@
  */
 #define FFA_PAGE_SIZE		SZ_4K
 
+/*
+ * Minimum buffer size/alignment encodings returned by an FFA_FEATURES
+ * query for FFA_RXTX_MAP.
+ */
+#define FFA_FEAT_RXTX_MIN_SZ_4K		0
+#define FFA_FEAT_RXTX_MIN_SZ_64K	1
+#define FFA_FEAT_RXTX_MIN_SZ_16K	2
+
 /* FFA Bus/Device/Driver related */
 struct ffa_device {
 	int vm_id;

From 3399bf007b5f75ae4608f811917f72f879c676cb Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:31 +0000
Subject: [PATCH 149/457] BACKPORT: FROMLIST: KVM: arm64: Add FF-A helpers to
 share/unshare memory with secure world

Extend pKVM's memory protection code so that we can update the host's
stage-2 page-table to track pages shared with secure world by the host
using FF-A and prevent those pages from being mapped into a guest.

[ qperret: BACKPORT due to context conflicts in mem_protect.c caused by
  the presense of guest-related memory transition in the android kernel
  (host_donate_guest and friends) ]

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: Ib4d404cd1d4fa11d7bf8c1d0b8ec00838a8038a0
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-9-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  3 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 69 +++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 7d0b49800bd8..abc366c4e8ad 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -58,6 +58,7 @@ enum pkvm_component_id {
 	PKVM_ID_HOST,
 	PKVM_ID_HYP,
 	PKVM_ID_GUEST,
+	PKVM_ID_FFA,
 };
 
 extern unsigned long hyp_nr_cpus;
@@ -77,6 +78,8 @@ int __pkvm_guest_relinquish_to_host(struct pkvm_hyp_vcpu *vcpu,
 int __pkvm_install_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 int __pkvm_remove_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu, u64 ipa);
 bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu);
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
 
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index cdf965814724..ab10ffb80558 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1261,6 +1261,13 @@ static int check_share(struct pkvm_mem_share *share)
 	case PKVM_ID_GUEST:
 		ret = guest_ack_share(completer_addr, tx, share->completer_prot);
 		break;
+	case PKVM_ID_FFA:
+		/*
+		 * We only check the host; the secure side will check the other
+		 * end when we forward the FFA call.
+		 */
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1298,6 +1305,13 @@ static int __do_share(struct pkvm_mem_share *share)
 	case PKVM_ID_GUEST:
 		ret = guest_complete_share(completer_addr, tx, share->completer_prot);
 		break;
+	case PKVM_ID_FFA:
+		/*
+		 * We're not responsible for any secure page-tables, so there's
+		 * nothing to do here.
+		 */
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1352,6 +1366,10 @@ static int check_unshare(struct pkvm_mem_share *share)
 	case PKVM_ID_HYP:
 		ret = hyp_ack_unshare(completer_addr, tx);
 		break;
+	case PKVM_ID_FFA:
+		/* See check_share() */
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1386,6 +1404,10 @@ static int __do_unshare(struct pkvm_mem_share *share)
 	case PKVM_ID_HYP:
 		ret = hyp_complete_unshare(completer_addr, tx);
 		break;
+	case PKVM_ID_FFA:
+		/* See __do_share() */
+		ret = 0;
+		break;
 	default:
 		ret = -EINVAL;
 	}
@@ -1821,6 +1843,53 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu)
 	return ret;
 }
 
+int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
+{
+	int ret;
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= nr_pages,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= hyp_pfn_to_phys(pfn),
+			},
+			.completer	= {
+				.id	= PKVM_ID_FFA,
+			},
+		},
+	};
+
+	host_lock_component();
+	ret = do_share(&share);
+	host_unlock_component();
+
+	return ret;
+}
+
+
+int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
+{
+	int ret;
+	struct pkvm_mem_share share = {
+		.tx	= {
+			.nr_pages	= nr_pages,
+			.initiator	= {
+				.id	= PKVM_ID_HOST,
+				.addr	= hyp_pfn_to_phys(pfn),
+			},
+			.completer	= {
+				.id	= PKVM_ID_FFA,
+			},
+		},
+	};
+
+	host_lock_component();
+	ret = do_unshare(&share);
+	host_unlock_component();
+
+	return ret;
+}
+
 void hyp_poison_page(phys_addr_t phys)
 {
 	void *addr = hyp_fixmap_map(phys);

From 8453f39a4a178c9af1625a760214cb5bc1c55de9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:32 +0000
Subject: [PATCH 150/457] FROMLIST: KVM: arm64: Handle FFA_MEM_SHARE calls from
 the host

Intercept FFA_MEM_SHARE/FFA_FN64_MEM_SHARE calls from the host and
transition the host stage-2 page-table entries from the OWNED state to
the SHARED_OWNED state prior to forwarding the call onto EL3.

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: Ic75a943c67e6cb96794c250dccf2a59362bf857e
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-10-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 153 ++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 8388cc901c97..e6d85849c617 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -122,6 +122,14 @@ static int spmd_unmap_ffa_buffers(void)
 	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
 }
 
+static void spmd_mem_share(struct arm_smccc_res *res, u32 len, u32 fraglen)
+{
+	arm_smccc_1_1_smc(FFA_FN64_MEM_SHARE,
+			  len, fraglen,
+			  0, 0, 0, 0, 0,
+			  res);
+}
+
 static void do_ffa_rxtx_map(struct arm_smccc_res *res,
 			    struct kvm_cpu_context *ctxt)
 {
@@ -230,6 +238,149 @@ out:
 	ffa_to_smccc_res(res, ret);
 }
 
+static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+				   u32 nranges)
+{
+	u32 i;
+
+	for (i = 0; i < nranges; ++i) {
+		struct ffa_mem_region_addr_range *range = &ranges[i];
+		u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+		u64 pfn = hyp_phys_to_pfn(range->address);
+
+		if (!PAGE_ALIGNED(sz))
+			break;
+
+		if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE))
+			break;
+	}
+
+	return i;
+}
+
+static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+				     u32 nranges)
+{
+	u32 i;
+
+	for (i = 0; i < nranges; ++i) {
+		struct ffa_mem_region_addr_range *range = &ranges[i];
+		u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE;
+		u64 pfn = hyp_phys_to_pfn(range->address);
+
+		if (!PAGE_ALIGNED(sz))
+			break;
+
+		if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE))
+			break;
+	}
+
+	return i;
+}
+
+static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges,
+				 u32 nranges)
+{
+	u32 nshared = __ffa_host_share_ranges(ranges, nranges);
+	int ret = 0;
+
+	if (nshared != nranges) {
+		WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared);
+		ret = FFA_RET_DENIED;
+	}
+
+	return ret;
+}
+
+static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
+				   u32 nranges)
+{
+	u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges);
+	int ret = 0;
+
+	if (nunshared != nranges) {
+		WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared);
+		ret = FFA_RET_DENIED;
+	}
+
+	return ret;
+}
+
+static void do_ffa_mem_share(struct arm_smccc_res *res,
+			     struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, len, ctxt, 1);
+	DECLARE_REG(u32, fraglen, ctxt, 2);
+	DECLARE_REG(u64, addr_mbz, ctxt, 3);
+	DECLARE_REG(u32, npages_mbz, ctxt, 4);
+	struct ffa_composite_mem_region *reg;
+	struct ffa_mem_region *buf;
+	int ret = 0;
+	u32 offset;
+
+	if (addr_mbz || npages_mbz || fraglen > len ||
+	    fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	if (fraglen < len) {
+		ret = FFA_RET_ABORTED;
+		goto out;
+	}
+
+	if (fraglen < sizeof(struct ffa_mem_region) +
+		      sizeof(struct ffa_mem_region_attributes)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out;
+	}
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (!host_buffers.tx) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	buf = hyp_buffers.tx;
+	memcpy(buf, host_buffers.tx, fraglen);
+
+	offset = buf->ep_mem_access[0].composite_off;
+	if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	reg = (void *)buf + offset;
+	if (fraglen < offset + sizeof(struct ffa_composite_mem_region) +
+		      reg->addr_range_cnt *
+		      sizeof(struct ffa_mem_region_addr_range)) {
+		ret = FFA_RET_INVALID_PARAMETERS;
+		goto out_unlock;
+	}
+
+	ret = ffa_host_share_ranges(reg->constituents, reg->addr_range_cnt);
+	if (ret)
+		goto out_unlock;
+
+	spmd_mem_share(res, len, fraglen);
+	if (res->a0 != FFA_SUCCESS) {
+		WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+						reg->addr_range_cnt));
+	}
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	if (ret)
+		ffa_to_smccc_res(res, ret);
+	return;
+}
+
 static bool ffa_call_unsupported(u64 func_id)
 {
 	switch (func_id) {
@@ -308,6 +459,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 		goto out_handled;
 	case FFA_MEM_SHARE:
 	case FFA_FN64_MEM_SHARE:
+		do_ffa_mem_share(&res, host_ctxt);
+		goto out_handled;
 	case FFA_MEM_LEND:
 	case FFA_FN64_MEM_LEND:
 	case FFA_MEM_RECLAIM:

From 2e47f0ecdcc23144bb1c2716505f40e8a1883f22 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:33 +0000
Subject: [PATCH 151/457] FROMLIST: KVM: arm64: Handle FFA_MEM_RECLAIM calls
 from the host

Intecept FFA_MEM_RECLAIM calls from the host and transition the host
stage-2 page-table entries from the SHARED_OWNED state back to the OWNED
state once EL3 has confirmed that the secure mapping has been reclaimed.

Bug: 254811097
Change-Id: I58365e1b3fafa47f290a292fe57f6d2ed7f9091b
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-11-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 80 ++++++++++++++++++++++++++++++++++-
 1 file changed, 79 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index e6d85849c617..8f690b80af60 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -130,6 +130,23 @@ static void spmd_mem_share(struct arm_smccc_res *res, u32 len, u32 fraglen)
 			  res);
 }
 
+static void spmd_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo,
+			     u32 handle_hi, u32 flags)
+{
+	arm_smccc_1_1_smc(FFA_MEM_RECLAIM,
+			  handle_lo, handle_hi, flags,
+			  0, 0, 0, 0,
+			  res);
+}
+
+static void spmd_retrieve_req(struct arm_smccc_res *res, u32 len)
+{
+	arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ,
+			  len, len,
+			  0, 0, 0, 0, 0,
+			  res);
+}
+
 static void do_ffa_rxtx_map(struct arm_smccc_res *res,
 			    struct kvm_cpu_context *ctxt)
 {
@@ -381,6 +398,65 @@ out:
 	return;
 }
 
+static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
+			       struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, handle_lo, ctxt, 1);
+	DECLARE_REG(u32, handle_hi, ctxt, 2);
+	DECLARE_REG(u32, flags, ctxt, 3);
+	struct ffa_composite_mem_region *reg;
+	struct ffa_mem_region *buf;
+	int ret = 0;
+	u32 offset;
+	u64 handle;
+
+	handle = PACK_HANDLE(handle_lo, handle_hi);
+
+	hyp_spin_lock(&host_buffers.lock);
+
+	buf = hyp_buffers.tx;
+	*buf = (struct ffa_mem_region) {
+		.sender_id	= HOST_FFA_ID,
+		.handle		= handle,
+	};
+
+	spmd_retrieve_req(res, sizeof(*buf));
+	buf = hyp_buffers.rx;
+	if (res->a0 != FFA_MEM_RETRIEVE_RESP)
+		goto out_unlock;
+
+	/* Check for fragmentation */
+	if (res->a1 != res->a2) {
+		ret = FFA_RET_ABORTED;
+		goto out_unlock;
+	}
+
+	offset = buf->ep_mem_access[0].composite_off;
+	/*
+	 * We can trust the SPMD to get this right, but let's at least
+	 * check that we end up with something that doesn't look _completely_
+	 * bogus.
+	 */
+	if (WARN_ON(offset > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+		ret = FFA_RET_ABORTED;
+		goto out_unlock;
+	}
+
+	reg = (void *)buf + offset;
+	spmd_mem_reclaim(res, handle_lo, handle_hi, flags);
+	if (res->a0 != FFA_SUCCESS)
+		goto out_unlock;
+
+	/* If the SPMD was happy, then we should be too. */
+	WARN_ON(ffa_host_unshare_ranges(reg->constituents,
+					reg->addr_range_cnt));
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+
+	if (ret)
+		ffa_to_smccc_res(res, ret);
+}
+
 static bool ffa_call_unsupported(u64 func_id)
 {
 	switch (func_id) {
@@ -461,9 +537,11 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 	case FFA_FN64_MEM_SHARE:
 		do_ffa_mem_share(&res, host_ctxt);
 		goto out_handled;
+	case FFA_MEM_RECLAIM:
+		do_ffa_mem_reclaim(&res, host_ctxt);
+		goto out_handled;
 	case FFA_MEM_LEND:
 	case FFA_FN64_MEM_LEND:
-	case FFA_MEM_RECLAIM:
 	case FFA_MEM_FRAG_TX:
 		break;
 	}

From cb335ce6d0bb7bf07aed4db4a21c44cd6cdbba51 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 16 Nov 2022 17:03:34 +0000
Subject: [PATCH 152/457] FROMLIST: KVM: arm64: Handle FFA_MEM_LEND calls from
 the host

Handle FFA_MEM_LEND calls from the host by treating them identically to
FFA_MEM_SHARE calls for the purposes of the host stage-2 page-table, but
forwarding on the original request to EL3.

Bug: 254811097
Change-Id: I8f53bca6f0865fabd9938eefd8427fa0e78016ed
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-12-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 8f690b80af60..84024cba12ff 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -122,10 +122,10 @@ static int spmd_unmap_ffa_buffers(void)
 	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
 }
 
-static void spmd_mem_share(struct arm_smccc_res *res, u32 len, u32 fraglen)
+static void spmd_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
+			  u32 fraglen)
 {
-	arm_smccc_1_1_smc(FFA_FN64_MEM_SHARE,
-			  len, fraglen,
+	arm_smccc_1_1_smc(func_id, len, fraglen,
 			  0, 0, 0, 0, 0,
 			  res);
 }
@@ -323,8 +323,9 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
 	return ret;
 }
 
-static void do_ffa_mem_share(struct arm_smccc_res *res,
-			     struct kvm_cpu_context *ctxt)
+static __always_inline void do_ffa_mem_xfer(const u64 func_id,
+					    struct arm_smccc_res *res,
+					    struct kvm_cpu_context *ctxt)
 {
 	DECLARE_REG(u32, len, ctxt, 1);
 	DECLARE_REG(u32, fraglen, ctxt, 2);
@@ -335,6 +336,9 @@ static void do_ffa_mem_share(struct arm_smccc_res *res,
 	int ret = 0;
 	u32 offset;
 
+	BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
+		     func_id != FFA_FN64_MEM_LEND);
+
 	if (addr_mbz || npages_mbz || fraglen > len ||
 	    fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) {
 		ret = FFA_RET_INVALID_PARAMETERS;
@@ -384,7 +388,7 @@ static void do_ffa_mem_share(struct arm_smccc_res *res,
 	if (ret)
 		goto out_unlock;
 
-	spmd_mem_share(res, len, fraglen);
+	spmd_mem_xfer(res, func_id, len, fraglen);
 	if (res->a0 != FFA_SUCCESS) {
 		WARN_ON(ffa_host_unshare_ranges(reg->constituents,
 						reg->addr_range_cnt));
@@ -535,13 +539,15 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 		goto out_handled;
 	case FFA_MEM_SHARE:
 	case FFA_FN64_MEM_SHARE:
-		do_ffa_mem_share(&res, host_ctxt);
+		do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt);
 		goto out_handled;
 	case FFA_MEM_RECLAIM:
 		do_ffa_mem_reclaim(&res, host_ctxt);
 		goto out_handled;
 	case FFA_MEM_LEND:
 	case FFA_FN64_MEM_LEND:
+		do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
+		goto out_handled;
 	case FFA_MEM_FRAG_TX:
 		break;
 	}

From bfd7a52151df87e86f369d51d5bd43b81edf2934 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 16 Nov 2022 17:03:35 +0000
Subject: [PATCH 153/457] BACKPORT: FROMLIST: KVM: arm64: pkvm: Add support for
 fragmented FF-A descriptors

FF-A memory descriptors may need to be sent in fragments when they don't
fit in the mailboxes. Doing so involves using the FRAG_TX and FRAG_RX
primitives defined in the FF-A protocol.

Add support in the pKVM FF-A relayer for fragmented descriptors by
monitoring outgoing FRAG_TX transactions and by buffering large
descriptors on the reclaim path.

[ qperret: BACKPORT because I removed the erroneous ANDROID tag from the
  patch title posted upstream ]

Bug: 254811097
Co-developed-by: Andrew Walbran <qwandor@google.com>
Change-Id: I701f279cd4820abb0b6d7c2572ee28e0f943edad
Signed-off-by: Andrew Walbran <qwandor@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20221116170335.2341003-13-qperret@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h |  17 ++-
 arch/arm64/kvm/hyp/nvhe/ffa.c     | 170 +++++++++++++++++++++++++-----
 2 files changed, 161 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 4e67a9446e55..708d6d5c27f4 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -7,7 +7,9 @@
 #ifndef __ARM64_KVM_PKVM_H__
 #define __ARM64_KVM_PKVM_H__
 
+#include <linux/arm_ffa.h>
 #include <linux/memblock.h>
+#include <linux/scatterlist.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/sysreg.h>
 
@@ -335,8 +337,19 @@ static inline unsigned long host_s2_pgtable_pages(void)
 
 static inline unsigned long hyp_ffa_proxy_pages(void)
 {
-	/* A page each for the hypervisor's RX and TX mailboxes. */
-	return 2 * KVM_FFA_MBOX_NR_PAGES;
+	size_t desc_max;
+
+	/*
+	 * The hypervisor FFA proxy needs enough memory to buffer a fragmented
+	 * descriptor returned from EL3 in response to a RETRIEVE_REQ call.
+	 */
+	desc_max = sizeof(struct ffa_mem_region) +
+		   sizeof(struct ffa_mem_region_attributes) +
+		   sizeof(struct ffa_composite_mem_region) +
+		   SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range);
+
+	/* Plus a page each for the hypervisor's RX and TX mailboxes. */
+	return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE);
 }
 
 #endif	/* __ARM64_KVM_PKVM_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 84024cba12ff..61003c3b6445 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -42,6 +42,18 @@
  */
 #define HOST_FFA_ID	0
 
+/*
+ * A buffer to hold the maximum descriptor size we can see from the host,
+ * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP
+ * when resolving the handle on the reclaim path.
+ */
+struct kvm_ffa_descriptor_buffer {
+	void	*buf;
+	size_t	len;
+};
+
+static struct kvm_ffa_descriptor_buffer ffa_desc_buf;
+
 struct kvm_ffa_buffers {
 	hyp_spinlock_t lock;
 	void *tx;
@@ -122,6 +134,24 @@ static int spmd_unmap_ffa_buffers(void)
 	return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2;
 }
 
+static void spmd_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo,
+			     u32 handle_hi, u32 fraglen, u32 endpoint_id)
+{
+	arm_smccc_1_1_smc(FFA_MEM_FRAG_TX,
+			  handle_lo, handle_hi, fraglen, endpoint_id,
+			  0, 0, 0,
+			  res);
+}
+
+static void spmd_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo,
+			     u32 handle_hi, u32 fragoff)
+{
+	arm_smccc_1_1_smc(FFA_MEM_FRAG_RX,
+			  handle_lo, handle_hi, fragoff, HOST_FFA_ID,
+			  0, 0, 0,
+			  res);
+}
+
 static void spmd_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len,
 			  u32 fraglen)
 {
@@ -323,6 +353,64 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges,
 	return ret;
 }
 
+static void do_ffa_mem_frag_tx(struct arm_smccc_res *res,
+			       struct kvm_cpu_context *ctxt)
+{
+	DECLARE_REG(u32, handle_lo, ctxt, 1);
+	DECLARE_REG(u32, handle_hi, ctxt, 2);
+	DECLARE_REG(u32, fraglen, ctxt, 3);
+	DECLARE_REG(u32, endpoint_id, ctxt, 4);
+	struct ffa_mem_region_addr_range *buf;
+	int ret = FFA_RET_INVALID_PARAMETERS;
+	u32 nr_ranges;
+
+	if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)
+		goto out;
+
+	if (fraglen % sizeof(*buf))
+		goto out;
+
+	hyp_spin_lock(&host_buffers.lock);
+	if (!host_buffers.tx)
+		goto out_unlock;
+
+	buf = hyp_buffers.tx;
+	memcpy(buf, host_buffers.tx, fraglen);
+	nr_ranges = fraglen / sizeof(*buf);
+
+	ret = ffa_host_share_ranges(buf, nr_ranges);
+	if (ret) {
+		/*
+		 * We're effectively aborting the transaction, so we need
+		 * to restore the global state back to what it was prior to
+		 * transmission of the first fragment.
+		 */
+		spmd_mem_reclaim(res, handle_lo, handle_hi, 0);
+		WARN_ON(res->a0 != FFA_SUCCESS);
+		goto out_unlock;
+	}
+
+	spmd_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id);
+	if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX)
+		WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges));
+
+out_unlock:
+	hyp_spin_unlock(&host_buffers.lock);
+out:
+	if (ret)
+		ffa_to_smccc_res(res, ret);
+
+	/*
+	 * If for any reason this did not succeed, we're in trouble as we have
+	 * now lost the content of the previous fragments and we can't rollback
+	 * the host stage-2 changes. The pages previously marked as shared will
+	 * remain stuck in that state forever, hence preventing the host from
+	 * sharing/donating them again and may possibly lead to subsequent
+	 * failures, but this will not compromise confidentiality.
+	 */
+	return;
+}
+
 static __always_inline void do_ffa_mem_xfer(const u64 func_id,
 					    struct arm_smccc_res *res,
 					    struct kvm_cpu_context *ctxt)
@@ -333,8 +421,8 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
 	DECLARE_REG(u32, npages_mbz, ctxt, 4);
 	struct ffa_composite_mem_region *reg;
 	struct ffa_mem_region *buf;
+	u32 offset, nr_ranges;
 	int ret = 0;
-	u32 offset;
 
 	BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE &&
 		     func_id != FFA_FN64_MEM_LEND);
@@ -345,11 +433,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
 		goto out;
 	}
 
-	if (fraglen < len) {
-		ret = FFA_RET_ABORTED;
-		goto out;
-	}
-
 	if (fraglen < sizeof(struct ffa_mem_region) +
 		      sizeof(struct ffa_mem_region_attributes)) {
 		ret = FFA_RET_INVALID_PARAMETERS;
@@ -377,21 +460,26 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id,
 	}
 
 	reg = (void *)buf + offset;
-	if (fraglen < offset + sizeof(struct ffa_composite_mem_region) +
-		      reg->addr_range_cnt *
-		      sizeof(struct ffa_mem_region_addr_range)) {
+	nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents;
+	if (nr_ranges % sizeof(reg->constituents[0])) {
 		ret = FFA_RET_INVALID_PARAMETERS;
 		goto out_unlock;
 	}
 
-	ret = ffa_host_share_ranges(reg->constituents, reg->addr_range_cnt);
+	nr_ranges /= sizeof(reg->constituents[0]);
+	ret = ffa_host_share_ranges(reg->constituents, nr_ranges);
 	if (ret)
 		goto out_unlock;
 
 	spmd_mem_xfer(res, func_id, len, fraglen);
-	if (res->a0 != FFA_SUCCESS) {
-		WARN_ON(ffa_host_unshare_ranges(reg->constituents,
-						reg->addr_range_cnt));
+	if (fraglen != len) {
+		if (res->a0 != FFA_MEM_FRAG_RX)
+			goto err_unshare;
+
+		if (res->a3 != fraglen)
+			goto err_unshare;
+	} else if (res->a0 != FFA_SUCCESS) {
+		goto err_unshare;
 	}
 
 out_unlock:
@@ -400,6 +488,10 @@ out:
 	if (ret)
 		ffa_to_smccc_res(res, ret);
 	return;
+
+err_unshare:
+	WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges));
+	goto out_unlock;
 }
 
 static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
@@ -409,9 +501,9 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
 	DECLARE_REG(u32, handle_hi, ctxt, 2);
 	DECLARE_REG(u32, flags, ctxt, 3);
 	struct ffa_composite_mem_region *reg;
+	u32 offset, len, fraglen, fragoff;
 	struct ffa_mem_region *buf;
 	int ret = 0;
-	u32 offset;
 	u64 handle;
 
 	handle = PACK_HANDLE(handle_lo, handle_hi);
@@ -429,11 +521,8 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
 	if (res->a0 != FFA_MEM_RETRIEVE_RESP)
 		goto out_unlock;
 
-	/* Check for fragmentation */
-	if (res->a1 != res->a2) {
-		ret = FFA_RET_ABORTED;
-		goto out_unlock;
-	}
+	len = res->a1;
+	fraglen = res->a2;
 
 	offset = buf->ep_mem_access[0].composite_off;
 	/*
@@ -441,16 +530,36 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res,
 	 * check that we end up with something that doesn't look _completely_
 	 * bogus.
 	 */
-	if (WARN_ON(offset > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
+	if (WARN_ON(offset > len ||
+		    fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) {
 		ret = FFA_RET_ABORTED;
 		goto out_unlock;
 	}
 
-	reg = (void *)buf + offset;
+	if (len > ffa_desc_buf.len) {
+		ret = FFA_RET_NO_MEMORY;
+		goto out_unlock;
+	}
+
+	buf = ffa_desc_buf.buf;
+	memcpy(buf, hyp_buffers.rx, fraglen);
+
+	for (fragoff = fraglen; fragoff < len; fragoff += fraglen) {
+		spmd_mem_frag_rx(res, handle_lo, handle_hi, fragoff);
+		if (res->a0 != FFA_MEM_FRAG_TX) {
+			ret = FFA_RET_INVALID_PARAMETERS;
+			goto out_unlock;
+		}
+
+		fraglen = res->a3;
+		memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen);
+	}
+
 	spmd_mem_reclaim(res, handle_lo, handle_hi, flags);
 	if (res->a0 != FFA_SUCCESS)
 		goto out_unlock;
 
+	reg = (void *)buf + offset;
 	/* If the SPMD was happy, then we should be too. */
 	WARN_ON(ffa_host_unshare_ranges(reg->constituents,
 					reg->addr_range_cnt));
@@ -549,7 +658,8 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt)
 		do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt);
 		goto out_handled;
 	case FFA_MEM_FRAG_TX:
-		break;
+		do_ffa_mem_frag_tx(&res, host_ctxt);
+		goto out_handled;
 	}
 
 	if (!ffa_call_unsupported(func_id))
@@ -565,6 +675,7 @@ int hyp_ffa_init(void *pages)
 {
 	struct arm_smccc_res res;
 	size_t min_rxtx_sz;
+	void *tx, *rx;
 
 	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
 		return 0;
@@ -605,10 +716,21 @@ int hyp_ffa_init(void *pages)
 	if (min_rxtx_sz > PAGE_SIZE)
 		return -EOPNOTSUPP;
 
+	tx = pages;
+	pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+	rx = pages;
+	pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE;
+
+	ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) {
+		.buf	= pages,
+		.len	= PAGE_SIZE *
+			  (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)),
+	};
+
 	hyp_buffers = (struct kvm_ffa_buffers) {
 		.lock	= __HYP_SPIN_LOCK_UNLOCKED,
-		.tx	= pages,
-		.rx	= pages + (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE),
+		.tx	= tx,
+		.rx	= rx,
 	};
 
 	host_buffers = (struct kvm_ffa_buffers) {

From 9402f3997c1cc92522529cd2f7897e49c70b30ac Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 18 Jul 2022 14:44:32 +0000
Subject: [PATCH 154/457] ANDROID: KVM: arm64: Increase size of FF-A buffer

As it turns out, the kernel's DMA code doesn't enforce the
SG_MAX_SEGMENTS limit on the number of elements in an sglist, which can
confuse the pKVM FF-A proxy which has a buffer sized to contain a
descriptor of at most SG_MAX_SEGMENTS constituents.

As the number of elements in an sglist doesn't seem to have an actual
upper bound, let's paper over the issue for now by increasing the size
of the pKVM buffer based on empirical 'measurements'. Longer term we
might need to make this value configurable on the kernel's cmdline, or
to rework the FF-A proxy to sanely handle large descriptors, although
this is not clear how at the time of writing.

Bug: 221256863
Signed-off-by: Quentin Perret <qperret@google.com>
Change-Id: If252f01bec8ae71c0fe1f7007a3ca7b037924c84
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 708d6d5c27f4..7ac74dd69b34 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -335,10 +335,27 @@ static inline unsigned long host_s2_pgtable_pages(void)
 
 #define KVM_FFA_MBOX_NR_PAGES	1
 
+/*
+ * Maximum number of consitutents allowed in a descriptor. This number is
+ * arbitrary, see comment below on SG_MAX_SEGMENTS in hyp_ffa_proxy_pages().
+ */
+#define KVM_FFA_MAX_NR_CONSTITUENTS	4096
+
 static inline unsigned long hyp_ffa_proxy_pages(void)
 {
 	size_t desc_max;
 
+	/*
+	 * SG_MAX_SEGMENTS is supposed to bound the number of elements in an
+	 * sglist, which should match the number of consituents in the
+	 * corresponding FFA descriptor. As such, the EL2 buffer needs to be
+	 * large enough to hold a descriptor with SG_MAX_SEGMENTS consituents
+	 * at least. But the kernel's DMA code doesn't enforce the limit, and
+	 * it is sometimes abused, so let's allow larger descriptors and hope
+	 * for the best.
+	 */
+	BUILD_BUG_ON(KVM_FFA_MAX_NR_CONSTITUENTS < SG_MAX_SEGMENTS);
+
 	/*
 	 * The hypervisor FFA proxy needs enough memory to buffer a fragmented
 	 * descriptor returned from EL3 in response to a RETRIEVE_REQ call.
@@ -346,7 +363,7 @@ static inline unsigned long hyp_ffa_proxy_pages(void)
 	desc_max = sizeof(struct ffa_mem_region) +
 		   sizeof(struct ffa_mem_region_attributes) +
 		   sizeof(struct ffa_composite_mem_region) +
-		   SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range);
+		   KVM_FFA_MAX_NR_CONSTITUENTS * sizeof(struct ffa_mem_region_addr_range);
 
 	/* Plus a page each for the hypervisor's RX and TX mailboxes. */
 	return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE);

From 8eaec3fe4a30a7ff6e8a3fbdf6f9208e1840037e Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Fri, 4 Mar 2022 17:24:25 +0000
Subject: [PATCH 155/457] ANDROID: KVM: arm64: Relax SMCCC version check during
 FF-A proxy init

Although FF-A claims to require version v1.2 of SMCCC, in reality the
current set of calls work just fine with v1.1 and some devices ship with
EL3 firmware that advertises this configuration.

Allow pKVM to proxy FF-A calls for these devices by relaxing our SMCCC
version check to permit SMCCC v1.1+

Reported-by: Alan Stokes <alanstokes@google.com>
Bug: 222663556
Signed-off-by: Will Deacon <willdeacon@google.com>
Change-Id: I41e9ff35f169df3609acee7bbc67999c1d11c9d1
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/ffa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 61003c3b6445..4d1c3c85b47d 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -677,7 +677,7 @@ int hyp_ffa_init(void *pages)
 	size_t min_rxtx_sz;
 	void *tx, *rx;
 
-	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2)
+	if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_1)
 		return 0;
 
 	arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res);

From 059a19c4ef8398a8d02d2e7435d6a8f54d00014e Mon Sep 17 00:00:00 2001
From: Fuad Tabba <tabba@google.com>
Date: Wed, 16 Nov 2022 14:28:56 +0000
Subject: [PATCH 156/457] ANDROID: KVM: arm64: Fix sparse __percpu warning

Force the cast to silence the warning.

We don't have a proper way to dynamically allocate memory at EL2,
and hence no proper way to dynamically allocate percpu fields.
Instead, we rely on memory donated from the host and index it by
hyp_smp_processor_id().

Bug: 258616809
Reported-by: Todd Kjos <tkjos@google.com>
Change-Id: I3f7efd4a41294a6696992ce0a49270c4f9468bb5
Signed-off-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |  4 +++-
 arch/arm64/kvm/hyp/nvhe/pkvm.c     | 10 ++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f10a7508921c..e44ce4e7e2a8 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -694,6 +694,7 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2);
 	DECLARE_REG(u64, hcr_el2, host_ctxt, 3);
 	struct pkvm_hyp_vcpu *hyp_vcpu;
+	int __percpu *last_vcpu_ran;
 	int *last_ran;
 
 	if (!is_protected_kvm_enabled())
@@ -708,7 +709,8 @@ static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt)
 	 * vcpu from the same VM has previously run on the same physical CPU,
 	 * nuke the relevant contexts.
 	 */
-	last_ran = &hyp_vcpu->vcpu.arch.hw_mmu->last_vcpu_ran[hyp_smp_processor_id()];
+	last_vcpu_ran = hyp_vcpu->vcpu.arch.hw_mmu->last_vcpu_ran;
+	last_ran = (__force int *) &last_vcpu_ran[hyp_smp_processor_id()];
 	if (*last_ran != hyp_vcpu->vcpu.vcpu_id) {
 		__kvm_flush_cpu_context(hyp_vcpu->vcpu.arch.hw_mmu);
 		*last_ran = hyp_vcpu->vcpu.vcpu_id;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 0aa6eba48a33..3da331b49168 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -467,8 +467,8 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm,
 		pvmfw_load_addr = READ_ONCE(host_kvm->arch.pkvm.pvmfw_load_addr);
 	hyp_vm->kvm.arch.pkvm.pvmfw_load_addr = pvmfw_load_addr;
 
-	hyp_vm->kvm.arch.mmu.last_vcpu_ran = last_ran;
-	memset(hyp_vm->kvm.arch.mmu.last_vcpu_ran, -1, pkvm_get_last_ran_size());
+	hyp_vm->kvm.arch.mmu.last_vcpu_ran = (int __percpu *)last_ran;
+	memset(last_ran, -1, pkvm_get_last_ran_size());
 }
 
 static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu,
@@ -669,7 +669,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva,
 		   unsigned long pgd_hva, unsigned long last_ran_hva)
 {
 	struct pkvm_hyp_vm *hyp_vm = NULL;
-	void *last_ran = NULL;
+	int *last_ran = NULL;
 	size_t vm_size, pgd_size, last_ran_size;
 	unsigned int nr_vcpus;
 	void *pgd = NULL;
@@ -799,6 +799,7 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
 int __pkvm_teardown_vm(pkvm_handle_t handle)
 {
 	size_t vm_size, last_ran_size;
+	int __percpu *last_vcpu_ran;
 	struct kvm_hyp_memcache *mc;
 	struct pkvm_hyp_vm *hyp_vm;
 	unsigned int idx;
@@ -844,8 +845,9 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 		teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu));
 	}
 
+	last_vcpu_ran = hyp_vm->kvm.arch.mmu.last_vcpu_ran;
 	last_ran_size = pkvm_get_last_ran_size();
-	teardown_donated_memory(mc, hyp_vm->kvm.arch.mmu.last_vcpu_ran,
+	teardown_donated_memory(mc, (__force void *)last_vcpu_ran,
 				last_ran_size);
 
 	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);

From b75ae68d193758b2a2e18dfc9f0b1668735dbbae Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Mon, 4 Jul 2022 14:32:56 +0100
Subject: [PATCH 157/457] ANDROID: KVM: arm64: Add protected_hyp_mem VM
 statistic

When using nVHE in protected mode, the host allocates memory for the
hypervisor to store shadow structures and the stage-2 page tables. This has
been proven to be an interesting value to follow, for debug and health
purpose. Account for those allocations in bytes, in a newly created VM
statistic "protected_hyp_mem".

It is expected, on VM teardown to reclaim all that memory. Raise a warning
if not all the donations are recovered.

Bug: 222044477
Change-Id: I18657d275f2ced67ceb6d0e4bd5ce41cf1d41dc8
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  3 +-
 arch/arm64/kvm/arm.c              |  6 +++-
 arch/arm64/kvm/guest.c            |  3 +-
 arch/arm64/kvm/mmu.c              | 48 +++++++++++++++++++++++++------
 arch/arm64/kvm/pkvm.c             | 10 +++++--
 5 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 9409aa82ff6a..7b8b328ebf0c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -127,7 +127,7 @@ static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
 		free_fn(pop_hyp_memcache(mc, to_va), arg);
 }
 
-void free_hyp_memcache(struct kvm_hyp_memcache *mc);
+void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm);
 int topup_hyp_memcache(struct kvm_vcpu *vcpu);
 
 struct kvm_vmid {
@@ -821,6 +821,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 
 struct kvm_vm_stat {
 	struct kvm_vm_stat_generic generic;
+	atomic64_t protected_hyp_mem;
 };
 
 struct kvm_vcpu_stat {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 09f35ba6ed1d..eee234b3db88 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -212,6 +212,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 
 	kvm_destroy_vcpus(kvm);
 
+	if (atomic64_read(&kvm->stat.protected_hyp_mem))
+		pr_warn("%lluB of donations to the nVHE hyp are missing\n",
+			atomic64_read(&kvm->stat.protected_hyp_mem));
+
 	kvm_unshare_hyp(kvm, kvm + 1);
 }
 
@@ -459,7 +463,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 		static_branch_dec(&userspace_irqchip_in_use);
 
 	if (is_protected_kvm_enabled())
-		free_hyp_memcache(&vcpu->arch.pkvm_memcache);
+		free_hyp_memcache(&vcpu->arch.pkvm_memcache, vcpu->kvm);
 	else
 		kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 2ff13a3f8479..ee2bd0c0ad90 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,7 +29,8 @@
 #include "trace.h"
 
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
-	KVM_GENERIC_VM_STATS()
+	KVM_GENERIC_VM_STATS(),
+	STATS_DESC_ICOUNTER(VM, protected_hyp_mem),
 };
 
 const struct kvm_stats_header kvm_vm_stats_header = {
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index b23af71ebef6..0de691fb3541 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -871,22 +871,54 @@ static void *hyp_mc_alloc_fn(void *unused)
 	return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
 }
 
-void free_hyp_memcache(struct kvm_hyp_memcache *mc)
+static void account_hyp_memcache(struct kvm_hyp_memcache *mc,
+				 unsigned long prev_nr_pages,
+				 struct kvm *kvm)
 {
-	if (is_protected_kvm_enabled())
-		__free_hyp_memcache(mc, hyp_mc_free_fn,
-				    kvm_host_va, NULL);
+	unsigned long nr_pages = mc->nr_pages;
+
+	if (prev_nr_pages == nr_pages)
+		return;
+
+	if (nr_pages > prev_nr_pages) {
+		atomic64_add((nr_pages - prev_nr_pages) << PAGE_SHIFT,
+			     &kvm->stat.protected_hyp_mem);
+	} else {
+		atomic64_sub((prev_nr_pages - nr_pages) << PAGE_SHIFT,
+			     &kvm->stat.protected_hyp_mem);
+	}
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm)
+{
+	unsigned long prev_nr_pages;
+
+	if (!is_protected_kvm_enabled())
+		return;
+
+	prev_nr_pages = mc->nr_pages;
+	__free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL);
+	account_hyp_memcache(mc, prev_nr_pages, kvm);
 }
 
 int topup_hyp_memcache(struct kvm_vcpu *vcpu)
 {
+	struct kvm_hyp_memcache *mc = &vcpu->arch.pkvm_memcache;
+	unsigned long prev_nr_pages;
+	int err;
+
 	if (!is_protected_kvm_enabled())
 		return 0;
 
-	return __topup_hyp_memcache(&vcpu->arch.pkvm_memcache,
-				    kvm_mmu_cache_min_pages(vcpu->kvm),
-				    hyp_mc_alloc_fn,
-				    kvm_host_pa, NULL);
+	prev_nr_pages = mc->nr_pages;
+
+	err = __topup_hyp_memcache(mc, kvm_mmu_cache_min_pages(vcpu->kvm),
+				   hyp_mc_alloc_fn,
+				   kvm_host_pa, NULL);
+	if (!err)
+		account_hyp_memcache(mc, prev_nr_pages, vcpu->kvm);
+
+	return err;
 }
 
 /**
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index a86328a92374..3c0f232d2c1c 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -117,7 +117,7 @@ void __init kvm_hyp_reserve(void)
  */
 static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 {
-	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz, last_ran_sz;
+	size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz, last_ran_sz, total_sz;
 	struct kvm_vcpu *host_vcpu;
 	pkvm_handle_t handle;
 	void *pgd, *hyp_vm, *last_ran;
@@ -165,6 +165,8 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 
 	host_kvm->arch.pkvm.handle = handle;
 
+	total_sz = hyp_vm_sz + last_ran_sz + pgd_sz;
+
 	/* Donate memory for the vcpus at hyp and initialize it. */
 	hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
 	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
@@ -182,6 +184,8 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 			goto destroy_vm;
 		}
 
+		total_sz += hyp_vcpu_sz;
+
 		ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
 					hyp_vcpu);
 		if (ret) {
@@ -190,6 +194,8 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 		}
 	}
 
+	atomic64_set(&host_kvm->stat.protected_hyp_mem, total_sz);
+
 	return 0;
 
 destroy_vm:
@@ -228,7 +234,7 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 	}
 
 	host_kvm->arch.pkvm.handle = 0;
-	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
+	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc, host_kvm);
 
 	node = rb_first(&host_kvm->arch.pkvm.pinned_pages);
 	while (node) {

From 781b6882ba90bfae570b73f57c1131bc60804b01 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Tue, 5 Jul 2022 11:51:12 +0100
Subject: [PATCH 158/457] ANDROID: KVM: arm64: count KVM s2 mmu usage in nVHE
 protected mode

When using the nVHE protected mode, the stage-2 page tables are handled by
the hypervisor, but are backed by memory donated by the host. That memory
is accounted during the donation (add to the vCPUs hyp_memcache) under
secondary pagetable stats.

On VM teardown, those pages are mixed with others in the teardown_mc, so use
a separated teardown_stage2_mc to deduct them from accounting after
reclaim.

Bug: 222044477
Change-Id: I2a45ce65c5ce9cf96aabd1b66d6f83ffe4808a0c
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  2 ++
 arch/arm64/kvm/arm.c              |  2 +-
 arch/arm64/kvm/hyp/nvhe/pkvm.c    | 10 +++++----
 arch/arm64/kvm/mmu.c              | 37 +++++++++++++++++++++++++++----
 arch/arm64/kvm/pkvm.c             |  3 +++
 5 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7b8b328ebf0c..e3b13f4967e0 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -128,6 +128,7 @@ static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
 }
 
 void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm);
+void free_hyp_stage2_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm);
 int topup_hyp_memcache(struct kvm_vcpu *vcpu);
 
 struct kvm_vmid {
@@ -183,6 +184,7 @@ typedef unsigned int pkvm_handle_t;
 struct kvm_protected_vm {
 	pkvm_handle_t handle;
 	struct kvm_hyp_memcache teardown_mc;
+	struct kvm_hyp_memcache teardown_stage2_mc;
 	struct rb_root pinned_pages;
 	gpa_t pvmfw_load_addr;
 	bool enabled;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index eee234b3db88..1b606bb1d285 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -463,7 +463,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 		static_branch_dec(&userspace_irqchip_in_use);
 
 	if (is_protected_kvm_enabled())
-		free_hyp_memcache(&vcpu->arch.pkvm_memcache, vcpu->kvm);
+		free_hyp_stage2_memcache(&vcpu->arch.pkvm_memcache, vcpu->kvm);
 	else
 		kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3da331b49168..cc7e92e93d05 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -798,9 +798,9 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size)
 
 int __pkvm_teardown_vm(pkvm_handle_t handle)
 {
+	struct kvm_hyp_memcache *mc, *stage2_mc;
 	size_t vm_size, last_ran_size;
 	int __percpu *last_vcpu_ran;
-	struct kvm_hyp_memcache *mc;
 	struct pkvm_hyp_vm *hyp_vm;
 	unsigned int idx;
 	int err;
@@ -822,9 +822,11 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 	remove_vm_table_entry(handle);
 	hyp_spin_unlock(&vm_table_lock);
 
-	/* Reclaim guest pages (including page-table pages) */
 	mc = &hyp_vm->host_kvm->arch.pkvm.teardown_mc;
-	reclaim_guest_pages(hyp_vm, mc);
+	stage2_mc = &hyp_vm->host_kvm->arch.pkvm.teardown_stage2_mc;
+
+	/* Reclaim guest pages (including page-table pages) */
+	reclaim_guest_pages(hyp_vm, stage2_mc);
 	unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
 
 	/* Push the metadata pages to the teardown memcache */
@@ -838,7 +840,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 		vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache;
 		while (vcpu_mc->nr_pages) {
 			addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt);
-			push_hyp_memcache(mc, addr, hyp_virt_to_phys);
+			push_hyp_memcache(stage2_mc, addr, hyp_virt_to_phys);
 			unmap_donated_memory_noclear(addr, PAGE_SIZE);
 		}
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 0de691fb3541..49a56711c33c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -861,14 +861,24 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	}
 }
 
-static void hyp_mc_free_fn(void *addr, void *unused)
+static void hyp_mc_free_fn(void *addr, void *args)
 {
+	bool account_stage2 = (bool)args;
+
+	if (account_stage2)
+		kvm_account_pgtable_pages(addr, -1);
+
 	free_page((unsigned long)addr);
 }
 
 static void *hyp_mc_alloc_fn(void *unused)
 {
-	return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+	void *addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+
+	if (addr)
+		kvm_account_pgtable_pages(addr, 1);
+
+	return addr;
 }
 
 static void account_hyp_memcache(struct kvm_hyp_memcache *mc,
@@ -889,7 +899,9 @@ static void account_hyp_memcache(struct kvm_hyp_memcache *mc,
 	}
 }
 
-void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm)
+static void __free_account_hyp_memcache(struct kvm_hyp_memcache *mc,
+					struct kvm *kvm,
+					bool account_stage2)
 {
 	unsigned long prev_nr_pages;
 
@@ -897,10 +909,27 @@ void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm)
 		return;
 
 	prev_nr_pages = mc->nr_pages;
-	__free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL);
+	__free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va,
+			    (void *)account_stage2);
 	account_hyp_memcache(mc, prev_nr_pages, kvm);
 }
 
+void free_hyp_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm)
+{
+	__free_account_hyp_memcache(mc, kvm, false);
+}
+
+/*
+ * All pages donated to the hypervisor through kvm_hyp_memcache are for the
+ * stage-2 page table. However, kvm_hyp_memcache is also a vehicule to retrieve
+ * meta-data from the hypervisor, hence the need for a stage2 specific free
+ * function.
+ */
+void free_hyp_stage2_memcache(struct kvm_hyp_memcache *mc, struct kvm *kvm)
+{
+	__free_account_hyp_memcache(mc, kvm, true);
+}
+
 int topup_hyp_memcache(struct kvm_vcpu *vcpu)
 {
 	struct kvm_hyp_memcache *mc = &vcpu->arch.pkvm_memcache;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 3c0f232d2c1c..8285eff0dcd7 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -195,6 +195,7 @@ static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 	}
 
 	atomic64_set(&host_kvm->stat.protected_hyp_mem, total_sz);
+	kvm_account_pgtable_pages(pgd, pgd_sz >> PAGE_SHIFT);
 
 	return 0;
 
@@ -235,6 +236,8 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 
 	host_kvm->arch.pkvm.handle = 0;
 	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc, host_kvm);
+	free_hyp_stage2_memcache(&host_kvm->arch.pkvm.teardown_stage2_mc,
+				 host_kvm);
 
 	node = rb_first(&host_kvm->arch.pkvm.pinned_pages);
 	while (node) {

From b97d00514a10dae1210d03ab2bd7d20576e01a53 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 26 Oct 2022 12:42:51 +0100
Subject: [PATCH 159/457] ANDROID: KVM: arm64: Add protected_shared_mem
 statistic

When using nVHE in protected mode, protected memory can be between
host and a guest. Tracking this value is interesting from a debug
perspective, to identify potential leaks.

Keeping the count of memory sharing is easy, each share/unshare will return
to the host where the accounting will take place.

Bug: 222044477
Change-Id: I43dcd258789f79dbfe489e5bf721e606c5e6e022
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  1 +
 arch/arm64/kvm/guest.c            |  1 +
 arch/arm64/kvm/hypercalls.c       | 12 ++++++++++++
 3 files changed, 14 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e3b13f4967e0..a5aece891f59 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -824,6 +824,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
 struct kvm_vm_stat {
 	struct kvm_vm_stat_generic generic;
 	atomic64_t protected_hyp_mem;
+	atomic64_t protected_shared_mem;
 };
 
 struct kvm_vcpu_stat {
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index ee2bd0c0ad90..89cb03e2ed06 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -31,6 +31,7 @@
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	KVM_GENERIC_VM_STATS(),
 	STATS_DESC_ICOUNTER(VM, protected_hyp_mem),
+	STATS_DESC_ICOUNTER(VM, protected_shared_mem),
 };
 
 const struct kvm_stats_header kvm_vm_stats_header = {
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index a92428ce1912..fc64e8358928 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -83,6 +83,8 @@ static bool kvm_hvc_call_default_allowed(u32 func_id)
 	 */
 	case ARM_SMCCC_VERSION_FUNC_ID:
 	case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
 		return true;
 	default:
 		/* PSCI 0.2 and up is in the 0:0x1f range */
@@ -224,6 +226,16 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
 		kvm_ptp_get_time(vcpu, val);
 		break;
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
+	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+		if (!kvm_vm_is_protected(vcpu->kvm))
+			break;
+		atomic64_add(
+			func_id == ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID ?
+			PAGE_SIZE : -PAGE_SIZE,
+			&vcpu->kvm->stat.protected_shared_mem);
+		val[0] = SMCCC_RET_SUCCESS;
+		break;
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_RELINQUISH_FUNC_ID:
 		pkvm_host_reclaim_page(vcpu->kvm, smccc_get_arg1(vcpu));
 		val[0] = SMCCC_RET_SUCCESS;

From a596e6423ea1ba390fa00479dee45b7627d053a2 Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Wed, 23 Nov 2022 13:32:44 +0000
Subject: [PATCH 160/457] ANDROID: KVM: arm64: Update pKVM hyp state series to
 v6

aosp/2257747 merged v5 of the pKVM hypervisor state series as FROMLIST.
Since then, version 6 was posted and queued by the upstream maintainer:

  https://lore.kernel.org/r/166819337067.3836113.13147674500457473286.b4-ty@kernel.org

Rather than revert v5 from android (and the dozens of dependent patches),
snap to v6 so that we're in-sync with upstream.

Bug: 233587962
[willdeacon@: Fix conflicts with 'stage2_mc' introduced by accounting work]
Signed-off-by: Will Deacon <willdeacon@google.com>
Change-Id: I137bbd611c180cbe03e63a55705150f8f9c2ae31
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c |  4 ++--
 arch/arm64/kvm/hyp/nvhe/page_alloc.c  |  1 +
 arch/arm64/kvm/hyp/nvhe/pkvm.c        | 12 +++++++-----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index ab10ffb80558..7d742c771a1b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1455,7 +1455,7 @@ static int check_donation(struct pkvm_mem_donation *donation)
 	if (ret)
 		return ret;
 
-	switch (tx->completer.id){
+	switch (tx->completer.id) {
 	case PKVM_ID_HOST:
 		ret = host_ack_donation(completer_addr, tx);
 		break;
@@ -1492,7 +1492,7 @@ static int __do_donate(struct pkvm_mem_donation *donation)
 	if (ret)
 		return ret;
 
-	switch (tx->completer.id){
+	switch (tx->completer.id) {
 	case PKVM_ID_HOST:
 		ret = host_complete_donation(completer_addr, tx);
 		break;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 01976a58d850..11b190ff49d1 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -99,6 +99,7 @@ static void __hyp_attach_page(struct hyp_pool *pool,
 
 	memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
 
+	/* Skip coalescing for 'external' pages being freed into the pool. */
 	if (phys < pool->range_start || phys >= pool->range_end)
 		goto insert;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index cc7e92e93d05..12d5728d2b04 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -654,7 +654,7 @@ static void unmap_donated_memory_noclear(void *va, size_t size)
  *
  * Unmaps the donated memory from the host at stage 2.
  *
- * kvm: A pointer to the host's struct kvm.
+ * host_kvm: A pointer to the host's struct kvm.
  * vm_hva: The host va of the area being donated for the VM state.
  *	   Must be page aligned.
  * pgd_hva: The host va of the area being donated for the stage-2 PGD for
@@ -802,6 +802,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 	size_t vm_size, last_ran_size;
 	int __percpu *last_vcpu_ran;
 	struct pkvm_hyp_vm *hyp_vm;
+	struct kvm *host_kvm;
 	unsigned int idx;
 	int err;
 
@@ -817,21 +818,21 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 		goto err_unlock;
 	}
 
+	host_kvm = hyp_vm->host_kvm;
+
 	/* Ensure the VMID is clean before it can be reallocated */
 	__kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu);
 	remove_vm_table_entry(handle);
 	hyp_spin_unlock(&vm_table_lock);
 
-	mc = &hyp_vm->host_kvm->arch.pkvm.teardown_mc;
-	stage2_mc = &hyp_vm->host_kvm->arch.pkvm.teardown_stage2_mc;
+	mc = &host_kvm->arch.pkvm.teardown_mc;
+	stage2_mc = &host_kvm->arch.pkvm.teardown_stage2_mc;
 
 	/* Reclaim guest pages (including page-table pages) */
 	reclaim_guest_pages(hyp_vm, stage2_mc);
 	unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus);
 
 	/* Push the metadata pages to the teardown memcache */
-	hyp_unpin_shared_mem(hyp_vm->host_kvm, hyp_vm->host_kvm + 1);
-
 	for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) {
 		struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx];
 		struct kvm_hyp_memcache *vcpu_mc;
@@ -854,6 +855,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle)
 
 	vm_size = pkvm_get_hyp_vm_size(hyp_vm->kvm.created_vcpus);
 	teardown_donated_memory(mc, hyp_vm, vm_size);
+	hyp_unpin_shared_mem(host_kvm, host_kvm + 1);
 	return 0;
 
 err_unlock:

From dd5b25ca1fe2f7fc77183e0d6b879edd03b183d8 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 29 Jun 2021 22:05:26 +0000
Subject: [PATCH 161/457] ANDROID: KVM: arm64: Introduce IOMMU driver
 infrastructure

Bootstrap infrastructure for IOMMU drivers by introducing kvm_iommu_ops
struct in EL2 that is populated based on a iommu_driver parameter to
__pkvm_init hypercall and selected in EL1 early init.

An 'init' operation is called in __pkvm_init_finalise, giving the driver
an opportunity to initialize itself in EL2 and create any EL2 mappings
that it will need. 'init' is specifically called before
'finalize_host_mappings' so that:
  (a) pages mapped by the driver change owner to hyp,
  (b) ownership changes in 'finalize_host_mappings' get reflected in
      IOMMU mappings (added in a future patch).

Test: builds, boots
Bug: 190463801
Change-Id: I04c9f32c6eda846e6e377cb3d23330eb143b6242
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 79775d022591380b9419f9a1d95c1e03076f9c5c)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h  |  4 ++++
 arch/arm64/include/asm/kvm_hyp.h   |  8 +++++++-
 arch/arm64/kvm/arm.c               | 15 ++++++++++++---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |  5 ++++-
 arch/arm64/kvm/hyp/nvhe/setup.c    | 24 +++++++++++++++++++++++-
 5 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a5aece891f59..41337d5ae41a 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -379,6 +379,10 @@ extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
 extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map)
 
+enum kvm_iommu_driver {
+	KVM_IOMMU_DRIVER_NONE,
+};
+
 struct vcpu_reset_state {
 	unsigned long	pc;
 	unsigned long	r0;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index d450ed354d69..bf1ca0e6b97f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -114,7 +114,8 @@ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
 void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
 			    phys_addr_t pgd, void *sp, void *cont_fn);
 int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
-		unsigned long *per_cpu_base, u32 hyp_va_bits);
+		unsigned long *per_cpu_base, u32 hyp_va_bits,
+		enum kvm_iommu_driver iommu_driver);
 void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
 #endif
 
@@ -130,5 +131,10 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
 extern unsigned long kvm_nvhe_sym(__icache_flags);
 extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
 extern bool kvm_nvhe_sym(smccc_trng_available);
+struct kvm_iommu_ops {
+	int (*init)(void);
+};
+
+extern struct kvm_iommu_ops kvm_iommu_ops;
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1b606bb1d285..977a58b4c8a7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1919,6 +1919,11 @@ static bool init_psci_relay(void)
 	return true;
 }
 
+static int init_stage2_iommu(void)
+{
+	return KVM_IOMMU_DRIVER_NONE;
+}
+
 static int init_subsystems(void)
 {
 	int err = 0;
@@ -1977,7 +1982,7 @@ static void teardown_hyp_mode(void)
 	}
 }
 
-static int do_pkvm_init(u32 hyp_va_bits)
+static int do_pkvm_init(u32 hyp_va_bits, enum kvm_iommu_driver iommu_driver)
 {
 	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
 	int ret;
@@ -1986,7 +1991,7 @@ static int do_pkvm_init(u32 hyp_va_bits)
 	cpu_hyp_init_context();
 	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
 				num_possible_cpus(), kern_hyp_va(per_cpu_base),
-				hyp_va_bits);
+				hyp_va_bits, iommu_driver);
 	cpu_hyp_init_features();
 
 	/*
@@ -2023,7 +2028,11 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
 	if (ret)
 		return ret;
 
-	ret = do_pkvm_init(hyp_va_bits);
+	ret = init_stage2_iommu();
+	if (ret < 0)
+		return ret;
+
+	ret = do_pkvm_init(hyp_va_bits, (enum kvm_iommu_driver)ret);
 	if (ret)
 		return ret;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e44ce4e7e2a8..cf14b4ed6a83 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -36,6 +36,8 @@ static DEFINE_PER_CPU(struct user_fpsimd_state, loaded_host_fpsimd_state);
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
+struct kvm_iommu_ops kvm_iommu_ops;
+
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
 static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -1013,6 +1015,7 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
 	DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
 	DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
+	DECLARE_REG(enum kvm_iommu_driver, iommu_driver, host_ctxt, 6);
 
 	/*
 	 * __pkvm_init() will return only if an error occurred, otherwise it
@@ -1020,7 +1023,7 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
 	 * with the host context directly.
 	 */
 	cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
-					    hyp_va_bits);
+					    hyp_va_bits, iommu_driver);
 }
 
 static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index bde9368725c4..16363765ffec 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -301,6 +301,16 @@ static int fix_hyp_pgtable_refcnt(void)
 				&walker);
 }
 
+int select_iommu_ops(enum kvm_iommu_driver driver)
+{
+	switch (driver) {
+	case KVM_IOMMU_DRIVER_NONE:
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 void __noreturn __pkvm_init_finalise(void)
 {
 	struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -320,6 +330,13 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	if (kvm_iommu_ops.init) {
+		ret = kvm_iommu_ops.init();
+		if (ret)
+			goto out;
+	}
+
+
 	pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
 		.zalloc_page = hyp_zalloc_hyp_page,
 		.phys_to_virt = hyp_phys_to_virt,
@@ -358,7 +375,8 @@ out:
 }
 
 int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
-		unsigned long *per_cpu_base, u32 hyp_va_bits)
+		unsigned long *per_cpu_base, u32 hyp_va_bits,
+		enum kvm_iommu_driver iommu_driver)
 {
 	struct kvm_nvhe_init_params *params;
 	void *virt = hyp_phys_to_virt(phys);
@@ -381,6 +399,10 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
 	if (ret)
 		return ret;
 
+	ret = select_iommu_ops(iommu_driver);
+	if (ret)
+		return ret;
+
 	update_nvhe_init_params();
 
 	/* Jump in the idmap page to switch to the new page-tables */

From 90473cbd4be1236697e07d9449514d7d607eef04 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 29 Jun 2021 19:14:45 +0000
Subject: [PATCH 162/457] ANDROID: KVM: arm64: Add 'host_smc_handler' to
 kvm_iommu_ops

IOMMU drivers need to intercept power management SMCs between the host
and EL3. Add a hook to hyp's 'handle_host_smc'.

Test: builds, boots
Bug: 190463801
Change-Id: Ied34b60d4bb0e5ae0fbf03f8ce1dc22a09679e37
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit d2efcdcb2b1874ad0d701cdb6d9ae4517fc0e117)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h   | 1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index bf1ca0e6b97f..92a614b36625 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -133,6 +133,7 @@ extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
 extern bool kvm_nvhe_sym(smccc_trng_available);
 struct kvm_iommu_ops {
 	int (*init)(void);
+	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
 };
 
 extern struct kvm_iommu_ops kvm_iommu_ops;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index cf14b4ed6a83..9e4dd62c1c58 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1194,6 +1194,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 	bool handled;
 
 	handled = kvm_host_psci_handler(host_ctxt);
+	if (!handled && kvm_iommu_ops.host_smc_handler)
+		handled = kvm_iommu_ops.host_smc_handler(host_ctxt);
 	if (!handled)
 		handled = kvm_host_ffa_handler(host_ctxt);
 	if (!handled)

From 6cf8566b50e995207c2d3781510ddc8e80034e2b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 24 Jun 2021 11:02:37 +0000
Subject: [PATCH 163/457] ANDROID: KVM: arm64: Add 'host_stage2_set_owner' to
 kvm_iommu_ops

Add a new hook to kvm_iommu_ops that is invoked whenever a range of
pages changes their owner in the host stage2. This is currently limited
to finalize_host_mappings, which changes the owner of EL2-mapped pages
from host to hyp.

The driver is expected to apply corresponding changes in the IOMMU it
controls, so that only the new owner can access the page range.

Test: builds, boots
Bug: 190463801
Change-Id: I0809f4859a9117d1a37506b7aa9e19c6bd25ffdb
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 3cd8b5b00be97ba6567d319a8be8bf7b45929936)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h      |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 92a614b36625..61b8354e1b7d 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -134,6 +134,7 @@ extern bool kvm_nvhe_sym(smccc_trng_available);
 struct kvm_iommu_ops {
 	int (*init)(void);
 	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
+	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size, u8 owner_id);
 };
 
 extern struct kvm_iommu_ops kvm_iommu_ops;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 7d742c771a1b..5ec7b76657b5 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -576,14 +576,20 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 {
 	kvm_pte_t annotation;
+	int ret;
 
 	if (owner_id > KVM_MAX_OWNER_ID)
 		return -EINVAL;
 
 	annotation = kvm_init_invalid_leaf_owner(owner_id);
 
-	return host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
-			       addr, size, &host_s2_pool, annotation);
+	ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
+			      addr, size, &host_s2_pool, annotation);
+
+	if (!ret && kvm_iommu_ops.host_stage2_set_owner)
+		kvm_iommu_ops.host_stage2_set_owner(addr, size, owner_id);
+
+	return ret;
 }
 
 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)

From 59d406c88deed2694c09b59c65b2fe240765b347 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 28 Oct 2021 14:26:18 +0100
Subject: [PATCH 164/457] ANDROID: KVM: arm64: Add 'host_mmio_dabt_handler' to
 kvm_iommu_ops

Add a new kvm_iommu_ops hook which allows the IOMMU driver to handle
data aborts in unmapped device memory regions. If the abort is handled
by the driver, the global abort handler will not attempt to map in the
page.

For example, this enables the IOMMU driver to virtualize access to
the underlying IOMMU hardware, or to allow access to a subset of the
functionality, eg. performance counters.

Test: builds, boots
Bug: 190463801
Change-Id: I84adbc992e577ac6ceb09f4856e1c648df580f76
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 25f81ec77b8915f45ba2d2c393cba37788b6eef8)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h      |  5 +++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 37 +++++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 61b8354e1b7d..ffed0238279f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -134,7 +134,10 @@ extern bool kvm_nvhe_sym(smccc_trng_available);
 struct kvm_iommu_ops {
 	int (*init)(void);
 	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
-	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size, u8 owner_id);
+	bool (*host_mmio_dabt_handler)(struct kvm_cpu_context *host_ctxt,
+				       phys_addr_t fault_pa, unsigned int len,
+				       bool is_write, int rd);
+	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size, u32 owner_id);
 };
 
 extern struct kvm_iommu_ops kvm_iommu_ops;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 5ec7b76657b5..eb6fe11362d4 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -12,6 +12,7 @@
 #include <asm/kvm_pkvm.h>
 #include <asm/stage2_pgtable.h>
 
+#include <hyp/adjust_pc.h>
 #include <hyp/fault.h>
 
 #include <nvhe/gfp.h>
@@ -679,17 +680,49 @@ static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
 	write_sysreg_el2(spsr, SYS_SPSR);
 }
 
+static int host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
+				  phys_addr_t addr)
+{
+	bool wnr = esr & ESR_ELx_WNR;
+	unsigned int len = BIT((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT);
+	int rd = (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
+	bool handled = false;
+
+	if (kvm_iommu_ops.host_mmio_dabt_handler) {
+		handled = kvm_iommu_ops.host_mmio_dabt_handler(host_ctxt, addr,
+							       len, wnr, rd);
+	}
+
+	if (!handled)
+		return -EPERM;
+
+	kvm_skip_host_instr();
+	return 0;
+}
+
+static bool is_dabt(u64 esr)
+{
+	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_LOW;
+}
+
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 {
 	struct kvm_vcpu_fault_info fault;
 	u64 esr, addr;
-	int ret = 0;
+	int ret = -EPERM;
 
 	esr = read_sysreg_el2(SYS_ESR);
 	BUG_ON(!__get_fault_info(esr, &fault));
 
 	addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
-	ret = host_stage2_idmap(addr);
+
+	/* See if any subsystem can handle this abort. */
+	if (is_dabt(esr) && !addr_is_memory(addr))
+		ret = host_mmio_dabt_handler(host_ctxt, esr, addr);
+
+	/* If not handled, attempt to map the page. */
+	if (ret == -EPERM)
+		ret = host_stage2_idmap(addr);
 
 	if (ret == -EPERM)
 		host_inject_abort(host_ctxt);

From 2bc6495fedcec3d3e8ab5bb4ce49435a08c6d3a4 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 28 Oct 2021 14:23:05 +0100
Subject: [PATCH 165/457] ANDROID: KVM: arm64: Add
 'host_stage2_adjust_mmio_range' to kvm_iommu_ops

Add a new kvm_iommu_ops hook to the lower-EL instruction/data abort
handler, which allows the IOMMU driver to restrict the region of device
memory that is about to be mapped in the host stage-2.

This can be used by the IOMMU driver to restrict access to the MMIO
registers of the IOMMU itself.

Test: builds, boots
Bug: 190463801
Change-Id: I51cf3cfd84c889627e290d74579657447964ca16
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit cc1ad46fb2618ea2346e52ace4b010b364d10e65)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_hyp.h      |  2 ++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index ffed0238279f..4728b1321d51 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -138,6 +138,8 @@ struct kvm_iommu_ops {
 				       phys_addr_t fault_pa, unsigned int len,
 				       bool is_write, int rd);
 	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size, u32 owner_id);
+	int (*host_stage2_adjust_mmio_range)(phys_addr_t addr, phys_addr_t *start,
+					     phys_addr_t *end);
 };
 
 extern struct kvm_iommu_ops kvm_iommu_ops;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index eb6fe11362d4..c6f40dfc63ef 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -624,6 +624,17 @@ static int host_stage2_idmap(u64 addr)
 
 	prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
 
+	/**
+	 * Let device drivers adjust the permitted range first.
+	 * host_stage2_adjust_range() should be last to also properly align it.
+	 */
+	if (!is_memory && kvm_iommu_ops.host_stage2_adjust_mmio_range) {
+		ret = kvm_iommu_ops.host_stage2_adjust_mmio_range(addr, &range.start,
+								  &range.end);
+		if (ret)
+			return ret;
+	}
+
 	host_lock_component();
 	ret = host_stage2_adjust_range(addr, &range);
 	if (ret)

From c4d2c4f644c993b715082abf28259be84717bc57 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 7 Jul 2021 14:54:24 +0000
Subject: [PATCH 166/457] ANDROID: KVM: arm64: Create empty S2MPU driver

Create a skeleton driver for the S2MPU - an EL1 portion called during
KVM init which will parse the DT and configure the kernel, and an EL2
portion which will program the S2MPUs later at runtime. The code is
behind CONFIG_KVM_S2MPU.

Test: builds, boots
Bug: 190463801
Change-Id: I58206535f3493e1d989576a9db2112d370a1cb4d
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit b2de5483b7c5977dccfa85bde756e47f53a0ab42)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h     |  7 +++++++
 arch/arm64/include/asm/kvm_hyp.h      |  1 +
 arch/arm64/kvm/Kconfig                |  9 +++++++++
 arch/arm64/kvm/Makefile               |  2 +-
 arch/arm64/kvm/arm.c                  |  8 +++++++-
 arch/arm64/kvm/hyp/nvhe/Makefile      |  2 ++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 11 +++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c       |  6 ++++++
 arch/arm64/kvm/iommu/Makefile         |  6 ++++++
 arch/arm64/kvm/iommu/s2mpu.c          | 13 +++++++++++++
 10 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
 create mode 100644 arch/arm64/kvm/iommu/Makefile
 create mode 100644 arch/arm64/kvm/iommu/s2mpu.c

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 41337d5ae41a..36daec0bbb0f 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -381,8 +381,15 @@ extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 
 enum kvm_iommu_driver {
 	KVM_IOMMU_DRIVER_NONE,
+	KVM_IOMMU_DRIVER_S2MPU,
 };
 
+#ifdef CONFIG_KVM_S2MPU
+int kvm_s2mpu_init(void);
+#else
+static inline int kvm_s2mpu_init(void) { return -ENODEV; }
+#endif
+
 struct vcpu_reset_state {
 	unsigned long	pc;
 	unsigned long	r0;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4728b1321d51..4c343db80324 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -143,5 +143,6 @@ struct kvm_iommu_ops {
 };
 
 extern struct kvm_iommu_ops kvm_iommu_ops;
+extern const struct kvm_iommu_ops kvm_s2mpu_ops;
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 815cc118c675..9d7b7718e3f0 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -69,4 +69,13 @@ config PROTECTED_NVHE_STACKTRACE
 
 	  If unsure, or not using protected nVHE (pKVM), say N.
 
+config KVM_S2MPU
+	bool "Stage-2 Memory Protection Unit support"
+	depends on KVM
+	help
+	  Support for the Stage-2 Memory Protection Unit (S2MPU) and Stream
+	  Security Mapping Table (SSMT) devices in KVM. This allows the
+	  hypervisor to restrict DMA access to its memory and the memory of
+	  protected guests.
+
 endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 5e33c2d4645a..ccec383ad673 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -8,7 +8,7 @@ ccflags-y += -I $(srctree)/$(src)
 include $(srctree)/virt/kvm/Makefile.kvm
 
 obj-$(CONFIG_KVM) += kvm.o
-obj-$(CONFIG_KVM) += hyp/
+obj-$(CONFIG_KVM) += hyp/ iommu/
 
 kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 977a58b4c8a7..efeeb56e823c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1921,7 +1921,13 @@ static bool init_psci_relay(void)
 
 static int init_stage2_iommu(void)
 {
-	return KVM_IOMMU_DRIVER_NONE;
+	int ret;
+
+	ret = kvm_s2mpu_init();
+	if (!ret)
+		return KVM_IOMMU_DRIVER_S2MPU;
+
+	return (ret == -ENODEV) ? KVM_IOMMU_DRIVER_NONE : ret;
 }
 
 static int init_subsystems(void)
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 1b34d3ff57f3..7086c6134331 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -28,6 +28,8 @@ hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
 hyp-obj-y += $(lib-objs)
 
+hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/s2mpu.o
+
 ##
 ## Build rules for compiling nVHE hyp code
 ## Output of this folder is `kvm_nvhe.o`, a partially linked object
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
new file mode 100644
index 000000000000..1731300e0bec
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+
+const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){};
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 16363765ffec..1716b70f9286 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -306,6 +306,12 @@ int select_iommu_ops(enum kvm_iommu_driver driver)
 	switch (driver) {
 	case KVM_IOMMU_DRIVER_NONE:
 		return 0;
+	case KVM_IOMMU_DRIVER_S2MPU:
+		if (IS_ENABLED(CONFIG_KVM_S2MPU)) {
+			kvm_iommu_ops = kvm_s2mpu_ops;
+			return 0;
+		}
+		break;
 	}
 
 	return -EINVAL;
diff --git a/arch/arm64/kvm/iommu/Makefile b/arch/arm64/kvm/iommu/Makefile
new file mode 100644
index 000000000000..2a51f8cb2848
--- /dev/null
+++ b/arch/arm64/kvm/iommu/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+obj-$(CONFIG_KVM_S2MPU) += s2mpu.o
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
new file mode 100644
index 000000000000..800d264e4be1
--- /dev/null
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <linux/kvm_host.h>
+
+int kvm_s2mpu_init(void)
+{
+	kvm_info("S2MPU driver initialized\n");
+	return 0;
+}

From 3c1759f91d4fede602b7e106318bbac8efff9839 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 17 Jun 2021 14:23:25 +0000
Subject: [PATCH 167/457] ANDROID: KVM: arm64: Parse S2MPU MMIO region

Start EL1 portion of the S2MPU driver with an init function which
probes the Device tree for nodes compatible with 'google,s2mpu'.
Parse and check the base, size and power domain ID.

Test: builds, boots
Bug: 190463801
Change-Id: I5f0b32febb4e922fdfdfe10a9a9c823e20b8e26f
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4e91a0015386e3449c9da101175fb4c613df1179)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h | 16 +++++++
 arch/arm64/kvm/iommu/s2mpu.c       | 77 +++++++++++++++++++++++++++++-
 2 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/kvm_s2mpu.h

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
new file mode 100644
index 000000000000..ba0db7988117
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#ifndef __ARM64_KVM_S2MPU_H__
+#define __ARM64_KVM_S2MPU_H__
+
+enum s2mpu_power_state {
+	S2MPU_POWER_ALWAYS_ON,
+	S2MPU_POWER_ON,
+	S2MPU_POWER_OFF,
+};
+
+#endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 800d264e4be1..9a9e2db44037 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -5,9 +5,84 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/of_platform.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_s2mpu.h>
+
+#define S2MPU_MMIO_SIZE		SZ_64K
+
+static int s2mpu_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	void __iomem *kaddr;
+	size_t res_size;
+	enum s2mpu_power_state power_state = S2MPU_POWER_ALWAYS_ON;
+	u32 power_domain_id = 0;
+	int ret;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "failed to parse 'reg'");
+		return -EINVAL;
+	}
+
+	/* devm_ioremap_resource internally calls devm_request_mem_region. */
+	kaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(kaddr)) {
+		dev_err(&pdev->dev, "could not ioremap resource: %ld",
+			PTR_ERR(kaddr));
+		return PTR_ERR(kaddr);
+	}
+
+	if (!PAGE_ALIGNED(res->start)) {
+		dev_err(&pdev->dev, "base address must be page-aligned (0x%llx)",
+			res->start);
+		return -EINVAL;
+	}
+
+	res_size = resource_size(res);
+	if (res_size != S2MPU_MMIO_SIZE) {
+		dev_err(&pdev->dev,
+			"unexpected device region size (expected=%u, actual=%lu)",
+			S2MPU_MMIO_SIZE, res_size);
+		return -EINVAL;
+	}
+
+	ret = of_property_read_u32(pdev->dev.of_node, "power-domain-id",
+				   &power_domain_id);
+	if (!ret) {
+		power_state = S2MPU_POWER_ON;
+	} else if (ret != -EINVAL) {
+		dev_err(&pdev->dev, "failed to parse power-domain-id: %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct of_device_id of_table[] = {
+	{ .compatible = "google,s2mpu" },
+	{},
+};
+
+static struct platform_driver of_driver = {
+	.driver = {
+		.name = "kvm,s2mpu",
+		.of_match_table = of_table,
+	},
+};
 
 int kvm_s2mpu_init(void)
 {
+	int ret;
+
+	ret = platform_driver_probe(&of_driver, s2mpu_probe);
+	if (ret)
+		goto out;
+
 	kvm_info("S2MPU driver initialized\n");
-	return 0;
+
+out:
+	return ret;
 }

From ed7f0fcd8f6ffee7e6ecd664dd9b359d0c941a5b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 9 Jul 2021 11:47:09 +0100
Subject: [PATCH 168/457] ANDROID: KVM: arm64: Read and check S2MPU_VERSION

Read S2MPU_VERSION during driver init and check it against list of
supported versions. The register fields are as follows:
  - MAJOR_ARCH_VER,
  - MINOR_ARCH_VER,
  - REV_ARCH_VER,
  - RTL_VER.
Their exact use is not documented. For now, we mask out RTL_VER and
expect a match on MAJOR_, MINOR_ and REV_ARCH_VER. This may be tweaked
in the future.

Test: builds, boots
Bug: 190463801
Change-Id: I9709fde5f4d3ca4c23f84919c37b081302846917
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4a7da93bdbfcd166f5bb67a69d6f017da158c7e2)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h | 17 +++++++++++++++++
 arch/arm64/kvm/iommu/s2mpu.c       | 12 +++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index ba0db7988117..2ae35aba43e6 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -7,6 +7,23 @@
 #ifndef __ARM64_KVM_S2MPU_H__
 #define __ARM64_KVM_S2MPU_H__
 
+#define REG_NS_VERSION				0x60
+
+#define VERSION_MAJOR_ARCH_VER_MASK		GENMASK(31, 28)
+#define VERSION_MINOR_ARCH_VER_MASK		GENMASK(27, 24)
+#define VERSION_REV_ARCH_VER_MASK		GENMASK(23, 16)
+#define VERSION_RTL_VER_MASK			GENMASK(7, 0)
+
+/* Ignore RTL version in driver version check. */
+#define VERSION_CHECK_MASK			(VERSION_MAJOR_ARCH_VER_MASK | \
+						 VERSION_MINOR_ARCH_VER_MASK | \
+						 VERSION_REV_ARCH_VER_MASK)
+
+enum s2mpu_version {
+	S2MPU_VERSION_8 = 0x11000000,
+	S2MPU_VERSION_9 = 0x20000000,
+};
+
 enum s2mpu_power_state {
 	S2MPU_POWER_ALWAYS_ON,
 	S2MPU_POWER_ON,
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 9a9e2db44037..5d7a87dee0d3 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -18,7 +18,7 @@ static int s2mpu_probe(struct platform_device *pdev)
 	void __iomem *kaddr;
 	size_t res_size;
 	enum s2mpu_power_state power_state = S2MPU_POWER_ALWAYS_ON;
-	u32 power_domain_id = 0;
+	u32 version, power_domain_id = 0;
 	int ret;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -58,6 +58,16 @@ static int s2mpu_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	version = readl_relaxed(kaddr + REG_NS_VERSION);
+	switch (version & VERSION_CHECK_MASK) {
+	case S2MPU_VERSION_8:
+	case S2MPU_VERSION_9:
+		break;
+	default:
+		dev_err(&pdev->dev, "unexpected version 0x%08x", version);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 

From 529308f73d6fd46f84efaaecf1baec9dffbf7391 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 10 Aug 2021 12:23:39 +0100
Subject: [PATCH 169/457] ANDROID: KVM: arm64: Allocate context IDs for valid
 VIDs

S2MPU_CONTEXT_CFG_VALID_VID register must be configured on v9,
allocating a context ID in range 0 to S2MPU_NUM_CONTEXT to each valid
VID. For now assume that all 8 VIDs are valid. This will change once
the hypervisor takes control over SSMT configuration as well.

If there are more VIDs than available context IDs, the driver prints
a warning that DMA may be blocked and continues.

Test: builds, boots
Bug: 190463801
Change-Id: I0c9e0a5c9470b27debaade2c4e02e16c6577fbfe
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 923353be1e550f2f7da3529e0ddee2f4bc563f6d)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h | 14 +++++++
 arch/arm64/kvm/iommu/s2mpu.c       | 60 ++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 2ae35aba43e6..1c4f4a85cd53 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -7,7 +7,15 @@
 #ifndef __ARM64_KVM_S2MPU_H__
 #define __ARM64_KVM_S2MPU_H__
 
+#include <linux/bitfield.h>
+
+#define NR_VIDS					8
+#define NR_CTX_IDS				8
+
+#define ALL_VIDS_BITMAP				GENMASK(NR_VIDS - 1, 0)
+
 #define REG_NS_VERSION				0x60
+#define REG_NS_NUM_CONTEXT			0x100
 
 #define VERSION_MAJOR_ARCH_VER_MASK		GENMASK(31, 28)
 #define VERSION_MINOR_ARCH_VER_MASK		GENMASK(27, 24)
@@ -19,6 +27,12 @@
 						 VERSION_MINOR_ARCH_VER_MASK | \
 						 VERSION_REV_ARCH_VER_MASK)
 
+#define NUM_CONTEXT_MASK			GENMASK(3, 0)
+
+#define CONTEXT_CFG_VALID_VID_CTX_VALID(ctx)	BIT((4 * (ctx)) + 3)
+#define CONTEXT_CFG_VALID_VID_CTX_VID(ctx, vid)	\
+		FIELD_PREP(GENMASK((4 * (ctx) + 2), 4 * (ctx)), (vid))
+
 enum s2mpu_version {
 	S2MPU_VERSION_8 = 0x11000000,
 	S2MPU_VERSION_9 = 0x20000000,
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 5d7a87dee0d3..9bbd473e276c 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -12,6 +12,62 @@
 
 #define S2MPU_MMIO_SIZE		SZ_64K
 
+#define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
+	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
+	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
+
+static u32 gen_ctx_cfg_valid_vid(struct platform_device *pdev,
+				 unsigned int num_ctx, u32 vid_bmap)
+{
+	u8 ctx_vid[NR_CTX_IDS] = { 0 };
+	unsigned int vid, ctx = 0;
+
+	/* Check NUM_CONTEXT value is within bounds. This should not happen. */
+	if (WARN_ON(num_ctx > NR_CTX_IDS))
+		num_ctx = NR_CTX_IDS;
+
+	while (vid_bmap) {
+		/* Break if we cannot allocate more. */
+		if (ctx >= num_ctx) {
+			dev_warn(&pdev->dev,
+				 "could not allocate all context IDs, DMA may be blocked (VID bitmap: 0x%x)",
+				 vid_bmap);
+			break;
+		}
+
+		vid = __ffs(vid_bmap);
+		vid_bmap &= ~BIT(vid);
+		ctx_vid[ctx++] = vid;
+	}
+
+	/* The following loop was unrolled so bitmasks are constant. */
+	BUILD_BUG_ON(NR_CTX_IDS != 8);
+	return CTX_CFG_ENTRY(0, ctx, ctx_vid[0])
+	     | CTX_CFG_ENTRY(1, ctx, ctx_vid[1])
+	     | CTX_CFG_ENTRY(2, ctx, ctx_vid[2])
+	     | CTX_CFG_ENTRY(3, ctx, ctx_vid[3])
+	     | CTX_CFG_ENTRY(4, ctx, ctx_vid[4])
+	     | CTX_CFG_ENTRY(5, ctx, ctx_vid[5])
+	     | CTX_CFG_ENTRY(6, ctx, ctx_vid[6])
+	     | CTX_CFG_ENTRY(7, ctx, ctx_vid[7]);
+}
+
+static int s2mpu_probe_v9(struct platform_device *pdev, void __iomem *kaddr)
+{
+	unsigned int num_ctx;
+	u32 ssmt_valid_vid_bmap, ctx_cfg_valid_vid;
+
+	ssmt_valid_vid_bmap = ALL_VIDS_BITMAP;
+	num_ctx = readl_relaxed(kaddr + REG_NS_NUM_CONTEXT) & NUM_CONTEXT_MASK;
+	ctx_cfg_valid_vid = gen_ctx_cfg_valid_vid(pdev, num_ctx, ssmt_valid_vid_bmap);
+	if (!ctx_cfg_valid_vid) {
+		dev_err(&pdev->dev, "failed to allocate context IDs");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int s2mpu_probe(struct platform_device *pdev)
 {
 	struct resource *res;
@@ -61,7 +117,11 @@ static int s2mpu_probe(struct platform_device *pdev)
 	version = readl_relaxed(kaddr + REG_NS_VERSION);
 	switch (version & VERSION_CHECK_MASK) {
 	case S2MPU_VERSION_8:
+		break;
 	case S2MPU_VERSION_9:
+		ret = s2mpu_probe_v9(pdev, kaddr);
+		if (ret)
+			return ret;
 		break;
 	default:
 		dev_err(&pdev->dev, "unexpected version 0x%08x", version);

From 8877952284cffa066e7ed4061273170b29f3c061 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 11 Aug 2021 16:04:46 +0100
Subject: [PATCH 170/457] ANDROID: KVM: arm64: Implement IRQ handler for S2MPU
 faults

The S2MPU can be configured to trigger an interrupt on faults: access
permission (both regular and during page table walks) and if no matching
context ID is found for request's VID (v9 only).

When interrupt information is provided in the S2MPU's DT node, parse the
information and enable an IRQ handler. Later patch will enable the
functionality in the S2MPU.

Test: builds, boots
Bug: 190463801
Change-Id: I11d1a896406011cff1506ee1bd124bfc66ffa914
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 2517c4e5f0cd5bb9e3c934273c562f1c650d6daa)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h |  18 +++++
 arch/arm64/kvm/iommu/s2mpu.c       | 102 +++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 1c4f4a85cd53..d513c56b2c9c 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -14,8 +14,16 @@
 
 #define ALL_VIDS_BITMAP				GENMASK(NR_VIDS - 1, 0)
 
+#define REG_NS_INTERRUPT_CLEAR			0x2c
 #define REG_NS_VERSION				0x60
 #define REG_NS_NUM_CONTEXT			0x100
+#define REG_NS_FAULT_STATUS			0x2000
+#define REG_NS_FAULT_PA_LOW(vid)		(0x2004 + ((vid) * 0x20))
+#define REG_NS_FAULT_PA_HIGH(vid)		(0x2008 + ((vid) * 0x20))
+#define REG_NS_FAULT_INFO(vid)			(0x2010 + ((vid) * 0x20))
+
+/* For use with hi_lo_readq_relaxed(). */
+#define REG_NS_FAULT_PA_HIGH_LOW(vid)		REG_NS_FAULT_PA_LOW(vid)
 
 #define VERSION_MAJOR_ARCH_VER_MASK		GENMASK(31, 28)
 #define VERSION_MINOR_ARCH_VER_MASK		GENMASK(27, 24)
@@ -33,6 +41,16 @@
 #define CONTEXT_CFG_VALID_VID_CTX_VID(ctx, vid)	\
 		FIELD_PREP(GENMASK((4 * (ctx) + 2), 4 * (ctx)), (vid))
 
+#define NR_FAULT_INFO_REGS			8
+#define FAULT_INFO_VID_MASK			GENMASK(26, 24)
+#define FAULT_INFO_TYPE_MASK			GENMASK(23, 21)
+#define FAULT_INFO_TYPE_CONTEXT			0x4 /* v9 only */
+#define FAULT_INFO_TYPE_AP			0x2
+#define FAULT_INFO_TYPE_MPTW			0x1
+#define FAULT_INFO_RW_BIT			BIT(20)
+#define FAULT_INFO_LEN_MASK			GENMASK(19, 16)
+#define FAULT_INFO_ID_MASK			GENMASK(15, 0)
+
 enum s2mpu_version {
 	S2MPU_VERSION_8 = 0x11000000,
 	S2MPU_VERSION_9 = 0x20000000,
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 9bbd473e276c..5ac5544a30c9 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -4,6 +4,7 @@
  * Author: David Brazdil <dbrazdil@google.com>
  */
 
+#include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/kvm_host.h>
 #include <linux/of_platform.h>
 
@@ -16,6 +17,62 @@
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
 
+struct s2mpu_irq_info {
+	struct device *dev;
+	void __iomem *va;
+};
+
+static irqreturn_t s2mpu_irq_handler(int irq, void *data)
+{
+	struct s2mpu_irq_info *info = data;
+	unsigned int vid;
+	u32 vid_bmap, fault_info;
+	phys_addr_t fault_pa;
+	const char *fault_type;
+	irqreturn_t ret = IRQ_NONE;
+
+	while ((vid_bmap = readl_relaxed(info->va + REG_NS_FAULT_STATUS))) {
+		WARN_ON_ONCE(vid_bmap & (~ALL_VIDS_BITMAP));
+		vid = __ffs(vid_bmap);
+
+		fault_pa = hi_lo_readq_relaxed(info->va + REG_NS_FAULT_PA_HIGH_LOW(vid));
+		fault_info = readl_relaxed(info->va + REG_NS_FAULT_INFO(vid));
+		WARN_ON(FIELD_GET(FAULT_INFO_VID_MASK, fault_info) != vid);
+
+		switch (FIELD_GET(FAULT_INFO_TYPE_MASK, fault_info)) {
+		case FAULT_INFO_TYPE_MPTW:
+			fault_type = "MPTW fault";
+			break;
+		case FAULT_INFO_TYPE_AP:
+			fault_type = "access permission fault";
+			break;
+		case FAULT_INFO_TYPE_CONTEXT:
+			fault_type = "context fault";
+			break;
+		default:
+			fault_type = "unknown fault";
+			break;
+		}
+
+		dev_err(info->dev, "\n"
+			"============== S2MPU FAULT DETECTED ==============\n"
+			"  PA=0x%pap, FAULT_INFO=0x%08x\n"
+			"  DIRECTION: %s, TYPE: %s\n"
+			"  VID=%u, REQ_LENGTH=%lu, REQ_AXI_ID=%lu\n"
+			"==================================================\n",
+			&fault_pa, fault_info,
+			(fault_info & FAULT_INFO_RW_BIT) ? "write" : "read",
+			fault_type, vid,
+			FIELD_GET(FAULT_INFO_LEN_MASK, fault_info),
+			FIELD_GET(FAULT_INFO_ID_MASK, fault_info));
+
+		writel_relaxed(BIT(vid), info->va + REG_NS_INTERRUPT_CLEAR);
+		ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
 static u32 gen_ctx_cfg_valid_vid(struct platform_device *pdev,
 				 unsigned int num_ctx, u32 vid_bmap)
 {
@@ -68,6 +125,44 @@ static int s2mpu_probe_v9(struct platform_device *pdev, void __iomem *kaddr)
 	return 0;
 }
 
+/**
+ * Parse interrupt information from DT and if found, register IRQ handler.
+ * This is considered optional and will not fail even if the initialization is
+ * unsuccessful. In that case the IRQ will remain masked.
+ */
+static void s2mpu_probe_irq(struct platform_device *pdev, void __iomem *kaddr)
+{
+	struct s2mpu_irq_info *irq_info;
+	int ret, irq;
+
+	irq = platform_get_irq_optional(pdev, 0);
+
+	if (irq == -ENXIO)
+		return; /* No IRQ specified. */
+
+	if (irq < 0) {
+		/* IRQ specified but failed to parse. */
+		dev_err(&pdev->dev, "failed to parse IRQ, IRQ not enabled");
+		return;
+	}
+
+	irq_info = devm_kmalloc(&pdev->dev, sizeof(*irq_info), GFP_KERNEL);
+	if (!irq_info)
+		return;
+
+	*irq_info = (struct s2mpu_irq_info){
+		.dev = &pdev->dev,
+		.va = kaddr,
+	};
+
+	ret = devm_request_irq(&pdev->dev, irq, s2mpu_irq_handler, 0,
+			       dev_name(&pdev->dev), irq_info);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register IRQ, IRQ not enabled");
+		return;
+	}
+}
+
 static int s2mpu_probe(struct platform_device *pdev)
 {
 	struct resource *res;
@@ -114,6 +209,13 @@ static int s2mpu_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	/*
+	 * Try to parse IRQ information. This is optional as it only affects
+	 * runtime fault reporting, and therefore errors do not fail the whole
+	 * driver initialization.
+	 */
+	s2mpu_probe_irq(pdev, kaddr);
+
 	version = readl_relaxed(kaddr + REG_NS_VERSION);
 	switch (version & VERSION_CHECK_MASK) {
 	case S2MPU_VERSION_8:

From 8930f3af8383136fc304f6f0cc44e6b28f567124 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 17 Jun 2021 20:58:16 +0000
Subject: [PATCH 171/457] ANDROID: KVM: arm64: Copy S2MPU configuration to hyp

Create variables in hyp that will hold the DT information about S2MPUs
to use by hyp at runtime. Copy the information from EL1 to EL2.

The EL1 code computes the size of the data and allocates a sufficient
number of pages, which hyp will later take ownership of.

Test: builds, boots
Bug: 190463801
Change-Id: Ic3d4bfa3ec11f7c2e1b4474910e2f57a62139a75
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit bc80f81582975ba3d19162d3c4ee1e725b9e36bd)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 17 ++++-
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c |  3 +
 arch/arm64/kvm/iommu/s2mpu.c          | 89 +++++++++++++++++++++++----
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index d513c56b2c9c..d1ed76c5f066 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -57,9 +57,24 @@ enum s2mpu_version {
 };
 
 enum s2mpu_power_state {
-	S2MPU_POWER_ALWAYS_ON,
+	S2MPU_POWER_ALWAYS_ON = 0,
 	S2MPU_POWER_ON,
 	S2MPU_POWER_OFF,
 };
 
+struct s2mpu {
+	phys_addr_t pa;
+	void __iomem *va;
+	u32 version;
+	enum s2mpu_power_state power_state;
+	u32 power_domain_id;
+	u32 context_cfg_valid_vid;
+};
+
+extern size_t kvm_nvhe_sym(kvm_hyp_nr_s2mpus);
+#define kvm_hyp_nr_s2mpus kvm_nvhe_sym(kvm_hyp_nr_s2mpus)
+
+extern struct s2mpu *kvm_nvhe_sym(kvm_hyp_s2mpus);
+#define kvm_hyp_s2mpus kvm_nvhe_sym(kvm_hyp_s2mpus)
+
 #endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 1731300e0bec..43fc1e693aef 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -8,4 +8,7 @@
 
 #include <asm/kvm_hyp.h>
 
+size_t __ro_after_init				kvm_hyp_nr_s2mpus;
+struct s2mpu __ro_after_init			*kvm_hyp_s2mpus;
+
 const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){};
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 5ac5544a30c9..6d990c47146c 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -6,6 +6,7 @@
 
 #include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/kvm_host.h>
+#include <linux/list.h>
 #include <linux/of_platform.h>
 
 #include <asm/kvm_mmu.h>
@@ -22,6 +23,14 @@ struct s2mpu_irq_info {
 	void __iomem *va;
 };
 
+struct s2mpu_list_entry {
+	struct list_head list;
+	struct device *dev;
+	struct s2mpu info;
+};
+
+static LIST_HEAD(s2mpu_list);
+
 static irqreturn_t s2mpu_irq_handler(int irq, void *data)
 {
 	struct s2mpu_irq_info *info = data;
@@ -109,15 +118,16 @@ static u32 gen_ctx_cfg_valid_vid(struct platform_device *pdev,
 	     | CTX_CFG_ENTRY(7, ctx, ctx_vid[7]);
 }
 
-static int s2mpu_probe_v9(struct platform_device *pdev, void __iomem *kaddr)
+static int s2mpu_probe_v9(struct platform_device *pdev, void __iomem *kaddr,
+			  struct s2mpu *info)
 {
 	unsigned int num_ctx;
-	u32 ssmt_valid_vid_bmap, ctx_cfg_valid_vid;
+	u32 ssmt_valid_vid_bmap;
 
 	ssmt_valid_vid_bmap = ALL_VIDS_BITMAP;
 	num_ctx = readl_relaxed(kaddr + REG_NS_NUM_CONTEXT) & NUM_CONTEXT_MASK;
-	ctx_cfg_valid_vid = gen_ctx_cfg_valid_vid(pdev, num_ctx, ssmt_valid_vid_bmap);
-	if (!ctx_cfg_valid_vid) {
+	info->context_cfg_valid_vid = gen_ctx_cfg_valid_vid(pdev, num_ctx, ssmt_valid_vid_bmap);
+	if (!info->context_cfg_valid_vid) {
 		dev_err(&pdev->dev, "failed to allocate context IDs");
 		return -EINVAL;
 	}
@@ -168,10 +178,16 @@ static int s2mpu_probe(struct platform_device *pdev)
 	struct resource *res;
 	void __iomem *kaddr;
 	size_t res_size;
-	enum s2mpu_power_state power_state = S2MPU_POWER_ALWAYS_ON;
-	u32 version, power_domain_id = 0;
+	struct s2mpu_list_entry *entry;
+	struct s2mpu *info;
 	int ret;
 
+	entry = devm_kzalloc(&pdev->dev, sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+	entry->dev = &pdev->dev;
+	info = &entry->info;
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res) {
 		dev_err(&pdev->dev, "failed to parse 'reg'");
@@ -191,6 +207,7 @@ static int s2mpu_probe(struct platform_device *pdev)
 			res->start);
 		return -EINVAL;
 	}
+	info->pa = res->start;
 
 	res_size = resource_size(res);
 	if (res_size != S2MPU_MMIO_SIZE) {
@@ -201,9 +218,9 @@ static int s2mpu_probe(struct platform_device *pdev)
 	}
 
 	ret = of_property_read_u32(pdev->dev.of_node, "power-domain-id",
-				   &power_domain_id);
+				   &info->power_domain_id);
 	if (!ret) {
-		power_state = S2MPU_POWER_ON;
+		info->power_state = S2MPU_POWER_ON;
 	} else if (ret != -EINVAL) {
 		dev_err(&pdev->dev, "failed to parse power-domain-id: %d", ret);
 		return ret;
@@ -216,20 +233,23 @@ static int s2mpu_probe(struct platform_device *pdev)
 	 */
 	s2mpu_probe_irq(pdev, kaddr);
 
-	version = readl_relaxed(kaddr + REG_NS_VERSION);
-	switch (version & VERSION_CHECK_MASK) {
+	info->version = readl_relaxed(kaddr + REG_NS_VERSION);
+	switch (info->version & VERSION_CHECK_MASK) {
 	case S2MPU_VERSION_8:
 		break;
 	case S2MPU_VERSION_9:
-		ret = s2mpu_probe_v9(pdev, kaddr);
+		ret = s2mpu_probe_v9(pdev, kaddr, info);
 		if (ret)
 			return ret;
 		break;
 	default:
-		dev_err(&pdev->dev, "unexpected version 0x%08x", version);
+		dev_err(&pdev->dev, "unexpected version 0x%08x", info->version);
 		return -EINVAL;
 	}
 
+	/* Insert successfully parsed devices to a list later copied to hyp. */
+	list_add_tail(&entry->list, &s2mpu_list);
+	kvm_hyp_nr_s2mpus++;
 	return 0;
 }
 
@@ -245,16 +265,61 @@ static struct platform_driver of_driver = {
 	},
 };
 
+static struct s2mpu *alloc_s2mpu_array(void)
+{
+	unsigned int order;
+
+	order = get_order(kvm_hyp_nr_s2mpus * sizeof(struct s2mpu));
+	return (struct s2mpu *)__get_free_pages(GFP_KERNEL, order);
+}
+
+static void free_s2mpu_array(struct s2mpu *array)
+{
+	unsigned int order;
+
+	order = get_order(kvm_hyp_nr_s2mpus * sizeof(struct s2mpu));
+	free_pages((unsigned long)array, order);
+}
+
+static int create_s2mpu_array(struct s2mpu **array)
+{
+	struct s2mpu_list_entry *entry, *tmp;
+	size_t i;
+
+	*array = alloc_s2mpu_array();
+	if (!*array)
+		return -ENOMEM;
+
+	/* Copy list to hyp array and destroy the list in the process. */
+	i = 0;
+	list_for_each_entry_safe(entry, tmp, &s2mpu_list, list) {
+		(*array)[i++] = entry->info;
+		list_del(&entry->list);
+		devm_kfree(entry->dev, entry);
+	}
+	WARN_ON(i != kvm_hyp_nr_s2mpus);
+
+	kvm_hyp_s2mpus = kern_hyp_va(*array);
+	return 0;
+}
+
 int kvm_s2mpu_init(void)
 {
+	struct s2mpu *s2mpus = NULL;
 	int ret;
 
 	ret = platform_driver_probe(&of_driver, s2mpu_probe);
 	if (ret)
 		goto out;
 
+	ret = create_s2mpu_array(&s2mpus);
+	if (ret)
+		goto out;
+
 	kvm_info("S2MPU driver initialized\n");
 
 out:
+	if (ret)
+		free_s2mpu_array(s2mpus);
 	return ret;
 }

From 563bfc1ade28b6c89c4385ddd5081aaf8dd9560c Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 23 Jun 2021 09:18:32 +0000
Subject: [PATCH 172/457] ANDROID: KVM: arm64: Enable S2MPUs in
 __pkvm_init_stage2_iommu

Initialize the S2MPU driver in __pkvm_init_stage2_iommu if requested by
the host. The driver sets kvm_iommu_ops and configures all S2MPUs which
are powered on at that point (ie. all S2MPUs on currently supported
devices).

The S2MPU L1ENTRY registers are set to 1G granularity and R/W access.
CTRL0/CTRL1/CFG as set to reasonable defaults, though the code relies on
the reset state blocking all traffic as well.

On fault the S2MPUs are configured to return SLVERR/DECERR (v8/9) to the
master. Interrupts are enabled for all VIDs and trigger an IRQ handler
if EL1 init registered a handler as a result of a DT interrupts entry.

Because the host can configure the SSMTs freely, all permission bits are
configured for all VIDs. For v9 CONTEXT_CFG_VALID_VIDS is set to the
value precomputed at EL1, allocating a context ID to each VID.

Test: builds, boots
Bug: 190463801
Change-Id: I4a824e90b5d474dd83c97ef53e4df3c8b68da6ba
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 8aa6c440da32317dcf821493b5cb08606fde3998)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    |  52 +++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 145 +++++++++++++++++++++++++-
 arch/arm64/kvm/iommu/s2mpu.c          |   2 -
 3 files changed, 194 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index d1ed76c5f066..b8a0cd6701bd 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -9,18 +9,41 @@
 
 #include <linux/bitfield.h>
 
+#define S2MPU_MMIO_SIZE				SZ_64K
+
 #define NR_VIDS					8
 #define NR_CTX_IDS				8
 
 #define ALL_VIDS_BITMAP				GENMASK(NR_VIDS - 1, 0)
 
+#define REG_NS_CTRL0				0x0
+#define REG_NS_CTRL1				0x4
+#define REG_NS_CFG				0x10
+#define REG_NS_INTERRUPT_ENABLE_PER_VID_SET	0x20
 #define REG_NS_INTERRUPT_CLEAR			0x2c
 #define REG_NS_VERSION				0x60
 #define REG_NS_NUM_CONTEXT			0x100
+#define REG_NS_CONTEXT_CFG_VALID_VID		0x104
+#define REG_NS_ALL_INVALIDATION			0x1000
 #define REG_NS_FAULT_STATUS			0x2000
 #define REG_NS_FAULT_PA_LOW(vid)		(0x2004 + ((vid) * 0x20))
 #define REG_NS_FAULT_PA_HIGH(vid)		(0x2008 + ((vid) * 0x20))
 #define REG_NS_FAULT_INFO(vid)			(0x2010 + ((vid) * 0x20))
+#define REG_NS_L1ENTRY_ATTR(vid, gb)		(0x4004 + ((vid) * 0x200) + ((gb) * 0x8))
+
+#define CTRL0_ENABLE				BIT(0)
+#define CTRL0_INTERRUPT_ENABLE			BIT(1)
+#define CTRL0_FAULT_RESP_TYPE_SLVERR		BIT(2) /* for v8 */
+#define CTRL0_FAULT_RESP_TYPE_DECERR		BIT(2) /* for v9 */
+
+#define CTRL1_DISABLE_CHK_S1L1PTW		BIT(0)
+#define CTRL1_DISABLE_CHK_S1L2PTW		BIT(1)
+#define CTRL1_ENABLE_PAGE_SIZE_AWARENESS	BIT(2)
+#define CTRL1_DISABLE_CHK_USER_MATCHED_REQ	BIT(3)
+
+#define CFG_MPTW_CACHE_OVERRIDE			BIT(0)
+#define CFG_MPTW_QOS_OVERRIDE			BIT(8)
+#define CFG_MPTW_SHAREABLE			BIT(16)
 
 /* For use with hi_lo_readq_relaxed(). */
 #define REG_NS_FAULT_PA_HIGH_LOW(vid)		REG_NS_FAULT_PA_LOW(vid)
@@ -41,6 +64,8 @@
 #define CONTEXT_CFG_VALID_VID_CTX_VID(ctx, vid)	\
 		FIELD_PREP(GENMASK((4 * (ctx) + 2), 4 * (ctx)), (vid))
 
+#define INVALIDATION_INVALIDATE			BIT(0)
+
 #define NR_FAULT_INFO_REGS			8
 #define FAULT_INFO_VID_MASK			GENMASK(26, 24)
 #define FAULT_INFO_TYPE_MASK			GENMASK(23, 21)
@@ -51,6 +76,25 @@
 #define FAULT_INFO_LEN_MASK			GENMASK(19, 16)
 #define FAULT_INFO_ID_MASK			GENMASK(15, 0)
 
+#define L1ENTRY_ATTR_PROT(prot)			FIELD_PREP(GENMASK(2, 1), prot)
+#define L1ENTRY_ATTR_1G(prot)			L1ENTRY_ATTR_PROT(prot)
+
+#define NR_GIGABYTES				64
+#define RO_GIGABYTES_FIRST			4
+#define RO_GIGABYTES_LAST			33
+
+/*
+ * Iterate over S2MPU gigabyte regions. Skip those that cannot be modified
+ * (the MMIO registers are read only, with reset value MPT_PROT_NONE).
+ */
+#define for_each_gb_in_range(i, first, last) \
+	for ((i) = (first); (i) <= (last) && (i) < NR_GIGABYTES; \
+	     (i) = (((i) + 1 == RO_GIGABYTES_FIRST) ? RO_GIGABYTES_LAST : (i)) + 1)
+
+#define for_each_gb(i)			for_each_gb_in_range(i, 0, NR_GIGABYTES - 1)
+#define for_each_vid(i)			for ((i) = 0; (i) < NR_VIDS; (i)++)
+#define for_each_gb_and_vid(gb, vid)	for_each_vid((vid)) for_each_gb((gb))
+
 enum s2mpu_version {
 	S2MPU_VERSION_8 = 0x11000000,
 	S2MPU_VERSION_9 = 0x20000000,
@@ -71,6 +115,14 @@ struct s2mpu {
 	u32 context_cfg_valid_vid;
 };
 
+enum mpt_prot {
+	MPT_PROT_NONE	= 0,
+	MPT_PROT_R	= BIT(0),
+	MPT_PROT_W	= BIT(1),
+	MPT_PROT_RW	= MPT_PROT_R | MPT_PROT_W,
+	MPT_PROT_MASK	= MPT_PROT_RW,
+};
+
 extern size_t kvm_nvhe_sym(kvm_hyp_nr_s2mpus);
 #define kvm_hyp_nr_s2mpus kvm_nvhe_sym(kvm_hyp_nr_s2mpus)
 
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 43fc1e693aef..86ac5783741a 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -6,9 +6,148 @@
 
 #include <linux/kvm_host.h>
 
+#include <asm/kvm_asm.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_s2mpu.h>
 
-size_t __ro_after_init				kvm_hyp_nr_s2mpus;
-struct s2mpu __ro_after_init			*kvm_hyp_s2mpus;
+#include <nvhe/mm.h>
 
-const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){};
+#define for_each_s2mpu(i) \
+	for ((i) = &kvm_hyp_s2mpus[0]; (i) != &kvm_hyp_s2mpus[kvm_hyp_nr_s2mpus]; (i)++)
+
+#define for_each_powered_s2mpu(i) \
+	for_each_s2mpu((i)) if (is_powered_on((i)))
+
+size_t __ro_after_init		kvm_hyp_nr_s2mpus;
+struct s2mpu __ro_after_init	*kvm_hyp_s2mpus;
+
+static bool is_version(struct s2mpu *dev, u32 version)
+{
+	return (dev->version & VERSION_CHECK_MASK) == version;
+}
+
+static bool is_powered_on(struct s2mpu *dev)
+{
+	switch (dev->power_state) {
+	case S2MPU_POWER_ALWAYS_ON:
+	case S2MPU_POWER_ON:
+		return true;
+	case S2MPU_POWER_OFF:
+		return false;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Write CONTEXT_CFG_VALID_VID configuration before touching L1ENTRY* registers.
+ * Writes to those registers are ignored unless there is a context ID allocated
+ * to the corresponding VID (v9 only).
+ */
+static void __set_context_ids(struct s2mpu *dev)
+{
+	if (!is_version(dev, S2MPU_VERSION_9))
+		return;
+
+	writel_relaxed(dev->context_cfg_valid_vid,
+		       dev->va + REG_NS_CONTEXT_CFG_VALID_VID);
+}
+
+static void __set_control_regs(struct s2mpu *dev)
+{
+	u32 ctrl0 = 0, irq_vids;
+
+	/*
+	 * Note: We set the values of CTRL0, CTRL1 and CFG registers here but we
+	 * still rely on the correctness of their reset values. S2MPUs *must*
+	 * reset to a state where all DMA traffic is blocked until the hypervisor
+	 * writes its configuration to the S2MPU. A malicious EL1 could otherwise
+	 * attempt to bypass the permission checks in the window between powering
+	 * on the S2MPU and this function being called.
+	 */
+
+	/* Enable the S2MPU, otherwise all traffic would be allowed through. */
+	ctrl0 |= CTRL0_ENABLE;
+
+	/*
+	 * Enable interrupts on fault for all VIDs. The IRQ must also be
+	 * specified in DT to get unmasked in the GIC.
+	 */
+	ctrl0 |= CTRL0_INTERRUPT_ENABLE;
+	irq_vids = ALL_VIDS_BITMAP;
+
+	/* Return SLVERR/DECERR to device on permission fault. */
+	ctrl0 |= is_version(dev, S2MPU_VERSION_9) ? CTRL0_FAULT_RESP_TYPE_DECERR
+						  : CTRL0_FAULT_RESP_TYPE_SLVERR;
+
+	writel_relaxed(irq_vids, dev->va + REG_NS_INTERRUPT_ENABLE_PER_VID_SET);
+	writel_relaxed(0, dev->va + REG_NS_CFG);
+	writel_relaxed(0, dev->va + REG_NS_CTRL1);
+	writel_relaxed(ctrl0, dev->va + REG_NS_CTRL0);
+}
+
+static void __all_invalidation(struct s2mpu *dev)
+{
+	writel_relaxed(INVALIDATION_INVALIDATE,
+		       dev->va + REG_NS_ALL_INVALIDATION);
+}
+
+static void __set_l1entry_attr_with_prot(struct s2mpu *dev, unsigned int gb,
+					 unsigned int vid, enum mpt_prot prot)
+{
+	writel_relaxed(L1ENTRY_ATTR_1G(prot),
+		       dev->va + REG_NS_L1ENTRY_ATTR(vid, gb));
+}
+
+/**
+ * Initialize S2MPU device and set all GB regions to 1G granularity with
+ * given protection bits.
+ */
+static void initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
+{
+	unsigned int gb, vid;
+
+	/* Must write CONTEXT_CFG_VALID_VID before setting L1ENTRY registers. */
+	__set_context_ids(dev);
+
+	for_each_gb_and_vid(gb, vid)
+		__set_l1entry_attr_with_prot(dev, gb, vid, prot);
+	__all_invalidation(dev);
+
+	/* Set control registers, enable the S2MPU. */
+	__set_control_regs(dev);
+}
+
+static int s2mpu_init(void)
+{
+	struct s2mpu *dev;
+	int ret;
+
+	/* Map data structures in EL2 stage-1. */
+	ret = pkvm_create_mappings(kvm_hyp_s2mpus,
+				   kvm_hyp_s2mpus + kvm_hyp_nr_s2mpus,
+				   PAGE_HYP);
+	if (ret)
+		return ret;
+
+	/* Map S2MPU MMIO regions in EL2 stage-1. */
+	for_each_s2mpu(dev) {
+		ret = __pkvm_create_private_mapping(
+			dev->pa, S2MPU_MMIO_SIZE, PAGE_HYP_DEVICE,(unsigned long *)(&dev->va));
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Program all S2MPUs powered on at boot. Note that they may not be in
+	 * the blocking reset state as the bootloader may have programmed them.
+	 */
+	for_each_powered_s2mpu(dev)
+		initialize_with_prot(dev, MPT_PROT_RW);
+	return 0;
+}
+
+const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
+	.init = s2mpu_init,
+};
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 6d990c47146c..9704d7951517 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -12,8 +12,6 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_s2mpu.h>
 
-#define S2MPU_MMIO_SIZE		SZ_64K
-
 #define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))

From cf3b8fa669358f41b3db74c069b2cf4b4eca4cb8 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 29 Jun 2021 17:15:17 +0000
Subject: [PATCH 173/457] ANDROID: KVM: arm64: Reprogram S2MPUs in
 'host_smc_handler'

Intercept SMCs known to be used by the host to inform EL3 about power
events, either powering SoC blocks on or off.

Test: builds, boots
Bug: 190463801
Change-Id: I306433c8c1b712df24569cbd4dc346f72b4c9650
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 8ca0b34fe468450b041cc7cf255b2f104ca084a1)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 76 +++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 86ac5783741a..f4a6dc39d92e 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -11,7 +11,14 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_s2mpu.h>
 
+#include <linux/arm-smccc.h>
+
 #include <nvhe/mm.h>
+#include <nvhe/spinlock.h>
+#include <nvhe/trap_handler.h>
+
+#define SMC_CMD_PREPARE_PD_ONOFF	0x82000410
+#define SMC_MODE_POWER_UP		1
 
 #define for_each_s2mpu(i) \
 	for ((i) = &kvm_hyp_s2mpus[0]; (i) != &kvm_hyp_s2mpus[kvm_hyp_nr_s2mpus]; (i)++)
@@ -22,6 +29,8 @@
 size_t __ro_after_init		kvm_hyp_nr_s2mpus;
 struct s2mpu __ro_after_init	*kvm_hyp_s2mpus;
 
+static hyp_spinlock_t		s2mpu_lock;
+
 static bool is_version(struct s2mpu *dev, u32 version)
 {
 	return (dev->version & VERSION_CHECK_MASK) == version;
@@ -40,6 +49,19 @@ static bool is_powered_on(struct s2mpu *dev)
 	}
 }
 
+static bool is_in_power_domain(struct s2mpu *dev, u64 power_domain_id)
+{
+	switch (dev->power_state) {
+	case S2MPU_POWER_ALWAYS_ON:
+		return false;
+	case S2MPU_POWER_ON:
+	case S2MPU_POWER_OFF:
+		return dev->power_domain_id == power_domain_id;
+	default:
+		BUG();
+	}
+}
+
 /*
  * Write CONTEXT_CFG_VALID_VID configuration before touching L1ENTRY* registers.
  * Writes to those registers are ignored unless there is a context ID allocated
@@ -119,6 +141,59 @@ static void initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
 	__set_control_regs(dev);
 }
 
+static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, fn, host_ctxt, 0);
+	DECLARE_REG(u64, mode, host_ctxt, 1);
+	DECLARE_REG(u64, domain_id, host_ctxt, 2);
+	DECLARE_REG(u64, group, host_ctxt, 3);
+
+	struct arm_smccc_res res;
+	struct s2mpu *dev;
+
+	if (fn != SMC_CMD_PREPARE_PD_ONOFF)
+		return false; /* SMC not handled */
+
+	/*
+	 * Host is notifying EL3 that a power domain was turned on/off.
+	 * Use this SMC as a trigger to program the S2MPUs.
+	 * Note that the host may be malicious and issue this SMC arbitrarily.
+	 *
+	 * Power on:
+	 * It is paramount that the S2MPU reset state is enabled and blocking
+	 * all traffic. That way the host is forced to issue a power-on SMC to
+	 * unblock the S2MPUs.
+	 *
+	 * Power down:
+	 * A power-down SMC is a hint for hyp to stop updating the S2MPU, lest
+	 * writes to powered-down MMIO registers produce SErrors in the host.
+	 * However, hyp must perform one last update - putting the S2MPUs back
+	 * to their blocking reset state - in case the host does not actually
+	 * power them down and continues issuing DMA traffic.
+	 */
+
+	hyp_spin_lock(&s2mpu_lock);
+	arm_smccc_1_1_smc(fn, mode, domain_id, group, &res);
+	if (res.a0 == SMCCC_RET_SUCCESS) {
+		for_each_s2mpu(dev) {
+			if (!is_in_power_domain(dev, domain_id))
+				continue;
+
+			if (mode == SMC_MODE_POWER_UP) {
+				dev->power_state = S2MPU_POWER_ON;
+				initialize_with_prot(dev, MPT_PROT_RW);
+			} else {
+				initialize_with_prot(dev, MPT_PROT_NONE);
+				dev->power_state = S2MPU_POWER_OFF;
+			}
+		}
+	}
+	hyp_spin_unlock(&s2mpu_lock);
+
+	cpu_reg(host_ctxt, 0) = res.a0;
+	return true;  /* SMC handled */
+}
+
 static int s2mpu_init(void)
 {
 	struct s2mpu *dev;
@@ -150,4 +225,5 @@ static int s2mpu_init(void)
 
 const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
 	.init = s2mpu_init,
+	.host_smc_handler = s2mpu_host_smc_handler,
 };

From 0512ea32b8a75a0b1bf0ceda0d6224966ff674b2 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 25 Jun 2021 11:06:23 +0000
Subject: [PATCH 174/457] ANDROID: KVM: arm64: Set up S2MPU Memory Protection
 Table

S2MPU Second-level Memory Protection Table is a PA-contiguous buffer
containing an array of 2-bit read/write entries at given granularity
for a given gigabyte physical address space region. The size of SMPT
varies per granularity but at the finest 4K granularity it is 64KB
PA-contiguous, aligned to 64KB.

Allocate sufficient number of SMPT buffers for the S2MPU driver assuming
4K granularity for 4K/16K PAGE_SIZE, and 64K granularity for 64K
PAGE_SIZE. We also assume that all S2MPUs share SMPTs for a given
gigabyte region. There are 34 gigabyte regions that can be set by the
driver (GBs 4-33 always block all traffic).

Hyp takes ownership of the memory in s2mpu_init and assigns pointers to
the buffers to L1ENTRY_L2TABLE_ADDR registers on init and power-on
events. The pointers remain static as the driver will only change
granularity between 1G and 4K/64K (depending on PAGE_SIZE).

Test: builds, boots
Bug: 190463801
Change-Id: I3fcad8b3ce5d194a987b09d042bd56d59bb35e5e
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit f0e1de52ef17f205d499a3ee48a20d3c652d60f0)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 42 +++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 59 ++++++++++++++++++++++++++-
 arch/arm64/kvm/iommu/s2mpu.c          | 53 +++++++++++++++++++++++-
 3 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index b8a0cd6701bd..ebddc28bb30e 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -29,6 +29,7 @@
 #define REG_NS_FAULT_PA_LOW(vid)		(0x2004 + ((vid) * 0x20))
 #define REG_NS_FAULT_PA_HIGH(vid)		(0x2008 + ((vid) * 0x20))
 #define REG_NS_FAULT_INFO(vid)			(0x2010 + ((vid) * 0x20))
+#define REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb)	(0x4000 + ((vid) * 0x200) + ((gb) * 0x8))
 #define REG_NS_L1ENTRY_ATTR(vid, gb)		(0x4004 + ((vid) * 0x200) + ((gb) * 0x8))
 
 #define CTRL0_ENABLE				BIT(0)
@@ -76,12 +77,40 @@
 #define FAULT_INFO_LEN_MASK			GENMASK(19, 16)
 #define FAULT_INFO_ID_MASK			GENMASK(15, 0)
 
+#define L1ENTRY_L2TABLE_ADDR(pa)		((pa) >> 4)
+
+#define L1ENTRY_ATTR_L2TABLE_EN			BIT(0)
+#define L1ENTRY_ATTR_GRAN_4K			0x0
+#define L1ENTRY_ATTR_GRAN_64K			0x1
+#define L1ENTRY_ATTR_GRAN_2M			0x2
 #define L1ENTRY_ATTR_PROT(prot)			FIELD_PREP(GENMASK(2, 1), prot)
+#define L1ENTRY_ATTR_GRAN(gran)			FIELD_PREP(GENMASK(5, 4), gran)
 #define L1ENTRY_ATTR_1G(prot)			L1ENTRY_ATTR_PROT(prot)
+#define L1ENTRY_ATTR_L2(gran)			(L1ENTRY_ATTR_GRAN(gran) | \
+						 L1ENTRY_ATTR_L2TABLE_EN)
 
 #define NR_GIGABYTES				64
 #define RO_GIGABYTES_FIRST			4
 #define RO_GIGABYTES_LAST			33
+#define NR_RO_GIGABYTES				(RO_GIGABYTES_LAST - RO_GIGABYTES_FIRST + 1)
+#define NR_RW_GIGABYTES				(NR_GIGABYTES - NR_RO_GIGABYTES)
+
+#ifdef CONFIG_ARM64_64K_PAGES
+#define SMPT_GRAN				SZ_64K
+#define SMPT_GRAN_ATTR				L1ENTRY_ATTR_GRAN_64K
+#else
+#define SMPT_GRAN				SZ_4K
+#define SMPT_GRAN_ATTR				L1ENTRY_ATTR_GRAN_4K
+#endif
+static_assert(SMPT_GRAN <= PAGE_SIZE);
+
+#define MPT_PROT_BITS				2
+#define SMPT_WORD_SIZE				sizeof(u32)
+#define SMPT_ELEMS_PER_BYTE			(BITS_PER_BYTE / MPT_PROT_BITS)
+#define SMPT_NUM_ELEMS				(SZ_1G / SMPT_GRAN)
+#define SMPT_NUM_WORDS				(SMPT_SIZE / SMPT_WORD_SIZE)
+#define SMPT_SIZE				(SMPT_NUM_ELEMS / SMPT_ELEMS_PER_BYTE)
+#define SMPT_ORDER				get_order(SMPT_SIZE)
 
 /*
  * Iterate over S2MPU gigabyte regions. Skip those that cannot be modified
@@ -123,10 +152,23 @@ enum mpt_prot {
 	MPT_PROT_MASK	= MPT_PROT_RW,
 };
 
+struct fmpt {
+	u32 *smpt;
+	bool gran_1g;
+	enum mpt_prot prot;
+};
+
+struct mpt {
+	struct fmpt fmpt[NR_GIGABYTES];
+};
+
 extern size_t kvm_nvhe_sym(kvm_hyp_nr_s2mpus);
 #define kvm_hyp_nr_s2mpus kvm_nvhe_sym(kvm_hyp_nr_s2mpus)
 
 extern struct s2mpu *kvm_nvhe_sym(kvm_hyp_s2mpus);
 #define kvm_hyp_s2mpus kvm_nvhe_sym(kvm_hyp_s2mpus)
 
+extern struct mpt kvm_nvhe_sym(kvm_hyp_host_mpt);
+#define kvm_hyp_host_mpt kvm_nvhe_sym(kvm_hyp_host_mpt)
+
 #endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index f4a6dc39d92e..e36a5fe3f7f1 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -13,6 +13,7 @@
 
 #include <linux/arm-smccc.h>
 
+#include <nvhe/memory.h>
 #include <nvhe/mm.h>
 #include <nvhe/spinlock.h>
 #include <nvhe/trap_handler.h>
@@ -28,6 +29,7 @@
 
 size_t __ro_after_init		kvm_hyp_nr_s2mpus;
 struct s2mpu __ro_after_init	*kvm_hyp_s2mpus;
+struct mpt			kvm_hyp_host_mpt;
 
 static hyp_spinlock_t		s2mpu_lock;
 
@@ -122,6 +124,26 @@ static void __set_l1entry_attr_with_prot(struct s2mpu *dev, unsigned int gb,
 		       dev->va + REG_NS_L1ENTRY_ATTR(vid, gb));
 }
 
+static void __set_l1entry_attr_with_fmpt(struct s2mpu *dev, unsigned int gb,
+					 unsigned int vid, struct fmpt *fmpt)
+{
+	if (fmpt->gran_1g) {
+		__set_l1entry_attr_with_prot(dev, gb, vid, fmpt->prot);
+	} else {
+		/* Order against writes to the SMPT. */
+		writel(L1ENTRY_ATTR_L2(SMPT_GRAN_ATTR),
+		       dev->va + REG_NS_L1ENTRY_ATTR(vid, gb));
+	}
+}
+
+static void __set_l1entry_l2table_addr(struct s2mpu *dev, unsigned int gb,
+				       unsigned int vid, phys_addr_t addr)
+{
+	/* Order against writes to the SMPT. */
+	writel(L1ENTRY_L2TABLE_ADDR(addr),
+	       dev->va + REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb));
+}
+
 /**
  * Initialize S2MPU device and set all GB regions to 1G granularity with
  * given protection bits.
@@ -141,6 +163,29 @@ static void initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
 	__set_control_regs(dev);
 }
 
+/**
+ * Initialize S2MPU device, set L2 table addresses and configure L1TABLE_ATTR
+ * registers according to the given MPT struct.
+ */
+static void initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
+{
+	unsigned int gb, vid;
+	struct fmpt *fmpt;
+
+	/* Must write CONTEXT_CFG_VALID_VID before setting L1ENTRY registers. */
+	__set_context_ids(dev);
+
+	for_each_gb_and_vid(gb, vid) {
+		fmpt = &mpt->fmpt[gb];
+		__set_l1entry_l2table_addr(dev, gb, vid, __hyp_pa(fmpt->smpt));
+		__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
+	}
+	__all_invalidation(dev);
+
+	/* Set control registers, enable the S2MPU. */
+	__set_control_regs(dev);
+}
+
 static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, fn, host_ctxt, 0);
@@ -181,7 +226,7 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 
 			if (mode == SMC_MODE_POWER_UP) {
 				dev->power_state = S2MPU_POWER_ON;
-				initialize_with_prot(dev, MPT_PROT_RW);
+				initialize_with_mpt(dev, &kvm_hyp_host_mpt);
 			} else {
 				initialize_with_prot(dev, MPT_PROT_NONE);
 				dev->power_state = S2MPU_POWER_OFF;
@@ -197,6 +242,7 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 static int s2mpu_init(void)
 {
 	struct s2mpu *dev;
+	unsigned int gb;
 	int ret;
 
 	/* Map data structures in EL2 stage-1. */
@@ -206,6 +252,15 @@ static int s2mpu_init(void)
 	if (ret)
 		return ret;
 
+	for_each_gb(gb) {
+		ret = pkvm_create_mappings(
+			kvm_hyp_host_mpt.fmpt[gb].smpt,
+			kvm_hyp_host_mpt.fmpt[gb].smpt + SMPT_NUM_WORDS,
+			PAGE_HYP);
+		if (ret)
+			return ret;
+	}
+
 	/* Map S2MPU MMIO regions in EL2 stage-1. */
 	for_each_s2mpu(dev) {
 		ret = __pkvm_create_private_mapping(
@@ -219,7 +274,7 @@ static int s2mpu_init(void)
 	 * the blocking reset state as the bootloader may have programmed them.
 	 */
 	for_each_powered_s2mpu(dev)
-		initialize_with_prot(dev, MPT_PROT_RW);
+		initialize_with_mpt(dev, &kvm_hyp_host_mpt);
 	return 0;
 }
 
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 9704d7951517..ce4b32453b7c 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -301,9 +301,54 @@ static int create_s2mpu_array(struct s2mpu **array)
 	return 0;
 }
 
+static int alloc_smpts(struct mpt *mpt)
+{
+	unsigned int gb;
+
+	for_each_gb(gb) {
+		/* The returned buffer is aligned to its size, as required. */
+		mpt->fmpt[gb].smpt = (u32 *)__get_free_pages(GFP_KERNEL, SMPT_ORDER);
+		if (!mpt->fmpt[gb].smpt)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void free_smpts(struct mpt *mpt)
+{
+	unsigned int gb;
+
+	for_each_gb(gb)
+		free_pages((unsigned long)mpt->fmpt[gb].smpt, SMPT_ORDER);
+}
+
+static int init_host_mpt(struct mpt *mpt)
+{
+	unsigned int gb;
+	int ret;
+
+	ret = alloc_smpts(mpt);
+	if (ret) {
+		kvm_err("Cannot allocate memory for S2MPU host MPT");
+		return ret;
+	}
+
+	/* Initialize the host MPT. Use 1G mappings with RW permissions. */
+	for_each_gb(gb) {
+		kvm_hyp_host_mpt.fmpt[gb] = (struct fmpt){
+			.gran_1g = true,
+			.prot = MPT_PROT_RW,
+			.smpt = kern_hyp_va(mpt->fmpt[gb].smpt),
+		};
+	}
+	return 0;
+}
+
 int kvm_s2mpu_init(void)
 {
 	struct s2mpu *s2mpus = NULL;
+	struct mpt mpt = {};
 	int ret;
 
 	ret = platform_driver_probe(&of_driver, s2mpu_probe);
@@ -314,10 +359,16 @@ int kvm_s2mpu_init(void)
 	if (ret)
 		goto out;
 
+	ret = init_host_mpt(&mpt);
+	if (ret)
+		goto out;
+
 	kvm_info("S2MPU driver initialized\n");
 
 out:
-	if (ret)
+	if (ret) {
 		free_s2mpu_array(s2mpus);
+		free_smpts(&mpt);
+	}
 	return ret;
 }

From 95660558988782ea34a39892ddac088ca35f2c9d Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 29 Jun 2021 19:10:14 +0000
Subject: [PATCH 175/457] ANDROID: KVM: arm64: Modify S2MPU MPT in
 'host_stage2_set_owner'

The 'host_stage2_set_owner' callback indicates that a range of
PA-contiguous pages changed owner. With all devices owned by the host,
the driver sets the protection bits in the corresponding FMPT/SMPT to
either MPT_PROT_RW if owned by the host or MPT_PROT_NONE otherwise.

For each gigabyte region, the implementation will select between 1G and
4K/64K (depending on PAGE_SIZE) mappings and populate the L1ENTRY_ATTR
register or SMPT bitmap, respectivelly.

The driver never dynamically switches between two granularities which
both require a SMPT. This is because the L1ENTRY_ATTR and
L1ENTRY_L2TABLE_ADDR registers would need to be set atomically.

Test: builds, boots
Bug: 190463801
Change-Id: Ifb0bdcaa143ef8eb213ba4133ac86d8b610a4bcf
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4475d993aa0c74460e8f90e88919dee7fe0e4258)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 141 ++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c |  85 ++++++++++++++++
 2 files changed, 226 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index ebddc28bb30e..6904ae20a99f 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -9,6 +9,8 @@
 
 #include <linux/bitfield.h>
 
+#include <asm/kvm_mmu.h>
+
 #define S2MPU_MMIO_SIZE				SZ_64K
 
 #define NR_VIDS					8
@@ -25,6 +27,9 @@
 #define REG_NS_NUM_CONTEXT			0x100
 #define REG_NS_CONTEXT_CFG_VALID_VID		0x104
 #define REG_NS_ALL_INVALIDATION			0x1000
+#define REG_NS_RANGE_INVALIDATION		0x1020
+#define REG_NS_RANGE_INVALIDATION_START_PPN	0x1024
+#define REG_NS_RANGE_INVALIDATION_END_PPN	0x1028
 #define REG_NS_FAULT_STATUS			0x2000
 #define REG_NS_FAULT_PA_LOW(vid)		(0x2004 + ((vid) * 0x20))
 #define REG_NS_FAULT_PA_HIGH(vid)		(0x2008 + ((vid) * 0x20))
@@ -66,6 +71,7 @@
 		FIELD_PREP(GENMASK((4 * (ctx) + 2), 4 * (ctx)), (vid))
 
 #define INVALIDATION_INVALIDATE			BIT(0)
+#define RANGE_INVALIDATION_PPN_SHIFT		12
 
 #define NR_FAULT_INFO_REGS			8
 #define FAULT_INFO_VID_MASK			GENMASK(26, 24)
@@ -107,6 +113,8 @@ static_assert(SMPT_GRAN <= PAGE_SIZE);
 #define MPT_PROT_BITS				2
 #define SMPT_WORD_SIZE				sizeof(u32)
 #define SMPT_ELEMS_PER_BYTE			(BITS_PER_BYTE / MPT_PROT_BITS)
+#define SMPT_ELEMS_PER_WORD			(SMPT_WORD_SIZE * SMPT_ELEMS_PER_BYTE)
+#define SMPT_WORD_BYTE_RANGE			(SMPT_GRAN * SMPT_ELEMS_PER_WORD)
 #define SMPT_NUM_ELEMS				(SZ_1G / SMPT_GRAN)
 #define SMPT_NUM_WORDS				(SMPT_SIZE / SMPT_WORD_SIZE)
 #define SMPT_SIZE				(SMPT_NUM_ELEMS / SMPT_ELEMS_PER_BYTE)
@@ -152,6 +160,13 @@ enum mpt_prot {
 	MPT_PROT_MASK	= MPT_PROT_RW,
 };
 
+static const u64 mpt_prot_doubleword[] = {
+	[MPT_PROT_NONE] = 0x0000000000000000,
+	[MPT_PROT_R]    = 0x5555555555555555,
+	[MPT_PROT_W]	= 0xaaaaaaaaaaaaaaaa,
+	[MPT_PROT_RW]   = 0xffffffffffffffff,
+};
+
 struct fmpt {
 	u32 *smpt;
 	bool gran_1g;
@@ -162,6 +177,11 @@ struct mpt {
 	struct fmpt fmpt[NR_GIGABYTES];
 };
 
+enum mpt_update_flags {
+	MPT_UPDATE_L1 = BIT(0),
+	MPT_UPDATE_L2 = BIT(1),
+};
+
 extern size_t kvm_nvhe_sym(kvm_hyp_nr_s2mpus);
 #define kvm_hyp_nr_s2mpus kvm_nvhe_sym(kvm_hyp_nr_s2mpus)
 
@@ -171,4 +191,125 @@ extern struct s2mpu *kvm_nvhe_sym(kvm_hyp_s2mpus);
 extern struct mpt kvm_nvhe_sym(kvm_hyp_host_mpt);
 #define kvm_hyp_host_mpt kvm_nvhe_sym(kvm_hyp_host_mpt)
 
+/* Set protection bits of SMPT in a given range without using memset. */
+static inline void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
+					 size_t end_gb_byte, enum mpt_prot prot)
+{
+	size_t i, start_word_byte, end_word_byte, word_idx, first_elem, last_elem;
+	u32 val;
+
+	/* Iterate over u32 words. */
+	start_word_byte = start_gb_byte;
+	while (start_word_byte < end_gb_byte) {
+		/* Determine the range of bytes covered by this word. */
+		word_idx = start_word_byte / SMPT_WORD_BYTE_RANGE;
+		end_word_byte = min(
+			ALIGN(start_word_byte + 1, SMPT_WORD_BYTE_RANGE),
+			end_gb_byte);
+
+		/* Identify protection bit offsets within the word. */
+		first_elem = (start_word_byte / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
+		last_elem = ((end_word_byte - 1) / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
+
+		/* Modify the corresponding word. */
+		val = READ_ONCE(smpt[word_idx]);
+		for (i = first_elem; i <= last_elem; i++) {
+			val &= ~(MPT_PROT_MASK << (i * MPT_PROT_BITS));
+			val |= prot << (i * MPT_PROT_BITS);
+		}
+		WRITE_ONCE(smpt[word_idx], val);
+
+		start_word_byte = end_word_byte;
+	}
+}
+
+/* Set protection bits of SMPT in a given range. */
+static inline void __set_smpt_range(u32 *smpt, size_t start_gb_byte,
+				    size_t end_gb_byte, enum mpt_prot prot)
+{
+	size_t interlude_start, interlude_end, interlude_bytes, word_idx;
+	char prot_byte = (char)mpt_prot_doubleword[prot];
+
+	if (start_gb_byte >= end_gb_byte)
+		return;
+
+	/* Check if range spans at least one full u32 word. */
+	interlude_start = ALIGN(start_gb_byte, SMPT_WORD_BYTE_RANGE);
+	interlude_end = ALIGN_DOWN(end_gb_byte, SMPT_WORD_BYTE_RANGE);
+
+	/* If not, fall back to editing bits in the given range. */
+	if (interlude_start >= interlude_end) {
+		__set_smpt_range_slow(smpt, start_gb_byte, end_gb_byte, prot);
+		return;
+	}
+
+	/* Use bit-editing for prologue/epilogue, memset for interlude. */
+	word_idx = interlude_start / SMPT_WORD_BYTE_RANGE;
+	interlude_bytes = (interlude_end - interlude_start) / SMPT_GRAN / SMPT_ELEMS_PER_BYTE;
+
+	__set_smpt_range_slow(smpt, start_gb_byte, interlude_start, prot);
+	memset(&smpt[word_idx], prot_byte, interlude_bytes);
+	__set_smpt_range_slow(smpt, interlude_end, end_gb_byte, prot);
+}
+
+/* Returns true if all SMPT protection bits match 'prot'. */
+static inline bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
+{
+	size_t i;
+	u64 *doublewords = (u64 *)smpt;
+
+	for (i = 0; i < SMPT_NUM_WORDS / 2; i++) {
+		if (doublewords[i] != mpt_prot_doubleword[prot])
+			return false;
+	}
+	return true;
+}
+
+/**
+ * Set protection bits of FMPT/SMPT in a given range.
+ * Returns flags specifying whether L1/L2 changes need to be made visible
+ * to the device.
+ */
+static inline enum mpt_update_flags
+__set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte, size_t end_gb_byte,
+		 enum mpt_prot prot)
+{
+	if (start_gb_byte == 0 && end_gb_byte >= SZ_1G) {
+		/* Update covers the entire GB region. */
+		if (fmpt->gran_1g && fmpt->prot == prot)
+			return 0;
+
+		fmpt->gran_1g = true;
+		fmpt->prot = prot;
+		return MPT_UPDATE_L1;
+	}
+
+	if (fmpt->gran_1g) {
+		/* GB region currently uses 1G mapping. */
+		if (fmpt->prot == prot)
+			return 0;
+
+		/*
+		 * Range has different mapping than the rest of the GB.
+		 * Convert to PAGE_SIZE mapping.
+		 */
+		fmpt->gran_1g = false;
+		__set_smpt_range(fmpt->smpt, 0, start_gb_byte, fmpt->prot);
+		__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
+		__set_smpt_range(fmpt->smpt, end_gb_byte, SZ_1G, fmpt->prot);
+		return MPT_UPDATE_L1 | MPT_UPDATE_L2;
+	}
+
+	/* GB region currently uses PAGE_SIZE mapping. */
+	__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
+
+	/* Check if the entire GB region has the same prot bits. */
+	if (!__is_smpt_uniform(fmpt->smpt, prot))
+		return MPT_UPDATE_L2;
+
+	fmpt->gran_1g = true;
+	fmpt->prot = prot;
+	return MPT_UPDATE_L1;
+}
+
 #endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index e36a5fe3f7f1..c2078c55e2f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -21,6 +21,8 @@
 #define SMC_CMD_PREPARE_PD_ONOFF	0x82000410
 #define SMC_MODE_POWER_UP		1
 
+#define PA_MAX				((phys_addr_t)SZ_1G * NR_GIGABYTES)
+
 #define for_each_s2mpu(i) \
 	for ((i) = &kvm_hyp_s2mpus[0]; (i) != &kvm_hyp_s2mpus[kvm_hyp_nr_s2mpus]; (i)++)
 
@@ -117,6 +119,17 @@ static void __all_invalidation(struct s2mpu *dev)
 		       dev->va + REG_NS_ALL_INVALIDATION);
 }
 
+static void __range_invalidation(struct s2mpu *dev, phys_addr_t first_byte,
+				 phys_addr_t last_byte)
+{
+	u32 start_ppn = first_byte >> RANGE_INVALIDATION_PPN_SHIFT;
+	u32 end_ppn = last_byte >> RANGE_INVALIDATION_PPN_SHIFT;
+
+	writel_relaxed(start_ppn, dev->va + REG_NS_RANGE_INVALIDATION_START_PPN);
+	writel_relaxed(end_ppn, dev->va + REG_NS_RANGE_INVALIDATION_END_PPN);
+	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_RANGE_INVALIDATION);
+}
+
 static void __set_l1entry_attr_with_prot(struct s2mpu *dev, unsigned int gb,
 					 unsigned int vid, enum mpt_prot prot)
 {
@@ -186,6 +199,77 @@ static void initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
 	__set_control_regs(dev);
 }
 
+/**
+ * Set MPT protection bits set to 'prot' in the give byte range (page-aligned).
+ * Update currently powered S2MPUs.
+ */
+static void set_mpt_range_locked(struct mpt *mpt, phys_addr_t first_byte,
+				 phys_addr_t last_byte, enum mpt_prot prot)
+{
+	unsigned int first_gb = first_byte / SZ_1G;
+	unsigned int last_gb = last_byte / SZ_1G;
+	size_t start_gb_byte, end_gb_byte;
+	unsigned int gb, vid;
+	struct s2mpu *dev;
+	struct fmpt *fmpt;
+	enum mpt_update_flags flags;
+
+	for_each_gb_in_range(gb, first_gb, last_gb) {
+		fmpt = &mpt->fmpt[gb];
+		start_gb_byte = (gb == first_gb) ? first_byte % SZ_1G : 0;
+		end_gb_byte = (gb == last_gb) ? (last_byte % SZ_1G) + 1 : SZ_1G;
+
+		flags = __set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
+
+		if (flags & MPT_UPDATE_L2)
+			kvm_flush_dcache_to_poc(fmpt->smpt, SMPT_SIZE);
+
+		if (flags & MPT_UPDATE_L1) {
+			for_each_powered_s2mpu(dev) {
+				for_each_vid(vid)
+					__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
+			}
+		}
+	}
+
+	/* Invalidate range in all powered S2MPUs. */
+	for_each_powered_s2mpu(dev)
+		__range_invalidation(dev, first_byte, last_byte);
+}
+
+static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size, u32 owner_id)
+{
+	/* Grant access only to the default owner of the page table (ID=0). */
+	enum mpt_prot prot = owner_id ? MPT_PROT_NONE : MPT_PROT_RW;
+
+	/*
+	 * NOTE: The following code refers to 'end' as the exclusive upper
+	 * bound and 'last' as the inclusive one.
+	 */
+
+	/*
+	 * Sanitize inputs with S2MPU-specific physical address space bounds.
+	 * Ownership change requests outside this boundary will be ignored.
+	 * The S2MPU also specifies that the PA region 4-34GB always maps to
+	 * PROT_NONE and the corresponding MMIO registers are read-only.
+	 * Ownership changes in this region will have no effect.
+	 */
+
+	if (addr >= PA_MAX)
+		return;
+
+	size = min(size, (size_t)(PA_MAX - addr));
+	if (size == 0)
+		return;
+
+	hyp_spin_lock(&s2mpu_lock);
+	set_mpt_range_locked(&kvm_hyp_host_mpt,
+			     ALIGN_DOWN(addr, SMPT_GRAN),
+			     ALIGN(addr + size, SMPT_GRAN) - 1,
+			     prot);
+	hyp_spin_unlock(&s2mpu_lock);
+}
+
 static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, fn, host_ctxt, 0);
@@ -281,4 +365,5 @@ static int s2mpu_init(void)
 const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
 	.init = s2mpu_init,
 	.host_smc_handler = s2mpu_host_smc_handler,
+	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
 };

From 877e4ee07966a1e769654a1af0402a4e847e183a Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 13 Oct 2021 16:17:21 +0100
Subject: [PATCH 176/457] ANDROID: KVM: arm64: Implement MMIO handler in S2MPU
 driver

The host should not have access to the vast majority of S2MPU MMIO
registers. Currently it only needs access to fault information, in
the future maybe also performance registers.

Implement an MMIO trap handler for the S2MPU, allowing read-only
access to FAULT_* registers, and a write-only access to
INTERRUPT_CLEAR.

Test: builds, boots
Bug: 190463801
Change-Id: Ia482cc65642ba9ec303f443591e8f0fe192d4d27
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 81e70911d6022b3af8918b273c2b4743f37b267d)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    |  3 ++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 66 +++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 6904ae20a99f..fa10bc754f03 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -54,6 +54,9 @@
 /* For use with hi_lo_readq_relaxed(). */
 #define REG_NS_FAULT_PA_HIGH_LOW(vid)		REG_NS_FAULT_PA_LOW(vid)
 
+/* Mask used for extracting VID from FAULT_* register offset. */
+#define REG_NS_FAULT_VID_MASK			GENMASK(7, 5)
+
 #define VERSION_MAJOR_ARCH_VER_MASK		GENMASK(31, 28)
 #define VERSION_MINOR_ARCH_VER_MASK		GENMASK(27, 24)
 #define VERSION_REV_ARCH_VER_MASK		GENMASK(23, 16)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index c2078c55e2f2..f9db44b07985 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -323,6 +323,71 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 	return true;  /* SMC handled */
 }
 
+static struct s2mpu *find_s2mpu_by_addr(phys_addr_t addr)
+{
+	struct s2mpu *dev;
+
+	for_each_s2mpu(dev) {
+		if (dev->pa <= addr && addr < (dev->pa + S2MPU_MMIO_SIZE))
+			return dev;
+	}
+	return NULL;
+}
+
+static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
+{
+	const u32 no_access  = 0;
+	const u32 read_write = (u32)(-1);
+	const u32 read_only  = is_write ? no_access  : read_write;
+	const u32 write_only = is_write ? read_write : no_access;
+	u32 masked_off;
+
+	/* IRQ handler can clear interrupts. */
+	if (off == REG_NS_INTERRUPT_CLEAR)
+		return write_only & ALL_VIDS_BITMAP;
+
+	/* IRQ handler can read bitmap of pending interrupts. */
+	if (off == REG_NS_FAULT_STATUS)
+		return read_only & ALL_VIDS_BITMAP;
+
+	/* IRQ handler can read fault information. */
+	masked_off = off & ~REG_NS_FAULT_VID_MASK;
+	if ((masked_off == REG_NS_FAULT_PA_LOW(0)) ||
+	    (masked_off == REG_NS_FAULT_PA_HIGH(0)) ||
+	    (masked_off == REG_NS_FAULT_INFO(0)))
+		return read_only;
+
+	return no_access;
+}
+
+static bool s2mpu_host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt,
+					 phys_addr_t fault_pa, unsigned int len,
+					 bool is_write, int rd)
+{
+	struct s2mpu *dev;
+	size_t off;
+	u32 mask;
+
+	/* Only handle MMIO access with u32 size and alignment. */
+	if ((len != sizeof(u32)) || (fault_pa & (sizeof(u32) - 1)))
+		return false;
+
+	dev = find_s2mpu_by_addr(fault_pa);
+	if (!dev || !is_powered_on(dev))
+		return false;
+
+	off = fault_pa - dev->pa;
+	mask = host_mmio_reg_access_mask(off, is_write);
+	if (!mask)
+		return false;
+
+	if (is_write)
+		writel_relaxed(cpu_reg(host_ctxt, rd) & mask, dev->va + off);
+	else
+		cpu_reg(host_ctxt, rd) = readl_relaxed(dev->va + off) & mask;
+	return true;
+}
+
 static int s2mpu_init(void)
 {
 	struct s2mpu *dev;
@@ -365,5 +430,6 @@ static int s2mpu_init(void)
 const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
 	.init = s2mpu_init,
 	.host_smc_handler = s2mpu_host_smc_handler,
+	.host_mmio_dabt_handler = s2mpu_host_mmio_dabt_handler,
 	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
 };

From 8c2b04dc1bb738cf1201e9cb804bc0c3ab1049a4 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 13 Oct 2021 11:13:00 +0100
Subject: [PATCH 177/457] ANDROID: KVM: arm64: Unmap S2MPU MMIO registers from
 host stage-2

The S2MPU driver needs to protect its MMIO registers from the host.
Implement the host_stage2_adjust_mmio_range callback and restrict
the address range that is about to be mapped in to avoid the known
S2MPU MMIO regions.

Test: builds, boots
Bug: 190463801
Change-Id: Ib46f5dd651b9368c31940035e4c28a7324fc4160
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 8f234061533dc798ac96fdb6b49b6b66584a8d0f)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 29 +++++++++++++++++++++++++++
 arch/arm64/kvm/iommu/s2mpu.c          | 11 ++++++++++
 2 files changed, 40 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index f9db44b07985..fb9ce31a7ae0 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -270,6 +270,34 @@ static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size, u32 owner
 	hyp_spin_unlock(&s2mpu_lock);
 }
 
+static int s2mpu_host_stage2_adjust_mmio_range(phys_addr_t addr, phys_addr_t *start,
+					       phys_addr_t *end)
+{
+	struct s2mpu *dev;
+	phys_addr_t dev_start, dev_end, int_start, int_end;
+
+	/* Find the PA interval in the non-empty, sorted list of S2MPUs. */
+	int_start = 0;
+	for_each_s2mpu(dev) {
+		dev_start = dev->pa;
+		dev_end = dev_start + S2MPU_MMIO_SIZE;
+		int_end = dev_start;
+
+		if (dev_start <= addr && addr < dev_end)
+			return -EPERM;
+
+		if (int_start <= addr && addr < int_end)
+			break;
+
+		int_start = dev_end;
+		int_end = PA_MAX;
+	}
+
+	*start = max(*start, int_start);
+	*end = min(*end, int_end);
+	return 0;
+}
+
 static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, fn, host_ctxt, 0);
@@ -432,4 +460,5 @@ const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
 	.host_smc_handler = s2mpu_host_smc_handler,
 	.host_mmio_dabt_handler = s2mpu_host_mmio_dabt_handler,
 	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
+	.host_stage2_adjust_mmio_range = s2mpu_host_stage2_adjust_mmio_range,
 };
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index ce4b32453b7c..12b387b351bc 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -8,6 +8,7 @@
 #include <linux/kvm_host.h>
 #include <linux/list.h>
 #include <linux/of_platform.h>
+#include <linux/sort.h>
 
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_s2mpu.h>
@@ -279,6 +280,13 @@ static void free_s2mpu_array(struct s2mpu *array)
 	free_pages((unsigned long)array, order);
 }
 
+static int cmp_s2mpu(const void *p1, const void *p2)
+{
+	const struct s2mpu *a = p1, *b = p2;
+
+	return (a->pa > b->pa) - (a->pa < b->pa);
+}
+
 static int create_s2mpu_array(struct s2mpu **array)
 {
 	struct s2mpu_list_entry *entry, *tmp;
@@ -297,6 +305,9 @@ static int create_s2mpu_array(struct s2mpu **array)
 	}
 	WARN_ON(i != kvm_hyp_nr_s2mpus);
 
+	/* Searching through the list assumes that it is sorted. */
+	sort(*array, kvm_hyp_nr_s2mpus, sizeof(struct s2mpu), cmp_s2mpu, NULL);
+
 	kvm_hyp_s2mpus = kern_hyp_va(*array);
 	return 0;
 }

From 516a77ed1091475e859fbc4753dbed76e610845b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 28 Oct 2021 14:58:23 +0100
Subject: [PATCH 178/457] ANDROID: Enable KVM_S2MPU in gki_defconfig

Enable the KVM S2MPU driver in GKI.

Test: builds, boots
Bug: 190463801
Change-Id: I653cac7622e8b6e7f6484d7d8d9ee0b192edb705
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 6d4485877385f32fdf45e24bde8e5b19cfac2691)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/configs/gki_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index c4919417aba5..5383353f960f 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -75,6 +75,7 @@ CONFIG_ARM_SCPI_CPUFREQ=y
 CONFIG_ARM_SCMI_CPUFREQ=y
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM=y
+CONFIG_KVM_S2MPU=y
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_SHADOW_CALL_STACK=y

From ea35246b8e6b875c5e53e3493b2c2e93144311e4 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 8 Nov 2021 10:26:35 +0000
Subject: [PATCH 179/457] ANDROID: KVM: arm64: Mark select_iommu_ops static

The function is only used in the compilation unit where it is defined.
Silence a warning by marking it static.

Test: builds
Bug: 190463801
Change-Id: I296cffefdef4639ef2bab644d42f1374ee1a2f60
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 91abc8ece242eb595e7dd0b85eac27c51a27ba88)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 1716b70f9286..6f8b2b5c1320 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -301,7 +301,7 @@ static int fix_hyp_pgtable_refcnt(void)
 				&walker);
 }
 
-int select_iommu_ops(enum kvm_iommu_driver driver)
+static int select_iommu_ops(enum kvm_iommu_driver driver)
 {
 	switch (driver) {
 	case KVM_IOMMU_DRIVER_NONE:

From c398dac459006a5e6fa9d8840ff66a0f1ed54307 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 6 Dec 2021 11:03:06 +0000
Subject: [PATCH 180/457] ANDROID: KVM: arm64: Initialize pkvm_pgtable.mm_ops
 earlier

The `init` callback of an IOMMU driver is called just before
`finalize_host_mappings` so that EL2 mappings created by drivers are
subsequently unmapped from host stage-2. However, at this point hyp has
already switched to the buddy allocator, having reserved pages allocated
by the early allocator, but `pkvm_pgtable.mm_ops` have not been switched
to buddy allocator callbacks. As a result, pages allocated for EL2
mappings of the IOMMU driver are allocated by the obsoleted early
allocator and remain treated as free by the buddy allocator. This likely
leads to a corruption in the free page lists and a later hyp panic.

Move the initialization of `pkvm_pgtable.mm_ops` before
`finalize_host_mappings` and the call to IOMMU's `init`.

Test: run a VM
Test: adb shell cmd jobscheduler run -f android 5132250
Bug: 190463801
Bug: 209004831
Change-Id: I1f6e00bca087d889b0cad4bd43d044895e37006c
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 395d0451237fb14d801591d0a42c2abaa6641d03)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/setup.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 6f8b2b5c1320..13312c484fc3 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -336,13 +336,6 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
-	if (kvm_iommu_ops.init) {
-		ret = kvm_iommu_ops.init();
-		if (ret)
-			goto out;
-	}
-
-
 	pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
 		.zalloc_page = hyp_zalloc_hyp_page,
 		.phys_to_virt = hyp_phys_to_virt,
@@ -353,6 +346,12 @@ void __noreturn __pkvm_init_finalise(void)
 	};
 	pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
 
+	if (kvm_iommu_ops.init) {
+		ret = kvm_iommu_ops.init();
+		if (ret)
+			goto out;
+	}
+
 	ret = fix_host_ownership();
 	if (ret)
 		goto out;

From cb5df6595abfeea4b71ff046cb1636f237442f74 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 14 Jan 2022 09:20:41 +0000
Subject: [PATCH 181/457] ANDROID: KVM: arm64: Remove kernel-doc in S2MPU
 driver

Comments in S2MPU driver code were mistakenly prefixed with /**,
denoting a kernel-doc comment. Since these do not match kernel-doc
syntax, replace them with regular /* comments.

Test: n/a
Bug: 190463801
Change-Id: I0c68bb5d1c843caeb4d535430bdfc866ba8d119c
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4377d9dea9b2006e77ad82fd0c8120e40bc7d812)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 2 +-
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 6 +++---
 arch/arm64/kvm/iommu/s2mpu.c          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index fa10bc754f03..6b9218673031 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -268,7 +268,7 @@ static inline bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
 	return true;
 }
 
-/**
+/*
  * Set protection bits of FMPT/SMPT in a given range.
  * Returns flags specifying whether L1/L2 changes need to be made visible
  * to the device.
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index fb9ce31a7ae0..89b2f0763780 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -157,7 +157,7 @@ static void __set_l1entry_l2table_addr(struct s2mpu *dev, unsigned int gb,
 	       dev->va + REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb));
 }
 
-/**
+/*
  * Initialize S2MPU device and set all GB regions to 1G granularity with
  * given protection bits.
  */
@@ -176,7 +176,7 @@ static void initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
 	__set_control_regs(dev);
 }
 
-/**
+/*
  * Initialize S2MPU device, set L2 table addresses and configure L1TABLE_ATTR
  * registers according to the given MPT struct.
  */
@@ -199,7 +199,7 @@ static void initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
 	__set_control_regs(dev);
 }
 
-/**
+/*
  * Set MPT protection bits set to 'prot' in the give byte range (page-aligned).
  * Update currently powered S2MPUs.
  */
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 12b387b351bc..00111f237de3 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -134,7 +134,7 @@ static int s2mpu_probe_v9(struct platform_device *pdev, void __iomem *kaddr,
 	return 0;
 }
 
-/**
+/*
  * Parse interrupt information from DT and if found, register IRQ handler.
  * This is considered optional and will not fail even if the initialization is
  * unsuccessful. In that case the IRQ will remain masked.

From 2678f06afbb6795f12a414b8a790a487c84a8b7f Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 10 Jan 2022 11:17:54 +0000
Subject: [PATCH 182/457] ANDROID: KVM: arm64: Wait on S2MPU.STATUS after
 invalidation

The S2MPU must wait for a v9 device to finish invalidation before
accessing its SFRs. Failure to do so can result in memory transaction
timeouts.

Add a loop that polls the STATUS register while the return value has
the BUSY and ON_INVALIDATING bits set.

Test: builds, boots
Bug: 190463801
Bug: 206761586
Change-Id: Ie8755bd3466b2c76ca05d6f3f2dd6e8e7bce592c
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit e149939df25f9677eab1f996b0aac7575ca2995e)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    |  4 ++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 6b9218673031..3d541accf982 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -24,6 +24,7 @@
 #define REG_NS_INTERRUPT_ENABLE_PER_VID_SET	0x20
 #define REG_NS_INTERRUPT_CLEAR			0x2c
 #define REG_NS_VERSION				0x60
+#define REG_NS_STATUS				0x68
 #define REG_NS_NUM_CONTEXT			0x100
 #define REG_NS_CONTEXT_CFG_VALID_VID		0x104
 #define REG_NS_ALL_INVALIDATION			0x1000
@@ -67,6 +68,9 @@
 						 VERSION_MINOR_ARCH_VER_MASK | \
 						 VERSION_REV_ARCH_VER_MASK)
 
+#define STATUS_BUSY				BIT(0)
+#define STATUS_ON_INVALIDATING			BIT(1)
+
 #define NUM_CONTEXT_MASK			GENMASK(3, 0)
 
 #define CONTEXT_CFG_VALID_VID_CTX_VALID(ctx)	BIT((4 * (ctx)) + 3)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 89b2f0763780..6fa701ded195 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -113,10 +113,26 @@ static void __set_control_regs(struct s2mpu *dev)
 	writel_relaxed(ctrl0, dev->va + REG_NS_CTRL0);
 }
 
+/* Poll the given SFR as long as its value has all bits of a given mask set. */
+static void __wait_while(void __iomem *addr, u32 mask)
+{
+	while ((readl_relaxed(addr) & mask) == mask)
+		continue;
+}
+
+static void __wait_for_invalidation_complete(struct s2mpu *dev)
+{
+	/* Must not access SFRs while S2MPU is busy invalidating (v9 only). */
+	if (is_version(dev, S2MPU_VERSION_9)) {
+		__wait_while(dev->va + REG_NS_STATUS,
+			     STATUS_BUSY | STATUS_ON_INVALIDATING);
+	}
+}
+
 static void __all_invalidation(struct s2mpu *dev)
 {
-	writel_relaxed(INVALIDATION_INVALIDATE,
-		       dev->va + REG_NS_ALL_INVALIDATION);
+	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_ALL_INVALIDATION);
+	__wait_for_invalidation_complete(dev);
 }
 
 static void __range_invalidation(struct s2mpu *dev, phys_addr_t first_byte,
@@ -128,6 +144,7 @@ static void __range_invalidation(struct s2mpu *dev, phys_addr_t first_byte,
 	writel_relaxed(start_ppn, dev->va + REG_NS_RANGE_INVALIDATION_START_PPN);
 	writel_relaxed(end_ppn, dev->va + REG_NS_RANGE_INVALIDATION_END_PPN);
 	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_RANGE_INVALIDATION);
+	__wait_for_invalidation_complete(dev);
 }
 
 static void __set_l1entry_attr_with_prot(struct s2mpu *dev, unsigned int gb,

From 678ff6c4cb6a5d2e7e808e9026443a910ddb0975 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 24 Feb 2022 14:46:16 +0000
Subject: [PATCH 183/457] ANDROID: KVM: arm64: Fix host MMIO DABT handler IPA

The data abort fault IPA obtained from HFAR_EL2 has the bottom 12 bits
zeroed out. This broke the host MMIO DABT handler because the offsets
of accessed MMIO registers were rounded down to the nearest page.

Include FAR_EL2 in the address to fix the issue.

Bug: 220194478
Change-Id: I6473e2dfbe189c58c15c0e5647d695d07f88c5e0
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 346987baf5d756ddfabebd7fa44b9bf8103f2ea0)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index c6f40dfc63ef..b1a02b996526 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -726,6 +726,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 	BUG_ON(!__get_fault_info(esr, &fault));
 
 	addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
+	addr |= fault.far_el2 & FAR_MASK;
 
 	/* See if any subsystem can handle this abort. */
 	if (is_dabt(esr) && !addr_is_memory(addr))

From f5a49750dcfaaf74bbeec04ce4da1e26bc1ffcc7 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 23 Feb 2022 19:52:31 +0000
Subject: [PATCH 184/457] ANDROID: KVM: arm64: iommu: Driver initialization
 hypcall

Add '__pkvm_iommu_driver_init' hypcall and 'struct pkvm_iommu_ops' with
an 'init' callback implemented by an EL2 driver. Driver-specific data
can be passed to 'init' from the host. The memory is pinned while
the callback processed it.

Bug: 190463801
Change-Id: I1185350bb46d41ff060a207af8e6d1f2f8a3d32d
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 1d9ae14c927282663397065a9c58adf399e7792d)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h        |   1 +
 arch/arm64/include/asm/kvm_host.h       |   6 ++
 arch/arm64/include/asm/kvm_hyp.h        |  13 ---
 arch/arm64/kvm/Makefile                 |   2 +-
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  38 +++++++++
 arch/arm64/kvm/hyp/nvhe/Makefile        |   2 +-
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      |  11 +++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 103 ++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c   |   4 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c   |   1 +
 arch/arm64/kvm/hyp/nvhe/setup.c         |   1 +
 arch/arm64/kvm/iommu.c                  |  12 +++
 12 files changed, 178 insertions(+), 16 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/iommu.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/iommu.c
 create mode 100644 arch/arm64/kvm/iommu.c

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 12aa0ccc3b3d..3a2055044968 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -81,6 +81,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_driver_init,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 36daec0bbb0f..ea7e80bdb8a9 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -390,6 +390,12 @@ int kvm_s2mpu_init(void);
 static inline int kvm_s2mpu_init(void) { return -ENODEV; }
 #endif
 
+enum pkvm_iommu_driver_id {
+	PKVM_IOMMU_NR_DRIVERS,
+};
+
+int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id drv_id, void *data, size_t size);
+
 struct vcpu_reset_state {
 	unsigned long	pc;
 	unsigned long	r0;
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 4c343db80324..f3e041cfe751 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -131,18 +131,5 @@ extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
 extern unsigned long kvm_nvhe_sym(__icache_flags);
 extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
 extern bool kvm_nvhe_sym(smccc_trng_available);
-struct kvm_iommu_ops {
-	int (*init)(void);
-	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
-	bool (*host_mmio_dabt_handler)(struct kvm_cpu_context *host_ctxt,
-				       phys_addr_t fault_pa, unsigned int len,
-				       bool is_write, int rd);
-	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size, u32 owner_id);
-	int (*host_stage2_adjust_mmio_range)(phys_addr_t addr, phys_addr_t *start,
-					     phys_addr_t *end);
-};
-
-extern struct kvm_iommu_ops kvm_iommu_ops;
-extern const struct kvm_iommu_ops kvm_s2mpu_ops;
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index ccec383ad673..7da3971e6f29 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -14,7 +14,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o stacktrace.o \
 	 vgic-sys-reg-v3.o fpsimd.o pkvm.o \
-	 arch_timer.o trng.o vmid.o \
+	 arch_timer.o trng.o vmid.o iommu.o \
 	 vgic/vgic.o vgic/vgic-init.o \
 	 vgic/vgic-irqfd.o vgic/vgic-v2.o \
 	 vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
new file mode 100644
index 000000000000..b0a27f2ee9b7
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ARM64_KVM_NVHE_IOMMU_H__
+#define __ARM64_KVM_NVHE_IOMMU_H__
+
+#include <linux/types.h>
+#include <asm/kvm_host.h>
+
+#include <nvhe/mem_protect.h>
+
+struct pkvm_iommu_ops {
+	/*
+	 * Global driver initialization called before devices are registered.
+	 * Driver-specific arguments are passed in a buffer shared by the host.
+	 * The buffer memory has been pinned in EL2 but host retains R/W access.
+	 * Extra care must be taken when reading from it to avoid TOCTOU bugs.
+	 * Driver initialization lock held during callback.
+	 */
+	int (*init)(void *data, size_t size);
+};
+
+int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size);
+
+struct kvm_iommu_ops {
+	int (*init)(void);
+	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
+	bool (*host_mmio_dabt_handler)(struct kvm_cpu_context *host_ctxt,
+				       phys_addr_t fault_pa, unsigned int len,
+				       bool is_write, int rd);
+	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size,
+				      enum pkvm_component_id owner_id);
+	int (*host_stage2_adjust_mmio_range)(phys_addr_t addr, phys_addr_t *start,
+					     phys_addr_t *end);
+};
+
+extern struct kvm_iommu_ops kvm_iommu_ops;
+extern const struct kvm_iommu_ops kvm_s2mpu_ops;
+
+#endif	/* __ARM64_KVM_NVHE_IOMMU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 7086c6134331..7772aedd9b2b 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
 
 hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
 	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
-	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
+	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o iommu.o
 hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 9e4dd62c1c58..e758e81c3eb1 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -16,6 +16,7 @@
 #include <asm/kvm_mmu.h>
 
 #include <nvhe/ffa.h>
+#include <nvhe/iommu.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
@@ -1113,6 +1114,15 @@ static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle);
 }
 
+static void handle___pkvm_iommu_driver_init(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(enum pkvm_iommu_driver_id, id, host_ctxt, 1);
+	DECLARE_REG(void *, data, host_ctxt, 2);
+	DECLARE_REG(size_t, size, host_ctxt, 3);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_iommu_driver_init(id, data, size);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1147,6 +1157,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_vcpu_load),
 	HANDLE_FUNC(__pkvm_vcpu_put),
 	HANDLE_FUNC(__pkvm_vcpu_sync_state),
+	HANDLE_FUNC(__pkvm_iommu_driver_init),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
new file mode 100644
index 000000000000..3bf76be92821
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#include <nvhe/iommu.h>
+
+enum {
+	IOMMU_DRIVER_NOT_READY = 0,
+	IOMMU_DRIVER_INITIALIZING,
+	IOMMU_DRIVER_READY,
+};
+
+struct pkvm_iommu_driver {
+	const struct pkvm_iommu_ops *ops;
+	atomic_t state;
+};
+
+static struct pkvm_iommu_driver iommu_drivers[PKVM_IOMMU_NR_DRIVERS];
+
+/*
+ * Find IOMMU driver by its ID. The input ID is treated as unstrusted
+ * and is properly validated.
+ */
+static inline struct pkvm_iommu_driver *get_driver(enum pkvm_iommu_driver_id id)
+{
+	size_t index = (size_t)id;
+
+	if (index >= ARRAY_SIZE(iommu_drivers))
+		return NULL;
+
+	return &iommu_drivers[index];
+}
+
+static const struct pkvm_iommu_ops *get_driver_ops(enum pkvm_iommu_driver_id id)
+{
+	switch (id) {
+	default:
+		return NULL;
+	}
+}
+
+static inline bool driver_acquire_init(struct pkvm_iommu_driver *drv)
+{
+	return atomic_cmpxchg_acquire(&drv->state, IOMMU_DRIVER_NOT_READY,
+				      IOMMU_DRIVER_INITIALIZING)
+			== IOMMU_DRIVER_NOT_READY;
+}
+
+static inline void driver_release_init(struct pkvm_iommu_driver *drv,
+				       bool success)
+{
+	atomic_set_release(&drv->state, success ? IOMMU_DRIVER_READY
+						: IOMMU_DRIVER_NOT_READY);
+}
+
+/*
+ * Initialize EL2 IOMMU driver.
+ *
+ * This is a common hypercall for driver initialization. Driver-specific
+ * arguments are passed in a shared memory buffer. The driver is expected to
+ * initialize it's page-table bookkeeping.
+ */
+int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size)
+{
+	struct pkvm_iommu_driver *drv;
+	const struct pkvm_iommu_ops *ops;
+	int ret = 0;
+
+	data = kern_hyp_va(data);
+
+	drv = get_driver(id);
+	ops = get_driver_ops(id);
+	if (!drv || !ops)
+		return -EINVAL;
+
+	if (!driver_acquire_init(drv))
+		return -EBUSY;
+
+	drv->ops = ops;
+
+	/* This can change stage-2 mappings. */
+	if (ops->init) {
+		ret = hyp_pin_shared_mem(data, data + size);
+		if (!ret) {
+			ret = ops->init(data, size);
+			hyp_unpin_shared_mem(data, data + size);
+		}
+		if (ret)
+			goto out;
+	}
+
+out:
+	driver_release_init(drv, /*success=*/!ret);
+	return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 6fa701ded195..f10326b7bf8b 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -13,6 +13,7 @@
 
 #include <linux/arm-smccc.h>
 
+#include <nvhe/iommu.h>
 #include <nvhe/memory.h>
 #include <nvhe/mm.h>
 #include <nvhe/spinlock.h>
@@ -254,7 +255,8 @@ static void set_mpt_range_locked(struct mpt *mpt, phys_addr_t first_byte,
 		__range_invalidation(dev, first_byte, last_byte);
 }
 
-static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size, u32 owner_id)
+static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size,
+					enum pkvm_component_id owner_id)
 {
 	/* Grant access only to the default owner of the page table (ID=0). */
 	enum mpt_prot prot = owner_id ? MPT_PROT_NONE : MPT_PROT_RW;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index b1a02b996526..3faf77a513d9 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -16,6 +16,7 @@
 #include <hyp/fault.h>
 
 #include <nvhe/gfp.h>
+#include <nvhe/iommu.h>
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 13312c484fc3..cae960251048 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -13,6 +13,7 @@
 #include <nvhe/early_alloc.h>
 #include <nvhe/ffa.h>
 #include <nvhe/gfp.h>
+#include <nvhe/iommu.h>
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c
new file mode 100644
index 000000000000..edd7316bd61b
--- /dev/null
+++ b/arch/arm64/kvm/iommu.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 - Google LLC
+ * Author: David Brazdil <dbrazdil@google.com>
+ */
+
+#include <linux/kvm_host.h>
+
+int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size)
+{
+	return kvm_call_hyp_nvhe(__pkvm_iommu_driver_init, id, data, size);
+}

From 29249d8d2449df4b07857e79bcca7036da8c4e6b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 18:13:49 +0000
Subject: [PATCH 185/457] ANDROID: KVM: arm64: iommu: Avoid mapping devices in
 host stage-2

Introduce a linked list of IOMMU devices and
'pkvm_iommu_host_stage2_adjust_range' called from host DABT handler.
The function will adjust the memory range that is about to be mapped
to avoid MMIO regions of all devices in the linked list. If the host
tried to access a device MMIO region, the access is declined.

The function replaces the existing call to
'kvm_iommu.ops.host_stage2_adjust_mmio_range' callback.

Bug: 190463801
Change-Id: Iacd6b74147fea2fef04846a91f0a5e550daaf074
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit d7adab5f9fcb7f7efd76bdbc23a1b37322156112)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  8 +++++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 41 +++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c   | 12 ++++----
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index b0a27f2ee9b7..54f5ce5976d7 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -18,7 +18,15 @@ struct pkvm_iommu_ops {
 	int (*init)(void *data, size_t size);
 };
 
+struct pkvm_iommu {
+	struct list_head list;
+	phys_addr_t pa;
+	size_t size;
+};
+
 int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size);
+int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
+					phys_addr_t *end);
 
 struct kvm_iommu_ops {
 	int (*init)(void);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 3bf76be92821..f2b510fd2727 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -25,6 +25,14 @@ struct pkvm_iommu_driver {
 
 static struct pkvm_iommu_driver iommu_drivers[PKVM_IOMMU_NR_DRIVERS];
 
+/* IOMMU device list. Must only be accessed with host_mmu.lock held. */
+static LIST_HEAD(iommu_list);
+
+static void assert_host_component_locked(void)
+{
+	hyp_assert_lock_held(&host_mmu.lock);
+}
+
 /*
  * Find IOMMU driver by its ID. The input ID is treated as unstrusted
  * and is properly validated.
@@ -101,3 +109,36 @@ out:
 	driver_release_init(drv, /*success=*/!ret);
 	return ret;
 }
+
+/*
+ * Check host memory access against IOMMUs' MMIO regions.
+ * Returns -EPERM if the address is within the bounds of a registered device.
+ * Otherwise returns zero and adjusts boundaries of the new mapping to avoid
+ * MMIO regions of registered IOMMUs.
+ */
+int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
+					phys_addr_t *end)
+{
+	struct pkvm_iommu *dev;
+	phys_addr_t new_start = *start;
+	phys_addr_t new_end = *end;
+	phys_addr_t dev_start, dev_end;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		dev_start = dev->pa;
+		dev_end = dev_start + dev->size;
+
+		if (addr < dev_start)
+			new_end = min(new_end, dev_start);
+		else if (addr >= dev_end)
+			new_start = max(new_start, dev_end);
+		else
+			return -EPERM;
+	}
+
+	*start = new_start;
+	*end = new_end;
+	return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 3faf77a513d9..3711f626fb4a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -625,13 +625,13 @@ static int host_stage2_idmap(u64 addr)
 
 	prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
 
-	/**
-	 * Let device drivers adjust the permitted range first.
-	 * host_stage2_adjust_range() should be last to also properly align it.
+	/*
+	 * Adjust against IOMMU devices first. host_stage2_adjust_range() should
+	 * be called last for proper alignment.
 	 */
-	if (!is_memory && kvm_iommu_ops.host_stage2_adjust_mmio_range) {
-		ret = kvm_iommu_ops.host_stage2_adjust_mmio_range(addr, &range.start,
-								  &range.end);
+	if (!is_memory) {
+		ret = pkvm_iommu_host_stage2_adjust_range(addr, &range.start,
+							  &range.end);
 		if (ret)
 			return ret;
 	}

From 69b7603406fa0f60fae129e05978f71f1ed27afc Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 17:52:34 +0000
Subject: [PATCH 186/457] ANDROID: KVM: arm64: iommu: Register device hypcall

Add '__pkvm_iommu_register' hypcall for registering a new IOMMU device.
The handler allocates a linked-list entry for the device from a memory
pool provided by the host. If the pool has run out, the handler returns
-ENOMEM and expects the host to call it again with a fresh mem pool.

The inputs are validated, eg. ID is unique and memory region does not
overlap with existing IOMMUs. The driver can also implement a 'validate'
callback for driver-specific input validation.

If successful, the handler creates a private EL2 mapping for the device,
forces the memory region is unmapped from host stage-2 and inserts the
device into the linked list. Future attempts to map the MMIO region will
fail because of pkvm_iommu_host_stage2_adjust_range.

Bug: 190463801
Change-Id: If6f707555c80ac164ff995f42260872896a84e3d
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 78e0b7722c8d2961af0b3f2c8115df336cf7b492)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h              |   1 +
 arch/arm64/include/asm/kvm_host.h             |   2 +
 arch/arm64/kvm/hyp/include/nvhe/iommu.h       |  17 ++
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |   1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c            |  14 ++
 arch/arm64/kvm/hyp/nvhe/iommu.c               | 179 ++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         |  15 +-
 arch/arm64/kvm/iommu.c                        |  30 +++
 8 files changed, 255 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 3a2055044968..64aa2f65ce4f 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -82,6 +82,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_driver_init,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_register,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ea7e80bdb8a9..e42b318baa78 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -395,6 +395,8 @@ enum pkvm_iommu_driver_id {
 };
 
 int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id drv_id, void *data, size_t size);
+int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
+			phys_addr_t pa, size_t size);
 
 struct vcpu_reset_state {
 	unsigned long	pc;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 54f5ce5976d7..51935ab93efa 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -16,15 +16,32 @@ struct pkvm_iommu_ops {
 	 * Driver initialization lock held during callback.
 	 */
 	int (*init)(void *data, size_t size);
+
+	/*
+	 * Driver-specific validation of device registration inputs.
+	 * This should be stateless. No locks are held at entry.
+	 */
+	int (*validate)(phys_addr_t base, size_t size);
+
+	/* Amount of memory allocated per-device for use by the driver. */
+	size_t data_size;
 };
 
 struct pkvm_iommu {
 	struct list_head list;
+	unsigned long id;
+	const struct pkvm_iommu_ops *ops;
 	phys_addr_t pa;
+	void *va;
 	size_t size;
+	char data[];
 };
 
 int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size);
+int __pkvm_iommu_register(unsigned long dev_id,
+			  enum pkvm_iommu_driver_id drv_id,
+			  phys_addr_t dev_pa, size_t dev_size,
+			  void *kern_mem_va, size_t mem_size);
 int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
 					phys_addr_t *end);
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index abc366c4e8ad..a42090ab120c 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -84,6 +84,7 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
+int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
 int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e758e81c3eb1..c9acd573b77a 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1123,6 +1123,19 @@ static void handle___pkvm_iommu_driver_init(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_iommu_driver_init(id, data, size);
 }
 
+static void handle___pkvm_iommu_register(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned long, dev_id, host_ctxt, 1);
+	DECLARE_REG(enum pkvm_iommu_driver_id, drv_id, host_ctxt, 2);
+	DECLARE_REG(phys_addr_t, dev_pa, host_ctxt, 3);
+	DECLARE_REG(size_t, dev_size, host_ctxt, 4);
+	DECLARE_REG(void *, mem, host_ctxt, 5);
+	DECLARE_REG(size_t, mem_size, host_ctxt, 6);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_iommu_register(dev_id, drv_id, dev_pa,
+						      dev_size, mem, mem_size);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1158,6 +1171,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_vcpu_put),
 	HANDLE_FUNC(__pkvm_vcpu_sync_state),
 	HANDLE_FUNC(__pkvm_iommu_driver_init),
+	HANDLE_FUNC(__pkvm_iommu_register),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index f2b510fd2727..f2132c381766 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -9,8 +9,10 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_pkvm.h>
 
 #include <nvhe/iommu.h>
+#include <nvhe/mm.h>
 
 enum {
 	IOMMU_DRIVER_NOT_READY = 0,
@@ -33,6 +35,16 @@ static void assert_host_component_locked(void)
 	hyp_assert_lock_held(&host_mmu.lock);
 }
 
+static void host_lock_component(void)
+{
+	hyp_spin_lock(&host_mmu.lock);
+}
+
+static void host_unlock_component(void)
+{
+	hyp_spin_unlock(&host_mmu.lock);
+}
+
 /*
  * Find IOMMU driver by its ID. The input ID is treated as unstrusted
  * and is properly validated.
@@ -69,6 +81,91 @@ static inline void driver_release_init(struct pkvm_iommu_driver *drv,
 						: IOMMU_DRIVER_NOT_READY);
 }
 
+static inline bool is_driver_ready(struct pkvm_iommu_driver *drv)
+{
+	return atomic_read(&drv->state) == IOMMU_DRIVER_READY;
+}
+
+/* Global memory pool for allocating IOMMU list entry structs. */
+static inline struct pkvm_iommu *
+alloc_iommu_list_entry(struct pkvm_iommu_driver *drv, void *mem, size_t mem_size)
+{
+	static void *pool;
+	static size_t remaining;
+	static DEFINE_HYP_SPINLOCK(lock);
+	size_t size = sizeof(struct pkvm_iommu) + drv->ops->data_size;
+	void *ptr;
+
+	size = ALIGN(size, sizeof(unsigned long));
+
+	hyp_spin_lock(&lock);
+
+	/*
+	 * If new memory is being provided, replace the existing pool with it.
+	 * Any remaining memory in the pool is discarded.
+	 */
+	if (mem && mem_size) {
+		pool = mem;
+		remaining = mem_size;
+	}
+
+	if (size <= remaining) {
+		ptr = pool;
+		pool += size;
+		remaining -= size;
+	} else {
+		ptr = NULL;
+	}
+
+	hyp_spin_unlock(&lock);
+	return ptr;
+}
+
+static bool is_overlap(phys_addr_t r1_start, size_t r1_size,
+		       phys_addr_t r2_start, size_t r2_size)
+{
+	phys_addr_t r1_end = r1_start + r1_size;
+	phys_addr_t r2_end = r2_start + r2_size;
+
+	return (r1_start < r2_end) && (r2_start < r1_end);
+}
+
+static bool is_mmio_range(phys_addr_t base, size_t size)
+{
+	struct memblock_region *reg;
+	phys_addr_t limit = BIT(host_mmu.pgt.ia_bits);
+	size_t i;
+
+	/* Check against limits of host IPA space. */
+	if ((base >= limit) || !size || (size > limit - base))
+		return false;
+
+	for (i = 0; i < hyp_memblock_nr; i++) {
+		reg = &hyp_memory[i];
+		if (is_overlap(base, size, reg->base, reg->size))
+			return false;
+	}
+	return true;
+}
+
+static bool validate_against_existing_iommus(struct pkvm_iommu *dev)
+{
+	struct pkvm_iommu *other;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(other, &iommu_list, list) {
+		/* Device ID must be unique. */
+		if (dev->id == other->id)
+			return false;
+
+		/* MMIO regions must not overlap. */
+		if (is_overlap(dev->pa, dev->size, other->pa, other->size))
+			return false;
+	}
+	return true;
+}
+
 /*
  * Initialize EL2 IOMMU driver.
  *
@@ -110,6 +207,88 @@ out:
 	return ret;
 }
 
+int __pkvm_iommu_register(unsigned long dev_id,
+			  enum pkvm_iommu_driver_id drv_id,
+			  phys_addr_t dev_pa, size_t dev_size,
+			  void *kern_mem_va, size_t mem_size)
+{
+	struct pkvm_iommu *dev = NULL;
+	struct pkvm_iommu_driver *drv;
+	void *dev_va, *mem_va = NULL;
+	int ret = 0;
+
+	drv = get_driver(drv_id);
+	if (!drv || !is_driver_ready(drv))
+		return -ENOENT;
+
+	if (!PAGE_ALIGNED(dev_pa) || !PAGE_ALIGNED(dev_size))
+		return -EINVAL;
+
+	if (!is_mmio_range(dev_pa, dev_size))
+		return -EINVAL;
+
+	if (drv->ops->validate) {
+		ret = drv->ops->validate(dev_pa, dev_size);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Accept memory donation if the host is providing new memory.
+	 * Note: We do not return the memory even if there is an error later.
+	 */
+	if (kern_mem_va && mem_size) {
+		mem_va = kern_hyp_va(kern_mem_va);
+
+		if (!PAGE_ALIGNED(mem_va) || !PAGE_ALIGNED(mem_size))
+			return -EINVAL;
+
+		ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(mem_va),
+					     mem_size >> PAGE_SHIFT);
+		if (ret)
+			return ret;
+	}
+
+	/* Allocate memory for the new device entry. */
+	dev = alloc_iommu_list_entry(drv, mem_va, mem_size);
+	if (!dev)
+		return -ENOMEM;
+
+	/* Create EL2 mapping for the device. */
+	ret = __pkvm_create_private_mapping(dev_pa, dev_size,
+				 PAGE_HYP_DEVICE,(unsigned long *)&dev_va);
+	if (ret)
+		return ret;
+
+	/* Populate the new device entry. */
+	*dev = (struct pkvm_iommu){
+		.id = dev_id,
+		.ops = drv->ops,
+		.pa = dev_pa,
+		.va = dev_va,
+		.size = dev_size,
+	};
+
+	/* Take the host_mmu lock to block host stage-2 changes. */
+	host_lock_component();
+	if (!validate_against_existing_iommus(dev)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/* Unmap the device's MMIO range from host stage-2. */
+	ret = host_stage2_unmap_dev_locked(dev_pa, dev_size);
+	if (ret)
+		goto out;
+
+	/* Register device and prevent host from mapping the MMIO range. */
+	list_add_tail(&dev->list, &iommu_list);
+
+out:
+	host_unlock_component();
+	return ret;
+}
+
 /*
  * Check host memory access against IOMMUs' MMIO regions.
  * Returns -EPERM if the address is within the bounds of a registered device.
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 3711f626fb4a..392f862d76ff 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -414,6 +414,13 @@ int __pkvm_prot_finalize(void)
 	return 0;
 }
 
+int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size)
+{
+	hyp_assert_lock_held(&host_mmu.lock);
+
+	return kvm_pgtable_stage2_unmap(&host_mmu.pgt, start, size);
+}
+
 static int host_stage2_unmap_dev_all(void)
 {
 	struct kvm_pgtable *pgt = &host_mmu.pgt;
@@ -424,11 +431,11 @@ static int host_stage2_unmap_dev_all(void)
 	/* Unmap all non-memory regions to recycle the pages */
 	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
 		reg = &hyp_memory[i];
-		ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr);
+		ret = host_stage2_unmap_dev_locked(addr, reg->base - addr);
 		if (ret)
 			return ret;
 	}
-	return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
+	return host_stage2_unmap_dev_locked(addr, BIT(pgt->ia_bits) - addr);
 }
 
 struct kvm_mem_range {
@@ -625,6 +632,7 @@ static int host_stage2_idmap(u64 addr)
 
 	prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
 
+	host_lock_component();
 	/*
 	 * Adjust against IOMMU devices first. host_stage2_adjust_range() should
 	 * be called last for proper alignment.
@@ -633,10 +641,9 @@ static int host_stage2_idmap(u64 addr)
 		ret = pkvm_iommu_host_stage2_adjust_range(addr, &range.start,
 							  &range.end);
 		if (ret)
-			return ret;
+			goto unlock;
 	}
 
-	host_lock_component();
 	ret = host_stage2_adjust_range(addr, &range);
 	if (ret)
 		goto unlock;
diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c
index edd7316bd61b..a845be0c8fa9 100644
--- a/arch/arm64/kvm/iommu.c
+++ b/arch/arm64/kvm/iommu.c
@@ -6,7 +6,37 @@
 
 #include <linux/kvm_host.h>
 
+static unsigned long dev_to_id(struct device *dev)
+{
+	/* Use the struct device pointer as a unique identifier. */
+	return (unsigned long)dev;
+}
+
 int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size)
 {
 	return kvm_call_hyp_nvhe(__pkvm_iommu_driver_init, id, data, size);
 }
+
+int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
+			phys_addr_t pa, size_t size)
+{
+	void *mem;
+	int ret;
+
+	/*
+	 * Hypcall to register the device. It will return -ENOMEM if it needs
+	 * more memory. In that case allocate a page and retry.
+	 * We assume that hyp never allocates more than a page per hypcall.
+	 */
+	ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev),
+				drv_id, pa, size, NULL, 0);
+	if (ret == -ENOMEM) {
+		mem = (void *)__get_free_page(GFP_KERNEL);
+		if (!mem)
+			return -ENOMEM;
+
+		ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev),
+					drv_id, pa, size, mem, PAGE_SIZE);
+	}
+	return ret;
+}

From 3ab30989b45b19722d79f9367660df14a361cc73 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 20:58:22 +0000
Subject: [PATCH 187/457] ANDROID: KVM: arm64: iommu: Suspend/resume callbacks

Add suspend/resume callbacks for IOMMU devices. The EL1 kernel driver
is expected to call these when the IOMMU device is powered on but is
about to be used or about to stop being used.

pkvm_iommu_suspend/resume are exported for use by kernel modules.

Bug: 190463801
Change-Id: Ia4ab37fe96879d451ce82f4278b3ff33a0b9685b
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit ca47ae70c77f327be95d8fb6930caa143bb200c9)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h        |  1 +
 arch/arm64/include/asm/kvm_host.h       |  7 +++++
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  9 ++++++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      |  9 ++++++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 37 +++++++++++++++++++++++++
 arch/arm64/kvm/iommu.c                  | 14 ++++++++++
 6 files changed, 77 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 64aa2f65ce4f..fd9950148ca1 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -83,6 +83,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_driver_init,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_register,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_pm_notify,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e42b318baa78..e8d92153f623 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -394,9 +394,16 @@ enum pkvm_iommu_driver_id {
 	PKVM_IOMMU_NR_DRIVERS,
 };
 
+enum pkvm_iommu_pm_event {
+	PKVM_IOMMU_PM_SUSPEND,
+	PKVM_IOMMU_PM_RESUME,
+};
+
 int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id drv_id, void *data, size_t size);
 int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
 			phys_addr_t pa, size_t size);
+int pkvm_iommu_suspend(struct device *dev);
+int pkvm_iommu_resume(struct device *dev);
 
 struct vcpu_reset_state {
 	unsigned long	pc;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 51935ab93efa..452d5de0f215 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -7,6 +7,8 @@
 
 #include <nvhe/mem_protect.h>
 
+struct pkvm_iommu;
+
 struct pkvm_iommu_ops {
 	/*
 	 * Global driver initialization called before devices are registered.
@@ -23,6 +25,10 @@ struct pkvm_iommu_ops {
 	 */
 	int (*validate)(phys_addr_t base, size_t size);
 
+	/* Power management callbacks. Called with host lock held. */
+	int (*suspend)(struct pkvm_iommu *dev);
+	int (*resume)(struct pkvm_iommu *dev);
+
 	/* Amount of memory allocated per-device for use by the driver. */
 	size_t data_size;
 };
@@ -34,6 +40,7 @@ struct pkvm_iommu {
 	phys_addr_t pa;
 	void *va;
 	size_t size;
+	bool powered;
 	char data[];
 };
 
@@ -42,6 +49,8 @@ int __pkvm_iommu_register(unsigned long dev_id,
 			  enum pkvm_iommu_driver_id drv_id,
 			  phys_addr_t dev_pa, size_t dev_size,
 			  void *kern_mem_va, size_t mem_size);
+int __pkvm_iommu_pm_notify(unsigned long dev_id,
+			   enum pkvm_iommu_pm_event event);
 int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
 					phys_addr_t *end);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index c9acd573b77a..d597ba27ee38 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1136,6 +1136,14 @@ static void handle___pkvm_iommu_register(struct kvm_cpu_context *host_ctxt)
 						      dev_size, mem, mem_size);
 }
 
+static void handle___pkvm_iommu_pm_notify(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned long, dev_id, host_ctxt, 1);
+	DECLARE_REG(enum pkvm_iommu_pm_event, event, host_ctxt, 2);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_iommu_pm_notify(dev_id, event);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1172,6 +1180,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_vcpu_sync_state),
 	HANDLE_FUNC(__pkvm_iommu_driver_init),
 	HANDLE_FUNC(__pkvm_iommu_register),
+	HANDLE_FUNC(__pkvm_iommu_pm_notify),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index f2132c381766..3ad158369413 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -166,6 +166,19 @@ static bool validate_against_existing_iommus(struct pkvm_iommu *dev)
 	return true;
 }
 
+static struct pkvm_iommu *find_iommu_by_id(unsigned long id)
+{
+	struct pkvm_iommu *dev;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (dev->id == id)
+			return dev;
+	}
+	return NULL;
+}
+
 /*
  * Initialize EL2 IOMMU driver.
  *
@@ -289,6 +302,30 @@ out:
 	return ret;
 }
 
+int __pkvm_iommu_pm_notify(unsigned long dev_id, enum pkvm_iommu_pm_event event)
+{
+	struct pkvm_iommu *dev;
+	int ret;
+
+	host_lock_component();
+	dev = find_iommu_by_id(dev_id);
+	if (dev) {
+		if (event == PKVM_IOMMU_PM_SUSPEND) {
+			ret = dev->ops->suspend ? dev->ops->suspend(dev) : 0;
+			dev->powered = !!ret;
+		} else if (event == PKVM_IOMMU_PM_RESUME) {
+			ret = dev->ops->resume ? dev->ops->resume(dev) : 0;
+			dev->powered = !ret;
+		} else {
+			ret = -EINVAL;
+		}
+	} else {
+		ret = -ENODEV;
+	}
+	host_unlock_component();
+	return ret;
+}
+
 /*
  * Check host memory access against IOMMUs' MMIO regions.
  * Returns -EPERM if the address is within the bounds of a registered device.
diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c
index a845be0c8fa9..e13d36ec57e7 100644
--- a/arch/arm64/kvm/iommu.c
+++ b/arch/arm64/kvm/iommu.c
@@ -40,3 +40,17 @@ int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
 	}
 	return ret;
 }
+
+int pkvm_iommu_suspend(struct device *dev)
+{
+	return kvm_call_hyp_nvhe(__pkvm_iommu_pm_notify, dev_to_id(dev),
+				 PKVM_IOMMU_PM_SUSPEND);
+}
+EXPORT_SYMBOL_GPL(pkvm_iommu_suspend);
+
+int pkvm_iommu_resume(struct device *dev)
+{
+	return kvm_call_hyp_nvhe(__pkvm_iommu_pm_notify, dev_to_id(dev),
+				 PKVM_IOMMU_PM_RESUME);
+}
+EXPORT_SYMBOL_GPL(pkvm_iommu_resume);

From d148d7a3e35cc6f7a0742ef7949d883af9bd409a Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 22:09:46 +0000
Subject: [PATCH 188/457] ANDROID: KVM: arm64: iommu: DABT handler callback

Replace the 'host_mmio_dabt_handler' hook in kvm_iommu_ops with
an equivalent callback in the new pkvm_iommu_ops. The generic portion
of the code finds the IOMMU device at the faulted address and invokes
the callback on it.

Bug: 190463801
Change-Id: I0ca008c3e1ae0ec12a259fa4ddac1aee65aaac5c
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 5df451f35e66b890b086b417c60d42118266823f)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h | 10 ++++++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 22 ++++++++++++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c   | 45 +++++++------------------
 3 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 452d5de0f215..c95f1c08371f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -29,6 +29,14 @@ struct pkvm_iommu_ops {
 	int (*suspend)(struct pkvm_iommu *dev);
 	int (*resume)(struct pkvm_iommu *dev);
 
+	/*
+	 * Host data abort handler callback. Called with host lock held.
+	 * Returns true if the data abort has been handled.
+	 */
+	bool (*host_dabt_handler)(struct pkvm_iommu *dev,
+				  struct kvm_cpu_context *host_ctxt,
+				  u32 esr, size_t off);
+
 	/* Amount of memory allocated per-device for use by the driver. */
 	size_t data_size;
 };
@@ -53,6 +61,8 @@ int __pkvm_iommu_pm_notify(unsigned long dev_id,
 			   enum pkvm_iommu_pm_event event);
 int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
 					phys_addr_t *end);
+bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
+				  phys_addr_t fault_pa);
 
 struct kvm_iommu_ops {
 	int (*init)(void);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 3ad158369413..2d3d05d7ac89 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -11,6 +11,7 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_pkvm.h>
 
+#include <hyp/adjust_pc.h>
 #include <nvhe/iommu.h>
 #include <nvhe/mm.h>
 
@@ -358,3 +359,24 @@ int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
 	*end = new_end;
 	return 0;
 }
+
+bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
+				  phys_addr_t pa)
+{
+	struct pkvm_iommu *dev;
+
+	assert_host_component_locked();
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (pa < dev->pa || pa >= dev->pa + dev->size)
+			continue;
+
+		if (!dev->powered || !dev->ops->host_dabt_handler ||
+		    !dev->ops->host_dabt_handler(dev, host_ctxt, esr, pa - dev->pa))
+			return false;
+
+		kvm_skip_host_instr();
+		return true;
+	}
+	return false;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 392f862d76ff..b1db2b8fb0fc 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -630,9 +630,9 @@ static int host_stage2_idmap(u64 addr)
 	enum kvm_pgtable_prot prot;
 	int ret;
 
-	prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
+	hyp_assert_lock_held(&host_mmu.lock);
 
-	host_lock_component();
+	prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
 	/*
 	 * Adjust against IOMMU devices first. host_stage2_adjust_range() should
 	 * be called last for proper alignment.
@@ -641,18 +641,14 @@ static int host_stage2_idmap(u64 addr)
 		ret = pkvm_iommu_host_stage2_adjust_range(addr, &range.start,
 							  &range.end);
 		if (ret)
-			goto unlock;
+			return ret;
 	}
 
 	ret = host_stage2_adjust_range(addr, &range);
 	if (ret)
-		goto unlock;
+		return ret;
 
-	ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot);
-unlock:
-	host_unlock_component();
-
-	return ret;
+	return host_stage2_idmap_locked(range.start, range.end - range.start, prot);
 }
 
 static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
@@ -699,26 +695,6 @@ static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
 	write_sysreg_el2(spsr, SYS_SPSR);
 }
 
-static int host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
-				  phys_addr_t addr)
-{
-	bool wnr = esr & ESR_ELx_WNR;
-	unsigned int len = BIT((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT);
-	int rd = (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
-	bool handled = false;
-
-	if (kvm_iommu_ops.host_mmio_dabt_handler) {
-		handled = kvm_iommu_ops.host_mmio_dabt_handler(host_ctxt, addr,
-							       len, wnr, rd);
-	}
-
-	if (!handled)
-		return -EPERM;
-
-	kvm_skip_host_instr();
-	return 0;
-}
-
 static bool is_dabt(u64 esr)
 {
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_LOW;
@@ -736,14 +712,19 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 	addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
 	addr |= fault.far_el2 & FAR_MASK;
 
-	/* See if any subsystem can handle this abort. */
-	if (is_dabt(esr) && !addr_is_memory(addr))
-		ret = host_mmio_dabt_handler(host_ctxt, esr, addr);
+	host_lock_component();
+
+	/* Check if an IOMMU device can handle the DABT. */
+	if (is_dabt(esr) && !addr_is_memory(addr) &&
+	    pkvm_iommu_host_dabt_handler(host_ctxt, esr, addr))
+		ret = 0;
 
 	/* If not handled, attempt to map the page. */
 	if (ret == -EPERM)
 		ret = host_stage2_idmap(addr);
 
+	host_unlock_component();
+
 	if (ret == -EPERM)
 		host_inject_abort(host_ctxt);
 	else

From d9bab2b5edec899c56ee82b7f5b5080df89c53dc Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 22:30:37 +0000
Subject: [PATCH 189/457] ANDROID: KVM: arm64: iommu: Host stage-2 idmap
 callbacks

Add IOMMU callbacks for host stage-2 idmap changes.
'host_stage2_idmap_prepare' is called first and is expected to apply
the changes on the driver level, eg. update driver-specific page table
information. If successful, the generic code invokes
'host_stage2_idmap_apply' on each currently powered IOMMU device
associated with the driver to apply the changes.

Bug: 190463801
Change-Id: Ifcc063896f6e8967c332dbaa5b7e7f2ba138abbf
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4395ddff4bc5ba21935e53861b86e7063447bf85)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h | 16 +++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 21 +++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c   | 30 +++++++++++++++++++------
 3 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index c95f1c08371f..53e65deb2c79 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -25,6 +25,20 @@ struct pkvm_iommu_ops {
 	 */
 	int (*validate)(phys_addr_t base, size_t size);
 
+	/*
+	 * Callback to apply a host stage-2 mapping change at driver level.
+	 * Called before 'host_stage2_idmap_apply' with host lock held.
+	 */
+	void (*host_stage2_idmap_prepare)(phys_addr_t start, phys_addr_t end,
+					  enum kvm_pgtable_prot prot);
+
+	/*
+	 * Callback to apply a host stage-2 mapping change at device level.
+	 * Called after 'host_stage2_idmap_prepare' with host lock held.
+	 */
+	void (*host_stage2_idmap_apply)(struct pkvm_iommu *dev,
+					phys_addr_t start, phys_addr_t end);
+
 	/* Power management callbacks. Called with host lock held. */
 	int (*suspend)(struct pkvm_iommu *dev);
 	int (*resume)(struct pkvm_iommu *dev);
@@ -63,6 +77,8 @@ int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
 					phys_addr_t *end);
 bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
 				  phys_addr_t fault_pa);
+void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
+				  enum kvm_pgtable_prot prot);
 
 struct kvm_iommu_ops {
 	int (*init)(void);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 2d3d05d7ac89..c7f228a0a733 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -380,3 +380,24 @@ bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
 	}
 	return false;
 }
+
+void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
+				  enum kvm_pgtable_prot prot)
+{
+	struct pkvm_iommu_driver *drv;
+	struct pkvm_iommu *dev;
+	size_t i;
+
+	assert_host_component_locked();
+
+	for (i = 0; i < ARRAY_SIZE(iommu_drivers); i++) {
+		drv = get_driver(i);
+		if (drv && is_driver_ready(drv) && drv->ops->host_stage2_idmap_prepare)
+			drv->ops->host_stage2_idmap_prepare(start, end, prot);
+	}
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (dev->powered && dev->ops->host_stage2_idmap_apply)
+			dev->ops->host_stage2_idmap_apply(dev, start, end);
+	}
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index b1db2b8fb0fc..0c3db72312ff 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -416,9 +416,16 @@ int __pkvm_prot_finalize(void)
 
 int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size)
 {
+	int ret;
+
 	hyp_assert_lock_held(&host_mmu.lock);
 
-	return kvm_pgtable_stage2_unmap(&host_mmu.pgt, start, size);
+	ret = kvm_pgtable_stage2_unmap(&host_mmu.pgt, start, size);
+	if (ret)
+		return ret;
+
+	pkvm_iommu_host_stage2_idmap(start, start + size, 0);
+	return 0;
 }
 
 static int host_stage2_unmap_dev_all(void)
@@ -508,8 +515,15 @@ static bool range_is_memory(u64 start, u64 end)
 static inline int __host_stage2_idmap(u64 start, u64 end,
 				      enum kvm_pgtable_prot prot)
 {
-	return kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
-				      prot, &host_s2_pool);
+	int ret;
+
+	ret = kvm_pgtable_stage2_map(&host_mmu.pgt, start, end - start, start,
+				     prot, &host_s2_pool);
+	if (ret)
+		return ret;
+
+	pkvm_iommu_host_stage2_idmap(start, end, prot);
+	return 0;
 }
 
 /*
@@ -585,6 +599,7 @@ static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 {
 	kvm_pte_t annotation;
+	enum kvm_pgtable_prot prot;
 	int ret;
 
 	if (owner_id > KVM_MAX_OWNER_ID)
@@ -594,11 +609,12 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
 
 	ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt,
 			      addr, size, &host_s2_pool, annotation);
+	if (ret)
+		return ret;
 
-	if (!ret && kvm_iommu_ops.host_stage2_set_owner)
-		kvm_iommu_ops.host_stage2_set_owner(addr, size, owner_id);
-
-	return ret;
+	prot = owner_id == PKVM_ID_HOST ? PKVM_HOST_MEM_PROT : 0;
+	pkvm_iommu_host_stage2_idmap(addr, addr + size, prot);
+	return 0;
 }
 
 static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)

From f78c4e5b3f48df72dd856985f811584ce3199a66 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 23:34:26 +0000
Subject: [PATCH 190/457] ANDROID: KVM: arm64: iommu: Snapshot host stage-2 at
 driver init

IOMMU drivers may need to keep their own state of the host stage-2
mappings, eg. because they cannot share the PTs with the CPU. To this
end, walk the host stage-2 at driver init time and pass the current
state of host stage-2 mappings to the driver.

The driver initialization lock is released together with host_kvm
lock. That was the driver starts receiving stage-2 updates immediately
after the snapshot is taken.

Bug: 190463801
Change-Id: I5a5b0e064c5c88e210e28e343314318a2a1bffda
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 1ec4b346d0b65a868c52e5ff11b30918ed29185c)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu.c | 47 ++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index c7f228a0a733..061630ebaaaf 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -149,6 +149,40 @@ static bool is_mmio_range(phys_addr_t base, size_t size)
 	return true;
 }
 
+static int __snapshot_host_stage2(u64 start, u64 end, u32 level,
+				  kvm_pte_t *ptep,
+				  enum kvm_pgtable_walk_flags flags,
+				  void * const arg)
+{
+	struct pkvm_iommu_driver * const drv = arg;
+	enum kvm_pgtable_prot prot;
+	kvm_pte_t pte = *ptep;
+
+	/*
+	 * Valid stage-2 entries are created lazily, invalid ones eagerly.
+	 * Note: In the future we may need to check if [start,end) is MMIO.
+	 */
+	prot = (!pte || kvm_pte_valid(pte)) ? PKVM_HOST_MEM_PROT : 0;
+
+	drv->ops->host_stage2_idmap_prepare(start, end, prot);
+	return 0;
+}
+
+static int snapshot_host_stage2(struct pkvm_iommu_driver * const drv)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= __snapshot_host_stage2,
+		.arg	= drv,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+	};
+	struct kvm_pgtable *pgt = &host_mmu.pgt;
+
+	if (!drv->ops->host_stage2_idmap_prepare)
+		return 0;
+
+	return kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker);
+}
+
 static bool validate_against_existing_iommus(struct pkvm_iommu *dev)
 {
 	struct pkvm_iommu *other;
@@ -216,8 +250,19 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 			goto out;
 	}
 
+	/*
+	 * Walk host stage-2 and pass current mappings to the driver. Start
+	 * accepting host stage-2 updates as soon as the host lock is released.
+	 */
+	host_lock_component();
+	ret = snapshot_host_stage2(drv);
+	if (!ret)
+		driver_release_init(drv, /*success=*/true);
+	host_unlock_component();
+
 out:
-	driver_release_init(drv, /*success=*/!ret);
+	if (ret)
+		driver_release_init(drv, /*success=*/false);
 	return ret;
 }
 

From 0f56c07d1cac6f8b1a5a41ac48fc34f4cf5da1dd Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 7 Jan 2022 11:33:38 +0000
Subject: [PATCH 191/457] ANDROID: KVM: arm64: s2mpu: Move SFR init to EL2

S2MPU code previously assumed that all S2MPUs were powered on at boot
and would check the version register and precompute the value of
S2MPU.CONTEXT_CFG_VALID_VID.

With EL1 S2MPU code being removed, and to allow for S2MPUs not powered
at boot, move the code to EL2 and run it on resume.

Bug: 190463801
Change-Id: Icaccfd125a6be7bab336ca3ffee52f2a33cf43b2
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit c823243791205b8219b60294bc7615e198c12382)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 117 +++++++++++++++++++++-----
 arch/arm64/kvm/iommu/s2mpu.c          |  71 ----------------
 2 files changed, 95 insertions(+), 93 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index f10326b7bf8b..c138c48eb14f 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -30,6 +30,10 @@
 #define for_each_powered_s2mpu(i) \
 	for_each_s2mpu((i)) if (is_powered_on((i)))
 
+#define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
+	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
+	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
+
 size_t __ro_after_init		kvm_hyp_nr_s2mpus;
 struct s2mpu __ro_after_init	*kvm_hyp_s2mpus;
 struct mpt			kvm_hyp_host_mpt;
@@ -67,18 +71,75 @@ static bool is_in_power_domain(struct s2mpu *dev, u64 power_domain_id)
 	}
 }
 
-/*
- * Write CONTEXT_CFG_VALID_VID configuration before touching L1ENTRY* registers.
- * Writes to those registers are ignored unless there is a context ID allocated
- * to the corresponding VID (v9 only).
- */
-static void __set_context_ids(struct s2mpu *dev)
+static u32 __context_cfg_valid_vid(struct s2mpu *dev, u32 vid_bmap)
 {
-	if (!is_version(dev, S2MPU_VERSION_9))
-		return;
+	u8 ctx_vid[NR_CTX_IDS] = { 0 };
+	unsigned int vid, ctx = 0;
+	unsigned int num_ctx;
+	u32 res;
 
-	writel_relaxed(dev->context_cfg_valid_vid,
-		       dev->va + REG_NS_CONTEXT_CFG_VALID_VID);
+	/* Only initialize once. */
+	if (dev->context_cfg_valid_vid)
+		return dev->context_cfg_valid_vid;
+
+	num_ctx = readl_relaxed(dev->va + REG_NS_NUM_CONTEXT) & NUM_CONTEXT_MASK;
+	while (vid_bmap) {
+		/* Break if we cannot allocate more. */
+		if (ctx >= num_ctx)
+			break;
+
+		vid = __ffs(vid_bmap);
+		vid_bmap &= ~BIT(vid);
+		ctx_vid[ctx++] = vid;
+	}
+
+	/* The following loop was unrolled so bitmasks are constant. */
+	BUILD_BUG_ON(NR_CTX_IDS != 8);
+	res = CTX_CFG_ENTRY(0, ctx, ctx_vid[0])
+	    | CTX_CFG_ENTRY(1, ctx, ctx_vid[1])
+	    | CTX_CFG_ENTRY(2, ctx, ctx_vid[2])
+	    | CTX_CFG_ENTRY(3, ctx, ctx_vid[3])
+	    | CTX_CFG_ENTRY(4, ctx, ctx_vid[4])
+	    | CTX_CFG_ENTRY(5, ctx, ctx_vid[5])
+	    | CTX_CFG_ENTRY(6, ctx, ctx_vid[6])
+	    | CTX_CFG_ENTRY(7, ctx, ctx_vid[7]);
+
+	dev->context_cfg_valid_vid = res;
+	return res;
+}
+
+static int __initialize_v9(struct s2mpu *dev)
+{
+	u32 ssmt_valid_vid_bmap, ctx_cfg;
+
+	/* Assume all VIDs may be generated by the connected SSMTs for now. */
+	ssmt_valid_vid_bmap = ALL_VIDS_BITMAP;
+	ctx_cfg = __context_cfg_valid_vid(dev, ssmt_valid_vid_bmap);
+	if (!ctx_cfg)
+		return -EINVAL;
+
+	/*
+	 * Write CONTEXT_CFG_VALID_VID configuration before touching L1ENTRY*
+	 * registers. Writes to those registers are ignored unless there is
+	 * a context ID allocated to the corresponding VID (v9 only).
+	 */
+	writel_relaxed(ctx_cfg, dev->va + REG_NS_CONTEXT_CFG_VALID_VID);
+	return 0;
+}
+
+static int __initialize(struct s2mpu *dev)
+{
+	if (!dev->version)
+		dev->version = readl_relaxed(dev->va + REG_NS_VERSION);
+
+	switch (dev->version & VERSION_CHECK_MASK) {
+	case S2MPU_VERSION_8:
+		return 0;
+	case S2MPU_VERSION_9:
+		return __initialize_v9(dev);
+	default:
+		return -EINVAL;
+	}
 }
 
 static void __set_control_regs(struct s2mpu *dev)
@@ -179,12 +240,14 @@ static void __set_l1entry_l2table_addr(struct s2mpu *dev, unsigned int gb,
  * Initialize S2MPU device and set all GB regions to 1G granularity with
  * given protection bits.
  */
-static void initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
+static int initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
 {
 	unsigned int gb, vid;
+	int ret;
 
-	/* Must write CONTEXT_CFG_VALID_VID before setting L1ENTRY registers. */
-	__set_context_ids(dev);
+	ret = __initialize(dev);
+	if (ret)
+		return ret;
 
 	for_each_gb_and_vid(gb, vid)
 		__set_l1entry_attr_with_prot(dev, gb, vid, prot);
@@ -192,19 +255,22 @@ static void initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
 
 	/* Set control registers, enable the S2MPU. */
 	__set_control_regs(dev);
+	return 0;
 }
 
 /*
  * Initialize S2MPU device, set L2 table addresses and configure L1TABLE_ATTR
  * registers according to the given MPT struct.
  */
-static void initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
+static int initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
 {
 	unsigned int gb, vid;
 	struct fmpt *fmpt;
+	int ret;
 
-	/* Must write CONTEXT_CFG_VALID_VID before setting L1ENTRY registers. */
-	__set_context_ids(dev);
+	ret = __initialize(dev);
+	if (ret)
+		return ret;
 
 	for_each_gb_and_vid(gb, vid) {
 		fmpt = &mpt->fmpt[gb];
@@ -215,6 +281,7 @@ static void initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
 
 	/* Set control registers, enable the S2MPU. */
 	__set_control_regs(dev);
+	return 0;
 }
 
 /*
@@ -326,6 +393,7 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 
 	struct arm_smccc_res res;
 	struct s2mpu *dev;
+	int ret;
 
 	if (fn != SMC_CMD_PREPARE_PD_ONOFF)
 		return false; /* SMC not handled */
@@ -350,23 +418,25 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 
 	hyp_spin_lock(&s2mpu_lock);
 	arm_smccc_1_1_smc(fn, mode, domain_id, group, &res);
-	if (res.a0 == SMCCC_RET_SUCCESS) {
+	ret = res.a0;
+
+	if (ret == SMCCC_RET_SUCCESS) {
 		for_each_s2mpu(dev) {
 			if (!is_in_power_domain(dev, domain_id))
 				continue;
 
 			if (mode == SMC_MODE_POWER_UP) {
 				dev->power_state = S2MPU_POWER_ON;
-				initialize_with_mpt(dev, &kvm_hyp_host_mpt);
+				ret = initialize_with_mpt(dev, &kvm_hyp_host_mpt);
 			} else {
-				initialize_with_prot(dev, MPT_PROT_NONE);
+				ret = initialize_with_prot(dev, MPT_PROT_NONE);
 				dev->power_state = S2MPU_POWER_OFF;
 			}
 		}
 	}
 	hyp_spin_unlock(&s2mpu_lock);
 
-	cpu_reg(host_ctxt, 0) = res.a0;
+	cpu_reg(host_ctxt, 0) = ret;
 	return true;  /* SMC handled */
 }
 
@@ -469,8 +539,11 @@ static int s2mpu_init(void)
 	 * Program all S2MPUs powered on at boot. Note that they may not be in
 	 * the blocking reset state as the bootloader may have programmed them.
 	 */
-	for_each_powered_s2mpu(dev)
-		initialize_with_mpt(dev, &kvm_hyp_host_mpt);
+	for_each_powered_s2mpu(dev) {
+		ret = initialize_with_mpt(dev, &kvm_hyp_host_mpt);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 00111f237de3..79c33815267b 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -13,10 +13,6 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_s2mpu.h>
 
-#define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
-	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
-	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
-
 struct s2mpu_irq_info {
 	struct device *dev;
 	void __iomem *va;
@@ -81,59 +77,6 @@ static irqreturn_t s2mpu_irq_handler(int irq, void *data)
 	return ret;
 }
 
-static u32 gen_ctx_cfg_valid_vid(struct platform_device *pdev,
-				 unsigned int num_ctx, u32 vid_bmap)
-{
-	u8 ctx_vid[NR_CTX_IDS] = { 0 };
-	unsigned int vid, ctx = 0;
-
-	/* Check NUM_CONTEXT value is within bounds. This should not happen. */
-	if (WARN_ON(num_ctx > NR_CTX_IDS))
-		num_ctx = NR_CTX_IDS;
-
-	while (vid_bmap) {
-		/* Break if we cannot allocate more. */
-		if (ctx >= num_ctx) {
-			dev_warn(&pdev->dev,
-				 "could not allocate all context IDs, DMA may be blocked (VID bitmap: 0x%x)",
-				 vid_bmap);
-			break;
-		}
-
-		vid = __ffs(vid_bmap);
-		vid_bmap &= ~BIT(vid);
-		ctx_vid[ctx++] = vid;
-	}
-
-	/* The following loop was unrolled so bitmasks are constant. */
-	BUILD_BUG_ON(NR_CTX_IDS != 8);
-	return CTX_CFG_ENTRY(0, ctx, ctx_vid[0])
-	     | CTX_CFG_ENTRY(1, ctx, ctx_vid[1])
-	     | CTX_CFG_ENTRY(2, ctx, ctx_vid[2])
-	     | CTX_CFG_ENTRY(3, ctx, ctx_vid[3])
-	     | CTX_CFG_ENTRY(4, ctx, ctx_vid[4])
-	     | CTX_CFG_ENTRY(5, ctx, ctx_vid[5])
-	     | CTX_CFG_ENTRY(6, ctx, ctx_vid[6])
-	     | CTX_CFG_ENTRY(7, ctx, ctx_vid[7]);
-}
-
-static int s2mpu_probe_v9(struct platform_device *pdev, void __iomem *kaddr,
-			  struct s2mpu *info)
-{
-	unsigned int num_ctx;
-	u32 ssmt_valid_vid_bmap;
-
-	ssmt_valid_vid_bmap = ALL_VIDS_BITMAP;
-	num_ctx = readl_relaxed(kaddr + REG_NS_NUM_CONTEXT) & NUM_CONTEXT_MASK;
-	info->context_cfg_valid_vid = gen_ctx_cfg_valid_vid(pdev, num_ctx, ssmt_valid_vid_bmap);
-	if (!info->context_cfg_valid_vid) {
-		dev_err(&pdev->dev, "failed to allocate context IDs");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 /*
  * Parse interrupt information from DT and if found, register IRQ handler.
  * This is considered optional and will not fail even if the initialization is
@@ -232,20 +175,6 @@ static int s2mpu_probe(struct platform_device *pdev)
 	 */
 	s2mpu_probe_irq(pdev, kaddr);
 
-	info->version = readl_relaxed(kaddr + REG_NS_VERSION);
-	switch (info->version & VERSION_CHECK_MASK) {
-	case S2MPU_VERSION_8:
-		break;
-	case S2MPU_VERSION_9:
-		ret = s2mpu_probe_v9(pdev, kaddr, info);
-		if (ret)
-			return ret;
-		break;
-	default:
-		dev_err(&pdev->dev, "unexpected version 0x%08x", info->version);
-		return -EINVAL;
-	}
-
 	/* Insert successfully parsed devices to a list later copied to hyp. */
 	list_add_tail(&entry->list, &s2mpu_list);
 	kvm_hyp_nr_s2mpus++;

From 03952689493e6a8189f45948486cf7ae39813fc1 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 11:04:28 +0000
Subject: [PATCH 192/457] ANDROID: KVM: arm64: s2mpu: Remove all EL1 code

EL2 S2MPU driver relied on EL1 code which parsed the DT and populated
EL2 driver data before deprivileging of the host. The driver is now
moving to later initialization from kernel modules, which will take over
the role of parsing the DT and power management. Remove the unused code.

Bug: 190463801
Change-Id: Ie6e21ba02b84494e5066c7681f85612a09f93f6d
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 167332a9fa748f7d9fd08643d0df5bbe5078f26d)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h     |   7 -
 arch/arm64/include/asm/kvm_s2mpu.h    |   9 -
 arch/arm64/kvm/arm.c                  |   8 +-
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c |  25 +--
 arch/arm64/kvm/hyp/nvhe/setup.c       |   6 -
 arch/arm64/kvm/iommu/s2mpu.c          | 306 --------------------------
 6 files changed, 13 insertions(+), 348 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e8d92153f623..769033501880 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -381,15 +381,8 @@ extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 
 enum kvm_iommu_driver {
 	KVM_IOMMU_DRIVER_NONE,
-	KVM_IOMMU_DRIVER_S2MPU,
 };
 
-#ifdef CONFIG_KVM_S2MPU
-int kvm_s2mpu_init(void);
-#else
-static inline int kvm_s2mpu_init(void) { return -ENODEV; }
-#endif
-
 enum pkvm_iommu_driver_id {
 	PKVM_IOMMU_NR_DRIVERS,
 };
diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 3d541accf982..0e262b0c032d 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -189,15 +189,6 @@ enum mpt_update_flags {
 	MPT_UPDATE_L2 = BIT(1),
 };
 
-extern size_t kvm_nvhe_sym(kvm_hyp_nr_s2mpus);
-#define kvm_hyp_nr_s2mpus kvm_nvhe_sym(kvm_hyp_nr_s2mpus)
-
-extern struct s2mpu *kvm_nvhe_sym(kvm_hyp_s2mpus);
-#define kvm_hyp_s2mpus kvm_nvhe_sym(kvm_hyp_s2mpus)
-
-extern struct mpt kvm_nvhe_sym(kvm_hyp_host_mpt);
-#define kvm_hyp_host_mpt kvm_nvhe_sym(kvm_hyp_host_mpt)
-
 /* Set protection bits of SMPT in a given range without using memset. */
 static inline void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
 					 size_t end_gb_byte, enum mpt_prot prot)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index efeeb56e823c..977a58b4c8a7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1921,13 +1921,7 @@ static bool init_psci_relay(void)
 
 static int init_stage2_iommu(void)
 {
-	int ret;
-
-	ret = kvm_s2mpu_init();
-	if (!ret)
-		return KVM_IOMMU_DRIVER_S2MPU;
-
-	return (ret == -ENODEV) ? KVM_IOMMU_DRIVER_NONE : ret;
+	return KVM_IOMMU_DRIVER_NONE;
 }
 
 static int init_subsystems(void)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index c138c48eb14f..8228811d1c74 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -25,7 +25,7 @@
 #define PA_MAX				((phys_addr_t)SZ_1G * NR_GIGABYTES)
 
 #define for_each_s2mpu(i) \
-	for ((i) = &kvm_hyp_s2mpus[0]; (i) != &kvm_hyp_s2mpus[kvm_hyp_nr_s2mpus]; (i)++)
+	for ((i) = &s2mpus[0]; (i) != &s2mpus[nr_s2mpus]; (i)++)
 
 #define for_each_powered_s2mpu(i) \
 	for_each_s2mpu((i)) if (is_powered_on((i)))
@@ -34,11 +34,10 @@
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
 
-size_t __ro_after_init		kvm_hyp_nr_s2mpus;
-struct s2mpu __ro_after_init	*kvm_hyp_s2mpus;
-struct mpt			kvm_hyp_host_mpt;
-
-static hyp_spinlock_t		s2mpu_lock;
+static size_t __ro_after_init		nr_s2mpus;
+static struct s2mpu __ro_after_init	*s2mpus;
+static struct mpt			host_mpt;
+static hyp_spinlock_t			s2mpu_lock;
 
 static bool is_version(struct s2mpu *dev, u32 version)
 {
@@ -349,7 +348,7 @@ static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size,
 		return;
 
 	hyp_spin_lock(&s2mpu_lock);
-	set_mpt_range_locked(&kvm_hyp_host_mpt,
+	set_mpt_range_locked(&host_mpt,
 			     ALIGN_DOWN(addr, SMPT_GRAN),
 			     ALIGN(addr + size, SMPT_GRAN) - 1,
 			     prot);
@@ -427,7 +426,7 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 
 			if (mode == SMC_MODE_POWER_UP) {
 				dev->power_state = S2MPU_POWER_ON;
-				ret = initialize_with_mpt(dev, &kvm_hyp_host_mpt);
+				ret = initialize_with_mpt(dev, &host_mpt);
 			} else {
 				ret = initialize_with_prot(dev, MPT_PROT_NONE);
 				dev->power_state = S2MPU_POWER_OFF;
@@ -512,16 +511,16 @@ static int s2mpu_init(void)
 	int ret;
 
 	/* Map data structures in EL2 stage-1. */
-	ret = pkvm_create_mappings(kvm_hyp_s2mpus,
-				   kvm_hyp_s2mpus + kvm_hyp_nr_s2mpus,
+	ret = pkvm_create_mappings(s2mpus,
+				   s2mpus + nr_s2mpus,
 				   PAGE_HYP);
 	if (ret)
 		return ret;
 
 	for_each_gb(gb) {
 		ret = pkvm_create_mappings(
-			kvm_hyp_host_mpt.fmpt[gb].smpt,
-			kvm_hyp_host_mpt.fmpt[gb].smpt + SMPT_NUM_WORDS,
+			host_mpt.fmpt[gb].smpt,
+			host_mpt.fmpt[gb].smpt + SMPT_NUM_WORDS,
 			PAGE_HYP);
 		if (ret)
 			return ret;
@@ -540,7 +539,7 @@ static int s2mpu_init(void)
 	 * the blocking reset state as the bootloader may have programmed them.
 	 */
 	for_each_powered_s2mpu(dev) {
-		ret = initialize_with_mpt(dev, &kvm_hyp_host_mpt);
+		ret = initialize_with_mpt(dev, &host_mpt);
 		if (ret)
 			return ret;
 	}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index cae960251048..db6f747aad84 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -307,12 +307,6 @@ static int select_iommu_ops(enum kvm_iommu_driver driver)
 	switch (driver) {
 	case KVM_IOMMU_DRIVER_NONE:
 		return 0;
-	case KVM_IOMMU_DRIVER_S2MPU:
-		if (IS_ENABLED(CONFIG_KVM_S2MPU)) {
-			kvm_iommu_ops = kvm_s2mpu_ops;
-			return 0;
-		}
-		break;
 	}
 
 	return -EINVAL;
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 79c33815267b..496e75f76e56 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -4,311 +4,5 @@
  * Author: David Brazdil <dbrazdil@google.com>
  */
 
-#include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/kvm_host.h>
-#include <linux/list.h>
-#include <linux/of_platform.h>
-#include <linux/sort.h>
-
-#include <asm/kvm_mmu.h>
 #include <asm/kvm_s2mpu.h>
-
-struct s2mpu_irq_info {
-	struct device *dev;
-	void __iomem *va;
-};
-
-struct s2mpu_list_entry {
-	struct list_head list;
-	struct device *dev;
-	struct s2mpu info;
-};
-
-static LIST_HEAD(s2mpu_list);
-
-static irqreturn_t s2mpu_irq_handler(int irq, void *data)
-{
-	struct s2mpu_irq_info *info = data;
-	unsigned int vid;
-	u32 vid_bmap, fault_info;
-	phys_addr_t fault_pa;
-	const char *fault_type;
-	irqreturn_t ret = IRQ_NONE;
-
-	while ((vid_bmap = readl_relaxed(info->va + REG_NS_FAULT_STATUS))) {
-		WARN_ON_ONCE(vid_bmap & (~ALL_VIDS_BITMAP));
-		vid = __ffs(vid_bmap);
-
-		fault_pa = hi_lo_readq_relaxed(info->va + REG_NS_FAULT_PA_HIGH_LOW(vid));
-		fault_info = readl_relaxed(info->va + REG_NS_FAULT_INFO(vid));
-		WARN_ON(FIELD_GET(FAULT_INFO_VID_MASK, fault_info) != vid);
-
-		switch (FIELD_GET(FAULT_INFO_TYPE_MASK, fault_info)) {
-		case FAULT_INFO_TYPE_MPTW:
-			fault_type = "MPTW fault";
-			break;
-		case FAULT_INFO_TYPE_AP:
-			fault_type = "access permission fault";
-			break;
-		case FAULT_INFO_TYPE_CONTEXT:
-			fault_type = "context fault";
-			break;
-		default:
-			fault_type = "unknown fault";
-			break;
-		}
-
-		dev_err(info->dev, "\n"
-			"============== S2MPU FAULT DETECTED ==============\n"
-			"  PA=0x%pap, FAULT_INFO=0x%08x\n"
-			"  DIRECTION: %s, TYPE: %s\n"
-			"  VID=%u, REQ_LENGTH=%lu, REQ_AXI_ID=%lu\n"
-			"==================================================\n",
-			&fault_pa, fault_info,
-			(fault_info & FAULT_INFO_RW_BIT) ? "write" : "read",
-			fault_type, vid,
-			FIELD_GET(FAULT_INFO_LEN_MASK, fault_info),
-			FIELD_GET(FAULT_INFO_ID_MASK, fault_info));
-
-		writel_relaxed(BIT(vid), info->va + REG_NS_INTERRUPT_CLEAR);
-		ret = IRQ_HANDLED;
-	}
-
-	return ret;
-}
-
-/*
- * Parse interrupt information from DT and if found, register IRQ handler.
- * This is considered optional and will not fail even if the initialization is
- * unsuccessful. In that case the IRQ will remain masked.
- */
-static void s2mpu_probe_irq(struct platform_device *pdev, void __iomem *kaddr)
-{
-	struct s2mpu_irq_info *irq_info;
-	int ret, irq;
-
-	irq = platform_get_irq_optional(pdev, 0);
-
-	if (irq == -ENXIO)
-		return; /* No IRQ specified. */
-
-	if (irq < 0) {
-		/* IRQ specified but failed to parse. */
-		dev_err(&pdev->dev, "failed to parse IRQ, IRQ not enabled");
-		return;
-	}
-
-	irq_info = devm_kmalloc(&pdev->dev, sizeof(*irq_info), GFP_KERNEL);
-	if (!irq_info)
-		return;
-
-	*irq_info = (struct s2mpu_irq_info){
-		.dev = &pdev->dev,
-		.va = kaddr,
-	};
-
-	ret = devm_request_irq(&pdev->dev, irq, s2mpu_irq_handler, 0,
-			       dev_name(&pdev->dev), irq_info);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to register IRQ, IRQ not enabled");
-		return;
-	}
-}
-
-static int s2mpu_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-	void __iomem *kaddr;
-	size_t res_size;
-	struct s2mpu_list_entry *entry;
-	struct s2mpu *info;
-	int ret;
-
-	entry = devm_kzalloc(&pdev->dev, sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-	entry->dev = &pdev->dev;
-	info = &entry->info;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(&pdev->dev, "failed to parse 'reg'");
-		return -EINVAL;
-	}
-
-	/* devm_ioremap_resource internally calls devm_request_mem_region. */
-	kaddr = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(kaddr)) {
-		dev_err(&pdev->dev, "could not ioremap resource: %ld",
-			PTR_ERR(kaddr));
-		return PTR_ERR(kaddr);
-	}
-
-	if (!PAGE_ALIGNED(res->start)) {
-		dev_err(&pdev->dev, "base address must be page-aligned (0x%llx)",
-			res->start);
-		return -EINVAL;
-	}
-	info->pa = res->start;
-
-	res_size = resource_size(res);
-	if (res_size != S2MPU_MMIO_SIZE) {
-		dev_err(&pdev->dev,
-			"unexpected device region size (expected=%u, actual=%lu)",
-			S2MPU_MMIO_SIZE, res_size);
-		return -EINVAL;
-	}
-
-	ret = of_property_read_u32(pdev->dev.of_node, "power-domain-id",
-				   &info->power_domain_id);
-	if (!ret) {
-		info->power_state = S2MPU_POWER_ON;
-	} else if (ret != -EINVAL) {
-		dev_err(&pdev->dev, "failed to parse power-domain-id: %d", ret);
-		return ret;
-	}
-
-	/*
-	 * Try to parse IRQ information. This is optional as it only affects
-	 * runtime fault reporting, and therefore errors do not fail the whole
-	 * driver initialization.
-	 */
-	s2mpu_probe_irq(pdev, kaddr);
-
-	/* Insert successfully parsed devices to a list later copied to hyp. */
-	list_add_tail(&entry->list, &s2mpu_list);
-	kvm_hyp_nr_s2mpus++;
-	return 0;
-}
-
-static const struct of_device_id of_table[] = {
-	{ .compatible = "google,s2mpu" },
-	{},
-};
-
-static struct platform_driver of_driver = {
-	.driver = {
-		.name = "kvm,s2mpu",
-		.of_match_table = of_table,
-	},
-};
-
-static struct s2mpu *alloc_s2mpu_array(void)
-{
-	unsigned int order;
-
-	order = get_order(kvm_hyp_nr_s2mpus * sizeof(struct s2mpu));
-	return (struct s2mpu *)__get_free_pages(GFP_KERNEL, order);
-}
-
-static void free_s2mpu_array(struct s2mpu *array)
-{
-	unsigned int order;
-
-	order = get_order(kvm_hyp_nr_s2mpus * sizeof(struct s2mpu));
-	free_pages((unsigned long)array, order);
-}
-
-static int cmp_s2mpu(const void *p1, const void *p2)
-{
-	const struct s2mpu *a = p1, *b = p2;
-
-	return (a->pa > b->pa) - (a->pa < b->pa);
-}
-
-static int create_s2mpu_array(struct s2mpu **array)
-{
-	struct s2mpu_list_entry *entry, *tmp;
-	size_t i;
-
-	*array = alloc_s2mpu_array();
-	if (!*array)
-		return -ENOMEM;
-
-	/* Copy list to hyp array and destroy the list in the process. */
-	i = 0;
-	list_for_each_entry_safe(entry, tmp, &s2mpu_list, list) {
-		(*array)[i++] = entry->info;
-		list_del(&entry->list);
-		devm_kfree(entry->dev, entry);
-	}
-	WARN_ON(i != kvm_hyp_nr_s2mpus);
-
-	/* Searching through the list assumes that it is sorted. */
-	sort(*array, kvm_hyp_nr_s2mpus, sizeof(struct s2mpu), cmp_s2mpu, NULL);
-
-	kvm_hyp_s2mpus = kern_hyp_va(*array);
-	return 0;
-}
-
-static int alloc_smpts(struct mpt *mpt)
-{
-	unsigned int gb;
-
-	for_each_gb(gb) {
-		/* The returned buffer is aligned to its size, as required. */
-		mpt->fmpt[gb].smpt = (u32 *)__get_free_pages(GFP_KERNEL, SMPT_ORDER);
-		if (!mpt->fmpt[gb].smpt)
-			return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void free_smpts(struct mpt *mpt)
-{
-	unsigned int gb;
-
-	for_each_gb(gb)
-		free_pages((unsigned long)mpt->fmpt[gb].smpt, SMPT_ORDER);
-}
-
-static int init_host_mpt(struct mpt *mpt)
-{
-	unsigned int gb;
-	int ret;
-
-	ret = alloc_smpts(mpt);
-	if (ret) {
-		kvm_err("Cannot allocate memory for S2MPU host MPT");
-		return ret;
-	}
-
-	/* Initialize the host MPT. Use 1G mappings with RW permissions. */
-	for_each_gb(gb) {
-		kvm_hyp_host_mpt.fmpt[gb] = (struct fmpt){
-			.gran_1g = true,
-			.prot = MPT_PROT_RW,
-			.smpt = kern_hyp_va(mpt->fmpt[gb].smpt),
-		};
-	}
-	return 0;
-}
-
-int kvm_s2mpu_init(void)
-{
-	struct s2mpu *s2mpus = NULL;
-	struct mpt mpt = {};
-	int ret;
-
-	ret = platform_driver_probe(&of_driver, s2mpu_probe);
-	if (ret)
-		goto out;
-
-	ret = create_s2mpu_array(&s2mpus);
-	if (ret)
-		goto out;
-
-	ret = init_host_mpt(&mpt);
-	if (ret)
-		goto out;
-
-	kvm_info("S2MPU driver initialized\n");
-
-out:
-	if (ret) {
-		free_s2mpu_array(s2mpus);
-		free_smpts(&mpt);
-	}
-	return ret;
-}

From 390064d9e77c6d9dbf0892864e0dbcc18606dd14 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 21:35:53 +0000
Subject: [PATCH 193/457] ANDROID: KVM: arm64: s2mpu: Replace struct s2mpu with
 pkvm_iommu

Replace all uses of 'struct s2mpu' with the generic 'struct pkvm_iommu'.
'struct s2mpu_drv_data' is created to accommodate driver-specific values
associated with 'struct pkvm_iommu' and allocated by the generic code.

These changes are safe because the S2MPU code is currently unused.
The EL1 code that initialized it had been removed.

Bug: 190463801
Change-Id: Ia634bac9b7dda333d87f7da0a02768df01d6bbd6
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit a1ed8a1881f44cecaa1f73ffff36e4584ab37b07)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    |  15 ----
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 100 ++++++++++++--------------
 2 files changed, 47 insertions(+), 68 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 0e262b0c032d..3c61842a25ee 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -144,21 +144,6 @@ enum s2mpu_version {
 	S2MPU_VERSION_9 = 0x20000000,
 };
 
-enum s2mpu_power_state {
-	S2MPU_POWER_ALWAYS_ON = 0,
-	S2MPU_POWER_ON,
-	S2MPU_POWER_OFF,
-};
-
-struct s2mpu {
-	phys_addr_t pa;
-	void __iomem *va;
-	u32 version;
-	enum s2mpu_power_state power_state;
-	u32 power_domain_id;
-	u32 context_cfg_valid_vid;
-};
-
 enum mpt_prot {
 	MPT_PROT_NONE	= 0,
 	MPT_PROT_R	= BIT(0),
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 8228811d1c74..8de8fe7528c4 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -34,52 +34,44 @@
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
 
-static size_t __ro_after_init		nr_s2mpus;
-static struct s2mpu __ro_after_init	*s2mpus;
-static struct mpt			host_mpt;
-static hyp_spinlock_t			s2mpu_lock;
+static size_t __ro_after_init			nr_s2mpus;
+static struct pkvm_iommu __ro_after_init	*s2mpus;
+static struct mpt				host_mpt;
+static hyp_spinlock_t				s2mpu_lock;
 
-static bool is_version(struct s2mpu *dev, u32 version)
+struct s2mpu_drv_data {
+	u32 version;
+	u32 context_cfg_valid_vid;
+};
+
+static bool is_version(struct pkvm_iommu *dev, u32 version)
 {
-	return (dev->version & VERSION_CHECK_MASK) == version;
+	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
+
+	return (data->version & VERSION_CHECK_MASK) == version;
 }
 
-static bool is_powered_on(struct s2mpu *dev)
+static bool is_powered_on(struct pkvm_iommu *dev)
 {
-	switch (dev->power_state) {
-	case S2MPU_POWER_ALWAYS_ON:
-	case S2MPU_POWER_ON:
-		return true;
-	case S2MPU_POWER_OFF:
-		return false;
-	default:
-		BUG();
-	}
+	return dev->powered;
 }
 
-static bool is_in_power_domain(struct s2mpu *dev, u64 power_domain_id)
+static bool is_in_power_domain(struct pkvm_iommu *dev, u64 power_domain_id)
 {
-	switch (dev->power_state) {
-	case S2MPU_POWER_ALWAYS_ON:
-		return false;
-	case S2MPU_POWER_ON:
-	case S2MPU_POWER_OFF:
-		return dev->power_domain_id == power_domain_id;
-	default:
-		BUG();
-	}
+	return false;
 }
 
-static u32 __context_cfg_valid_vid(struct s2mpu *dev, u32 vid_bmap)
+static u32 __context_cfg_valid_vid(struct pkvm_iommu *dev, u32 vid_bmap)
 {
+	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
 	u8 ctx_vid[NR_CTX_IDS] = { 0 };
 	unsigned int vid, ctx = 0;
 	unsigned int num_ctx;
 	u32 res;
 
 	/* Only initialize once. */
-	if (dev->context_cfg_valid_vid)
-		return dev->context_cfg_valid_vid;
+	if (data->context_cfg_valid_vid)
+		return data->context_cfg_valid_vid;
 
 	num_ctx = readl_relaxed(dev->va + REG_NS_NUM_CONTEXT) & NUM_CONTEXT_MASK;
 	while (vid_bmap) {
@@ -103,11 +95,11 @@ static u32 __context_cfg_valid_vid(struct s2mpu *dev, u32 vid_bmap)
 	    | CTX_CFG_ENTRY(6, ctx, ctx_vid[6])
 	    | CTX_CFG_ENTRY(7, ctx, ctx_vid[7]);
 
-	dev->context_cfg_valid_vid = res;
+	data->context_cfg_valid_vid = res;
 	return res;
 }
 
-static int __initialize_v9(struct s2mpu *dev)
+static int __initialize_v9(struct pkvm_iommu *dev)
 {
 	u32 ssmt_valid_vid_bmap, ctx_cfg;
 
@@ -126,12 +118,14 @@ static int __initialize_v9(struct s2mpu *dev)
 	return 0;
 }
 
-static int __initialize(struct s2mpu *dev)
+static int __initialize(struct pkvm_iommu *dev)
 {
-	if (!dev->version)
-		dev->version = readl_relaxed(dev->va + REG_NS_VERSION);
+	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
 
-	switch (dev->version & VERSION_CHECK_MASK) {
+	if (!data->version)
+		data->version = readl_relaxed(dev->va + REG_NS_VERSION);
+
+	switch (data->version & VERSION_CHECK_MASK) {
 	case S2MPU_VERSION_8:
 		return 0;
 	case S2MPU_VERSION_9:
@@ -141,7 +135,7 @@ static int __initialize(struct s2mpu *dev)
 	}
 }
 
-static void __set_control_regs(struct s2mpu *dev)
+static void __set_control_regs(struct pkvm_iommu *dev)
 {
 	u32 ctrl0 = 0, irq_vids;
 
@@ -181,7 +175,7 @@ static void __wait_while(void __iomem *addr, u32 mask)
 		continue;
 }
 
-static void __wait_for_invalidation_complete(struct s2mpu *dev)
+static void __wait_for_invalidation_complete(struct pkvm_iommu *dev)
 {
 	/* Must not access SFRs while S2MPU is busy invalidating (v9 only). */
 	if (is_version(dev, S2MPU_VERSION_9)) {
@@ -190,13 +184,13 @@ static void __wait_for_invalidation_complete(struct s2mpu *dev)
 	}
 }
 
-static void __all_invalidation(struct s2mpu *dev)
+static void __all_invalidation(struct pkvm_iommu *dev)
 {
 	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_ALL_INVALIDATION);
 	__wait_for_invalidation_complete(dev);
 }
 
-static void __range_invalidation(struct s2mpu *dev, phys_addr_t first_byte,
+static void __range_invalidation(struct pkvm_iommu *dev, phys_addr_t first_byte,
 				 phys_addr_t last_byte)
 {
 	u32 start_ppn = first_byte >> RANGE_INVALIDATION_PPN_SHIFT;
@@ -208,14 +202,14 @@ static void __range_invalidation(struct s2mpu *dev, phys_addr_t first_byte,
 	__wait_for_invalidation_complete(dev);
 }
 
-static void __set_l1entry_attr_with_prot(struct s2mpu *dev, unsigned int gb,
+static void __set_l1entry_attr_with_prot(struct pkvm_iommu *dev, unsigned int gb,
 					 unsigned int vid, enum mpt_prot prot)
 {
 	writel_relaxed(L1ENTRY_ATTR_1G(prot),
 		       dev->va + REG_NS_L1ENTRY_ATTR(vid, gb));
 }
 
-static void __set_l1entry_attr_with_fmpt(struct s2mpu *dev, unsigned int gb,
+static void __set_l1entry_attr_with_fmpt(struct pkvm_iommu *dev, unsigned int gb,
 					 unsigned int vid, struct fmpt *fmpt)
 {
 	if (fmpt->gran_1g) {
@@ -227,7 +221,7 @@ static void __set_l1entry_attr_with_fmpt(struct s2mpu *dev, unsigned int gb,
 	}
 }
 
-static void __set_l1entry_l2table_addr(struct s2mpu *dev, unsigned int gb,
+static void __set_l1entry_l2table_addr(struct pkvm_iommu *dev, unsigned int gb,
 				       unsigned int vid, phys_addr_t addr)
 {
 	/* Order against writes to the SMPT. */
@@ -239,7 +233,7 @@ static void __set_l1entry_l2table_addr(struct s2mpu *dev, unsigned int gb,
  * Initialize S2MPU device and set all GB regions to 1G granularity with
  * given protection bits.
  */
-static int initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
+static int initialize_with_prot(struct pkvm_iommu *dev, enum mpt_prot prot)
 {
 	unsigned int gb, vid;
 	int ret;
@@ -261,7 +255,7 @@ static int initialize_with_prot(struct s2mpu *dev, enum mpt_prot prot)
  * Initialize S2MPU device, set L2 table addresses and configure L1TABLE_ATTR
  * registers according to the given MPT struct.
  */
-static int initialize_with_mpt(struct s2mpu *dev, struct mpt *mpt)
+static int initialize_with_mpt(struct pkvm_iommu *dev, struct mpt *mpt)
 {
 	unsigned int gb, vid;
 	struct fmpt *fmpt;
@@ -294,7 +288,7 @@ static void set_mpt_range_locked(struct mpt *mpt, phys_addr_t first_byte,
 	unsigned int last_gb = last_byte / SZ_1G;
 	size_t start_gb_byte, end_gb_byte;
 	unsigned int gb, vid;
-	struct s2mpu *dev;
+	struct pkvm_iommu *dev;
 	struct fmpt *fmpt;
 	enum mpt_update_flags flags;
 
@@ -358,7 +352,7 @@ static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size,
 static int s2mpu_host_stage2_adjust_mmio_range(phys_addr_t addr, phys_addr_t *start,
 					       phys_addr_t *end)
 {
-	struct s2mpu *dev;
+	struct pkvm_iommu *dev;
 	phys_addr_t dev_start, dev_end, int_start, int_end;
 
 	/* Find the PA interval in the non-empty, sorted list of S2MPUs. */
@@ -391,7 +385,7 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(u64, group, host_ctxt, 3);
 
 	struct arm_smccc_res res;
-	struct s2mpu *dev;
+	struct pkvm_iommu *dev;
 	int ret;
 
 	if (fn != SMC_CMD_PREPARE_PD_ONOFF)
@@ -425,11 +419,11 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 				continue;
 
 			if (mode == SMC_MODE_POWER_UP) {
-				dev->power_state = S2MPU_POWER_ON;
+				dev->powered = true;
 				ret = initialize_with_mpt(dev, &host_mpt);
 			} else {
 				ret = initialize_with_prot(dev, MPT_PROT_NONE);
-				dev->power_state = S2MPU_POWER_OFF;
+				dev->powered = false;
 			}
 		}
 	}
@@ -439,9 +433,9 @@ static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 	return true;  /* SMC handled */
 }
 
-static struct s2mpu *find_s2mpu_by_addr(phys_addr_t addr)
+static struct pkvm_iommu *find_s2mpu_by_addr(phys_addr_t addr)
 {
-	struct s2mpu *dev;
+	struct pkvm_iommu *dev;
 
 	for_each_s2mpu(dev) {
 		if (dev->pa <= addr && addr < (dev->pa + S2MPU_MMIO_SIZE))
@@ -480,7 +474,7 @@ static bool s2mpu_host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt,
 					 phys_addr_t fault_pa, unsigned int len,
 					 bool is_write, int rd)
 {
-	struct s2mpu *dev;
+	struct pkvm_iommu *dev;
 	size_t off;
 	u32 mask;
 
@@ -506,7 +500,7 @@ static bool s2mpu_host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt,
 
 static int s2mpu_init(void)
 {
-	struct s2mpu *dev;
+	struct pkvm_iommu *dev;
 	unsigned int gb;
 	int ret;
 

From f274203091843a0ad421784365fe472bbdfcdd46 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 18:12:51 +0000
Subject: [PATCH 194/457] ANDROID: KVM: arm64: s2mpu: Remove
 host_stage2_adjust_mmio_range

The function is superseded by the generic
pkvm_iommu_host_stage2_adjust_range, remove it.

Bug: 190463801
Change-Id: If42b40357f1d9a046ff20815215f927ac2a0d765
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 3da3f51b335de90b134efc3f8029ee6c769197f2)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 29 ---------------------------
 1 file changed, 29 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 8de8fe7528c4..66090d00e810 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -349,34 +349,6 @@ static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size,
 	hyp_spin_unlock(&s2mpu_lock);
 }
 
-static int s2mpu_host_stage2_adjust_mmio_range(phys_addr_t addr, phys_addr_t *start,
-					       phys_addr_t *end)
-{
-	struct pkvm_iommu *dev;
-	phys_addr_t dev_start, dev_end, int_start, int_end;
-
-	/* Find the PA interval in the non-empty, sorted list of S2MPUs. */
-	int_start = 0;
-	for_each_s2mpu(dev) {
-		dev_start = dev->pa;
-		dev_end = dev_start + S2MPU_MMIO_SIZE;
-		int_end = dev_start;
-
-		if (dev_start <= addr && addr < dev_end)
-			return -EPERM;
-
-		if (int_start <= addr && addr < int_end)
-			break;
-
-		int_start = dev_end;
-		int_end = PA_MAX;
-	}
-
-	*start = max(*start, int_start);
-	*end = min(*end, int_end);
-	return 0;
-}
-
 static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(u64, fn, host_ctxt, 0);
@@ -545,5 +517,4 @@ const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
 	.host_smc_handler = s2mpu_host_smc_handler,
 	.host_mmio_dabt_handler = s2mpu_host_mmio_dabt_handler,
 	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
-	.host_stage2_adjust_mmio_range = s2mpu_host_stage2_adjust_mmio_range,
 };

From e254f79e082a508a6bd2410a6d2eb84cd3e7d15c Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 15:01:28 +0000
Subject: [PATCH 195/457] ANDROID: KVM: arm64: s2mpu: Add driver initializer

Create 'struct pkvm_iommu_ops' for the S2MPU and a new driver ID to the
list of IOMMU drivers. Implement the 'init' callback, accepting donated
memory from the host to back SMPTs. If the donation is successful,
the SMPTs are assigned to 'host_mpt'.

Export 'pkvm_iommu_s2mpu_register' for a kernel module to call to
register an S2MPU device. First call to this function will also
run the global S2MPU driver initializer.

Bug: 190463801
Change-Id: Icad06379e5cf695fba4f3a18a0773e302f3ead06
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 41707102f4fa94757ce9999e1555827512bd5192)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h       |  3 +
 arch/arm64/include/asm/kvm_s2mpu.h      |  3 +-
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  2 +
 arch/arm64/kvm/hyp/nvhe/iommu.c         |  2 +
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c   | 82 ++++++++++++++++---------
 arch/arm64/kvm/iommu/s2mpu.c            | 78 +++++++++++++++++++++++
 6 files changed, 139 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 769033501880..3cf7e69d4afe 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -384,6 +384,7 @@ enum kvm_iommu_driver {
 };
 
 enum pkvm_iommu_driver_id {
+	PKVM_IOMMU_DRIVER_S2MPU,
 	PKVM_IOMMU_NR_DRIVERS,
 };
 
@@ -398,6 +399,8 @@ int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
 int pkvm_iommu_suspend(struct device *dev);
 int pkvm_iommu_resume(struct device *dev);
 
+int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t pa);
+
 struct vcpu_reset_state {
 	unsigned long	pc;
 	unsigned long	r0;
diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 3c61842a25ee..0e322ede8421 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -123,8 +123,9 @@ static_assert(SMPT_GRAN <= PAGE_SIZE);
 #define SMPT_ELEMS_PER_WORD			(SMPT_WORD_SIZE * SMPT_ELEMS_PER_BYTE)
 #define SMPT_WORD_BYTE_RANGE			(SMPT_GRAN * SMPT_ELEMS_PER_WORD)
 #define SMPT_NUM_ELEMS				(SZ_1G / SMPT_GRAN)
-#define SMPT_NUM_WORDS				(SMPT_SIZE / SMPT_WORD_SIZE)
 #define SMPT_SIZE				(SMPT_NUM_ELEMS / SMPT_ELEMS_PER_BYTE)
+#define SMPT_NUM_WORDS				(SMPT_SIZE / SMPT_WORD_SIZE)
+#define SMPT_NUM_PAGES				(SMPT_SIZE / PAGE_SIZE)
 #define SMPT_ORDER				get_order(SMPT_SIZE)
 
 /*
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 53e65deb2c79..7f699e3d323e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -80,6 +80,8 @@ bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
 void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 				  enum kvm_pgtable_prot prot);
 
+extern const struct pkvm_iommu_ops pkvm_s2mpu_ops;
+
 struct kvm_iommu_ops {
 	int (*init)(void);
 	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 061630ebaaaf..c6224056dfa0 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -63,6 +63,8 @@ static inline struct pkvm_iommu_driver *get_driver(enum pkvm_iommu_driver_id id)
 static const struct pkvm_iommu_ops *get_driver_ops(enum pkvm_iommu_driver_id id)
 {
 	switch (id) {
+	case PKVM_IOMMU_DRIVER_S2MPU:
+		return IS_ENABLED(CONFIG_KVM_S2MPU) ? &pkvm_s2mpu_ops : NULL;
 	default:
 		return NULL;
 	}
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 66090d00e810..310303ddb3c7 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -470,50 +470,72 @@ static bool s2mpu_host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt,
 	return true;
 }
 
-static int s2mpu_init(void)
+static int s2mpu_init(void *data, size_t size)
 {
-	struct pkvm_iommu *dev;
+	struct mpt in_mpt;
+	u32 *smpt;
+	phys_addr_t pa;
 	unsigned int gb;
-	int ret;
+	int ret = 0;
 
-	/* Map data structures in EL2 stage-1. */
-	ret = pkvm_create_mappings(s2mpus,
-				   s2mpus + nr_s2mpus,
-				   PAGE_HYP);
-	if (ret)
-		return ret;
+	if (size != sizeof(in_mpt))
+		return -EINVAL;
 
+	/* The host can concurrently modify 'data'. Copy it to avoid TOCTOU. */
+	memcpy(&in_mpt, data, sizeof(in_mpt));
+
+	/* Take ownership of all SMPT buffers. This will also map them in. */
 	for_each_gb(gb) {
-		ret = pkvm_create_mappings(
-			host_mpt.fmpt[gb].smpt,
-			host_mpt.fmpt[gb].smpt + SMPT_NUM_WORDS,
-			PAGE_HYP);
+		smpt = kern_hyp_va(in_mpt.fmpt[gb].smpt);
+		pa = __hyp_pa(smpt);
+
+		if (!IS_ALIGNED(pa, SMPT_SIZE)) {
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = __pkvm_host_donate_hyp(pa >> PAGE_SHIFT, SMPT_NUM_PAGES);
 		if (ret)
-			return ret;
+			break;
+
+		host_mpt.fmpt[gb] = (struct fmpt){
+			.smpt = smpt,
+			.gran_1g = true,
+			.prot = MPT_PROT_NONE,
+		};
 	}
 
-	/* Map S2MPU MMIO regions in EL2 stage-1. */
-	for_each_s2mpu(dev) {
-		ret = __pkvm_create_private_mapping(
-			dev->pa, S2MPU_MMIO_SIZE, PAGE_HYP_DEVICE,(unsigned long *)(&dev->va));
-		if (ret)
-			return ret;
+	/* Try to return memory back if there was an error. */
+	if (ret) {
+		for_each_gb(gb) {
+			smpt = host_mpt.fmpt[gb].smpt;
+			if (!smpt)
+				break;
+
+			WARN_ON(__pkvm_hyp_donate_host(__hyp_pa(smpt) >> PAGE_SHIFT,
+						       SMPT_NUM_PAGES));
+		}
+		memset(&host_mpt, 0, sizeof(host_mpt));
 	}
 
-	/*
-	 * Program all S2MPUs powered on at boot. Note that they may not be in
-	 * the blocking reset state as the bootloader may have programmed them.
-	 */
-	for_each_powered_s2mpu(dev) {
-		ret = initialize_with_mpt(dev, &host_mpt);
-		if (ret)
-			return ret;
-	}
+	return ret;
+}
+
+static int s2mpu_validate(phys_addr_t pa, size_t size)
+{
+	if (size != S2MPU_MMIO_SIZE)
+		return -EINVAL;
+
 	return 0;
 }
 
-const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
+const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.init = s2mpu_init,
+	.validate = s2mpu_validate,
+	.data_size = sizeof(struct s2mpu_drv_data),
+};
+
+const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
 	.host_smc_handler = s2mpu_host_smc_handler,
 	.host_mmio_dabt_handler = s2mpu_host_mmio_dabt_handler,
 	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 496e75f76e56..be2b1ad09480 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -6,3 +6,81 @@
 
 #include <linux/kvm_host.h>
 #include <asm/kvm_s2mpu.h>
+
+static int init_s2mpu_driver(void)
+{
+	static DEFINE_MUTEX(lock);
+	static bool init_done;
+
+	struct mpt *mpt;
+	unsigned int gb;
+	unsigned long addr;
+	u64 pfn;
+	int ret = 0;
+
+	mutex_lock(&lock);
+	if (init_done)
+		goto out;
+
+	/* Allocate a page for driver data. Must fit MPT descriptor. */
+	BUILD_BUG_ON(sizeof(*mpt) > PAGE_SIZE);
+	addr = __get_free_page(GFP_KERNEL);
+	if (!addr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mpt = (struct mpt *)addr;
+
+	/* Allocate SMPT buffers. */
+	for_each_gb(gb) {
+		addr = __get_free_pages(GFP_KERNEL, SMPT_ORDER);
+		if (!addr) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		mpt->fmpt[gb].smpt = (u32 *)addr;
+	}
+
+	/* Share MPT descriptor with hyp. */
+	pfn = __pa(mpt) >> PAGE_SHIFT;
+	ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn);
+	if (ret)
+		goto out_free;
+
+	/* Hypercall to initialize EL2 driver. */
+	ret = pkvm_iommu_driver_init(PKVM_IOMMU_DRIVER_S2MPU, mpt, sizeof(*mpt));
+	if (ret)
+		goto out_unshare;
+
+	init_done = true;
+
+out_unshare:
+	WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn));
+out_free:
+	/* TODO - will driver return the memory? */
+	if (ret) {
+		for_each_gb(gb)
+			free_pages((unsigned long)mpt->fmpt[gb].smpt, SMPT_ORDER);
+		free_page((unsigned long)mpt);
+	}
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t addr)
+{
+	int ret;
+
+	if (!is_protected_kvm_enabled())
+		return -ENODEV;
+
+	ret = init_s2mpu_driver();
+	if (ret)
+		return ret;
+
+	return pkvm_iommu_register(dev, PKVM_IOMMU_DRIVER_S2MPU,
+				   addr, S2MPU_MMIO_SIZE);
+}
+EXPORT_SYMBOL_GPL(pkvm_iommu_s2mpu_register);

From 6581c17cf39116985ffc4bf5b9e763aaca08d78c Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 21:40:32 +0000
Subject: [PATCH 196/457] ANDROID: KVM: arm64: s2mpu: Replace SMC handler with
 PM callbacks

The host is now expected to notify EL2 about PM state changes of
individual IOMMU devices. Remove the old code that intercepted SMCs
and instead rely on callbacks from the core IOMMU code.

Bug: 190463801
Change-Id: If0ca60f7fce7fd3f025599a472b02824354f0991
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 39d559ad46ebd1416d2d7c9041400f92c2c0f618)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 75 +++++++--------------------
 1 file changed, 19 insertions(+), 56 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 310303ddb3c7..939e21cd968f 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -56,11 +56,6 @@ static bool is_powered_on(struct pkvm_iommu *dev)
 	return dev->powered;
 }
 
-static bool is_in_power_domain(struct pkvm_iommu *dev, u64 power_domain_id)
-{
-	return false;
-}
-
 static u32 __context_cfg_valid_vid(struct pkvm_iommu *dev, u32 vid_bmap)
 {
 	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
@@ -349,60 +344,27 @@ static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size,
 	hyp_spin_unlock(&s2mpu_lock);
 }
 
-static bool s2mpu_host_smc_handler(struct kvm_cpu_context *host_ctxt)
+static int s2mpu_resume(struct pkvm_iommu *dev)
 {
-	DECLARE_REG(u64, fn, host_ctxt, 0);
-	DECLARE_REG(u64, mode, host_ctxt, 1);
-	DECLARE_REG(u64, domain_id, host_ctxt, 2);
-	DECLARE_REG(u64, group, host_ctxt, 3);
-
-	struct arm_smccc_res res;
-	struct pkvm_iommu *dev;
-	int ret;
-
-	if (fn != SMC_CMD_PREPARE_PD_ONOFF)
-		return false; /* SMC not handled */
-
 	/*
-	 * Host is notifying EL3 that a power domain was turned on/off.
-	 * Use this SMC as a trigger to program the S2MPUs.
-	 * Note that the host may be malicious and issue this SMC arbitrarily.
-	 *
-	 * Power on:
-	 * It is paramount that the S2MPU reset state is enabled and blocking
-	 * all traffic. That way the host is forced to issue a power-on SMC to
-	 * unblock the S2MPUs.
-	 *
-	 * Power down:
-	 * A power-down SMC is a hint for hyp to stop updating the S2MPU, lest
-	 * writes to powered-down MMIO registers produce SErrors in the host.
-	 * However, hyp must perform one last update - putting the S2MPUs back
-	 * to their blocking reset state - in case the host does not actually
-	 * power them down and continues issuing DMA traffic.
+	 * Initialize the S2MPU with the host stage-2 MPT. It is paramount
+	 * that the S2MPU reset state is enabled and blocking all traffic,
+	 * otherwise the host would not be forced to call the resume HVC
+	 * before issuing DMA traffic.
 	 */
+	return initialize_with_mpt(dev, &host_mpt);
+}
 
-	hyp_spin_lock(&s2mpu_lock);
-	arm_smccc_1_1_smc(fn, mode, domain_id, group, &res);
-	ret = res.a0;
-
-	if (ret == SMCCC_RET_SUCCESS) {
-		for_each_s2mpu(dev) {
-			if (!is_in_power_domain(dev, domain_id))
-				continue;
-
-			if (mode == SMC_MODE_POWER_UP) {
-				dev->powered = true;
-				ret = initialize_with_mpt(dev, &host_mpt);
-			} else {
-				ret = initialize_with_prot(dev, MPT_PROT_NONE);
-				dev->powered = false;
-			}
-		}
-	}
-	hyp_spin_unlock(&s2mpu_lock);
-
-	cpu_reg(host_ctxt, 0) = ret;
-	return true;  /* SMC handled */
+static int s2mpu_suspend(struct pkvm_iommu *dev)
+{
+	/*
+	 * Stop updating the S2MPU when the host informs us about the intention
+	 * to suspend it. Writes to powered-down MMIO registers would trigger
+	 * SErrors in EL1 otherwise. However, hyp must put S2MPU back to
+	 * blocking state first, in case the host does not actually power it
+	 * down and continues issuing DMA traffic.
+	 */
+	return initialize_with_prot(dev, MPT_PROT_NONE);
 }
 
 static struct pkvm_iommu *find_s2mpu_by_addr(phys_addr_t addr)
@@ -532,11 +494,12 @@ static int s2mpu_validate(phys_addr_t pa, size_t size)
 const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.init = s2mpu_init,
 	.validate = s2mpu_validate,
+	.resume = s2mpu_resume,
+	.suspend = s2mpu_suspend,
 	.data_size = sizeof(struct s2mpu_drv_data),
 };
 
 const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
-	.host_smc_handler = s2mpu_host_smc_handler,
 	.host_mmio_dabt_handler = s2mpu_host_mmio_dabt_handler,
 	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
 };

From 104e328f178c5b057f073d95b4f5ce1720d9098b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 22:10:20 +0000
Subject: [PATCH 197/457] ANDROID: KVM: arm64: s2mpu: Replace DABT handler with
 callback

Previously the S2MPU DABT handler would be called directly from the host
DABT handler and it would look up the corresponding S2MPU device. Now the
lookup is done in the generic IOMMU DABT handler and only the actual
S2MPU register access is left to the driver itself.

Bug: 190463801
Change-Id: If18ff6600cd3cc678eb59aed9c8269d13868827b
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 667c7af5808ad6009bfe6e3e86f251421c79e96f)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 31 +++++++--------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 939e21cd968f..17c5f4548dc4 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -367,17 +367,6 @@ static int s2mpu_suspend(struct pkvm_iommu *dev)
 	return initialize_with_prot(dev, MPT_PROT_NONE);
 }
 
-static struct pkvm_iommu *find_s2mpu_by_addr(phys_addr_t addr)
-{
-	struct pkvm_iommu *dev;
-
-	for_each_s2mpu(dev) {
-		if (dev->pa <= addr && addr < (dev->pa + S2MPU_MMIO_SIZE))
-			return dev;
-	}
-	return NULL;
-}
-
 static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 {
 	const u32 no_access  = 0;
@@ -404,23 +393,19 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 	return no_access;
 }
 
-static bool s2mpu_host_mmio_dabt_handler(struct kvm_cpu_context *host_ctxt,
-					 phys_addr_t fault_pa, unsigned int len,
-					 bool is_write, int rd)
+static bool s2mpu_host_dabt_handler(struct pkvm_iommu *dev,
+				    struct kvm_cpu_context *host_ctxt,
+				    u32 esr, size_t off)
 {
-	struct pkvm_iommu *dev;
-	size_t off;
+	bool is_write = esr & ESR_ELx_WNR;
+	unsigned int len = BIT((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT);
+	int rd = (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
 	u32 mask;
 
 	/* Only handle MMIO access with u32 size and alignment. */
-	if ((len != sizeof(u32)) || (fault_pa & (sizeof(u32) - 1)))
+	if ((len != sizeof(u32)) || (off & (sizeof(u32) - 1)))
 		return false;
 
-	dev = find_s2mpu_by_addr(fault_pa);
-	if (!dev || !is_powered_on(dev))
-		return false;
-
-	off = fault_pa - dev->pa;
 	mask = host_mmio_reg_access_mask(off, is_write);
 	if (!mask)
 		return false;
@@ -496,10 +481,10 @@ const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.validate = s2mpu_validate,
 	.resume = s2mpu_resume,
 	.suspend = s2mpu_suspend,
+	.host_dabt_handler = s2mpu_host_dabt_handler,
 	.data_size = sizeof(struct s2mpu_drv_data),
 };
 
 const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
-	.host_mmio_dabt_handler = s2mpu_host_mmio_dabt_handler,
 	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
 };

From f68f711e09ffdbd84b6e07baf2d9c17d3c92cb14 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 22:40:09 +0000
Subject: [PATCH 198/457] ANDROID: KVM: arm64: s2mpu: Move mpt_update_flags
 into FMPT

Core SMPT manipulation code returns mpt_update_flags, signalling whether
the caller should flush the dcache (MPT_UPDATE_L2) or write new L1ATTR
values to S2MPU MMIO registers (MPT_UPDATE_L1).

In preparation for splitting the code into a driver-global and
per-device portions, store the value in the corresponding FMPT.
As long as the two code portions are called from a single critical
section, the FMPT value is guaranteed to not change.

Bug: 190463801
Change-Id: Ic3cdba49e78d68053f7a3cfa84288b1f0a473f71
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4c2ce4d16cc63f3d03da8411b876e9cc1fa3c52f)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 42 ++++++++++++++++-----------
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c |  7 ++---
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 0e322ede8421..4d517bc1d0eb 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -160,21 +160,22 @@ static const u64 mpt_prot_doubleword[] = {
 	[MPT_PROT_RW]   = 0xffffffffffffffff,
 };
 
+enum mpt_update_flags {
+	MPT_UPDATE_L1 = BIT(0),
+	MPT_UPDATE_L2 = BIT(1),
+};
+
 struct fmpt {
 	u32 *smpt;
 	bool gran_1g;
 	enum mpt_prot prot;
+	enum mpt_update_flags flags;
 };
 
 struct mpt {
 	struct fmpt fmpt[NR_GIGABYTES];
 };
 
-enum mpt_update_flags {
-	MPT_UPDATE_L1 = BIT(0),
-	MPT_UPDATE_L2 = BIT(1),
-};
-
 /* Set protection bits of SMPT in a given range without using memset. */
 static inline void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
 					 size_t end_gb_byte, enum mpt_prot prot)
@@ -254,24 +255,28 @@ static inline bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
  * Returns flags specifying whether L1/L2 changes need to be made visible
  * to the device.
  */
-static inline enum mpt_update_flags
-__set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte, size_t end_gb_byte,
-		 enum mpt_prot prot)
+static inline void __set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte,
+				    size_t end_gb_byte, enum mpt_prot prot)
 {
 	if (start_gb_byte == 0 && end_gb_byte >= SZ_1G) {
 		/* Update covers the entire GB region. */
-		if (fmpt->gran_1g && fmpt->prot == prot)
-			return 0;
+		if (fmpt->gran_1g && fmpt->prot == prot) {
+			fmpt->flags = 0;
+			return;
+		}
 
 		fmpt->gran_1g = true;
 		fmpt->prot = prot;
-		return MPT_UPDATE_L1;
+		fmpt->flags = MPT_UPDATE_L1;
+		return;
 	}
 
 	if (fmpt->gran_1g) {
 		/* GB region currently uses 1G mapping. */
-		if (fmpt->prot == prot)
-			return 0;
+		if (fmpt->prot == prot) {
+			fmpt->flags = 0;
+			return;
+		}
 
 		/*
 		 * Range has different mapping than the rest of the GB.
@@ -281,19 +286,22 @@ __set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte, size_t end_gb_byte,
 		__set_smpt_range(fmpt->smpt, 0, start_gb_byte, fmpt->prot);
 		__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
 		__set_smpt_range(fmpt->smpt, end_gb_byte, SZ_1G, fmpt->prot);
-		return MPT_UPDATE_L1 | MPT_UPDATE_L2;
+		fmpt->flags = MPT_UPDATE_L1 | MPT_UPDATE_L2;
+		return;
 	}
 
 	/* GB region currently uses PAGE_SIZE mapping. */
 	__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
 
 	/* Check if the entire GB region has the same prot bits. */
-	if (!__is_smpt_uniform(fmpt->smpt, prot))
-		return MPT_UPDATE_L2;
+	if (!__is_smpt_uniform(fmpt->smpt, prot)) {
+		fmpt->flags = MPT_UPDATE_L2;
+		return;
+	}
 
 	fmpt->gran_1g = true;
 	fmpt->prot = prot;
-	return MPT_UPDATE_L1;
+	fmpt->flags = MPT_UPDATE_L1;
 }
 
 #endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 17c5f4548dc4..16cf1503005e 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -285,19 +285,18 @@ static void set_mpt_range_locked(struct mpt *mpt, phys_addr_t first_byte,
 	unsigned int gb, vid;
 	struct pkvm_iommu *dev;
 	struct fmpt *fmpt;
-	enum mpt_update_flags flags;
 
 	for_each_gb_in_range(gb, first_gb, last_gb) {
 		fmpt = &mpt->fmpt[gb];
 		start_gb_byte = (gb == first_gb) ? first_byte % SZ_1G : 0;
 		end_gb_byte = (gb == last_gb) ? (last_byte % SZ_1G) + 1 : SZ_1G;
 
-		flags = __set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
+		__set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
 
-		if (flags & MPT_UPDATE_L2)
+		if (fmpt->flags & MPT_UPDATE_L2)
 			kvm_flush_dcache_to_poc(fmpt->smpt, SMPT_SIZE);
 
-		if (flags & MPT_UPDATE_L1) {
+		if (fmpt->flags & MPT_UPDATE_L1) {
 			for_each_powered_s2mpu(dev) {
 				for_each_vid(vid)
 					__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);

From 9bbbef3cd5812af149767b15dc02456d9f1c46cc Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 14 Feb 2022 22:59:04 +0000
Subject: [PATCH 199/457] ANDROID: KVM: arm64: s2mpu: Implement host stage2
 idmap callbacks

Remove the existing 's2mpu_host_stage2_set_owner' hook implementation
and refactor the code to match the prepare/apply split of the generic
IOMMU callbacks for updating host stage-2 mappings.

Bug: 190463801
Change-Id: I36485b9cc6786243a77afb3eb18c63e38ec46404
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit bb81c25bcbea0d1cc5457d958df6a39fa3546540)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 122 +++++++++++++-------------
 1 file changed, 59 insertions(+), 63 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 16cf1503005e..3fbd68c84218 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -24,26 +24,23 @@
 
 #define PA_MAX				((phys_addr_t)SZ_1G * NR_GIGABYTES)
 
-#define for_each_s2mpu(i) \
-	for ((i) = &s2mpus[0]; (i) != &s2mpus[nr_s2mpus]; (i)++)
-
-#define for_each_powered_s2mpu(i) \
-	for_each_s2mpu((i)) if (is_powered_on((i)))
-
 #define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
 
-static size_t __ro_after_init			nr_s2mpus;
-static struct pkvm_iommu __ro_after_init	*s2mpus;
-static struct mpt				host_mpt;
-static hyp_spinlock_t				s2mpu_lock;
-
 struct s2mpu_drv_data {
 	u32 version;
 	u32 context_cfg_valid_vid;
 };
 
+static struct mpt host_mpt;
+
+static inline enum mpt_prot prot_to_mpt(enum kvm_pgtable_prot prot)
+{
+	return ((prot & KVM_PGTABLE_PROT_R) ? MPT_PROT_R : 0) |
+	       ((prot & KVM_PGTABLE_PROT_W) ? MPT_PROT_W : 0);
+}
+
 static bool is_version(struct pkvm_iommu *dev, u32 version)
 {
 	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
@@ -51,11 +48,6 @@ static bool is_version(struct pkvm_iommu *dev, u32 version)
 	return (data->version & VERSION_CHECK_MASK) == version;
 }
 
-static bool is_powered_on(struct pkvm_iommu *dev)
-{
-	return dev->powered;
-}
-
 static u32 __context_cfg_valid_vid(struct pkvm_iommu *dev, u32 vid_bmap)
 {
 	struct s2mpu_drv_data *data = (struct s2mpu_drv_data *)dev->data;
@@ -272,18 +264,32 @@ static int initialize_with_mpt(struct pkvm_iommu *dev, struct mpt *mpt)
 	return 0;
 }
 
-/*
- * Set MPT protection bits set to 'prot' in the give byte range (page-aligned).
- * Update currently powered S2MPUs.
- */
-static void set_mpt_range_locked(struct mpt *mpt, phys_addr_t first_byte,
-				 phys_addr_t last_byte, enum mpt_prot prot)
+static bool to_valid_range(phys_addr_t *start, phys_addr_t *end)
+{
+	phys_addr_t new_start = *start;
+	phys_addr_t new_end = *end;
+
+	if (new_end > PA_MAX)
+		new_end = PA_MAX;
+
+	new_start = ALIGN_DOWN(new_start, SMPT_GRAN);
+	new_end = ALIGN(new_end, SMPT_GRAN);
+
+	if (new_start >= new_end)
+		return false;
+
+	*start = new_start;
+	*end = new_end;
+	return true;
+}
+
+static void __mpt_idmap_prepare(struct mpt *mpt, phys_addr_t first_byte,
+				phys_addr_t last_byte, enum mpt_prot prot)
 {
 	unsigned int first_gb = first_byte / SZ_1G;
 	unsigned int last_gb = last_byte / SZ_1G;
 	size_t start_gb_byte, end_gb_byte;
-	unsigned int gb, vid;
-	struct pkvm_iommu *dev;
+	unsigned int gb;
 	struct fmpt *fmpt;
 
 	for_each_gb_in_range(gb, first_gb, last_gb) {
@@ -295,52 +301,44 @@ static void set_mpt_range_locked(struct mpt *mpt, phys_addr_t first_byte,
 
 		if (fmpt->flags & MPT_UPDATE_L2)
 			kvm_flush_dcache_to_poc(fmpt->smpt, SMPT_SIZE);
-
-		if (fmpt->flags & MPT_UPDATE_L1) {
-			for_each_powered_s2mpu(dev) {
-				for_each_vid(vid)
-					__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
-			}
-		}
 	}
-
-	/* Invalidate range in all powered S2MPUs. */
-	for_each_powered_s2mpu(dev)
-		__range_invalidation(dev, first_byte, last_byte);
 }
 
-static void s2mpu_host_stage2_set_owner(phys_addr_t addr, size_t size,
-					enum pkvm_component_id owner_id)
+static void __mpt_idmap_apply(struct pkvm_iommu *dev, struct mpt *mpt,
+			      phys_addr_t first_byte, phys_addr_t last_byte)
 {
-	/* Grant access only to the default owner of the page table (ID=0). */
-	enum mpt_prot prot = owner_id ? MPT_PROT_NONE : MPT_PROT_RW;
+	unsigned int first_gb = first_byte / SZ_1G;
+	unsigned int last_gb = last_byte / SZ_1G;
+	unsigned int gb, vid;
+	struct fmpt *fmpt;
 
-	/*
-	 * NOTE: The following code refers to 'end' as the exclusive upper
-	 * bound and 'last' as the inclusive one.
-	 */
+	for_each_gb_in_range(gb, first_gb, last_gb) {
+		fmpt = &mpt->fmpt[gb];
 
-	/*
-	 * Sanitize inputs with S2MPU-specific physical address space bounds.
-	 * Ownership change requests outside this boundary will be ignored.
-	 * The S2MPU also specifies that the PA region 4-34GB always maps to
-	 * PROT_NONE and the corresponding MMIO registers are read-only.
-	 * Ownership changes in this region will have no effect.
-	 */
+		if (fmpt->flags & MPT_UPDATE_L1) {
+			for_each_vid(vid)
+				__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
+		}
+	}
+	__range_invalidation(dev, first_byte, last_byte);
+}
 
-	if (addr >= PA_MAX)
+static void s2mpu_host_stage2_idmap_prepare(phys_addr_t start, phys_addr_t end,
+					    enum kvm_pgtable_prot prot)
+{
+	if (!to_valid_range(&start, &end))
 		return;
 
-	size = min(size, (size_t)(PA_MAX - addr));
-	if (size == 0)
+	__mpt_idmap_prepare(&host_mpt, start, end - 1, prot_to_mpt(prot));
+}
+
+static void s2mpu_host_stage2_idmap_apply(struct pkvm_iommu *dev,
+					  phys_addr_t start, phys_addr_t end)
+{
+	if (!to_valid_range(&start, &end))
 		return;
 
-	hyp_spin_lock(&s2mpu_lock);
-	set_mpt_range_locked(&host_mpt,
-			     ALIGN_DOWN(addr, SMPT_GRAN),
-			     ALIGN(addr + size, SMPT_GRAN) - 1,
-			     prot);
-	hyp_spin_unlock(&s2mpu_lock);
+	__mpt_idmap_apply(dev, &host_mpt, start, end - 1);
 }
 
 static int s2mpu_resume(struct pkvm_iommu *dev)
@@ -480,10 +478,8 @@ const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.validate = s2mpu_validate,
 	.resume = s2mpu_resume,
 	.suspend = s2mpu_suspend,
+	.host_stage2_idmap_prepare = s2mpu_host_stage2_idmap_prepare,
+	.host_stage2_idmap_apply = s2mpu_host_stage2_idmap_apply,
 	.host_dabt_handler = s2mpu_host_dabt_handler,
 	.data_size = sizeof(struct s2mpu_drv_data),
 };
-
-const struct kvm_iommu_ops kvm_s2mpu_ops = (struct kvm_iommu_ops){
-	.host_stage2_set_owner = s2mpu_host_stage2_set_owner,
-};

From 197b8ad41c4e4b9d2e5f7a8fbf2f1871785874ec Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 23 Feb 2022 19:54:47 +0000
Subject: [PATCH 200/457] ANDROID: KVM: arm64: Remove unused IOMMU hooks,
 kvm_iommu_ops

With new generic IOMMU code in place, and with all S2MPU code
having been migrated to the new pkvm_iommu_ops callbacks, remove
all the now unused code.

Bug: 190463801
Change-Id: I83a1ac7c2e30ee76712079727da4607bda5de68a
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit e900628cfca812f9f4c4c93754d91be16be4e466)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h       |  4 ----
 arch/arm64/include/asm/kvm_hyp.h        |  3 +--
 arch/arm64/kvm/arm.c                    | 15 +++------------
 arch/arm64/kvm/hyp/include/nvhe/iommu.h | 15 ---------------
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      |  7 +------
 arch/arm64/kvm/hyp/nvhe/setup.c         | 23 +----------------------
 6 files changed, 6 insertions(+), 61 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3cf7e69d4afe..48637d925ebe 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -379,10 +379,6 @@ extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
 extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map)
 
-enum kvm_iommu_driver {
-	KVM_IOMMU_DRIVER_NONE,
-};
-
 enum pkvm_iommu_driver_id {
 	PKVM_IOMMU_DRIVER_S2MPU,
 	PKVM_IOMMU_NR_DRIVERS,
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index f3e041cfe751..d450ed354d69 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -114,8 +114,7 @@ void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr,
 void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size,
 			    phys_addr_t pgd, void *sp, void *cont_fn);
 int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
-		unsigned long *per_cpu_base, u32 hyp_va_bits,
-		enum kvm_iommu_driver iommu_driver);
+		unsigned long *per_cpu_base, u32 hyp_va_bits);
 void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
 #endif
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 977a58b4c8a7..1b606bb1d285 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1919,11 +1919,6 @@ static bool init_psci_relay(void)
 	return true;
 }
 
-static int init_stage2_iommu(void)
-{
-	return KVM_IOMMU_DRIVER_NONE;
-}
-
 static int init_subsystems(void)
 {
 	int err = 0;
@@ -1982,7 +1977,7 @@ static void teardown_hyp_mode(void)
 	}
 }
 
-static int do_pkvm_init(u32 hyp_va_bits, enum kvm_iommu_driver iommu_driver)
+static int do_pkvm_init(u32 hyp_va_bits)
 {
 	void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
 	int ret;
@@ -1991,7 +1986,7 @@ static int do_pkvm_init(u32 hyp_va_bits, enum kvm_iommu_driver iommu_driver)
 	cpu_hyp_init_context();
 	ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
 				num_possible_cpus(), kern_hyp_va(per_cpu_base),
-				hyp_va_bits, iommu_driver);
+				hyp_va_bits);
 	cpu_hyp_init_features();
 
 	/*
@@ -2028,11 +2023,7 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
 	if (ret)
 		return ret;
 
-	ret = init_stage2_iommu();
-	if (ret < 0)
-		return ret;
-
-	ret = do_pkvm_init(hyp_va_bits, (enum kvm_iommu_driver)ret);
+	ret = do_pkvm_init(hyp_va_bits);
 	if (ret)
 		return ret;
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 7f699e3d323e..2367fe153ad3 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -82,19 +82,4 @@ void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 
 extern const struct pkvm_iommu_ops pkvm_s2mpu_ops;
 
-struct kvm_iommu_ops {
-	int (*init)(void);
-	bool (*host_smc_handler)(struct kvm_cpu_context *host_ctxt);
-	bool (*host_mmio_dabt_handler)(struct kvm_cpu_context *host_ctxt,
-				       phys_addr_t fault_pa, unsigned int len,
-				       bool is_write, int rd);
-	void (*host_stage2_set_owner)(phys_addr_t addr, size_t size,
-				      enum pkvm_component_id owner_id);
-	int (*host_stage2_adjust_mmio_range)(phys_addr_t addr, phys_addr_t *start,
-					     phys_addr_t *end);
-};
-
-extern struct kvm_iommu_ops kvm_iommu_ops;
-extern const struct kvm_iommu_ops kvm_s2mpu_ops;
-
 #endif	/* __ARM64_KVM_NVHE_IOMMU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index d597ba27ee38..0f576493e094 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -37,8 +37,6 @@ static DEFINE_PER_CPU(struct user_fpsimd_state, loaded_host_fpsimd_state);
 
 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
-struct kvm_iommu_ops kvm_iommu_ops;
-
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
 static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
@@ -1016,7 +1014,6 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3);
 	DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4);
 	DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5);
-	DECLARE_REG(enum kvm_iommu_driver, iommu_driver, host_ctxt, 6);
 
 	/*
 	 * __pkvm_init() will return only if an error occurred, otherwise it
@@ -1024,7 +1021,7 @@ static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
 	 * with the host context directly.
 	 */
 	cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base,
-					    hyp_va_bits, iommu_driver);
+					    hyp_va_bits);
 }
 
 static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt)
@@ -1228,8 +1225,6 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 	bool handled;
 
 	handled = kvm_host_psci_handler(host_ctxt);
-	if (!handled && kvm_iommu_ops.host_smc_handler)
-		handled = kvm_iommu_ops.host_smc_handler(host_ctxt);
 	if (!handled)
 		handled = kvm_host_ffa_handler(host_ctxt);
 	if (!handled)
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index db6f747aad84..b10d903e05f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -302,16 +302,6 @@ static int fix_hyp_pgtable_refcnt(void)
 				&walker);
 }
 
-static int select_iommu_ops(enum kvm_iommu_driver driver)
-{
-	switch (driver) {
-	case KVM_IOMMU_DRIVER_NONE:
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
 void __noreturn __pkvm_init_finalise(void)
 {
 	struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -341,12 +331,6 @@ void __noreturn __pkvm_init_finalise(void)
 	};
 	pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
 
-	if (kvm_iommu_ops.init) {
-		ret = kvm_iommu_ops.init();
-		if (ret)
-			goto out;
-	}
-
 	ret = fix_host_ownership();
 	if (ret)
 		goto out;
@@ -375,8 +359,7 @@ out:
 }
 
 int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
-		unsigned long *per_cpu_base, u32 hyp_va_bits,
-		enum kvm_iommu_driver iommu_driver)
+		unsigned long *per_cpu_base, u32 hyp_va_bits)
 {
 	struct kvm_nvhe_init_params *params;
 	void *virt = hyp_phys_to_virt(phys);
@@ -399,10 +382,6 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus,
 	if (ret)
 		return ret;
 
-	ret = select_iommu_ops(iommu_driver);
-	if (ret)
-		return ret;
-
 	update_nvhe_init_params();
 
 	/* Jump in the idmap page to switch to the new page-tables */

From 9fdbe1f39bb4e4cd6b7765cdfba76b3421173d98 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 15 Mar 2022 11:37:28 +0000
Subject: [PATCH 201/457] ANDROID: KVM: arm64: iommu: Harden
 __pkvm_iommu_pm_notify

Currently __pkvm_iommu_pm_notify always changes the value of
dev->powered following a suspend/resume attempt. This could potentially
be abused to force the hypervisor to stop issuing updates to an S2MPU
and preserving an old/invalid state.

Modify to only update the power state if suspend/resume was successful.

Bug: 190463801
Change-Id: I32d5ce7d18b55c3bebfa3273c9f5ca098d5e61f4
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 6eaed0b8b769b78fe237a13d757adb4bf1a883c6)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index c6224056dfa0..f8fc5ad4e040 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -360,10 +360,12 @@ int __pkvm_iommu_pm_notify(unsigned long dev_id, enum pkvm_iommu_pm_event event)
 	if (dev) {
 		if (event == PKVM_IOMMU_PM_SUSPEND) {
 			ret = dev->ops->suspend ? dev->ops->suspend(dev) : 0;
-			dev->powered = !!ret;
+			if (!ret)
+				dev->powered = false;
 		} else if (event == PKVM_IOMMU_PM_RESUME) {
 			ret = dev->ops->resume ? dev->ops->resume(dev) : 0;
-			dev->powered = !ret;
+			if (!ret)
+				dev->powered = true;
 		} else {
 			ret = -EINVAL;
 		}

From e648f0fda00d7470c294712938855823229be3e1 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 25 Mar 2022 08:33:25 +0000
Subject: [PATCH 202/457] ANDROID: KVM: arm64: iommu: Free memory on
 registration error

Memory for IOMMU device entries gets allocated from a pool donated by
the host. It is possible for pkvm_iommu_register() to allocate the
memory and then fail, in which case the memory remains unused but not
freed.

Refactor the code such that the host lock covers the entire section
where the memory is allocated. This way we can return the memory back to
the linear allocator if an error is returned.

Bug: 190463801
Change-Id: I881b7fd56fe7b70c1546e897d7d872af6e34e762
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit acb9a25416ad028354393eb44adf512a4d7ce567)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu.c | 79 ++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index f8fc5ad4e040..1b8283d3c5c5 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -31,6 +31,9 @@ static struct pkvm_iommu_driver iommu_drivers[PKVM_IOMMU_NR_DRIVERS];
 /* IOMMU device list. Must only be accessed with host_mmu.lock held. */
 static LIST_HEAD(iommu_list);
 
+static void *iommu_mem_pool;
+static size_t iommu_mem_remaining;
+
 static void assert_host_component_locked(void)
 {
 	hyp_assert_lock_held(&host_mmu.lock);
@@ -89,41 +92,56 @@ static inline bool is_driver_ready(struct pkvm_iommu_driver *drv)
 	return atomic_read(&drv->state) == IOMMU_DRIVER_READY;
 }
 
-/* Global memory pool for allocating IOMMU list entry structs. */
-static inline struct pkvm_iommu *
-alloc_iommu_list_entry(struct pkvm_iommu_driver *drv, void *mem, size_t mem_size)
+static size_t __iommu_alloc_size(struct pkvm_iommu_driver *drv)
 {
-	static void *pool;
-	static size_t remaining;
-	static DEFINE_HYP_SPINLOCK(lock);
-	size_t size = sizeof(struct pkvm_iommu) + drv->ops->data_size;
+	return ALIGN(sizeof(struct pkvm_iommu) + drv->ops->data_size,
+		     sizeof(unsigned long));
+}
+
+/* Global memory pool for allocating IOMMU list entry structs. */
+static inline struct pkvm_iommu *alloc_iommu(struct pkvm_iommu_driver *drv,
+					     void *mem, size_t mem_size)
+{
+	size_t size = __iommu_alloc_size(drv);
 	void *ptr;
 
-	size = ALIGN(size, sizeof(unsigned long));
-
-	hyp_spin_lock(&lock);
+	assert_host_component_locked();
 
 	/*
 	 * If new memory is being provided, replace the existing pool with it.
 	 * Any remaining memory in the pool is discarded.
 	 */
 	if (mem && mem_size) {
-		pool = mem;
-		remaining = mem_size;
+		iommu_mem_pool = mem;
+		iommu_mem_remaining = mem_size;
 	}
 
-	if (size <= remaining) {
-		ptr = pool;
-		pool += size;
-		remaining -= size;
-	} else {
-		ptr = NULL;
-	}
+	if (size > iommu_mem_remaining)
+		return NULL;
 
-	hyp_spin_unlock(&lock);
+	ptr = iommu_mem_pool;
+	iommu_mem_pool += size;
+	iommu_mem_remaining -= size;
 	return ptr;
 }
 
+static inline void free_iommu(struct pkvm_iommu_driver *drv, struct pkvm_iommu *ptr)
+{
+	size_t size = __iommu_alloc_size(drv);
+
+	assert_host_component_locked();
+
+	if (!ptr)
+		return;
+
+	/* Only allow freeing the last allocated buffer. */
+	if ((void *)ptr + size != iommu_mem_pool)
+		return;
+
+	iommu_mem_pool -= size;
+	iommu_mem_remaining += size;
+}
+
 static bool is_overlap(phys_addr_t r1_start, size_t r1_size,
 		       phys_addr_t r2_start, size_t r2_size)
 {
@@ -310,16 +328,20 @@ int __pkvm_iommu_register(unsigned long dev_id,
 			return ret;
 	}
 
+	host_lock_component();
+
 	/* Allocate memory for the new device entry. */
-	dev = alloc_iommu_list_entry(drv, mem_va, mem_size);
-	if (!dev)
-		return -ENOMEM;
+	dev = alloc_iommu(drv, mem_va, mem_size);
+	if (!dev) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/* Create EL2 mapping for the device. */
 	ret = __pkvm_create_private_mapping(dev_pa, dev_size,
 				 PAGE_HYP_DEVICE,(unsigned long *)&dev_va);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* Populate the new device entry. */
 	*dev = (struct pkvm_iommu){
@@ -330,14 +352,15 @@ int __pkvm_iommu_register(unsigned long dev_id,
 		.size = dev_size,
 	};
 
-	/* Take the host_mmu lock to block host stage-2 changes. */
-	host_lock_component();
 	if (!validate_against_existing_iommus(dev)) {
 		ret = -EBUSY;
 		goto out;
 	}
 
-	/* Unmap the device's MMIO range from host stage-2. */
+	/*
+	 * Unmap the device's MMIO range from host stage-2. Future attempts to
+	 * map will be blocked by pkvm_iommu_host_stage2_adjust_range.
+	 */
 	ret = host_stage2_unmap_dev_locked(dev_pa, dev_size);
 	if (ret)
 		goto out;
@@ -346,6 +369,8 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	list_add_tail(&dev->list, &iommu_list);
 
 out:
+	if (ret)
+		free_iommu(drv, dev);
 	host_unlock_component();
 	return ret;
 }

From d4e1138d67ba8c6d81ef0f7883cf5a9b8c8574df Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Fri, 25 Mar 2022 19:38:21 +0000
Subject: [PATCH 203/457] ANDROID: KVM: arm64: iommu: Create private mapping
 last

Private EL2 mappings currently cannot be removed. Move the creation of
IOMMU device mappings at the end of the registration function so that
other errors do not result in unnecessary mappings.

Bug: 190463801
Change-Id: If100e35790cf512cabfa9327f790d20540977b7d
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit e6574a68fafc1961150d657f0ebf882c68afbb5b)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 1b8283d3c5c5..367a8eef39bb 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -293,7 +293,7 @@ int __pkvm_iommu_register(unsigned long dev_id,
 {
 	struct pkvm_iommu *dev = NULL;
 	struct pkvm_iommu_driver *drv;
-	void *dev_va, *mem_va = NULL;
+	void *mem_va = NULL;
 	int ret = 0;
 
 	drv = get_driver(drv_id);
@@ -337,18 +337,11 @@ int __pkvm_iommu_register(unsigned long dev_id,
 		goto out;
 	}
 
-	/* Create EL2 mapping for the device. */
-	ret = __pkvm_create_private_mapping(dev_pa, dev_size,
-				 PAGE_HYP_DEVICE,(unsigned long *)&dev_va);
-	if (ret)
-		goto out;
-
 	/* Populate the new device entry. */
 	*dev = (struct pkvm_iommu){
 		.id = dev_id,
 		.ops = drv->ops,
 		.pa = dev_pa,
-		.va = dev_va,
 		.size = dev_size,
 	};
 
@@ -358,13 +351,21 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	}
 
 	/*
-	 * Unmap the device's MMIO range from host stage-2. Future attempts to
-	 * map will be blocked by pkvm_iommu_host_stage2_adjust_range.
+	 * Unmap the device's MMIO range from host stage-2. If registration
+	 * is successful, future attempts to re-map will be blocked by
+	 * pkvm_iommu_host_stage2_adjust_range.
 	 */
 	ret = host_stage2_unmap_dev_locked(dev_pa, dev_size);
 	if (ret)
 		goto out;
 
+	/* Create EL2 mapping for the device. */
+	ret = __pkvm_create_private_mapping(dev_pa, dev_size,
+					    PAGE_HYP_DEVICE, (unsigned long *)(&dev->va));
+	if (ret){
+		goto out;
+	}
+
 	/* Register device and prevent host from mapping the MMIO range. */
 	list_add_tail(&dev->list, &iommu_list);
 

From 832a58028ca08b08536bb0821bb4d72e56651fe7 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 15 Mar 2022 11:35:01 +0000
Subject: [PATCH 204/457] ANDROID: KVM: arm64: iommu: Run validate() on struct
 pkvm_iommu

In preparation for needing to validate more aspects of a device that is
about to be registered, change the callback to accept the to-be-added
'struct pkvm_iommu' rather than individual inputs.

Bug: 190463801
Change-Id: I74663af26a06624a7f084b067de97f28baa6f262
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit be84f2c770810494ef178cdb73e22b05f2dfc2b6)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  7 ++++---
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 12 ++++++------
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c   |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 2367fe153ad3..70d9c9e67991 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -20,10 +20,11 @@ struct pkvm_iommu_ops {
 	int (*init)(void *data, size_t size);
 
 	/*
-	 * Driver-specific validation of device registration inputs.
-	 * This should be stateless. No locks are held at entry.
+	 * Driver-specific validation of a device that is being registered.
+	 * All fields of the device struct have been populated.
+	 * Called with the host lock held.
 	 */
-	int (*validate)(phys_addr_t base, size_t size);
+	int (*validate)(struct pkvm_iommu *dev);
 
 	/*
 	 * Callback to apply a host stage-2 mapping change at driver level.
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 367a8eef39bb..6b5bd157dd8d 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -306,12 +306,6 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	if (!is_mmio_range(dev_pa, dev_size))
 		return -EINVAL;
 
-	if (drv->ops->validate) {
-		ret = drv->ops->validate(dev_pa, dev_size);
-		if (ret)
-			return ret;
-	}
-
 	/*
 	 * Accept memory donation if the host is providing new memory.
 	 * Note: We do not return the memory even if there is an error later.
@@ -350,6 +344,12 @@ int __pkvm_iommu_register(unsigned long dev_id,
 		goto out;
 	}
 
+	if (dev->ops->validate) {
+		ret = dev->ops->validate(dev);
+		if (ret)
+			goto out;
+	}
+
 	/*
 	 * Unmap the device's MMIO range from host stage-2. If registration
 	 * is successful, future attempts to re-map will be blocked by
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 3fbd68c84218..dd9c81f15898 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -465,9 +465,9 @@ static int s2mpu_init(void *data, size_t size)
 	return ret;
 }
 
-static int s2mpu_validate(phys_addr_t pa, size_t size)
+static int s2mpu_validate(struct pkvm_iommu *dev)
 {
-	if (size != S2MPU_MMIO_SIZE)
+	if (dev->size != S2MPU_MMIO_SIZE)
 		return -EINVAL;
 
 	return 0;

From f7a8d059947290a70d31e8ccf492ef8655d9145b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 15 Mar 2022 11:33:45 +0000
Subject: [PATCH 205/457] ANDROID: KVM: arm64: iommu: Create parent/child
 relation

In preparation for adding new IOMMU devices that act as suppliers to
others, add the notion of a parent IOMMU device. Such device must be
registered after its parent and the driver of the parent device must
validate the addition.

The relation has no generic implications, it is up to drivers to make
use of it.

Bug: 190463801
Change-Id: I1e4be18a5ad826f84b0ea895129d2ef54ee17f85
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit e69c61cf4ebab19f5abdfedb66c48e96345dbdf4)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h       |  2 +-
 arch/arm64/kvm/hyp/include/nvhe/iommu.h | 10 ++++++++++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      |  8 +++++---
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 18 ++++++++++++++++++
 arch/arm64/kvm/iommu.c                  |  7 ++++---
 arch/arm64/kvm/iommu/s2mpu.c            |  2 +-
 6 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 48637d925ebe..d45487aa01c3 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -391,7 +391,7 @@ enum pkvm_iommu_pm_event {
 
 int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id drv_id, void *data, size_t size);
 int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
-			phys_addr_t pa, size_t size);
+			phys_addr_t pa, size_t size, struct device *parent);
 int pkvm_iommu_suspend(struct device *dev);
 int pkvm_iommu_resume(struct device *dev);
 
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 70d9c9e67991..e9683d314938 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -26,6 +26,12 @@ struct pkvm_iommu_ops {
 	 */
 	int (*validate)(struct pkvm_iommu *dev);
 
+	/*
+	 * Validation of a new child device that is being register by
+	 * the parent device the child selected. Called with the host lock held.
+	 */
+	int (*validate_child)(struct pkvm_iommu *dev, struct pkvm_iommu *child);
+
 	/*
 	 * Callback to apply a host stage-2 mapping change at driver level.
 	 * Called before 'host_stage2_idmap_apply' with host lock held.
@@ -57,7 +63,10 @@ struct pkvm_iommu_ops {
 };
 
 struct pkvm_iommu {
+	struct pkvm_iommu *parent;
 	struct list_head list;
+	struct list_head siblings;
+	struct list_head children;
 	unsigned long id;
 	const struct pkvm_iommu_ops *ops;
 	phys_addr_t pa;
@@ -71,6 +80,7 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 int __pkvm_iommu_register(unsigned long dev_id,
 			  enum pkvm_iommu_driver_id drv_id,
 			  phys_addr_t dev_pa, size_t dev_size,
+			  unsigned long parent_id,
 			  void *kern_mem_va, size_t mem_size);
 int __pkvm_iommu_pm_notify(unsigned long dev_id,
 			   enum pkvm_iommu_pm_event event);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 0f576493e094..b4b1936b690d 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1126,11 +1126,13 @@ static void handle___pkvm_iommu_register(struct kvm_cpu_context *host_ctxt)
 	DECLARE_REG(enum pkvm_iommu_driver_id, drv_id, host_ctxt, 2);
 	DECLARE_REG(phys_addr_t, dev_pa, host_ctxt, 3);
 	DECLARE_REG(size_t, dev_size, host_ctxt, 4);
-	DECLARE_REG(void *, mem, host_ctxt, 5);
-	DECLARE_REG(size_t, mem_size, host_ctxt, 6);
+	DECLARE_REG(unsigned long, parent_id, host_ctxt, 5);
+	DECLARE_REG(void *, mem, host_ctxt, 6);
+	DECLARE_REG(size_t, mem_size, host_ctxt, 7);
 
 	cpu_reg(host_ctxt, 1) = __pkvm_iommu_register(dev_id, drv_id, dev_pa,
-						      dev_size, mem, mem_size);
+						      dev_size, parent_id,
+						      mem, mem_size);
 }
 
 static void handle___pkvm_iommu_pm_notify(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 6b5bd157dd8d..bce13e28f1ce 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -289,6 +289,7 @@ out:
 int __pkvm_iommu_register(unsigned long dev_id,
 			  enum pkvm_iommu_driver_id drv_id,
 			  phys_addr_t dev_pa, size_t dev_size,
+			  unsigned long parent_id,
 			  void *kern_mem_va, size_t mem_size)
 {
 	struct pkvm_iommu *dev = NULL;
@@ -333,6 +334,7 @@ int __pkvm_iommu_register(unsigned long dev_id,
 
 	/* Populate the new device entry. */
 	*dev = (struct pkvm_iommu){
+		.children = LIST_HEAD_INIT(dev->children),
 		.id = dev_id,
 		.ops = drv->ops,
 		.pa = dev_pa,
@@ -344,6 +346,20 @@ int __pkvm_iommu_register(unsigned long dev_id,
 		goto out;
 	}
 
+	if (parent_id) {
+		dev->parent = find_iommu_by_id(parent_id);
+		if (!dev->parent) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (dev->parent->ops->validate_child) {
+			ret = dev->parent->ops->validate_child(dev->parent, dev);
+			if (ret)
+				goto out;
+		}
+	}
+
 	if (dev->ops->validate) {
 		ret = dev->ops->validate(dev);
 		if (ret)
@@ -368,6 +384,8 @@ int __pkvm_iommu_register(unsigned long dev_id,
 
 	/* Register device and prevent host from mapping the MMIO range. */
 	list_add_tail(&dev->list, &iommu_list);
+	if (dev->parent)
+		list_add_tail(&dev->siblings, &dev->parent->children);
 
 out:
 	if (ret)
diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c
index e13d36ec57e7..01176199e08f 100644
--- a/arch/arm64/kvm/iommu.c
+++ b/arch/arm64/kvm/iommu.c
@@ -18,7 +18,7 @@ int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size
 }
 
 int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
-			phys_addr_t pa, size_t size)
+			phys_addr_t pa, size_t size, struct device *parent)
 {
 	void *mem;
 	int ret;
@@ -29,14 +29,15 @@ int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
 	 * We assume that hyp never allocates more than a page per hypcall.
 	 */
 	ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev),
-				drv_id, pa, size, NULL, 0);
+				drv_id, pa, size, dev_to_id(parent), NULL, 0);
 	if (ret == -ENOMEM) {
 		mem = (void *)__get_free_page(GFP_KERNEL);
 		if (!mem)
 			return -ENOMEM;
 
 		ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev),
-					drv_id, pa, size, mem, PAGE_SIZE);
+					drv_id, pa, size, dev_to_id(parent),
+					mem, PAGE_SIZE);
 	}
 	return ret;
 }
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index be2b1ad09480..7d989afde0fb 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -81,6 +81,6 @@ int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t addr)
 		return ret;
 
 	return pkvm_iommu_register(dev, PKVM_IOMMU_DRIVER_S2MPU,
-				   addr, S2MPU_MMIO_SIZE);
+				   addr, S2MPU_MMIO_SIZE, NULL);
 }
 EXPORT_SYMBOL_GPL(pkvm_iommu_s2mpu_register);

From 5c2780d09c826b897a0f4538700e1e4a884abbd6 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 15 Mar 2022 11:39:02 +0000
Subject: [PATCH 206/457] ANDROID: KVM: arm64: s2mpu: Create SysMMU_SYNC driver

SysMMU_SYNC devices expose an interface to start a sync counter and
poll its SFR until the device signals that all memory transactions in
flight at the start have drained. This gives the hypervisor a reliable
indicator that S2MPU invalidation has fully completed and all new
transactions will use the new MPTs.

Add a new pKVM IOMMU driver that the host can use to register
SysMMU_SYNCs. Each device is expected to be a supplier to exactly one
S2MPU (parent), but multiple SYNCs can supply a single S2MPU.

To keep things simple, the SYNCs do not implement suspend/resume and are
assumed to follow the power transitions of their parent.

Following an invalidation, the S2MPU driver iterates over its children
and waits for each SYNC to signal that its transactions have drained.
The algorithm currently waits on each SYNC in turn. If latency proves to
be an issue, this could be optimized to initiate a SYNC on all powered
devices before starting to poll.

Bug: 190463801
Change-Id: I0006602bb5a683d39a6542b61b5ece13ebc28c3f
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 57381d548d9de5382047ac9602da5487a2f78383)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h       |  3 ++
 arch/arm64/include/asm/kvm_s2mpu.h      | 11 ++++++
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  1 +
 arch/arm64/kvm/hyp/nvhe/iommu.c         |  2 ++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c   | 45 +++++++++++++++++++++++++
 arch/arm64/kvm/iommu/s2mpu.c            | 34 +++++++++++++++++++
 6 files changed, 96 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d45487aa01c3..bc268b7e8c62 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -381,6 +381,7 @@ extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 
 enum pkvm_iommu_driver_id {
 	PKVM_IOMMU_DRIVER_S2MPU,
+	PKVM_IOMMU_DRIVER_SYSMMU_SYNC,
 	PKVM_IOMMU_NR_DRIVERS,
 };
 
@@ -396,6 +397,8 @@ int pkvm_iommu_suspend(struct device *dev);
 int pkvm_iommu_resume(struct device *dev);
 
 int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t pa);
+int pkvm_iommu_sysmmu_sync_register(struct device *dev, phys_addr_t pa,
+				    struct device *parent);
 
 struct vcpu_reset_state {
 	unsigned long	pc;
diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 4d517bc1d0eb..0804ece03cfd 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -12,6 +12,10 @@
 #include <asm/kvm_mmu.h>
 
 #define S2MPU_MMIO_SIZE				SZ_64K
+#define SYSMMU_SYNC_MMIO_SIZE			SZ_64K
+#define SYSMMU_SYNC_S2_OFFSET			SZ_32K
+#define SYSMMU_SYNC_S2_MMIO_SIZE		(SYSMMU_SYNC_MMIO_SIZE - \
+						 SYSMMU_SYNC_S2_OFFSET)
 
 #define NR_VIDS					8
 #define NR_CTX_IDS				8
@@ -128,6 +132,13 @@ static_assert(SMPT_GRAN <= PAGE_SIZE);
 #define SMPT_NUM_PAGES				(SMPT_SIZE / PAGE_SIZE)
 #define SMPT_ORDER				get_order(SMPT_SIZE)
 
+/* SysMMU_SYNC registers, relative to SYSMMU_SYNC_S2_OFFSET. */
+#define REG_NS_SYNC_CMD				0x0
+#define REG_NS_SYNC_COMP			0x4
+
+#define SYNC_CMD_SYNC				BIT(0)
+#define SYNC_COMP_COMPLETE			BIT(0)
+
 /*
  * Iterate over S2MPU gigabyte regions. Skip those that cannot be modified
  * (the MMIO registers are read only, with reset value MPT_PROT_NONE).
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index e9683d314938..07fe3db958c3 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -92,5 +92,6 @@ void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 				  enum kvm_pgtable_prot prot);
 
 extern const struct pkvm_iommu_ops pkvm_s2mpu_ops;
+extern const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops;
 
 #endif	/* __ARM64_KVM_NVHE_IOMMU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index bce13e28f1ce..0f9f6950bdce 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -68,6 +68,8 @@ static const struct pkvm_iommu_ops *get_driver_ops(enum pkvm_iommu_driver_id id)
 	switch (id) {
 	case PKVM_IOMMU_DRIVER_S2MPU:
 		return IS_ENABLED(CONFIG_KVM_S2MPU) ? &pkvm_s2mpu_ops : NULL;
+	case PKVM_IOMMU_DRIVER_SYSMMU_SYNC:
+		return IS_ENABLED(CONFIG_KVM_S2MPU) ? &pkvm_sysmmu_sync_ops : NULL;
 	default:
 		return NULL;
 	}
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index dd9c81f15898..01cd8a97fa15 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -28,6 +28,9 @@
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
 
+#define for_each_child(child, dev) \
+	list_for_each_entry((child), &(dev)->children, siblings)
+
 struct s2mpu_drv_data {
 	u32 version;
 	u32 context_cfg_valid_vid;
@@ -155,6 +158,13 @@ static void __set_control_regs(struct pkvm_iommu *dev)
 	writel_relaxed(ctrl0, dev->va + REG_NS_CTRL0);
 }
 
+/* Poll the given SFR until its value has all bits of a given mask set. */
+static void __wait_until(void __iomem *addr, u32 mask)
+{
+	while ((readl_relaxed(addr) & mask) != mask)
+		continue;
+}
+
 /* Poll the given SFR as long as its value has all bits of a given mask set. */
 static void __wait_while(void __iomem *addr, u32 mask)
 {
@@ -164,6 +174,17 @@ static void __wait_while(void __iomem *addr, u32 mask)
 
 static void __wait_for_invalidation_complete(struct pkvm_iommu *dev)
 {
+	struct pkvm_iommu *sync;
+
+	/*
+	 * Wait for transactions to drain if SysMMU_SYNCs were registered.
+	 * Assumes that they are in the same power domain as the S2MPU.
+	 */
+	for_each_child(sync, dev) {
+		writel_relaxed(SYNC_CMD_SYNC, sync->va + REG_NS_SYNC_CMD);
+		__wait_until(sync->va + REG_NS_SYNC_COMP, SYNC_COMP_COMPLETE);
+	}
+
 	/* Must not access SFRs while S2MPU is busy invalidating (v9 only). */
 	if (is_version(dev, S2MPU_VERSION_9)) {
 		__wait_while(dev->va + REG_NS_STATUS,
@@ -473,9 +494,29 @@ static int s2mpu_validate(struct pkvm_iommu *dev)
 	return 0;
 }
 
+static int s2mpu_validate_child(struct pkvm_iommu *dev, struct pkvm_iommu *child)
+{
+	if (child->ops != &pkvm_sysmmu_sync_ops)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int sysmmu_sync_validate(struct pkvm_iommu *dev)
+{
+	if (dev->size != SYSMMU_SYNC_S2_MMIO_SIZE)
+		return -EINVAL;
+
+	if (!dev->parent || dev->parent->ops != &pkvm_s2mpu_ops)
+		return -EINVAL;
+
+	return 0;
+}
+
 const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.init = s2mpu_init,
 	.validate = s2mpu_validate,
+	.validate_child = s2mpu_validate_child,
 	.resume = s2mpu_resume,
 	.suspend = s2mpu_suspend,
 	.host_stage2_idmap_prepare = s2mpu_host_stage2_idmap_prepare,
@@ -483,3 +524,7 @@ const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.host_dabt_handler = s2mpu_host_dabt_handler,
 	.data_size = sizeof(struct s2mpu_drv_data),
 };
+
+const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops = (struct pkvm_iommu_ops){
+	.validate = sysmmu_sync_validate,
+};
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 7d989afde0fb..733451d74100 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -84,3 +84,37 @@ int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t addr)
 				   addr, S2MPU_MMIO_SIZE, NULL);
 }
 EXPORT_SYMBOL_GPL(pkvm_iommu_s2mpu_register);
+
+static int init_sysmmu_sync_driver(void)
+{
+	static DEFINE_MUTEX(lock);
+	static bool init_done;
+
+	int ret = 0;
+
+	mutex_lock(&lock);
+	if (!init_done) {
+		ret = pkvm_iommu_driver_init(PKVM_IOMMU_DRIVER_SYSMMU_SYNC, NULL, 0);
+		init_done = !ret;
+	}
+	mutex_unlock(&lock);
+	return ret;
+}
+
+int pkvm_iommu_sysmmu_sync_register(struct device *dev, phys_addr_t addr,
+				    struct device *parent)
+{
+	int ret;
+
+	if (!is_protected_kvm_enabled())
+		return -ENODEV;
+
+	ret = init_sysmmu_sync_driver();
+	if (ret)
+		return ret;
+
+	return pkvm_iommu_register(dev, PKVM_IOMMU_DRIVER_SYSMMU_SYNC,
+				   addr + SYSMMU_SYNC_S2_OFFSET,
+				   SYSMMU_SYNC_S2_MMIO_SIZE, parent);
+}
+EXPORT_SYMBOL_GPL(pkvm_iommu_sysmmu_sync_register);

From 1e45bfa3e64da597e2e5573ab2b06641aaa1493e Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 29 Mar 2022 16:56:10 +0100
Subject: [PATCH 207/457] ANDROID: KVM: arm64: iommu: No powered check in DABT
 handler

The IOMMU DABT handler currently checks if the device is considered
powered by hyp before resolving the request. If the power tracking does
not reflect reality, the IOMMU may trigger issues in the host but the
incorrect state prevents it from diagnosing the issue.

Drop the powered check from the generic IOMMU code. The host accessing
the device's SFR means that it assumes it is powered, and individual
drivers can choose to reject that DABT request.

Bug: 224891559
Bug: 190463801
Change-Id: I3418a29c7deb7e3e866f89d933b9dad0aaa06365
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 798c4ea545e98e04797424c98c4329e6a34db60e)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 0f9f6950bdce..702af436277d 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -466,7 +466,8 @@ bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
 		if (pa < dev->pa || pa >= dev->pa + dev->size)
 			continue;
 
-		if (!dev->powered || !dev->ops->host_dabt_handler ||
+		/* No 'powered' check - the host assumes it is powered. */
+		if (!dev->ops->host_dabt_handler ||
 		    !dev->ops->host_dabt_handler(dev, host_ctxt, esr, pa - dev->pa))
 			return false;
 

From a3aa8872df9c0a7f377a9a7c3db2b1c1a256bc16 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 28 Mar 2022 11:02:34 +0100
Subject: [PATCH 208/457] ANDROID: KVM: arm64: iommu: Add pkvm_iommu_finalize

Add new hypercall that the host can use to inform the hypervisor that
all hypervisor-controlled IOMMUs have been registered and no new
registrations should be allowed. This will typically be called at the
end of kernel module initialization phase.

Bug: 190463801
Change-Id: I124e6693a3ac0b1a81988640fcb9e09f08bf4ccf
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 8fd93b0ef92202f16008d97ebcb92de367751491)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h        |  1 +
 arch/arm64/include/asm/kvm_host.h       |  2 +
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      |  6 ++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 94 +++++++++++++++++++------
 arch/arm64/kvm/iommu.c                  |  6 ++
 6 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index fd9950148ca1..81cd42bc0bc0 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -84,6 +84,7 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_driver_init,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_register,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_pm_notify,
+	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_finalize,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index bc268b7e8c62..e93c4b2a4c1b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -399,6 +399,8 @@ int pkvm_iommu_resume(struct device *dev);
 int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t pa);
 int pkvm_iommu_sysmmu_sync_register(struct device *dev, phys_addr_t pa,
 				    struct device *parent);
+/* Reject future calls to pkvm_iommu_driver_init() and pkvm_iommu_register(). */
+int pkvm_iommu_finalize(void);
 
 struct vcpu_reset_state {
 	unsigned long	pc;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 07fe3db958c3..d794f705fb1a 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -84,6 +84,7 @@ int __pkvm_iommu_register(unsigned long dev_id,
 			  void *kern_mem_va, size_t mem_size);
 int __pkvm_iommu_pm_notify(unsigned long dev_id,
 			   enum pkvm_iommu_pm_event event);
+int __pkvm_iommu_finalize(void);
 int pkvm_iommu_host_stage2_adjust_range(phys_addr_t addr, phys_addr_t *start,
 					phys_addr_t *end);
 bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index b4b1936b690d..a65301e85f7c 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1143,6 +1143,11 @@ static void handle___pkvm_iommu_pm_notify(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_iommu_pm_notify(dev_id, event);
 }
 
+static void handle___pkvm_iommu_finalize(struct kvm_cpu_context *host_ctxt)
+{
+	cpu_reg(host_ctxt, 1) = __pkvm_iommu_finalize();
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1180,6 +1185,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_iommu_driver_init),
 	HANDLE_FUNC(__pkvm_iommu_register),
 	HANDLE_FUNC(__pkvm_iommu_pm_notify),
+	HANDLE_FUNC(__pkvm_iommu_finalize),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 702af436277d..630b5e1a643a 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -31,6 +31,9 @@ static struct pkvm_iommu_driver iommu_drivers[PKVM_IOMMU_NR_DRIVERS];
 /* IOMMU device list. Must only be accessed with host_mmu.lock held. */
 static LIST_HEAD(iommu_list);
 
+static bool iommu_finalized;
+static DEFINE_HYP_SPINLOCK(iommu_registration_lock);
+
 static void *iommu_mem_pool;
 static size_t iommu_mem_remaining;
 
@@ -251,13 +254,24 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 
 	data = kern_hyp_va(data);
 
+	/* New driver initialization not allowed after __pkvm_iommu_finalize(). */
+	hyp_spin_lock(&iommu_registration_lock);
+	if (iommu_finalized) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
 	drv = get_driver(id);
 	ops = get_driver_ops(id);
-	if (!drv || !ops)
-		return -EINVAL;
+	if (!drv || !ops) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 
-	if (!driver_acquire_init(drv))
-		return -EBUSY;
+	if (!driver_acquire_init(drv)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
 
 	drv->ops = ops;
 
@@ -269,7 +283,7 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 			hyp_unpin_shared_mem(data, data + size);
 		}
 		if (ret)
-			goto out;
+			goto out_release;
 	}
 
 	/*
@@ -282,9 +296,12 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 		driver_release_init(drv, /*success=*/true);
 	host_unlock_component();
 
-out:
+out_release:
 	if (ret)
 		driver_release_init(drv, /*success=*/false);
+
+out_unlock:
+	hyp_spin_unlock(&iommu_registration_lock);
 	return ret;
 }
 
@@ -299,15 +316,28 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	void *mem_va = NULL;
 	int ret = 0;
 
+	/* New device registration not allowed after __pkvm_iommu_finalize(). */
+	hyp_spin_lock(&iommu_registration_lock);
+	if (iommu_finalized) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
 	drv = get_driver(drv_id);
-	if (!drv || !is_driver_ready(drv))
-		return -ENOENT;
+	if (!drv || !is_driver_ready(drv)) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
 
-	if (!PAGE_ALIGNED(dev_pa) || !PAGE_ALIGNED(dev_size))
-		return -EINVAL;
+	if (!PAGE_ALIGNED(dev_pa) || !PAGE_ALIGNED(dev_size)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 
-	if (!is_mmio_range(dev_pa, dev_size))
-		return -EINVAL;
+	if (!is_mmio_range(dev_pa, dev_size)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 
 	/*
 	 * Accept memory donation if the host is providing new memory.
@@ -316,13 +346,15 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	if (kern_mem_va && mem_size) {
 		mem_va = kern_hyp_va(kern_mem_va);
 
-		if (!PAGE_ALIGNED(mem_va) || !PAGE_ALIGNED(mem_size))
-			return -EINVAL;
+		if (!PAGE_ALIGNED(mem_va) || !PAGE_ALIGNED(mem_size)) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
 
 		ret = __pkvm_host_donate_hyp(hyp_virt_to_pfn(mem_va),
 					     mem_size >> PAGE_SHIFT);
 		if (ret)
-			return ret;
+			goto out_unlock;
 	}
 
 	host_lock_component();
@@ -331,7 +363,7 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	dev = alloc_iommu(drv, mem_va, mem_size);
 	if (!dev) {
 		ret = -ENOMEM;
-		goto out;
+		goto out_free;
 	}
 
 	/* Populate the new device entry. */
@@ -345,27 +377,27 @@ int __pkvm_iommu_register(unsigned long dev_id,
 
 	if (!validate_against_existing_iommus(dev)) {
 		ret = -EBUSY;
-		goto out;
+		goto out_free;
 	}
 
 	if (parent_id) {
 		dev->parent = find_iommu_by_id(parent_id);
 		if (!dev->parent) {
 			ret = -EINVAL;
-			goto out;
+			goto out_free;
 		}
 
 		if (dev->parent->ops->validate_child) {
 			ret = dev->parent->ops->validate_child(dev->parent, dev);
 			if (ret)
-				goto out;
+				goto out_free;
 		}
 	}
 
 	if (dev->ops->validate) {
 		ret = dev->ops->validate(dev);
 		if (ret)
-			goto out;
+			goto out_free;
 	}
 
 	/*
@@ -375,13 +407,13 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	 */
 	ret = host_stage2_unmap_dev_locked(dev_pa, dev_size);
 	if (ret)
-		goto out;
+		goto out_free;
 
 	/* Create EL2 mapping for the device. */
 	ret = __pkvm_create_private_mapping(dev_pa, dev_size,
 					    PAGE_HYP_DEVICE, (unsigned long *)(&dev->va));
 	if (ret){
-		goto out;
+		goto out_free;
 	}
 
 	/* Register device and prevent host from mapping the MMIO range. */
@@ -389,10 +421,26 @@ int __pkvm_iommu_register(unsigned long dev_id,
 	if (dev->parent)
 		list_add_tail(&dev->siblings, &dev->parent->children);
 
-out:
+out_free:
 	if (ret)
 		free_iommu(drv, dev);
 	host_unlock_component();
+
+out_unlock:
+	hyp_spin_unlock(&iommu_registration_lock);
+	return ret;
+}
+
+int __pkvm_iommu_finalize(void)
+{
+	int ret = 0;
+
+	hyp_spin_lock(&iommu_registration_lock);
+	if (!iommu_finalized)
+		iommu_finalized = true;
+	else
+		ret = -EPERM;
+	hyp_spin_unlock(&iommu_registration_lock);
 	return ret;
 }
 
diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c
index 01176199e08f..6ca171327b28 100644
--- a/arch/arm64/kvm/iommu.c
+++ b/arch/arm64/kvm/iommu.c
@@ -55,3 +55,9 @@ int pkvm_iommu_resume(struct device *dev)
 				 PKVM_IOMMU_PM_RESUME);
 }
 EXPORT_SYMBOL_GPL(pkvm_iommu_resume);
+
+int pkvm_iommu_finalize(void)
+{
+	return kvm_call_hyp_nvhe(__pkvm_iommu_finalize);
+}
+EXPORT_SYMBOL_GPL(pkvm_iommu_finalize);

From 8fda1979fd77996ec048d18764a0c62aa5b76081 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 6 Apr 2022 16:19:58 +0100
Subject: [PATCH 209/457] ANDROID: KVM: arm64: iommu: Fix upper bound of PT
 walk

The second argument of the kvm_pgtable_walker callback was
misinterpreted as the end of the current entry, where in fact it is
the end of the walked memory region. Fix this by computing the end of
the current entry from the start and the level.

This did not affect correctness, as the code iterates linarly over
the entire address space, but it did affect boot time.

Bug: 190463801
Bug: 218012133
Signed-off-by: David Brazdil <dbrazdil@google.com>
Change-Id: I6d189b87645f47cd215a783c1bc9e1f032ff8c62
(cherry picked from commit 58f8121600276e32e0c91d075f273b836360b27a)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 630b5e1a643a..3a56ea453745 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -174,12 +174,13 @@ static bool is_mmio_range(phys_addr_t base, size_t size)
 	return true;
 }
 
-static int __snapshot_host_stage2(u64 start, u64 end, u32 level,
+static int __snapshot_host_stage2(u64 start, u64 pa_max, u32 level,
 				  kvm_pte_t *ptep,
 				  enum kvm_pgtable_walk_flags flags,
 				  void * const arg)
 {
 	struct pkvm_iommu_driver * const drv = arg;
+	u64 end = start + kvm_granule_size(level);
 	enum kvm_pgtable_prot prot;
 	kvm_pte_t pte = *ptep;
 

From c87a42ddbf893d14b83a70213c1f1e17eedfcfc0 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 6 Apr 2022 16:19:58 +0100
Subject: [PATCH 210/457] ANDROID: KVM: arm64: iommu: Optimize
 snapshot_host_stage2

Currently the generic IOMMU code lets the driver initialize its PT and
then invokes callbacks to set the permissions across the entire PA
range. Optimize this by making it a requirement on the driver to
initialize its PTs to all memory owned by the host. snapshot_host_stage2
then only calls the driver's callback for memory regions not owned by
the host.

Bug: 190463801
Bug: 218012133
Change-Id: I99a826d921d494269078c3a84d90323455a0b769
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 4e56697b422df13df9f25074f6d7710acd784394)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h | 2 ++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index d794f705fb1a..69bce1f01717 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -15,6 +15,8 @@ struct pkvm_iommu_ops {
 	 * Driver-specific arguments are passed in a buffer shared by the host.
 	 * The buffer memory has been pinned in EL2 but host retains R/W access.
 	 * Extra care must be taken when reading from it to avoid TOCTOU bugs.
+	 * If the driver maintains its own page tables, it is expected to
+	 * initialize them to all memory owned by the host.
 	 * Driver initialization lock held during callback.
 	 */
 	int (*init)(void *data, size_t size);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 3a56ea453745..d72ea5602eaf 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -181,16 +181,16 @@ static int __snapshot_host_stage2(u64 start, u64 pa_max, u32 level,
 {
 	struct pkvm_iommu_driver * const drv = arg;
 	u64 end = start + kvm_granule_size(level);
-	enum kvm_pgtable_prot prot;
 	kvm_pte_t pte = *ptep;
 
 	/*
 	 * Valid stage-2 entries are created lazily, invalid ones eagerly.
 	 * Note: In the future we may need to check if [start,end) is MMIO.
+	 * Note: Drivers initialize their PTs to all memory owned by the host,
+	 * so we only call the driver on regions where that is not the case.
 	 */
-	prot = (!pte || kvm_pte_valid(pte)) ? PKVM_HOST_MEM_PROT : 0;
-
-	drv->ops->host_stage2_idmap_prepare(start, end, prot);
+	if (pte && !kvm_pte_valid(pte))
+		drv->ops->host_stage2_idmap_prepare(start, end, /*prot*/ 0);
 	return 0;
 }
 

From 6997a1047311d15f8a322bfed23f85822b55ef5b Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 6 Apr 2022 16:42:36 +0100
Subject: [PATCH 211/457] ANDROID: KVM: arm64: s2mpu: Initialize MPTs to
 PROT_RW

Change the permissions that MPTs are initialized with from PROT_NONE to
PROT_RW. No functional change intended as the generic IOMMU code
sets permissions for the entire address space later. This will allow to
optimize boot time by only unmapping pages not available to host.

Bug: 190463801
Bug: 218012133
Change-Id: Id3891f9de3afe88a0008e8488b3172b73fa1bb69
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 174ac5b7c56fe039fc04ab70073223dd1780e0de)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 01cd8a97fa15..508f5fd5ad66 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -466,7 +466,7 @@ static int s2mpu_init(void *data, size_t size)
 		host_mpt.fmpt[gb] = (struct fmpt){
 			.smpt = smpt,
 			.gran_1g = true,
-			.prot = MPT_PROT_NONE,
+			.prot = MPT_PROT_RW,
 		};
 	}
 

From 9c859fbed20fdc921d8cbe7a18833f9d6220c250 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 20 Apr 2022 14:42:50 +0100
Subject: [PATCH 212/457] ANDROID: KVM: arm64: s2mpu: Extract L1ENTRY_* consts

Extract the L1ENTRY_ATTR_{PRON,GRAN}_MASK constants out of macros that
create the corresponding constants. This will allow EL1 users to use the
masks to get the fields out of register values. Also extract
L1ENTRY_L2TABLE_ADDR_SHIFT for adjusting the L2 table address.

Bug: 190463801
Change-Id: I5ad6657c7b12d10b81217ee80583eade1e17aeb8
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit c43dfe89feb794ef85a90f3e3fbf0fe6965fefb2)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 0804ece03cfd..d60790029e1f 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -94,14 +94,17 @@
 #define FAULT_INFO_LEN_MASK			GENMASK(19, 16)
 #define FAULT_INFO_ID_MASK			GENMASK(15, 0)
 
-#define L1ENTRY_L2TABLE_ADDR(pa)		((pa) >> 4)
+#define L1ENTRY_L2TABLE_ADDR_SHIFT		4
+#define L1ENTRY_L2TABLE_ADDR(pa)		((pa) >> L1ENTRY_L2TABLE_ADDR_SHIFT)
 
 #define L1ENTRY_ATTR_L2TABLE_EN			BIT(0)
 #define L1ENTRY_ATTR_GRAN_4K			0x0
 #define L1ENTRY_ATTR_GRAN_64K			0x1
 #define L1ENTRY_ATTR_GRAN_2M			0x2
-#define L1ENTRY_ATTR_PROT(prot)			FIELD_PREP(GENMASK(2, 1), prot)
-#define L1ENTRY_ATTR_GRAN(gran)			FIELD_PREP(GENMASK(5, 4), gran)
+#define L1ENTRY_ATTR_PROT_MASK			GENMASK(2, 1)
+#define L1ENTRY_ATTR_GRAN_MASK			GENMASK(5, 4)
+#define L1ENTRY_ATTR_PROT(prot)			FIELD_PREP(L1ENTRY_ATTR_PROT_MASK, prot)
+#define L1ENTRY_ATTR_GRAN(gran)			FIELD_PREP(L1ENTRY_ATTR_GRAN_MASK, gran)
 #define L1ENTRY_ATTR_1G(prot)			L1ENTRY_ATTR_PROT(prot)
 #define L1ENTRY_ATTR_L2(gran)			(L1ENTRY_ATTR_GRAN(gran) | \
 						 L1ENTRY_ATTR_L2TABLE_EN)

From 627a64b5ad0406c4a7187f0cb9294ba69a5e0bed Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 21 Apr 2022 11:00:33 +0100
Subject: [PATCH 213/457] ANDROID: KVM: arm64: s2mpu: Refactor DABT handler

In preparation for adding more entries to the list of S2MPU registers
accessible to the host, refactor the code to use a switch instead of
a series of ifs. No functional change intended.

Bug: 190463801
Change-Id: I9075a7f080b3391c98fa10f26a943bce0b183394
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 96767ad7bec869c1aee13f0c9680b2ed3d895db9)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 508f5fd5ad66..8ff0daeae176 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -393,15 +393,16 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 	const u32 write_only = is_write ? read_write : no_access;
 	u32 masked_off;
 
-	/* IRQ handler can clear interrupts. */
-	if (off == REG_NS_INTERRUPT_CLEAR)
+	switch (off) {
+	/* Allow EL1 IRQ handler to clear interrupts. */
+	case REG_NS_INTERRUPT_CLEAR:
 		return write_only & ALL_VIDS_BITMAP;
-
-	/* IRQ handler can read bitmap of pending interrupts. */
-	if (off == REG_NS_FAULT_STATUS)
+	/* Allow EL1 IRQ handler to read bitmap of pending interrupts. */
+	case REG_NS_FAULT_STATUS:
 		return read_only & ALL_VIDS_BITMAP;
+	}
 
-	/* IRQ handler can read fault information. */
+	/* Allow EL1 IRQ handler to read fault information. */
 	masked_off = off & ~REG_NS_FAULT_VID_MASK;
 	if ((masked_off == REG_NS_FAULT_PA_LOW(0)) ||
 	    (masked_off == REG_NS_FAULT_PA_HIGH(0)) ||

From bc018bea9bcc24edb0e795a6d8b0858d530a8460 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 20 Apr 2022 14:43:46 +0100
Subject: [PATCH 214/457] ANDROID: KVM: arm64: s2mpu: Allow L1ENTRY_* r/o
 access

Allow read-only access to L1ENTRY_ATTR and L1ENTRY_L2TABLE S2MPU
registers. This allows the host to dump the register state for debugging
purposes. It is safe because the state of the S2MPU is known to the host
anyway.

Bug: 190463801
Change-Id: I44b3633dbad3c122ce521c37813dbf2ae690a678
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit e56d9603a633e7320ea1973ec0c4474b4cb7392c)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 8ff0daeae176..89967f2fefcd 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -402,6 +402,11 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 		return read_only & ALL_VIDS_BITMAP;
 	}
 
+	/* Allow reading L1ENTRY registers for debugging. */
+	if (off >= REG_NS_L1ENTRY_L2TABLE_ADDR(0, 0) &&
+	    off < REG_NS_L1ENTRY_ATTR(NR_VIDS, 0))
+		return read_only;
+
 	/* Allow EL1 IRQ handler to read fault information. */
 	masked_off = off & ~REG_NS_FAULT_VID_MASK;
 	if ((masked_off == REG_NS_FAULT_PA_LOW(0)) ||

From 144f77f652a54500fd9223c8c0fec236b0f7c9a2 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 20 Apr 2022 15:48:16 +0100
Subject: [PATCH 215/457] ANDROID: KVM: arm64: s2mpu: Allow reading MPTC
 entries

The state of the S2MPU does not need to be kept secret from the host as
it merely reflects the permissions that the host has and knows about.
To make debugging DMA issues easier, allow the host to query entries
from the MPTC cache. This involves writing the set and way IDs of the
query to the READ_MPTC register and then reading the MPTC entry
information from READ_MPTC_TAG_PPN/TAG_OTHERS/DATA. Modify the S2MPU
DABT handler to allow this register access pattern.

Bug: 190463801
Bug: 229793579
Change-Id: I5661c09a284a4796e18b346f9e294ba64d7d8c0c
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit d5c0f0f937af0826e549503897fb9e9c9b22722a)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 21 +++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 16 ++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index d60790029e1f..d8c552da273f 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -28,6 +28,7 @@
 #define REG_NS_INTERRUPT_ENABLE_PER_VID_SET	0x20
 #define REG_NS_INTERRUPT_CLEAR			0x2c
 #define REG_NS_VERSION				0x60
+#define REG_NS_INFO				0x64
 #define REG_NS_STATUS				0x68
 #define REG_NS_NUM_CONTEXT			0x100
 #define REG_NS_CONTEXT_CFG_VALID_VID		0x104
@@ -39,6 +40,10 @@
 #define REG_NS_FAULT_PA_LOW(vid)		(0x2004 + ((vid) * 0x20))
 #define REG_NS_FAULT_PA_HIGH(vid)		(0x2008 + ((vid) * 0x20))
 #define REG_NS_FAULT_INFO(vid)			(0x2010 + ((vid) * 0x20))
+#define REG_NS_READ_MPTC			0x3000
+#define REG_NS_READ_MPTC_TAG_PPN		0x3004
+#define REG_NS_READ_MPTC_TAG_OTHERS		0x3008
+#define REG_NS_READ_MPTC_DATA			0x3010
 #define REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb)	(0x4000 + ((vid) * 0x200) + ((gb) * 0x8))
 #define REG_NS_L1ENTRY_ATTR(vid, gb)		(0x4004 + ((vid) * 0x200) + ((gb) * 0x8))
 
@@ -72,6 +77,8 @@
 						 VERSION_MINOR_ARCH_VER_MASK | \
 						 VERSION_REV_ARCH_VER_MASK)
 
+#define INFO_NUM_SET_MASK			GENMASK(15, 0)
+
 #define STATUS_BUSY				BIT(0)
 #define STATUS_ON_INVALIDATING			BIT(1)
 
@@ -97,6 +104,20 @@
 #define L1ENTRY_L2TABLE_ADDR_SHIFT		4
 #define L1ENTRY_L2TABLE_ADDR(pa)		((pa) >> L1ENTRY_L2TABLE_ADDR_SHIFT)
 
+#define READ_MPTC_WAY_MASK			GENMASK(18, 16)
+#define READ_MPTC_SET_MASK			GENMASK(15, 0)
+#define READ_MPTC_MASK				(READ_MPTC_WAY_MASK | READ_MPTC_SET_MASK)
+#define READ_MPTC_WAY(way)			FIELD_PREP(READ_MPTC_WAY_MASK, (way))
+#define READ_MPTC_SET(set)			FIELD_PREP(READ_MPTC_SET_MASK, (set))
+#define READ_MPTC(set, way)			(READ_MPTC_SET(set) | READ_MPTC_WAY(way))
+#define READ_MPTC_TAG_PPN_MASK			GENMASK(23, 0)
+#define READ_MPTC_TAG_OTHERS_VID_MASK		GENMASK(10, 8)
+#define READ_MPTC_TAG_OTHERS_GRAN_MASK		GENMASK(5, 4)
+#define READ_MPTC_TAG_OTHERS_VALID_BIT		BIT(0)
+#define READ_MPTC_TAG_OTHERS_MASK		(READ_MPTC_TAG_OTHERS_VID_MASK | \
+						 READ_MPTC_TAG_OTHERS_GRAN_MASK | \
+						 READ_MPTC_TAG_OTHERS_VALID_BIT)
+
 #define L1ENTRY_ATTR_L2TABLE_EN			BIT(0)
 #define L1ENTRY_ATTR_GRAN_4K			0x0
 #define L1ENTRY_ATTR_GRAN_64K			0x1
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 89967f2fefcd..50f842a3b0de 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -397,9 +397,25 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 	/* Allow EL1 IRQ handler to clear interrupts. */
 	case REG_NS_INTERRUPT_CLEAR:
 		return write_only & ALL_VIDS_BITMAP;
+	/* Allow reading number of sets used by MPTC. */
+	case REG_NS_INFO:
+		return read_only & INFO_NUM_SET_MASK;
 	/* Allow EL1 IRQ handler to read bitmap of pending interrupts. */
 	case REG_NS_FAULT_STATUS:
 		return read_only & ALL_VIDS_BITMAP;
+	/*
+	 * Allow reading MPTC entries for debugging. That involves:
+	 *   - writing (set,way) to READ_MPTC
+	 *   - reading READ_MPTC_*
+	 */
+	case REG_NS_READ_MPTC:
+		return write_only & READ_MPTC_MASK;
+	case REG_NS_READ_MPTC_TAG_PPN:
+		return read_only & READ_MPTC_TAG_PPN_MASK;
+	case REG_NS_READ_MPTC_TAG_OTHERS:
+		return read_only & READ_MPTC_TAG_OTHERS_MASK;
+	case REG_NS_READ_MPTC_DATA:
+		return read_only;
 	}
 
 	/* Allow reading L1ENTRY registers for debugging. */

From 786cfb3bde81a961f7c16a81c65729cd57d8c716 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 21 Apr 2022 10:56:36 +0100
Subject: [PATCH 216/457] ANDROID: KVM: arm64: s2mpu: Allow r/o access to
 control regs

To ease debugging, allow the host to read the state of S2MPU's control
registers. These values do not need to be kept secret from the host.

Bug: 190463801
Change-Id: I5df7de81caceaddfaf2bfb0948ff2d167de487c3
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 5d6831add7c1bb441fe410481d055e2b1c486684)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h    | 15 +++++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c |  7 +++++++
 2 files changed, 22 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index d8c552da273f..b1075abd604c 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -51,15 +51,30 @@
 #define CTRL0_INTERRUPT_ENABLE			BIT(1)
 #define CTRL0_FAULT_RESP_TYPE_SLVERR		BIT(2) /* for v8 */
 #define CTRL0_FAULT_RESP_TYPE_DECERR		BIT(2) /* for v9 */
+#define CTRL0_MASK				(CTRL0_ENABLE | \
+						 CTRL0_INTERRUPT_ENABLE | \
+						 CTRL0_FAULT_RESP_TYPE_SLVERR | \
+						 CTRL0_FAULT_RESP_TYPE_DECERR)
 
 #define CTRL1_DISABLE_CHK_S1L1PTW		BIT(0)
 #define CTRL1_DISABLE_CHK_S1L2PTW		BIT(1)
 #define CTRL1_ENABLE_PAGE_SIZE_AWARENESS	BIT(2)
 #define CTRL1_DISABLE_CHK_USER_MATCHED_REQ	BIT(3)
+#define CTRL1_MASK				(CTRL1_DISABLE_CHK_S1L1PTW | \
+						 CTRL1_DISABLE_CHK_S1L2PTW | \
+						 CTRL1_ENABLE_PAGE_SIZE_AWARENESS | \
+						 CTRL1_DISABLE_CHK_USER_MATCHED_REQ)
 
 #define CFG_MPTW_CACHE_OVERRIDE			BIT(0)
+#define CFG_MPTW_CACHE_VALUE			GENMASK(7, 4)
 #define CFG_MPTW_QOS_OVERRIDE			BIT(8)
+#define CFG_MPTW_QOS_VALUE			GENMASK(15, 12)
 #define CFG_MPTW_SHAREABLE			BIT(16)
+#define CFG_MASK				(CFG_MPTW_CACHE_OVERRIDE | \
+						 CFG_MPTW_CACHE_VALUE | \
+						 CFG_MPTW_QOS_OVERRIDE | \
+						 CFG_MPTW_QOS_VALUE | \
+						 CFG_MPTW_SHAREABLE)
 
 /* For use with hi_lo_readq_relaxed(). */
 #define REG_NS_FAULT_PA_HIGH_LOW(vid)		REG_NS_FAULT_PA_LOW(vid)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 50f842a3b0de..ff5d7d1044e5 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -394,6 +394,13 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 	u32 masked_off;
 
 	switch (off) {
+	/* Allow reading control registers for debugging. */
+	case REG_NS_CTRL0:
+		return read_only & CTRL0_MASK;
+	case REG_NS_CTRL1:
+		return read_only & CTRL1_MASK;
+	case REG_NS_CFG:
+		return read_only & CFG_MASK;
 	/* Allow EL1 IRQ handler to clear interrupts. */
 	case REG_NS_INTERRUPT_CLEAR:
 		return write_only & ALL_VIDS_BITMAP;

From d5eb62c94926b72599431ff24458cd0b0d14f101 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Thu, 13 Oct 2022 14:20:53 +0100
Subject: [PATCH 217/457] ANDROID: KVM: arm64: s2mpu: Add SysMMU_SYNC timeout

The SysMMU_SYNC provides an invalidation-complete signal to the
hypervisor. Currently the hypervisor will wait indefinitely for the SYNC
to set the SYNC_COMP_COMPLETE bit. In practice, this case deadlock as
the hypervisor holds the host lock while waiting for the SYNC.

To avoid deadlock, adjust the algorithm to time out after a given number
of reads of the SYNC_COMP register (new constant SYNC_TIMEOUT_BASE).
This can be a small number as most attempts succeed after a single read
of the SFR.

If the wait-loop times out, the hypervisor will try again, multiplying
the maximum number of SFR reads with SYNC_TIMEOUT_MULTIPLIER each time.
This number was selected to grow quickly, in case there is a lot of DMA
traffic that would be slowing down the SYNC request.

Finally, if the hardware does not set the bit even after
SYNC_MAX_RETRIES, the algorithm will give up to avoid deadlock. The
value was selected so that the worst-case time spent in
__wait_for_invalidation_complete() remains tolerable.

Bug: 250727777
Change-Id: I00230e6dd71de12bac223e4fe118806bdc3872f4
Signed-off-by: David Brazdil <dbrazdil@google.com>
(cherry picked from commit 992b5f98cad8a8bd0da106a113c25429aeb183cd)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 38 ++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index ff5d7d1044e5..20c65f7489ed 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -24,6 +24,10 @@
 
 #define PA_MAX				((phys_addr_t)SZ_1G * NR_GIGABYTES)
 
+#define SYNC_MAX_RETRIES		5
+#define SYNC_TIMEOUT			5
+#define SYNC_TIMEOUT_MULTIPLIER		3
+
 #define CTX_CFG_ENTRY(ctxid, nr_ctx, vid) \
 	(CONTEXT_CFG_VALID_VID_CTX_VID(ctxid, vid) \
 	 | (((ctxid) < (nr_ctx)) ? CONTEXT_CFG_VALID_VID_CTX_VALID(ctxid) : 0))
@@ -158,11 +162,20 @@ static void __set_control_regs(struct pkvm_iommu *dev)
 	writel_relaxed(ctrl0, dev->va + REG_NS_CTRL0);
 }
 
-/* Poll the given SFR until its value has all bits of a given mask set. */
-static void __wait_until(void __iomem *addr, u32 mask)
+/*
+ * Poll the given SFR until its value has all bits of a given mask set.
+ * Returns true if successful, false if not successful after a given number of
+ * attempts.
+ */
+static bool __wait_until(void __iomem *addr, u32 mask, size_t max_attempts)
 {
-	while ((readl_relaxed(addr) & mask) != mask)
-		continue;
+	size_t i;
+
+	for (i = 0; i < max_attempts; i++) {
+		if ((readl_relaxed(addr) & mask) == mask)
+			return true;
+	}
+	return false;
 }
 
 /* Poll the given SFR as long as its value has all bits of a given mask set. */
@@ -175,14 +188,27 @@ static void __wait_while(void __iomem *addr, u32 mask)
 static void __wait_for_invalidation_complete(struct pkvm_iommu *dev)
 {
 	struct pkvm_iommu *sync;
+	size_t i, timeout;
 
 	/*
 	 * Wait for transactions to drain if SysMMU_SYNCs were registered.
 	 * Assumes that they are in the same power domain as the S2MPU.
+	 *
+	 * The algorithm will try initiating the SYNC if the SYNC_COMP_COMPLETE
+	 * bit has not been set after a given number of attempts, increasing the
+	 * timeout exponentially each time. If this cycle fails a given number
+	 * of times, the algorithm will give up completely to avoid deadlock.
 	 */
 	for_each_child(sync, dev) {
-		writel_relaxed(SYNC_CMD_SYNC, sync->va + REG_NS_SYNC_CMD);
-		__wait_until(sync->va + REG_NS_SYNC_COMP, SYNC_COMP_COMPLETE);
+		timeout = SYNC_TIMEOUT;
+		for (i = 0; i < SYNC_MAX_RETRIES; i++) {
+			writel_relaxed(SYNC_CMD_SYNC, sync->va + REG_NS_SYNC_CMD);
+			if (__wait_until(sync->va + REG_NS_SYNC_COMP,
+					 SYNC_COMP_COMPLETE, timeout)) {
+				break;
+			}
+			timeout *= SYNC_TIMEOUT_MULTIPLIER;
+		}
 	}
 
 	/* Must not access SFRs while S2MPU is busy invalidating (v9 only). */

From 4dcd45d0cb7d8957c9c90590cc97e49c61f2559c Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 7 Jun 2022 12:00:07 +0000
Subject: [PATCH 218/457] ANDROID: KVM: arm64: Don't update IOMMUs
 unnecessarily

When handling host stage-2 faults the hypervisor currently updates the
CPU _and_ IOMMUs page-tables. However, since we currently proactively
map accessible PA ranges into IOMMUs, updating them during stage-2
faults is unnecessary -- it only needs to be done during ownership
transitions. Optimize this by skipping the IOMMU updates from the host
memory abort path, which also reduces contention on the host stage-2
lock during boot and saves up to 1.1 sec of boot time on Pixel 6.

Bug: 232879742
Change-Id: Icb6f1b2d10de8cc2f976bbc94b9b79ad7aaa6135
Signed-off-by: Quentin Perret <qperret@google.com>
(cherry picked from commit 20c6e1ba55403fad0e7f039cc5d7e6f8c0fffa50)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  3 ++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 14 ++++++++------
 arch/arm64/kvm/hyp/nvhe/setup.c               |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index a42090ab120c..18109fc7cc4d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -82,7 +82,8 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
 int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
 
 bool addr_is_memory(phys_addr_t phys);
-int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
+int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot,
+			     bool update_iommu);
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
 int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 0c3db72312ff..5c0aff3e7694 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -513,7 +513,8 @@ static bool range_is_memory(u64 start, u64 end)
 }
 
 static inline int __host_stage2_idmap(u64 start, u64 end,
-				      enum kvm_pgtable_prot prot)
+				      enum kvm_pgtable_prot prot,
+				      bool update_iommu)
 {
 	int ret;
 
@@ -522,7 +523,8 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
 	if (ret)
 		return ret;
 
-	pkvm_iommu_host_stage2_idmap(start, end, prot);
+	if (update_iommu)
+		pkvm_iommu_host_stage2_idmap(start, end, prot);
 	return 0;
 }
 
@@ -584,9 +586,9 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
 }
 
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
-			     enum kvm_pgtable_prot prot)
+			     enum kvm_pgtable_prot prot, bool update_iommu)
 {
-	return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
+	return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot, update_iommu);
 }
 
 #define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
@@ -664,7 +666,7 @@ static int host_stage2_idmap(u64 addr)
 	if (ret)
 		return ret;
 
-	return host_stage2_idmap_locked(range.start, range.end - range.start, prot);
+	return host_stage2_idmap_locked(range.start, range.end - range.start, prot, false);
 }
 
 static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
@@ -846,7 +848,7 @@ static int __host_set_page_state_range(u64 addr, u64 size,
 {
 	enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state);
 
-	return host_stage2_idmap_locked(addr, size, prot);
+	return host_stage2_idmap_locked(addr, size, prot, true);
 }
 
 static int host_request_owned_transition(u64 *completer_addr,
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index b10d903e05f2..66392411ecaa 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -248,7 +248,7 @@ static int fix_host_ownership_walker(u64 addr, u64 end, u32 level,
 		return -EINVAL;
 	}
 
-	return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+	return host_stage2_idmap_locked(phys, PAGE_SIZE, prot, false);
 }
 
 static int fix_hyp_pgtable_refcnt_walker(u64 addr, u64 end, u32 level,

From f21a6c3cb9ab02890e48954bf8435661d270960f Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 16 Nov 2022 17:54:51 +0000
Subject: [PATCH 219/457] ANDROID: KVM: arm64: iommu: Add
 host_stage2_idmap_complete

Add a new callback to pkvm_iommu_ops called after
host_stage2_idmap_apply on all IOMMU devices. This allows the drivers to
complete operations like invalidation in two stages.

Bug: 249161451
Signed-off-by: David Brazdil <dbrazdil@google.com>
Change-Id: I9c077fd2b18ce54ad67eb34ef16bc94428797419
(cherry picked from commit ce39549d9283e32cee6a476e4603e7b586957ecf)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h | 6 ++++++
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 5 +++++
 2 files changed, 11 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index 69bce1f01717..f9a75f49c499 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -48,6 +48,12 @@ struct pkvm_iommu_ops {
 	void (*host_stage2_idmap_apply)(struct pkvm_iommu *dev,
 					phys_addr_t start, phys_addr_t end);
 
+	/*
+	 * Callback to finish a host stage-2 mapping change at device level.
+	 * Called after 'host_stage2_idmap_apply' with host lock held.
+	 */
+	void (*host_stage2_idmap_complete)(struct pkvm_iommu *dev);
+
 	/* Power management callbacks. Called with host lock held. */
 	int (*suspend)(struct pkvm_iommu *dev);
 	int (*resume)(struct pkvm_iommu *dev);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index d72ea5602eaf..2cac6aea8c8c 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -545,4 +545,9 @@ void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 		if (dev->powered && dev->ops->host_stage2_idmap_apply)
 			dev->ops->host_stage2_idmap_apply(dev, start, end);
 	}
+
+	list_for_each_entry(dev, &iommu_list, list) {
+		if (dev->powered && dev->ops->host_stage2_idmap_complete)
+			dev->ops->host_stage2_idmap_complete(dev);
+	}
 }

From 5584b944ab0ddccf8aca4f082143936d5f376283 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Mon, 26 Sep 2022 18:46:08 +0100
Subject: [PATCH 220/457] ANDROID: KVM: arm64: s2mpu: Fix SYNC latency
 regression

SysMMU_SYNCs provide an invalidation-complete signal to the S2MPU
driver but the latency can be quite high. Improve this by waiting for
all the SYNCs in parallel - separate the initiation of invalidation
barrier from waiting for completion. This way we initiate invalidation
on all SYNCs first, then wait for all of them to complete.

The previously introduced exponential-backoff only kicks in if the
SYNC_COMP_COMPLETE bit is not set after the parallel invalidation.

Bug: 249161451
Signed-off-by: David Brazdil <dbrazdil@google.com>
Change-Id: I9d544bc65f8633d376c7ccd65ea23195ca432964
(cherry picked from commit 1da102d4e3d54285f1d2071a7bee79fbb4c9ce71)
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 69 ++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 20c65f7489ed..8da0e43bab32 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -185,9 +185,13 @@ static void __wait_while(void __iomem *addr, u32 mask)
 		continue;
 }
 
-static void __wait_for_invalidation_complete(struct pkvm_iommu *dev)
+static void __sync_cmd_start(struct pkvm_iommu *sync)
+{
+	writel_relaxed(SYNC_CMD_SYNC, sync->va + REG_NS_SYNC_CMD);
+}
+
+static void __invalidation_barrier_slow(struct pkvm_iommu *sync)
 {
-	struct pkvm_iommu *sync;
 	size_t i, timeout;
 
 	/*
@@ -199,16 +203,36 @@ static void __wait_for_invalidation_complete(struct pkvm_iommu *dev)
 	 * timeout exponentially each time. If this cycle fails a given number
 	 * of times, the algorithm will give up completely to avoid deadlock.
 	 */
+	timeout = SYNC_TIMEOUT;
+	for (i = 0; i < SYNC_MAX_RETRIES; i++) {
+		__sync_cmd_start(sync);
+		if (__wait_until(sync->va + REG_NS_SYNC_COMP, SYNC_COMP_COMPLETE, timeout))
+			break;
+		timeout *= SYNC_TIMEOUT_MULTIPLIER;
+	}
+}
+
+/* Initiate invalidation barrier. */
+static void __invalidation_barrier_init(struct pkvm_iommu *dev)
+{
+	struct pkvm_iommu *sync;
+
+	for_each_child(sync, dev)
+		__sync_cmd_start(sync);
+}
+
+/* Wait for invalidation to complete. */
+static void __invalidation_barrier_complete(struct pkvm_iommu *dev)
+{
+	struct pkvm_iommu *sync;
+
+	/*
+	 * Check if the SYNC_COMP_COMPLETE bit has been set for individual
+	 * devices. If not, fall back to non-parallel invalidation.
+	 */
 	for_each_child(sync, dev) {
-		timeout = SYNC_TIMEOUT;
-		for (i = 0; i < SYNC_MAX_RETRIES; i++) {
-			writel_relaxed(SYNC_CMD_SYNC, sync->va + REG_NS_SYNC_CMD);
-			if (__wait_until(sync->va + REG_NS_SYNC_COMP,
-					 SYNC_COMP_COMPLETE, timeout)) {
-				break;
-			}
-			timeout *= SYNC_TIMEOUT_MULTIPLIER;
-		}
+		if (!(readl_relaxed(sync->va + REG_NS_SYNC_COMP) & SYNC_COMP_COMPLETE))
+			__invalidation_barrier_slow(sync);
 	}
 
 	/* Must not access SFRs while S2MPU is busy invalidating (v9 only). */
@@ -221,11 +245,12 @@ static void __wait_for_invalidation_complete(struct pkvm_iommu *dev)
 static void __all_invalidation(struct pkvm_iommu *dev)
 {
 	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_ALL_INVALIDATION);
-	__wait_for_invalidation_complete(dev);
+	__invalidation_barrier_init(dev);
+	__invalidation_barrier_complete(dev);
 }
 
-static void __range_invalidation(struct pkvm_iommu *dev, phys_addr_t first_byte,
-				 phys_addr_t last_byte)
+static void __range_invalidation_init(struct pkvm_iommu *dev, phys_addr_t first_byte,
+				      phys_addr_t last_byte)
 {
 	u32 start_ppn = first_byte >> RANGE_INVALIDATION_PPN_SHIFT;
 	u32 end_ppn = last_byte >> RANGE_INVALIDATION_PPN_SHIFT;
@@ -233,7 +258,7 @@ static void __range_invalidation(struct pkvm_iommu *dev, phys_addr_t first_byte,
 	writel_relaxed(start_ppn, dev->va + REG_NS_RANGE_INVALIDATION_START_PPN);
 	writel_relaxed(end_ppn, dev->va + REG_NS_RANGE_INVALIDATION_END_PPN);
 	writel_relaxed(INVALIDATION_INVALIDATE, dev->va + REG_NS_RANGE_INVALIDATION);
-	__wait_for_invalidation_complete(dev);
+	__invalidation_barrier_init(dev);
 }
 
 static void __set_l1entry_attr_with_prot(struct pkvm_iommu *dev, unsigned int gb,
@@ -367,7 +392,13 @@ static void __mpt_idmap_apply(struct pkvm_iommu *dev, struct mpt *mpt,
 				__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
 		}
 	}
-	__range_invalidation(dev, first_byte, last_byte);
+	/* Initiate invalidation, completed in __mdt_idmap_complete. */
+	__range_invalidation_init(dev, first_byte, last_byte);
+}
+
+static void __mpt_idmap_complete(struct pkvm_iommu *dev, struct mpt *mpt)
+{
+	__invalidation_barrier_complete(dev);
 }
 
 static void s2mpu_host_stage2_idmap_prepare(phys_addr_t start, phys_addr_t end,
@@ -388,6 +419,11 @@ static void s2mpu_host_stage2_idmap_apply(struct pkvm_iommu *dev,
 	__mpt_idmap_apply(dev, &host_mpt, start, end - 1);
 }
 
+static void s2mpu_host_stage2_idmap_complete(struct pkvm_iommu *dev)
+{
+	__mpt_idmap_complete(dev, &host_mpt);
+}
+
 static int s2mpu_resume(struct pkvm_iommu *dev)
 {
 	/*
@@ -576,6 +612,7 @@ const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 	.suspend = s2mpu_suspend,
 	.host_stage2_idmap_prepare = s2mpu_host_stage2_idmap_prepare,
 	.host_stage2_idmap_apply = s2mpu_host_stage2_idmap_apply,
+	.host_stage2_idmap_complete = s2mpu_host_stage2_idmap_complete,
 	.host_dabt_handler = s2mpu_host_dabt_handler,
 	.data_size = sizeof(struct s2mpu_drv_data),
 };

From 8260bd357c7a894b9bc9177ee9fc1880f2d5d5ca Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Wed, 23 Nov 2022 14:38:03 +0000
Subject: [PATCH 221/457] Revert "ANDROID: virtio_balloon: New module parameter
 "pkvm""

This reverts commit 87bcd3edf3ee7b9d5e76a8090b3f05ea1535026d.

Reason for revert: Memory reclaim capability will be checked by the
host before configuring the virtio_balloon device.

Bug: 240239989
Change-Id: I03e7c39ec6d671babeace4040138b416c7e201cf
Signed-off-by: Keir Fraser <keirf@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 drivers/virtio/virtio_balloon.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index eb27c82bf0dc..72fe24005bc0 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -18,11 +18,6 @@
 #include <linux/wait.h>
 #include <linux/mm.h>
 #include <linux/page_reporting.h>
-#include <linux/mem_relinquish.h>
-
-static bool pkvm;
-module_param(pkvm, bool, 0);
-MODULE_PARM_DESC(pkvm, "Running on PKVM. Must use MEM_RELINQUISH.");
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -873,12 +868,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	struct virtio_balloon *vb;
 	int err;
 
-	if (pkvm && !kvm_has_memrelinquish_services()) {
-		dev_err(&vdev->dev, "%s failure: pkvm but no memrelinquish\n",
-			__func__);
-		return -EINVAL;
-	}
-
 	if (!vdev->config->get) {
 		dev_err(&vdev->dev, "%s failure: config access disabled\n",
 			__func__);

From fe3157f3286b8486e25733c9442f1f46e22102ae Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Thu, 24 Nov 2022 11:38:28 +0000
Subject: [PATCH 222/457] ANDROID: KVM: arm64: Use 32-bit function ID for PSCI
 MEM_PROTECT call

The PSCI specification defines only a 32-bit function ID for the
MEM_PROTECT call used to protect against cold reboot attacks.

Fix the pKVM hypervisor invocation of MEM_PROTECT to use the 32-bit
function ID instead of the unallocated 64-bit flavour.

[ qperret: dropped the change to include/uapi/linux/psci.h as the 32 bit
  variant of the call have been introduced upstream by 3137f2e60098
  ("firmware/psci: Add debugfs support to ease debugging") ]

Bug: 260316363
Signed-off-by: Will Deacon <willdeacon@google.com>
Change-Id: I675a57419064f7f006960ca5370e9dc2d5279a90
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/psci-relay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 4b26298e7927..0fa155323e19 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -236,7 +236,7 @@ static u64 psci_mem_protect(s64 offset)
 		return cnt;
 
 	if (!cnt || !new)
-		psci_call(PSCI_1_1_FN64_MEM_PROTECT, offset < 0 ? 0 : 1, 0, 0);
+		psci_call(PSCI_1_1_FN_MEM_PROTECT, offset < 0 ? 0 : 1, 0, 0);
 
 	cnt = new;
 	return cnt;

From 5747f9215c52ae7dfea6e29b5c444355b1b6fcb9 Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Thu, 10 Nov 2022 12:08:05 +0000
Subject: [PATCH 223/457] ANDROID: KVM: arm64: iommu: Support dynamic driver
 registration in IOMMU layer

This will allow IOMMU drivers to be loaded as EL2 modules
To achieve that, EL2 module init should register and init
the driver using pkvm_iommu_driver_init.

To avoid having to hardcode S2MPU drivers in IOMMU EL2 code,
pkvm_iommu_driver_init will have a ptr to driver struct instead of
id.

Id is removed from pkvm_iommu_driver and the address of the struct
is treated as the id of the driver, the kernel is expected to pass
linear mapping address of the driver and the hypervisor is expected
to use hyp address.

Bug: 249481474
Change-Id: I49db5b5d88a691d66fa2d5302198785da94d70d2
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h       | 21 +++---
 arch/arm64/kvm/hyp/include/nvhe/iommu.h |  8 +--
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      | 12 +++-
 arch/arm64/kvm/hyp/nvhe/iommu.c         | 93 +++++++++++++++----------
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c   |  9 +++
 arch/arm64/kvm/iommu.c                  | 12 ++--
 arch/arm64/kvm/iommu/s2mpu.c            | 15 ++--
 7 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e93c4b2a4c1b..8888316043e5 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -379,19 +379,24 @@ extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
 extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
 #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map)
 
-enum pkvm_iommu_driver_id {
-	PKVM_IOMMU_DRIVER_S2MPU,
-	PKVM_IOMMU_DRIVER_SYSMMU_SYNC,
-	PKVM_IOMMU_NR_DRIVERS,
-};
-
 enum pkvm_iommu_pm_event {
 	PKVM_IOMMU_PM_SUSPEND,
 	PKVM_IOMMU_PM_RESUME,
 };
 
-int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id drv_id, void *data, size_t size);
-int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
+struct pkvm_iommu_ops;
+
+struct pkvm_iommu_driver {
+	const struct pkvm_iommu_ops *ops;
+	struct list_head list;
+	atomic_t state;
+};
+
+extern struct pkvm_iommu_driver kvm_nvhe_sym(pkvm_s2mpu_driver);
+extern struct pkvm_iommu_driver kvm_nvhe_sym(pkvm_sysmmu_sync_driver);
+
+int pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size);
+int pkvm_iommu_register(struct device *dev, struct pkvm_iommu_driver *drv,
 			phys_addr_t pa, size_t size, struct device *parent);
 int pkvm_iommu_suspend(struct device *dev);
 int pkvm_iommu_resume(struct device *dev);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/iommu.h b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
index f9a75f49c499..639277053e6d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/iommu.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/iommu.h
@@ -84,9 +84,8 @@ struct pkvm_iommu {
 	char data[];
 };
 
-int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size);
-int __pkvm_iommu_register(unsigned long dev_id,
-			  enum pkvm_iommu_driver_id drv_id,
+int __pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size);
+int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id,
 			  phys_addr_t dev_pa, size_t dev_size,
 			  unsigned long parent_id,
 			  void *kern_mem_va, size_t mem_size);
@@ -100,7 +99,4 @@ bool pkvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u32 esr,
 void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 				  enum kvm_pgtable_prot prot);
 
-extern const struct pkvm_iommu_ops pkvm_s2mpu_ops;
-extern const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops;
-
 #endif	/* __ARM64_KVM_NVHE_IOMMU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index a65301e85f7c..a2854ea1a3ad 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1113,23 +1113,29 @@ static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt)
 
 static void handle___pkvm_iommu_driver_init(struct kvm_cpu_context *host_ctxt)
 {
-	DECLARE_REG(enum pkvm_iommu_driver_id, id, host_ctxt, 1);
+	DECLARE_REG(struct pkvm_iommu_driver*, drv, host_ctxt, 1);
 	DECLARE_REG(void *, data, host_ctxt, 2);
 	DECLARE_REG(size_t, size, host_ctxt, 3);
 
-	cpu_reg(host_ctxt, 1) = __pkvm_iommu_driver_init(id, data, size);
+	/* __pkvm_iommu_driver_init expects hyp_va as it can be called from EL2 as a function. */
+	drv = kern_hyp_va(drv);
+	data = kern_hyp_va(data);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_iommu_driver_init(drv, data, size);
 }
 
 static void handle___pkvm_iommu_register(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(unsigned long, dev_id, host_ctxt, 1);
-	DECLARE_REG(enum pkvm_iommu_driver_id, drv_id, host_ctxt, 2);
+	DECLARE_REG(unsigned long, drv_id, host_ctxt, 2);
 	DECLARE_REG(phys_addr_t, dev_pa, host_ctxt, 3);
 	DECLARE_REG(size_t, dev_size, host_ctxt, 4);
 	DECLARE_REG(unsigned long, parent_id, host_ctxt, 5);
 	DECLARE_REG(void *, mem, host_ctxt, 6);
 	DECLARE_REG(size_t, mem_size, host_ctxt, 7);
 
+	/* drv_id is the hyp address of the driver. */
+	drv_id = kern_hyp_va(drv_id);
 	cpu_reg(host_ctxt, 1) = __pkvm_iommu_register(dev_id, drv_id, dev_pa,
 						      dev_size, parent_id,
 						      mem, mem_size);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index 2cac6aea8c8c..df974eba1e0f 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -15,24 +15,22 @@
 #include <nvhe/iommu.h>
 #include <nvhe/mm.h>
 
+#define DRV_ID(drv_addr)			((unsigned long)drv_addr)
+
 enum {
 	IOMMU_DRIVER_NOT_READY = 0,
 	IOMMU_DRIVER_INITIALIZING,
 	IOMMU_DRIVER_READY,
 };
 
-struct pkvm_iommu_driver {
-	const struct pkvm_iommu_ops *ops;
-	atomic_t state;
-};
-
-static struct pkvm_iommu_driver iommu_drivers[PKVM_IOMMU_NR_DRIVERS];
-
+/* List of registered IOMMU drivers, protected with iommu_drv_lock. */
+static LIST_HEAD(iommu_drivers);
 /* IOMMU device list. Must only be accessed with host_mmu.lock held. */
 static LIST_HEAD(iommu_list);
 
 static bool iommu_finalized;
 static DEFINE_HYP_SPINLOCK(iommu_registration_lock);
+static DEFINE_HYP_SPINLOCK(iommu_drv_lock);
 
 static void *iommu_mem_pool;
 static size_t iommu_mem_remaining;
@@ -56,26 +54,19 @@ static void host_unlock_component(void)
  * Find IOMMU driver by its ID. The input ID is treated as unstrusted
  * and is properly validated.
  */
-static inline struct pkvm_iommu_driver *get_driver(enum pkvm_iommu_driver_id id)
+static inline struct pkvm_iommu_driver *get_driver(unsigned long id)
 {
-	size_t index = (size_t)id;
+	struct pkvm_iommu_driver *drv, *ret = NULL;
 
-	if (index >= ARRAY_SIZE(iommu_drivers))
-		return NULL;
-
-	return &iommu_drivers[index];
-}
-
-static const struct pkvm_iommu_ops *get_driver_ops(enum pkvm_iommu_driver_id id)
-{
-	switch (id) {
-	case PKVM_IOMMU_DRIVER_S2MPU:
-		return IS_ENABLED(CONFIG_KVM_S2MPU) ? &pkvm_s2mpu_ops : NULL;
-	case PKVM_IOMMU_DRIVER_SYSMMU_SYNC:
-		return IS_ENABLED(CONFIG_KVM_S2MPU) ? &pkvm_sysmmu_sync_ops : NULL;
-	default:
-		return NULL;
+	hyp_spin_lock(&iommu_drv_lock);
+	list_for_each_entry(drv, &iommu_drivers, list) {
+		if (DRV_ID(drv) == id) {
+			ret =  drv;
+			break;
+		}
 	}
+	hyp_spin_unlock(&iommu_drv_lock);
+	return ret;
 }
 
 static inline bool driver_acquire_init(struct pkvm_iommu_driver *drv)
@@ -103,6 +94,35 @@ static size_t __iommu_alloc_size(struct pkvm_iommu_driver *drv)
 		     sizeof(unsigned long));
 }
 
+static bool validate_driver_id_unique(struct pkvm_iommu_driver *drv)
+{
+	struct pkvm_iommu_driver *cur;
+
+	hyp_assert_lock_held(&iommu_drv_lock);
+	list_for_each_entry(cur, &iommu_drivers, list) {
+		if (DRV_ID(drv) == DRV_ID(cur))
+			return false;
+	}
+	return true;
+}
+
+static int __pkvm_register_iommu_driver(struct pkvm_iommu_driver *drv)
+{
+	int ret = 0;
+
+	if (!drv)
+		return -EINVAL;
+
+	hyp_assert_lock_held(&iommu_registration_lock);
+	hyp_spin_lock(&iommu_drv_lock);
+	if (validate_driver_id_unique(drv))
+		list_add_tail(&drv->list, &iommu_drivers);
+	else
+		ret = -EEXIST;
+	hyp_spin_unlock(&iommu_drv_lock);
+	return ret;
+}
+
 /* Global memory pool for allocating IOMMU list entry structs. */
 static inline struct pkvm_iommu *alloc_iommu(struct pkvm_iommu_driver *drv,
 					     void *mem, size_t mem_size)
@@ -247,14 +267,11 @@ static struct pkvm_iommu *find_iommu_by_id(unsigned long id)
  * arguments are passed in a shared memory buffer. The driver is expected to
  * initialize it's page-table bookkeeping.
  */
-int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size)
+int __pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size)
 {
-	struct pkvm_iommu_driver *drv;
 	const struct pkvm_iommu_ops *ops;
 	int ret = 0;
 
-	data = kern_hyp_va(data);
-
 	/* New driver initialization not allowed after __pkvm_iommu_finalize(). */
 	hyp_spin_lock(&iommu_registration_lock);
 	if (iommu_finalized) {
@@ -262,9 +279,11 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 		goto out_unlock;
 	}
 
-	drv = get_driver(id);
-	ops = get_driver_ops(id);
-	if (!drv || !ops) {
+	ret =  __pkvm_register_iommu_driver(drv);
+	if (ret)
+		return ret;
+
+	if (!drv->ops) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
@@ -274,7 +293,7 @@ int __pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t si
 		goto out_unlock;
 	}
 
-	drv->ops = ops;
+	ops = drv->ops;
 
 	/* This can change stage-2 mappings. */
 	if (ops->init) {
@@ -306,8 +325,7 @@ out_unlock:
 	return ret;
 }
 
-int __pkvm_iommu_register(unsigned long dev_id,
-			  enum pkvm_iommu_driver_id drv_id,
+int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id,
 			  phys_addr_t dev_pa, size_t dev_size,
 			  unsigned long parent_id,
 			  void *kern_mem_va, size_t mem_size)
@@ -531,15 +549,14 @@ void pkvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 {
 	struct pkvm_iommu_driver *drv;
 	struct pkvm_iommu *dev;
-	size_t i;
 
 	assert_host_component_locked();
-
-	for (i = 0; i < ARRAY_SIZE(iommu_drivers); i++) {
-		drv = get_driver(i);
+	hyp_spin_lock(&iommu_drv_lock);
+	list_for_each_entry(drv, &iommu_drivers, list) {
 		if (drv && is_driver_ready(drv) && drv->ops->host_stage2_idmap_prepare)
 			drv->ops->host_stage2_idmap_prepare(start, end, prot);
 	}
+	hyp_spin_unlock(&iommu_drv_lock);
 
 	list_for_each_entry(dev, &iommu_list, list) {
 		if (dev->powered && dev->ops->host_stage2_idmap_apply)
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 8da0e43bab32..aefbcd7a4750 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -42,6 +42,9 @@ struct s2mpu_drv_data {
 
 static struct mpt host_mpt;
 
+const struct pkvm_iommu_ops pkvm_s2mpu_ops;
+const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops;
+
 static inline enum mpt_prot prot_to_mpt(enum kvm_pgtable_prot prot)
 {
 	return ((prot & KVM_PGTABLE_PROT_R) ? MPT_PROT_R : 0) |
@@ -620,3 +623,9 @@ const struct pkvm_iommu_ops pkvm_s2mpu_ops = (struct pkvm_iommu_ops){
 const struct pkvm_iommu_ops pkvm_sysmmu_sync_ops = (struct pkvm_iommu_ops){
 	.validate = sysmmu_sync_validate,
 };
+struct pkvm_iommu_driver pkvm_s2mpu_driver = (struct pkvm_iommu_driver){
+	.ops = &pkvm_s2mpu_ops,
+};
+struct pkvm_iommu_driver pkvm_sysmmu_sync_driver = (struct pkvm_iommu_driver){
+	.ops = &pkvm_sysmmu_sync_ops,
+};
diff --git a/arch/arm64/kvm/iommu.c b/arch/arm64/kvm/iommu.c
index 6ca171327b28..f827edb92eb6 100644
--- a/arch/arm64/kvm/iommu.c
+++ b/arch/arm64/kvm/iommu.c
@@ -12,13 +12,13 @@ static unsigned long dev_to_id(struct device *dev)
 	return (unsigned long)dev;
 }
 
-int pkvm_iommu_driver_init(enum pkvm_iommu_driver_id id, void *data, size_t size)
+int pkvm_iommu_driver_init(struct pkvm_iommu_driver *drv, void *data, size_t size)
 {
-	return kvm_call_hyp_nvhe(__pkvm_iommu_driver_init, id, data, size);
+	return kvm_call_hyp_nvhe(__pkvm_iommu_driver_init, drv, data, size);
 }
 
-int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
-			phys_addr_t pa, size_t size, struct device *parent)
+int pkvm_iommu_register(struct device *dev, struct pkvm_iommu_driver *drv, phys_addr_t pa,
+			size_t size, struct device *parent)
 {
 	void *mem;
 	int ret;
@@ -29,14 +29,14 @@ int pkvm_iommu_register(struct device *dev, enum pkvm_iommu_driver_id drv_id,
 	 * We assume that hyp never allocates more than a page per hypcall.
 	 */
 	ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev),
-				drv_id, pa, size, dev_to_id(parent), NULL, 0);
+				drv, pa, size, dev_to_id(parent), NULL, 0);
 	if (ret == -ENOMEM) {
 		mem = (void *)__get_free_page(GFP_KERNEL);
 		if (!mem)
 			return -ENOMEM;
 
 		ret = kvm_call_hyp_nvhe(__pkvm_iommu_register, dev_to_id(dev),
-					drv_id, pa, size, dev_to_id(parent),
+					drv, pa, size, dev_to_id(parent),
 					mem, PAGE_SIZE);
 	}
 	return ret;
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index 733451d74100..cbe28dd8b660 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -6,6 +6,11 @@
 
 #include <linux/kvm_host.h>
 #include <asm/kvm_s2mpu.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_asm.h>
+
+/* For an nvhe symbol get the kernel linear address of it. */
+#define ksym_ref_addr_nvhe(x)			kvm_ksym_ref(&kvm_nvhe_sym(x))
 
 static int init_s2mpu_driver(void)
 {
@@ -49,7 +54,8 @@ static int init_s2mpu_driver(void)
 		goto out_free;
 
 	/* Hypercall to initialize EL2 driver. */
-	ret = pkvm_iommu_driver_init(PKVM_IOMMU_DRIVER_S2MPU, mpt, sizeof(*mpt));
+	ret = pkvm_iommu_driver_init(ksym_ref_addr_nvhe(pkvm_s2mpu_driver),
+				     mpt, sizeof(*mpt));
 	if (ret)
 		goto out_unshare;
 
@@ -80,7 +86,7 @@ int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t addr)
 	if (ret)
 		return ret;
 
-	return pkvm_iommu_register(dev, PKVM_IOMMU_DRIVER_S2MPU,
+	return pkvm_iommu_register(dev, ksym_ref_addr_nvhe(pkvm_s2mpu_driver),
 				   addr, S2MPU_MMIO_SIZE, NULL);
 }
 EXPORT_SYMBOL_GPL(pkvm_iommu_s2mpu_register);
@@ -94,7 +100,8 @@ static int init_sysmmu_sync_driver(void)
 
 	mutex_lock(&lock);
 	if (!init_done) {
-		ret = pkvm_iommu_driver_init(PKVM_IOMMU_DRIVER_SYSMMU_SYNC, NULL, 0);
+		ret = pkvm_iommu_driver_init(ksym_ref_addr_nvhe(pkvm_sysmmu_sync_driver),
+					     NULL, 0);
 		init_done = !ret;
 	}
 	mutex_unlock(&lock);
@@ -113,7 +120,7 @@ int pkvm_iommu_sysmmu_sync_register(struct device *dev, phys_addr_t addr,
 	if (ret)
 		return ret;
 
-	return pkvm_iommu_register(dev, PKVM_IOMMU_DRIVER_SYSMMU_SYNC,
+	return pkvm_iommu_register(dev, ksym_ref_addr_nvhe(pkvm_sysmmu_sync_driver),
 				   addr + SYSMMU_SYNC_S2_OFFSET,
 				   SYSMMU_SYNC_S2_MMIO_SIZE, parent);
 }

From ace1a8e10342eca8730f747c504c79798ee5586d Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 10:38:33 +0000
Subject: [PATCH 224/457] ANDROID: KVM: arm64: s2mpu: Abstract page table ops

Create a seprate file where S2MPU page table operations are allocated
based on configuration
This will be helpful when supporting S2MPU versions that have
different SMPT and FMPT formats

Bug: 255731794
Change-Id: I0b80a329d4357511baf6847baf84dc4644bb835f
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/io-mpt-s2mpu.h        |  27 +++++
 arch/arm64/kvm/hyp/nvhe/Makefile             |   1 +
 arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c | 103 +++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c        |  82 ++++-----------
 4 files changed, 150 insertions(+), 63 deletions(-)
 create mode 100644 arch/arm64/include/asm/io-mpt-s2mpu.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c

diff --git a/arch/arm64/include/asm/io-mpt-s2mpu.h b/arch/arm64/include/asm/io-mpt-s2mpu.h
new file mode 100644
index 000000000000..382422b26ed6
--- /dev/null
+++ b/arch/arm64/include/asm/io-mpt-s2mpu.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 - Google LLC
+ */
+
+#ifndef __IO_MPT_S2MPU_H__
+#define __IO_MPT_S2MPU_H__
+
+#include <linux/bitfield.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_s2mpu.h>
+
+struct s2mpu_mpt_cfg {
+	enum s2mpu_version version;
+};
+
+struct s2mpu_mpt_ops {
+	void (*init_with_prot)(void *dev_va, enum mpt_prot prot);
+	void (*init_with_mpt)(void *dev_va, struct mpt *mpt);
+	void (*apply_range)(void *dev_va, struct mpt *mpt, u32 first_gb, u32 last_gb);
+	void (*prepare_range)(struct mpt *mpt, phys_addr_t first_byte,
+			      phys_addr_t last_byte, enum mpt_prot prot);
+};
+
+const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg);
+
+#endif /* __IO_MPT_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 7772aedd9b2b..01fd26cff95a 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -29,6 +29,7 @@ hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
 hyp-obj-y += $(lib-objs)
 
 hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/s2mpu.o
+hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/io-mpt-s2mpu.o
 
 ##
 ## Build rules for compiling nVHE hyp code
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
new file mode 100644
index 000000000000..5ec941d842a5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 - Google LLC
+ */
+
+#include <asm/io-mpt-s2mpu.h>
+
+static void __set_l1entry_attr_with_prot(void *dev_va, unsigned int gb,
+					 unsigned int vid, enum mpt_prot prot)
+{
+	writel_relaxed(L1ENTRY_ATTR_1G(prot),
+		       dev_va + REG_NS_L1ENTRY_ATTR(vid, gb));
+}
+
+static void __set_l1entry_attr_with_fmpt(void *dev_va, unsigned int gb,
+					 unsigned int vid, struct fmpt *fmpt)
+{
+	if (fmpt->gran_1g) {
+		__set_l1entry_attr_with_prot(dev_va, gb, vid, fmpt->prot);
+	} else {
+		/* Order against writes to the SMPT. */
+		writel(L1ENTRY_ATTR_L2(SMPT_GRAN_ATTR),
+		       dev_va + REG_NS_L1ENTRY_ATTR(vid, gb));
+	}
+}
+
+static void __set_l1entry_l2table_addr(void *dev_va, unsigned int gb,
+				       unsigned int vid, phys_addr_t addr)
+{
+	/* Order against writes to the SMPT. */
+	writel(L1ENTRY_L2TABLE_ADDR(addr),
+	       dev_va + REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb));
+}
+
+static void init_with_prot(void *dev_va, enum mpt_prot prot)
+{
+	unsigned int gb, vid;
+
+	for_each_gb_and_vid(gb, vid)
+		__set_l1entry_attr_with_prot(dev_va, gb, vid, prot);
+}
+
+static void init_with_mpt(void *dev_va, struct mpt *mpt)
+{
+	unsigned int gb, vid;
+	struct fmpt *fmpt;
+
+	for_each_gb_and_vid(gb, vid) {
+		fmpt = &mpt->fmpt[gb];
+		__set_l1entry_l2table_addr(dev_va, gb, vid, __hyp_pa(fmpt->smpt));
+		__set_l1entry_attr_with_fmpt(dev_va, gb, vid, fmpt);
+	}
+}
+
+static void apply_range(void *dev_va, struct mpt *mpt, u32 first_gb, u32 last_gb)
+{
+	unsigned int gb, vid;
+	struct fmpt *fmpt;
+
+	for_each_gb_in_range(gb, first_gb, last_gb) {
+		fmpt = &mpt->fmpt[gb];
+		if (fmpt->flags & MPT_UPDATE_L1) {
+			for_each_vid(vid)
+				__set_l1entry_attr_with_fmpt(dev_va, gb, vid, fmpt);
+		}
+	}
+}
+
+static void prepare_range(struct mpt *mpt, phys_addr_t first_byte,
+			  phys_addr_t last_byte, enum mpt_prot prot)
+{
+	unsigned int first_gb = first_byte / SZ_1G;
+	unsigned int last_gb = last_byte / SZ_1G;
+	size_t start_gb_byte, end_gb_byte;
+	unsigned int gb;
+	struct fmpt *fmpt;
+
+	for_each_gb_in_range(gb, first_gb, last_gb) {
+		fmpt = &mpt->fmpt[gb];
+		start_gb_byte = (gb == first_gb) ? first_byte % SZ_1G : 0;
+		end_gb_byte = (gb == last_gb) ? (last_byte % SZ_1G) + 1 : SZ_1G;
+
+		__set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
+
+		if (fmpt->flags & MPT_UPDATE_L2)
+			kvm_flush_dcache_to_poc(fmpt->smpt, SMPT_SIZE);
+	}
+}
+
+static const struct s2mpu_mpt_ops this_ops = {
+	.init_with_prot = init_with_prot,
+	.init_with_mpt = init_with_mpt,
+	.apply_range = apply_range,
+	.prepare_range = prepare_range,
+};
+
+const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg)
+{
+	if ((cfg.version == S2MPU_VERSION_8) || (cfg.version == S2MPU_VERSION_9))
+		return &this_ops;
+
+	return NULL;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index aefbcd7a4750..b76d2cef6919 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -18,6 +18,7 @@
 #include <nvhe/mm.h>
 #include <nvhe/spinlock.h>
 #include <nvhe/trap_handler.h>
+#include <asm/io-mpt-s2mpu.h>
 
 #define SMC_CMD_PREPARE_PD_ONOFF	0x82000410
 #define SMC_MODE_POWER_UP		1
@@ -40,6 +41,7 @@ struct s2mpu_drv_data {
 	u32 context_cfg_valid_vid;
 };
 
+static const struct s2mpu_mpt_ops *mpt_ops;
 static struct mpt host_mpt;
 
 const struct pkvm_iommu_ops pkvm_s2mpu_ops;
@@ -264,48 +266,19 @@ static void __range_invalidation_init(struct pkvm_iommu *dev, phys_addr_t first_
 	__invalidation_barrier_init(dev);
 }
 
-static void __set_l1entry_attr_with_prot(struct pkvm_iommu *dev, unsigned int gb,
-					 unsigned int vid, enum mpt_prot prot)
-{
-	writel_relaxed(L1ENTRY_ATTR_1G(prot),
-		       dev->va + REG_NS_L1ENTRY_ATTR(vid, gb));
-}
-
-static void __set_l1entry_attr_with_fmpt(struct pkvm_iommu *dev, unsigned int gb,
-					 unsigned int vid, struct fmpt *fmpt)
-{
-	if (fmpt->gran_1g) {
-		__set_l1entry_attr_with_prot(dev, gb, vid, fmpt->prot);
-	} else {
-		/* Order against writes to the SMPT. */
-		writel(L1ENTRY_ATTR_L2(SMPT_GRAN_ATTR),
-		       dev->va + REG_NS_L1ENTRY_ATTR(vid, gb));
-	}
-}
-
-static void __set_l1entry_l2table_addr(struct pkvm_iommu *dev, unsigned int gb,
-				       unsigned int vid, phys_addr_t addr)
-{
-	/* Order against writes to the SMPT. */
-	writel(L1ENTRY_L2TABLE_ADDR(addr),
-	       dev->va + REG_NS_L1ENTRY_L2TABLE_ADDR(vid, gb));
-}
-
 /*
  * Initialize S2MPU device and set all GB regions to 1G granularity with
  * given protection bits.
  */
 static int initialize_with_prot(struct pkvm_iommu *dev, enum mpt_prot prot)
 {
-	unsigned int gb, vid;
 	int ret;
 
 	ret = __initialize(dev);
 	if (ret)
 		return ret;
 
-	for_each_gb_and_vid(gb, vid)
-		__set_l1entry_attr_with_prot(dev, gb, vid, prot);
+	mpt_ops->init_with_prot(dev->va, prot);
 	__all_invalidation(dev);
 
 	/* Set control registers, enable the S2MPU. */
@@ -319,19 +292,13 @@ static int initialize_with_prot(struct pkvm_iommu *dev, enum mpt_prot prot)
  */
 static int initialize_with_mpt(struct pkvm_iommu *dev, struct mpt *mpt)
 {
-	unsigned int gb, vid;
-	struct fmpt *fmpt;
 	int ret;
 
 	ret = __initialize(dev);
 	if (ret)
 		return ret;
 
-	for_each_gb_and_vid(gb, vid) {
-		fmpt = &mpt->fmpt[gb];
-		__set_l1entry_l2table_addr(dev, gb, vid, __hyp_pa(fmpt->smpt));
-		__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
-	}
+	mpt_ops->init_with_mpt(dev->va, mpt);
 	__all_invalidation(dev);
 
 	/* Set control registers, enable the S2MPU. */
@@ -361,22 +328,7 @@ static bool to_valid_range(phys_addr_t *start, phys_addr_t *end)
 static void __mpt_idmap_prepare(struct mpt *mpt, phys_addr_t first_byte,
 				phys_addr_t last_byte, enum mpt_prot prot)
 {
-	unsigned int first_gb = first_byte / SZ_1G;
-	unsigned int last_gb = last_byte / SZ_1G;
-	size_t start_gb_byte, end_gb_byte;
-	unsigned int gb;
-	struct fmpt *fmpt;
-
-	for_each_gb_in_range(gb, first_gb, last_gb) {
-		fmpt = &mpt->fmpt[gb];
-		start_gb_byte = (gb == first_gb) ? first_byte % SZ_1G : 0;
-		end_gb_byte = (gb == last_gb) ? (last_byte % SZ_1G) + 1 : SZ_1G;
-
-		__set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
-
-		if (fmpt->flags & MPT_UPDATE_L2)
-			kvm_flush_dcache_to_poc(fmpt->smpt, SMPT_SIZE);
-	}
+	mpt_ops->prepare_range(mpt, first_byte, last_byte, prot);
 }
 
 static void __mpt_idmap_apply(struct pkvm_iommu *dev, struct mpt *mpt,
@@ -384,17 +336,8 @@ static void __mpt_idmap_apply(struct pkvm_iommu *dev, struct mpt *mpt,
 {
 	unsigned int first_gb = first_byte / SZ_1G;
 	unsigned int last_gb = last_byte / SZ_1G;
-	unsigned int gb, vid;
-	struct fmpt *fmpt;
 
-	for_each_gb_in_range(gb, first_gb, last_gb) {
-		fmpt = &mpt->fmpt[gb];
-
-		if (fmpt->flags & MPT_UPDATE_L1) {
-			for_each_vid(vid)
-				__set_l1entry_attr_with_fmpt(dev, gb, vid, fmpt);
-		}
-	}
+	mpt_ops->apply_range(dev->va, mpt, first_gb, last_gb);
 	/* Initiate invalidation, completed in __mdt_idmap_complete. */
 	__range_invalidation_init(dev, first_byte, last_byte);
 }
@@ -536,6 +479,7 @@ static int s2mpu_init(void *data, size_t size)
 	phys_addr_t pa;
 	unsigned int gb;
 	int ret = 0;
+	struct s2mpu_mpt_cfg cfg;
 
 	if (size != sizeof(in_mpt))
 		return -EINVAL;
@@ -543,6 +487,18 @@ static int s2mpu_init(void *data, size_t size)
 	/* The host can concurrently modify 'data'. Copy it to avoid TOCTOU. */
 	memcpy(&in_mpt, data, sizeof(in_mpt));
 
+	/*
+	 * Only v8/v9 are supported at this point so hardcode the version
+	 * as there is not way to get the version required from the kernel yet,
+	 * v8/v9 are compatible so using any of them will work.
+	 */
+	cfg.version = S2MPU_VERSION_8;
+	/* Get page table operations for this version. */
+	mpt_ops = s2mpu_get_mpt_ops(cfg);
+	/* If version is wrong return. */
+	if (!mpt_ops)
+		return -EINVAL;
+
 	/* Take ownership of all SMPT buffers. This will also map them in. */
 	for_each_gb(gb) {
 		smpt = kern_hyp_va(in_mpt.fmpt[gb].smpt);

From 781efc8f25e7cfe233ed45245b509c626e46a13b Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 11:18:33 +0000
Subject: [PATCH 225/457] ANDROID: KVM: arm64: s2mpu: Add SMPT and MPT
 functions to pgtable abstraction

No functional change
Move SMPT and MPT functions to io-pgtable-s2mpu.c as they will need to
change later when new version of S2MPU is added

Bug: 255731794
Change-Id: Ie890bd4e085c1e23a0d033147f955ba8789b8a28
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h           | 135 -------------------
 arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c | 135 +++++++++++++++++++
 2 files changed, 135 insertions(+), 135 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index b1075abd604c..2d9cd5509b16 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -203,13 +203,6 @@ enum mpt_prot {
 	MPT_PROT_MASK	= MPT_PROT_RW,
 };
 
-static const u64 mpt_prot_doubleword[] = {
-	[MPT_PROT_NONE] = 0x0000000000000000,
-	[MPT_PROT_R]    = 0x5555555555555555,
-	[MPT_PROT_W]	= 0xaaaaaaaaaaaaaaaa,
-	[MPT_PROT_RW]   = 0xffffffffffffffff,
-};
-
 enum mpt_update_flags {
 	MPT_UPDATE_L1 = BIT(0),
 	MPT_UPDATE_L2 = BIT(1),
@@ -226,132 +219,4 @@ struct mpt {
 	struct fmpt fmpt[NR_GIGABYTES];
 };
 
-/* Set protection bits of SMPT in a given range without using memset. */
-static inline void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
-					 size_t end_gb_byte, enum mpt_prot prot)
-{
-	size_t i, start_word_byte, end_word_byte, word_idx, first_elem, last_elem;
-	u32 val;
-
-	/* Iterate over u32 words. */
-	start_word_byte = start_gb_byte;
-	while (start_word_byte < end_gb_byte) {
-		/* Determine the range of bytes covered by this word. */
-		word_idx = start_word_byte / SMPT_WORD_BYTE_RANGE;
-		end_word_byte = min(
-			ALIGN(start_word_byte + 1, SMPT_WORD_BYTE_RANGE),
-			end_gb_byte);
-
-		/* Identify protection bit offsets within the word. */
-		first_elem = (start_word_byte / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
-		last_elem = ((end_word_byte - 1) / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
-
-		/* Modify the corresponding word. */
-		val = READ_ONCE(smpt[word_idx]);
-		for (i = first_elem; i <= last_elem; i++) {
-			val &= ~(MPT_PROT_MASK << (i * MPT_PROT_BITS));
-			val |= prot << (i * MPT_PROT_BITS);
-		}
-		WRITE_ONCE(smpt[word_idx], val);
-
-		start_word_byte = end_word_byte;
-	}
-}
-
-/* Set protection bits of SMPT in a given range. */
-static inline void __set_smpt_range(u32 *smpt, size_t start_gb_byte,
-				    size_t end_gb_byte, enum mpt_prot prot)
-{
-	size_t interlude_start, interlude_end, interlude_bytes, word_idx;
-	char prot_byte = (char)mpt_prot_doubleword[prot];
-
-	if (start_gb_byte >= end_gb_byte)
-		return;
-
-	/* Check if range spans at least one full u32 word. */
-	interlude_start = ALIGN(start_gb_byte, SMPT_WORD_BYTE_RANGE);
-	interlude_end = ALIGN_DOWN(end_gb_byte, SMPT_WORD_BYTE_RANGE);
-
-	/* If not, fall back to editing bits in the given range. */
-	if (interlude_start >= interlude_end) {
-		__set_smpt_range_slow(smpt, start_gb_byte, end_gb_byte, prot);
-		return;
-	}
-
-	/* Use bit-editing for prologue/epilogue, memset for interlude. */
-	word_idx = interlude_start / SMPT_WORD_BYTE_RANGE;
-	interlude_bytes = (interlude_end - interlude_start) / SMPT_GRAN / SMPT_ELEMS_PER_BYTE;
-
-	__set_smpt_range_slow(smpt, start_gb_byte, interlude_start, prot);
-	memset(&smpt[word_idx], prot_byte, interlude_bytes);
-	__set_smpt_range_slow(smpt, interlude_end, end_gb_byte, prot);
-}
-
-/* Returns true if all SMPT protection bits match 'prot'. */
-static inline bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
-{
-	size_t i;
-	u64 *doublewords = (u64 *)smpt;
-
-	for (i = 0; i < SMPT_NUM_WORDS / 2; i++) {
-		if (doublewords[i] != mpt_prot_doubleword[prot])
-			return false;
-	}
-	return true;
-}
-
-/*
- * Set protection bits of FMPT/SMPT in a given range.
- * Returns flags specifying whether L1/L2 changes need to be made visible
- * to the device.
- */
-static inline void __set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte,
-				    size_t end_gb_byte, enum mpt_prot prot)
-{
-	if (start_gb_byte == 0 && end_gb_byte >= SZ_1G) {
-		/* Update covers the entire GB region. */
-		if (fmpt->gran_1g && fmpt->prot == prot) {
-			fmpt->flags = 0;
-			return;
-		}
-
-		fmpt->gran_1g = true;
-		fmpt->prot = prot;
-		fmpt->flags = MPT_UPDATE_L1;
-		return;
-	}
-
-	if (fmpt->gran_1g) {
-		/* GB region currently uses 1G mapping. */
-		if (fmpt->prot == prot) {
-			fmpt->flags = 0;
-			return;
-		}
-
-		/*
-		 * Range has different mapping than the rest of the GB.
-		 * Convert to PAGE_SIZE mapping.
-		 */
-		fmpt->gran_1g = false;
-		__set_smpt_range(fmpt->smpt, 0, start_gb_byte, fmpt->prot);
-		__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
-		__set_smpt_range(fmpt->smpt, end_gb_byte, SZ_1G, fmpt->prot);
-		fmpt->flags = MPT_UPDATE_L1 | MPT_UPDATE_L2;
-		return;
-	}
-
-	/* GB region currently uses PAGE_SIZE mapping. */
-	__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
-
-	/* Check if the entire GB region has the same prot bits. */
-	if (!__is_smpt_uniform(fmpt->smpt, prot)) {
-		fmpt->flags = MPT_UPDATE_L2;
-		return;
-	}
-
-	fmpt->gran_1g = true;
-	fmpt->prot = prot;
-	fmpt->flags = MPT_UPDATE_L1;
-}
-
 #endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
index 5ec941d842a5..a7715ac8f24e 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
@@ -5,6 +5,141 @@
 
 #include <asm/io-mpt-s2mpu.h>
 
+static const u64 mpt_prot_doubleword[] = {
+	[MPT_PROT_NONE] = 0x0000000000000000,
+	[MPT_PROT_R]    = 0x5555555555555555,
+	[MPT_PROT_W]	= 0xaaaaaaaaaaaaaaaa,
+	[MPT_PROT_RW]   = 0xffffffffffffffff,
+};
+
+/* Set protection bits of SMPT in a given range without using memset. */
+static void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
+					 size_t end_gb_byte, enum mpt_prot prot)
+{
+	size_t i, start_word_byte, end_word_byte, word_idx, first_elem, last_elem;
+	u32 val;
+
+	/* Iterate over u32 words. */
+	start_word_byte = start_gb_byte;
+	while (start_word_byte < end_gb_byte) {
+		/* Determine the range of bytes covered by this word. */
+		word_idx = start_word_byte / SMPT_WORD_BYTE_RANGE;
+		end_word_byte = min(
+			ALIGN(start_word_byte + 1, SMPT_WORD_BYTE_RANGE),
+			end_gb_byte);
+
+		/* Identify protection bit offsets within the word. */
+		first_elem = (start_word_byte / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
+		last_elem = ((end_word_byte - 1) / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
+
+		/* Modify the corresponding word. */
+		val = READ_ONCE(smpt[word_idx]);
+		for (i = first_elem; i <= last_elem; i++) {
+			val &= ~(MPT_PROT_MASK << (i * MPT_PROT_BITS));
+			val |= prot << (i * MPT_PROT_BITS);
+		}
+		WRITE_ONCE(smpt[word_idx], val);
+
+		start_word_byte = end_word_byte;
+	}
+}
+
+/* Set protection bits of SMPT in a given range. */
+static void __set_smpt_range(u32 *smpt, size_t start_gb_byte,
+				    size_t end_gb_byte, enum mpt_prot prot)
+{
+	size_t interlude_start, interlude_end, interlude_bytes, word_idx;
+	char prot_byte = (char)mpt_prot_doubleword[prot];
+
+	if (start_gb_byte >= end_gb_byte)
+		return;
+
+	/* Check if range spans at least one full u32 word. */
+	interlude_start = ALIGN(start_gb_byte, SMPT_WORD_BYTE_RANGE);
+	interlude_end = ALIGN_DOWN(end_gb_byte, SMPT_WORD_BYTE_RANGE);
+
+	/* If not, fall back to editing bits in the given range. */
+	if (interlude_start >= interlude_end) {
+		__set_smpt_range_slow(smpt, start_gb_byte, end_gb_byte, prot);
+		return;
+	}
+
+	/* Use bit-editing for prologue/epilogue, memset for interlude. */
+	word_idx = interlude_start / SMPT_WORD_BYTE_RANGE;
+	interlude_bytes = (interlude_end - interlude_start) / SMPT_GRAN / SMPT_ELEMS_PER_BYTE;
+
+	__set_smpt_range_slow(smpt, start_gb_byte, interlude_start, prot);
+	memset(&smpt[word_idx], prot_byte, interlude_bytes);
+	__set_smpt_range_slow(smpt, interlude_end, end_gb_byte, prot);
+}
+
+/* Returns true if all SMPT protection bits match 'prot'. */
+static bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
+{
+	size_t i;
+	u64 *doublewords = (u64 *)smpt;
+
+	for (i = 0; i < SMPT_NUM_WORDS / 2; i++) {
+		if (doublewords[i] != mpt_prot_doubleword[prot])
+			return false;
+	}
+	return true;
+}
+
+/*
+ * Set protection bits of FMPT/SMPT in a given range.
+ * Returns flags specifying whether L1/L2 changes need to be made visible
+ * to the device.
+ */
+static void __set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte,
+				    size_t end_gb_byte, enum mpt_prot prot)
+{
+	if (start_gb_byte == 0 && end_gb_byte >= SZ_1G) {
+		/* Update covers the entire GB region. */
+		if (fmpt->gran_1g && fmpt->prot == prot) {
+			fmpt->flags = 0;
+			return;
+		}
+
+		fmpt->gran_1g = true;
+		fmpt->prot = prot;
+		fmpt->flags = MPT_UPDATE_L1;
+		return;
+	}
+
+	if (fmpt->gran_1g) {
+		/* GB region currently uses 1G mapping. */
+		if (fmpt->prot == prot) {
+			fmpt->flags = 0;
+			return;
+		}
+
+		/*
+		 * Range has different mapping than the rest of the GB.
+		 * Convert to PAGE_SIZE mapping.
+		 */
+		fmpt->gran_1g = false;
+		__set_smpt_range(fmpt->smpt, 0, start_gb_byte, fmpt->prot);
+		__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
+		__set_smpt_range(fmpt->smpt, end_gb_byte, SZ_1G, fmpt->prot);
+		fmpt->flags = MPT_UPDATE_L1 | MPT_UPDATE_L2;
+		return;
+	}
+
+	/* GB region currently uses PAGE_SIZE mapping. */
+	__set_smpt_range(fmpt->smpt, start_gb_byte, end_gb_byte, prot);
+
+	/* Check if the entire GB region has the same prot bits. */
+	if (!__is_smpt_uniform(fmpt->smpt, prot)) {
+		fmpt->flags = MPT_UPDATE_L2;
+		return;
+	}
+
+	fmpt->gran_1g = true;
+	fmpt->prot = prot;
+	fmpt->flags = MPT_UPDATE_L1;
+}
+
 static void __set_l1entry_attr_with_prot(void *dev_va, unsigned int gb,
 					 unsigned int vid, enum mpt_prot prot)
 {

From fd3eeecd1169eec02fbf438a291ba67a8a484d0e Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 11:40:51 +0000
Subject: [PATCH 226/457] ANDROID: KVM: arm64: s2mpu: Pass driver version
 during init

Add new a function to explicitly init S2MPU driver from EL1
pkvm_iommu_s2mpu_init. Instead of being called implicitly from
register function, it now should be called from EL1 driver register
function, and it should pass the expected version of S2MPUs.

EL2 driver will work only on a set of compatible S2MPUs at run time
matching the version at init.

Bug: 255731794
Change-Id: I01a39aa357e368b636fe3c9347651e92f3c62fc2
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_host.h     |  1 +
 arch/arm64/include/asm/kvm_s2mpu.h    |  1 +
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c |  7 +------
 arch/arm64/kvm/iommu/s2mpu.c          | 19 +++++++++++--------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8888316043e5..4a9dc8bc8124 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -401,6 +401,7 @@ int pkvm_iommu_register(struct device *dev, struct pkvm_iommu_driver *drv,
 int pkvm_iommu_suspend(struct device *dev);
 int pkvm_iommu_resume(struct device *dev);
 
+int pkvm_iommu_s2mpu_init(u32 version);
 int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t pa);
 int pkvm_iommu_sysmmu_sync_register(struct device *dev, phys_addr_t pa,
 				    struct device *parent);
diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 2d9cd5509b16..fa3cc25f1080 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -217,6 +217,7 @@ struct fmpt {
 
 struct mpt {
 	struct fmpt fmpt[NR_GIGABYTES];
+	enum s2mpu_version version;
 };
 
 #endif /* __ARM64_KVM_S2MPU_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index b76d2cef6919..fbaa950e5856 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -487,12 +487,7 @@ static int s2mpu_init(void *data, size_t size)
 	/* The host can concurrently modify 'data'. Copy it to avoid TOCTOU. */
 	memcpy(&in_mpt, data, sizeof(in_mpt));
 
-	/*
-	 * Only v8/v9 are supported at this point so hardcode the version
-	 * as there is not way to get the version required from the kernel yet,
-	 * v8/v9 are compatible so using any of them will work.
-	 */
-	cfg.version = S2MPU_VERSION_8;
+	cfg.version = in_mpt.version;
 	/* Get page table operations for this version. */
 	mpt_ops = s2mpu_get_mpt_ops(cfg);
 	/* If version is wrong return. */
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index cbe28dd8b660..fd63ce1fc6e4 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -12,7 +12,7 @@
 /* For an nvhe symbol get the kernel linear address of it. */
 #define ksym_ref_addr_nvhe(x)			kvm_ksym_ref(&kvm_nvhe_sym(x))
 
-static int init_s2mpu_driver(void)
+static int init_s2mpu_driver(u32 version)
 {
 	static DEFINE_MUTEX(lock);
 	static bool init_done;
@@ -46,6 +46,7 @@ static int init_s2mpu_driver(void)
 		}
 		mpt->fmpt[gb].smpt = (u32 *)addr;
 	}
+	mpt->version = version;
 
 	/* Share MPT descriptor with hyp. */
 	pfn = __pa(mpt) >> PAGE_SHIFT;
@@ -74,17 +75,19 @@ out:
 	mutex_unlock(&lock);
 	return ret;
 }
-
-int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t addr)
+int pkvm_iommu_s2mpu_init(u32 version)
 {
-	int ret;
-
 	if (!is_protected_kvm_enabled())
 		return -ENODEV;
 
-	ret = init_s2mpu_driver();
-	if (ret)
-		return ret;
+	return init_s2mpu_driver(version);
+}
+EXPORT_SYMBOL_GPL(pkvm_iommu_s2mpu_init);
+
+int pkvm_iommu_s2mpu_register(struct device *dev, phys_addr_t addr)
+{
+	if (!is_protected_kvm_enabled())
+		return -ENODEV;
 
 	return pkvm_iommu_register(dev, ksym_ref_addr_nvhe(pkvm_s2mpu_driver),
 				   addr, S2MPU_MMIO_SIZE, NULL);

From 7868d051b03f427a8f023352515137026cf4613e Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 12:17:32 +0000
Subject: [PATCH 227/457] ANDROID: KVM: arm64: s2mpu: Abstract register
 initialization with version_ops

As initialization sequence differ between versions, add a struct
version_ops to abstract init operations

This is just a clean before pushing new S2MPU code.
No functional changes.

Bug: 255731794
Change-Id: I67cc2fb351981280e23860d6f83bdc5632f3abc1
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c | 75 +++++++++++++++++++--------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index fbaa950e5856..9eca868a0dd5 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -36,12 +36,20 @@
 #define for_each_child(child, dev) \
 	list_for_each_entry((child), &(dev)->children, siblings)
 
+/* HW version-specific operations. */
+struct s2mpu_reg_ops {
+	int (*init)(struct pkvm_iommu *dev);
+	void (*set_control_regs)(struct pkvm_iommu *dev);
+	u32 (*host_mmio_reg_access_mask)(size_t off, bool is_write);
+};
+
 struct s2mpu_drv_data {
 	u32 version;
 	u32 context_cfg_valid_vid;
 };
 
 static const struct s2mpu_mpt_ops *mpt_ops;
+static const struct s2mpu_reg_ops *reg_ops;
 static struct mpt host_mpt;
 
 const struct pkvm_iommu_ops pkvm_s2mpu_ops;
@@ -274,7 +282,7 @@ static int initialize_with_prot(struct pkvm_iommu *dev, enum mpt_prot prot)
 {
 	int ret;
 
-	ret = __initialize(dev);
+	ret = reg_ops->init(dev);
 	if (ret)
 		return ret;
 
@@ -282,7 +290,7 @@ static int initialize_with_prot(struct pkvm_iommu *dev, enum mpt_prot prot)
 	__all_invalidation(dev);
 
 	/* Set control registers, enable the S2MPU. */
-	__set_control_regs(dev);
+	reg_ops->set_control_regs(dev);
 	return 0;
 }
 
@@ -294,7 +302,7 @@ static int initialize_with_mpt(struct pkvm_iommu *dev, struct mpt *mpt)
 {
 	int ret;
 
-	ret = __initialize(dev);
+	ret = reg_ops->init(dev);
 	if (ret)
 		return ret;
 
@@ -302,7 +310,7 @@ static int initialize_with_mpt(struct pkvm_iommu *dev, struct mpt *mpt)
 	__all_invalidation(dev);
 
 	/* Set control registers, enable the S2MPU. */
-	__set_control_regs(dev);
+	reg_ops->set_control_regs(dev);
 	return 0;
 }
 
@@ -393,13 +401,12 @@ static int s2mpu_suspend(struct pkvm_iommu *dev)
 	return initialize_with_prot(dev, MPT_PROT_NONE);
 }
 
-static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
+static u32 host_mmio_reg_access_mask_v8_v9(size_t off, bool is_write)
 {
-	const u32 no_access  = 0;
+	const u32 no_access = 0;
 	const u32 read_write = (u32)(-1);
-	const u32 read_only  = is_write ? no_access  : read_write;
+	const u32 read_only = is_write ? no_access : read_write;
 	const u32 write_only = is_write ? read_write : no_access;
-	u32 masked_off;
 
 	switch (off) {
 	/* Allow reading control registers for debugging. */
@@ -407,19 +414,7 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 		return read_only & CTRL0_MASK;
 	case REG_NS_CTRL1:
 		return read_only & CTRL1_MASK;
-	case REG_NS_CFG:
-		return read_only & CFG_MASK;
-	/* Allow EL1 IRQ handler to clear interrupts. */
-	case REG_NS_INTERRUPT_CLEAR:
-		return write_only & ALL_VIDS_BITMAP;
-	/* Allow reading number of sets used by MPTC. */
-	case REG_NS_INFO:
-		return read_only & INFO_NUM_SET_MASK;
-	/* Allow EL1 IRQ handler to read bitmap of pending interrupts. */
-	case REG_NS_FAULT_STATUS:
-		return read_only & ALL_VIDS_BITMAP;
-	/*
-	 * Allow reading MPTC entries for debugging. That involves:
+	/* Allow reading MPTC entries for debugging. That involves:
 	 *   - writing (set,way) to READ_MPTC
 	 *   - reading READ_MPTC_*
 	 */
@@ -431,6 +426,30 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 		return read_only & READ_MPTC_TAG_OTHERS_MASK;
 	case REG_NS_READ_MPTC_DATA:
 		return read_only;
+	};
+	return no_access;
+}
+
+static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
+{
+	const u32 no_access  = 0;
+	const u32 read_write = (u32)(-1);
+	const u32 read_only  = is_write ? no_access  : read_write;
+	const u32 write_only = is_write ? read_write : no_access;
+	u32 masked_off;
+
+	switch (off) {
+	case REG_NS_CFG:
+		return read_only & CFG_MASK;
+	/* Allow EL1 IRQ handler to clear interrupts. */
+	case REG_NS_INTERRUPT_CLEAR:
+		return write_only & ALL_VIDS_BITMAP;
+	/* Allow reading number of sets used by MPTC. */
+	case REG_NS_INFO:
+		return read_only & INFO_NUM_SET_MASK;
+	/* Allow EL1 IRQ handler to read bitmap of pending interrupts. */
+	case REG_NS_FAULT_STATUS:
+		return read_only & ALL_VIDS_BITMAP;
 	}
 
 	/* Allow reading L1ENTRY registers for debugging. */
@@ -445,7 +464,8 @@ static u32 host_mmio_reg_access_mask(size_t off, bool is_write)
 	    (masked_off == REG_NS_FAULT_INFO(0)))
 		return read_only;
 
-	return no_access;
+	/* Check version-specific registers. */
+	return reg_ops->host_mmio_reg_access_mask(off, is_write);
 }
 
 static bool s2mpu_host_dabt_handler(struct pkvm_iommu *dev,
@@ -472,6 +492,12 @@ static bool s2mpu_host_dabt_handler(struct pkvm_iommu *dev,
 	return true;
 }
 
+const struct s2mpu_reg_ops ops_v8_v9 = {
+	.init = __initialize,
+	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v8_v9,
+	.set_control_regs = __set_control_regs,
+};
+
 static int s2mpu_init(void *data, size_t size)
 {
 	struct mpt in_mpt;
@@ -488,6 +514,11 @@ static int s2mpu_init(void *data, size_t size)
 	memcpy(&in_mpt, data, sizeof(in_mpt));
 
 	cfg.version = in_mpt.version;
+	if ((in_mpt.version == S2MPU_VERSION_8) || (in_mpt.version == S2MPU_VERSION_9))
+		reg_ops = &ops_v8_v9;
+	else
+		return -ENODEV;
+
 	/* Get page table operations for this version. */
 	mpt_ops = s2mpu_get_mpt_ops(cfg);
 	/* If version is wrong return. */

From 6b6a75221782ddda7a0747276171bc2737e48173 Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 12:27:52 +0000
Subject: [PATCH 228/457] ANDROID: KVM: arm64: s2mpu: rename versions to match
 major arch

No functional change

Rename versions to match MAJOR_ARCH_VER[31:28] in user manual
0x11000000: is v1
0x20000000: is v2

Bug: 255731794
Change-Id: I30762293a3fab8194b616b4520b047baf3a94790
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h           | 10 ++++----
 arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c |  2 +-
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c        | 26 ++++++++++----------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index fa3cc25f1080..48c405adde96 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -49,8 +49,8 @@
 
 #define CTRL0_ENABLE				BIT(0)
 #define CTRL0_INTERRUPT_ENABLE			BIT(1)
-#define CTRL0_FAULT_RESP_TYPE_SLVERR		BIT(2) /* for v8 */
-#define CTRL0_FAULT_RESP_TYPE_DECERR		BIT(2) /* for v9 */
+#define CTRL0_FAULT_RESP_TYPE_SLVERR		BIT(2) /* for v1 */
+#define CTRL0_FAULT_RESP_TYPE_DECERR		BIT(2) /* for v2 */
 #define CTRL0_MASK				(CTRL0_ENABLE | \
 						 CTRL0_INTERRUPT_ENABLE | \
 						 CTRL0_FAULT_RESP_TYPE_SLVERR | \
@@ -109,7 +109,7 @@
 #define NR_FAULT_INFO_REGS			8
 #define FAULT_INFO_VID_MASK			GENMASK(26, 24)
 #define FAULT_INFO_TYPE_MASK			GENMASK(23, 21)
-#define FAULT_INFO_TYPE_CONTEXT			0x4 /* v9 only */
+#define FAULT_INFO_TYPE_CONTEXT			0x4 /* v2 only */
 #define FAULT_INFO_TYPE_AP			0x2
 #define FAULT_INFO_TYPE_MPTW			0x1
 #define FAULT_INFO_RW_BIT			BIT(20)
@@ -191,8 +191,8 @@ static_assert(SMPT_GRAN <= PAGE_SIZE);
 #define for_each_gb_and_vid(gb, vid)	for_each_vid((vid)) for_each_gb((gb))
 
 enum s2mpu_version {
-	S2MPU_VERSION_8 = 0x11000000,
-	S2MPU_VERSION_9 = 0x20000000,
+	S2MPU_VERSION_1 = 0x11000000,
+	S2MPU_VERSION_2 = 0x20000000,
 };
 
 enum mpt_prot {
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
index a7715ac8f24e..832368171e05 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
@@ -231,7 +231,7 @@ static const struct s2mpu_mpt_ops this_ops = {
 
 const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg)
 {
-	if ((cfg.version == S2MPU_VERSION_8) || (cfg.version == S2MPU_VERSION_9))
+	if ((cfg.version == S2MPU_VERSION_1) || (cfg.version == S2MPU_VERSION_2))
 		return &this_ops;
 
 	return NULL;
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index 9eca868a0dd5..f86a525344e8 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -106,7 +106,7 @@ static u32 __context_cfg_valid_vid(struct pkvm_iommu *dev, u32 vid_bmap)
 	return res;
 }
 
-static int __initialize_v9(struct pkvm_iommu *dev)
+static int __initialize_v2(struct pkvm_iommu *dev)
 {
 	u32 ssmt_valid_vid_bmap, ctx_cfg;
 
@@ -119,7 +119,7 @@ static int __initialize_v9(struct pkvm_iommu *dev)
 	/*
 	 * Write CONTEXT_CFG_VALID_VID configuration before touching L1ENTRY*
 	 * registers. Writes to those registers are ignored unless there is
-	 * a context ID allocated to the corresponding VID (v9 only).
+	 * a context ID allocated to the corresponding VID (v2 only).
 	 */
 	writel_relaxed(ctx_cfg, dev->va + REG_NS_CONTEXT_CFG_VALID_VID);
 	return 0;
@@ -133,10 +133,10 @@ static int __initialize(struct pkvm_iommu *dev)
 		data->version = readl_relaxed(dev->va + REG_NS_VERSION);
 
 	switch (data->version & VERSION_CHECK_MASK) {
-	case S2MPU_VERSION_8:
+	case S2MPU_VERSION_1:
 		return 0;
-	case S2MPU_VERSION_9:
-		return __initialize_v9(dev);
+	case S2MPU_VERSION_2:
+		return __initialize_v2(dev);
 	default:
 		return -EINVAL;
 	}
@@ -166,7 +166,7 @@ static void __set_control_regs(struct pkvm_iommu *dev)
 	irq_vids = ALL_VIDS_BITMAP;
 
 	/* Return SLVERR/DECERR to device on permission fault. */
-	ctrl0 |= is_version(dev, S2MPU_VERSION_9) ? CTRL0_FAULT_RESP_TYPE_DECERR
+	ctrl0 |= is_version(dev, S2MPU_VERSION_2) ? CTRL0_FAULT_RESP_TYPE_DECERR
 						  : CTRL0_FAULT_RESP_TYPE_SLVERR;
 
 	writel_relaxed(irq_vids, dev->va + REG_NS_INTERRUPT_ENABLE_PER_VID_SET);
@@ -248,8 +248,8 @@ static void __invalidation_barrier_complete(struct pkvm_iommu *dev)
 			__invalidation_barrier_slow(sync);
 	}
 
-	/* Must not access SFRs while S2MPU is busy invalidating (v9 only). */
-	if (is_version(dev, S2MPU_VERSION_9)) {
+	/* Must not access SFRs while S2MPU is busy invalidating (v2 only). */
+	if (is_version(dev, S2MPU_VERSION_2)) {
 		__wait_while(dev->va + REG_NS_STATUS,
 			     STATUS_BUSY | STATUS_ON_INVALIDATING);
 	}
@@ -401,7 +401,7 @@ static int s2mpu_suspend(struct pkvm_iommu *dev)
 	return initialize_with_prot(dev, MPT_PROT_NONE);
 }
 
-static u32 host_mmio_reg_access_mask_v8_v9(size_t off, bool is_write)
+static u32 host_mmio_reg_access_mask_v1_v2(size_t off, bool is_write)
 {
 	const u32 no_access = 0;
 	const u32 read_write = (u32)(-1);
@@ -492,9 +492,9 @@ static bool s2mpu_host_dabt_handler(struct pkvm_iommu *dev,
 	return true;
 }
 
-const struct s2mpu_reg_ops ops_v8_v9 = {
+const struct  s2mpu_reg_ops ops_v1_v2 = {
 	.init = __initialize,
-	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v8_v9,
+	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v1_v2,
 	.set_control_regs = __set_control_regs,
 };
 
@@ -514,8 +514,8 @@ static int s2mpu_init(void *data, size_t size)
 	memcpy(&in_mpt, data, sizeof(in_mpt));
 
 	cfg.version = in_mpt.version;
-	if ((in_mpt.version == S2MPU_VERSION_8) || (in_mpt.version == S2MPU_VERSION_9))
-		reg_ops = &ops_v8_v9;
+	if ((cfg.version == S2MPU_VERSION_1) || (cfg.version == S2MPU_VERSION_2))
+		reg_ops = &ops_v1_v2;
 	else
 		return -ENODEV;
 

From 61ca64906f8e2878395733a630bb1f293e78be16 Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 12:37:23 +0000
Subject: [PATCH 229/457] ANDROID: KVM: arm64: s2mpu: Add MMIO and defines for
 V9 S2MPU

No functional change
This commit defines the new registers, fields and constants of V9 S2MPU

Bug: 255731794
Change-Id: Ic2e0f276d83604efd7534f74de862df05d4d5696
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_s2mpu.h | 180 +++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 48c405adde96..13104e2f18d7 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -22,6 +22,185 @@
 
 #define ALL_VIDS_BITMAP				GENMASK(NR_VIDS - 1, 0)
 
+
+/*
+ * S2MPU V9 specific values (some new and some different from old versions)
+ * to avoid any confusion all names are prefixed with V9.
+ */
+#define REG_NS_V9_CTRL_PROT_EN_PER_VID_SET	0x50
+#define REG_NS_V9_CTRL_ERR_RESP_T_PER_VID_SET	0x70
+#define REG_NS_V9_CFG_MPTW_ATTRIBUTE		0x10
+
+#define REG_NS_V9_READ_MPTC			0x3014
+#define REG_NS_V9_READ_MPTC_TAG_PPN		0x3018
+#define REG_NS_V9_READ_MPTC_TAG_OTHERS		0x301C
+#define REG_NS_V9_READ_MPTC_DATA		0x3020
+#define REG_NS_V9_READ_PTLB			0x3030
+#define REG_NS_V9_READ_PTLB_TAG			0x3034
+#define REG_NS_V9_READ_PTLB_DATA_S1_EN_PPN_AP	0x3040
+#define REG_NS_V9_READ_PTLB_DATA_S1_DIS_AP_LIST 0x3044
+#define REG_NS_V9_PMMU_INDICATOR		0x3050
+#define REG_NS_V9_PMMU_INFO			0x3100
+#define REG_NS_V9_PMMU_PTLB_INFO(n)		(0x3400 + (n)*0x4)
+#define REG_NS_V9_SWALKER_INFO			0x3104
+#define REG_NS_V9_MPTC_INFO			0x3C00
+
+/* V9 Masks */
+#define V9_READ_MPTC_TAG_PPN_VALID_MASK		BIT(28)
+#define V9_READ_MPTC_TAG_PPN_TPN_PPN_MASK	GENMASK(23, 0)
+#define V9_READ_MPTC_TAG_PPN_MASK		(V9_READ_MPTC_TAG_PPN_VALID_MASK | \
+						 V9_READ_MPTC_TAG_PPN_TPN_PPN_MASK)
+
+#define V9_READ_MPTC_TAG_OTHERS_VID_MASK	GENMASK(10, 8)
+#define V9_READ_MPTC_TAG_OTHERS_PAGE_GRAN_MASK	GENMASK(5, 4)
+#define V9_READ_MPTC_TAG_OTHERS_MASK		(V9_READ_MPTC_TAG_OTHERS_VID_MASK | \
+						 V9_READ_MPTC_TAG_OTHERS_PAGE_GRAN_MASK)
+
+#define V9_READ_PTLB_WAY_MASK			GENMASK(31, 24)
+#define V9_READ_PTLB_SET_MASK			GENMASK(23, 16)
+#define V9_READ_PTLB_PTLB_MASK			GENMASK(15, 4)
+#define V9_READ_PTLB_PMMU_MASK			GENMASK(3, 0)
+#define V9_READ_PTLB_MASK			(V9_READ_PTLB_WAY_MASK | V9_READ_PTLB_SET_MASK | \
+						 V9_READ_PTLB_PTLB_MASK | V9_READ_PTLB_PMMU_MASK)
+
+#define V9_READ_PTLB_TAG_VALID_MASK		BIT(31)
+#define V9_READ_PTLB_TAG_PAGE_SIZE_MASK		GENMASK(30, 28)
+#define V9_READ_PTLB_TAG_STAGE1_ENABLED_MASK	BIT(27)
+#define V9_READ_PTLB_TAG_VID_MASK		GENMASK(26, 24)
+#define V9_READ_PTLB_TAG_TPN_MASK		GENMASK(23, 0)
+#define V9_READ_PTLB_TAG_MASK			(V9_READ_PTLB_TAG_VALID_MASK | \
+						 V9_READ_PTLB_TAG_TPN_MASK | \
+						 V9_READ_PTLB_TAG_VID_MASK | \
+						 V9_READ_PTLB_TAG_PAGE_SIZE_MASK | \
+						 V9_READ_PTLB_TAG_STAGE1_ENABLED_MASK)
+
+#define V9_READ_PTLB_DTA_S1_EN_PPN_AP_S2AP_MASK	GENMASK(25, 24)
+#define V9_READ_PTLB_DTA_S1_EN_PPN_AP_PPN_MASK	GENMASK(23, 0)
+
+#define V9_READ_PTLB_DATA_S1_ENABLE_PPN_AP_MASK (V9_READ_PTLB_DTA_S1_EN_PPN_AP_S2AP_MASK | \
+						 V9_READ_PTLB_DTA_S1_EN_PPN_AP_PPN_MASK)
+
+#define V9_READ_MPTC_INFO_NUM_MPTC_SET		GENMASK(31, 16)
+#define V9_READ_MPTC_INFO_NUM_MPTC_WAY		GENMASK(15, 12)
+#define V9_READ_MPTC_INFO_MASK			(V9_READ_MPTC_INFO_NUM_MPTC_SET | \
+						 V9_READ_MPTC_INFO_NUM_MPTC_SET)
+
+#define V9_READ_PMMU_INFO_NUM_PTLB		GENMASK(15, 1)
+#define V9_READ_PMMU_INFO_VA_WIDTH		BIT(0)
+#define V9_READ_PMMU_INFO_NUM_STREAM_TABLE	GENMASK(31, 16)
+#define V9_READ_PMMU_INFO_MASK			(V9_READ_PMMU_INFO_NUM_PTLB | \
+						 V9_READ_PMMU_INFO_VA_WIDTH | \
+						 V9_READ_PMMU_INFO_NUM_STREAM_TABLE)
+
+#define V9_READ_PMMU_PTLB_INFO_NUM_WAY		GENMASK(31, 16)
+#define V9_READ_PMMU_PTLB_INFO_NUM_SET		GENMASK(15, 0)
+#define V9_READ_PMMU_PTLB_INFO_MASK		(V9_READ_PMMU_PTLB_INFO_NUM_WAY | \
+						 V9_READ_PMMU_PTLB_INFO_NUM_SET)
+
+#define V9_READ_PMMU_INDICATOR_PMMU_NUM		GENMASK(3, 0)
+#define V9_READ_PMMU_INDICATOR_MASK		V9_READ_PMMU_INDICATOR_PMMU_NUM
+
+#define V9_READ_MPTC_WAY_MASK			GENMASK(17, 16)
+#define V9_READ_MPTC_SET_MASK			GENMASK(15, 0)
+#define V9_READ_MPTC_MASK			(V9_READ_MPTC_WAY_MASK | \
+						 V9_READ_MPTC_SET_MASK)
+#define V9_READ_MPTC_WAY(way)			FIELD_PREP(V9_READ_MPTC_WAY_MASK, (way))
+#define V9_READ_MPTC_SET(set)			FIELD_PREP(V9_READ_MPTC_SET_MASK, (set))
+#define V9_READ_MPTC(set, way)			(V9_READ_MPTC_SET(set) | V9_READ_MPTC_WAY(way))
+
+#define V9_READ_PTLB_WAY(x)			FIELD_PREP(V9_READ_PTLB_WAY_MASK, (x))
+#define V9_READ_PTLB_SET(x)			FIELD_PREP(V9_READ_PTLB_SET_MASK, (x))
+#define V9_READ_PTLB_PTLB(x)			FIELD_PREP(V9_READ_PTLB_PTLB_MASK, (x))
+#define V9_READ_PTLB_PMMU(x)			FIELD_PREP(V9_READ_PTLB_PMMU_MASK, (x))
+#define V9_READ_PTLB(pu_i, pb_i, s, w)		(V9_READ_PTLB_WAY(w) | V9_READ_PTLB_SET(s) | \
+						 V9_READ_PTLB_PTLB(pb_i) | V9_READ_PTLB_PMMU(pu_i))
+
+#define V9_READ_SLTB_INFO_SET_MASK		GENMASK(15, 0)
+#define V9_READ_SLTB_INFO_WAY_MASK		GENMASK(31, 16)
+#define V9_READ_SLTB_INFO_MASK			(V9_READ_SLTB_INFO_SET_MASK | \
+						 V9_READ_SLTB_INFO_WAY_MASK)
+
+#define V9_SWALKER_INFO_NUM_STLB_MASK		GENMASK(31, 16)
+#define V9_SWALKER_INFO_NUM_PMMU_MASK		GENMASK(15, 0)
+#define V9_SWALKER_INFO_MASK			(V9_SWALKER_INFO_NUM_STLB_MASK | \
+						 V9_SWALKER_INFO_NUM_PMMU_MASK)
+
+/*
+ * STLB has 2 types: A,B based on how S2MPU is connected
+ * registers or masks that vary based on type are suffixed with
+ * either TYPEA or TYPEB.
+ */
+#define REG_NS_V9_READ_STLB			0x3000
+#define REG_NS_V9_READ_STLB_TPN			0x3004
+#define REG_NS_V9_READ_STLB_TAG_PPN		0x3008
+#define REG_NS_V9_READ_STLB_TAG_OTHERS		0x300C
+#define REG_NS_V9_READ_STLB_DATA		0x3010
+#define REG_NS_V9_STLB_INFO(n)			(0x3800 + (n)*0x4)
+
+#define V9_READ_STLB_SET_MASK_TYPEA		GENMASK(7, 0)
+#define V9_READ_STLB_WAY_MASK_TYPEA		GENMASK(15, 8)
+#define V9_READ_STLB_SUBLINE_MASK_TYPEA		GENMASK(31, 20)
+#define V9_READ_STLB_STLBID_MASK_TYPEA		GENMASK(17, 16)
+#define V9_READ_STLB_MASK_TYPEA			(V9_READ_STLB_SET_MASK_TYPEA | \
+						 V9_READ_STLB_WAY_MASK_TYPEA | \
+						 V9_READ_STLB_SUBLINE_MASK_TYPEA | \
+						 V9_READ_STLB_STLBID_MASK_TYPEA)
+
+#define V9_READ_STLB_SET_MASK_TYPEB		GENMASK(15, 0)
+#define V9_READ_STLB_WAY_MASK_TYPEB		GENMASK(17, 16)
+#define V9_READ_STLB_STLBID_MASK_TYPEB		GENMASK(31, 20)
+#define V9_READ_STLB_MASK_TYPEB			(V9_READ_STLB_SET_MASK_TYPEB | \
+						 V9_READ_STLB_WAY_MASK_TYPEB  | \
+						 V9_READ_STLB_STLBID_MASK_TYPEB)
+
+#define V9_READ_STLB_TPN_TPN_MASK		GENMASK(23, 0)
+#define V9_READ_STLB_TPN_S2VALID_MASK		BIT(24)
+#define V9_READ_STLB_TPN_STAGE1_ENABLED_MASK	BIT(27)
+#define V9_READ_STLB_TPN_VALID_MASK		BIT(28)
+#define V9_READ_STLB_TPN_MASK			(V9_READ_STLB_TPN_TPN_MASK | \
+						 V9_READ_STLB_TPN_S2VALID_MASK | \
+						 V9_READ_STLB_TPN_STAGE1_ENABLED_MASK | \
+						 V9_READ_STLB_TPN_VALID_MASK)
+
+#define V9_READ_STLB_TAG_PPN_VALID_MASK_TYPEB	BIT(28)
+#define V9_READ_STLB_TAG_PPN_PPN_MASK		GENMASK(23, 0)
+#define V9_READ_STLB_TAG_PPN_MASK		(V9_READ_STLB_TAG_PPN_PPN_MASK | \
+						 V9_READ_STLB_TAG_PPN_VALID_MASK_TYPEB)
+
+#define V9_READ_STLB_TAG_OTHERS_S2AP_MASK_TYPEA	GENMASK(1, 0)
+#define V9_READ_STLB_TAG_OTHERS_PS_MASK		GENMASK(10, 8)
+#define V9_READ_STLB_TAG_OTHERS_BPS_MASK	BIT(12)
+#define V9_READ_STLB_TAG_OTHERS_VID_MASK	GENMASK(23, 20)
+#define V9_READ_STLB_TAG_OTHERS_MASK		(V9_READ_STLB_TAG_OTHERS_S2AP_MASK_TYPEA | \
+						 V9_READ_STLB_TAG_OTHERS_PS_MASK | \
+						 V9_READ_STLB_TAG_OTHERS_BPS_MASK | \
+						 V9_READ_STLB_TAG_OTHERS_VID_MASK)
+
+#define V9_READ_STLB_WAY_TYPEA(x)		FIELD_PREP(V9_READ_STLB_WAY_MASK_TYPEA, (x))
+#define V9_READ_STLB_SET_TYPEA(x)		FIELD_PREP(V9_READ_STLB_SET_MASK_TYPEA, (x))
+#define V9_READ_STLB_STLBID_TYPEA(x)		FIELD_PREP(V9_READ_STLB_STLBID_MASK_TYPEA, (x))
+#define V9_READ_STLB_SUBLINE_TYPEA(x)		FIELD_PREP(V9_READ_STLB_SUBLINE_MASK_TYPEA, (x))
+
+#define V9_READ_STLB_TYPEA(s_i, sub, s, w)	(V9_READ_STLB_WAY_TYPEA(w) | \
+						 V9_READ_STLB_SET_TYPEA(s) | \
+						 V9_READ_STLB_STLBID_TYPEA(s_i) | \
+						 V9_READ_STLB_SUBLINE_TYPEA(sub))
+
+#define V9_READ_STLB_WAY_TYPEB(x)		FIELD_PREP(V9_READ_STLB_WAY_MASK_TYPEB, (x))
+#define V9_READ_STLB_SET_TYPEB(x)		FIELD_PREP(V9_READ_STLB_SET_MASK_TYPEB, (x))
+#define V9_READ_STLB_STLBID_TYPEB(x)		FIELD_PREP(V9_READ_STLB_STLBID_MASK_TYPEB, (x))
+
+#define V9_READ_STLB_TYPEB(s_i,  s, w)		(V9_READ_STLB_WAY_TYPEB(w) | \
+						 V9_READ_STLB_SET_TYPEB(s) | \
+						 V9_READ_STLB_STLBID_TYPEB(s_i))
+
+#define V9_MAX_PTLB_NUM				0x100
+#define V9_MAX_STLB_NUM				0x100
+
+#define V9_L1ENTRY_ATTR_GRAN_MASK               BIT(3)
+#define V9_MPT_PROT_BITS                        4
+#define V9_MPT_ACCESS_SHIFT			2
+
 #define REG_NS_CTRL0				0x0
 #define REG_NS_CTRL1				0x4
 #define REG_NS_CFG				0x10
@@ -193,6 +372,7 @@ static_assert(SMPT_GRAN <= PAGE_SIZE);
 enum s2mpu_version {
 	S2MPU_VERSION_1 = 0x11000000,
 	S2MPU_VERSION_2 = 0x20000000,
+	S2MPU_VERSION_9 = 0x90000000,
 };
 
 enum mpt_prot {

From e3262b342c7dead031ac7c5792a8a358cffa7ab8 Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 15 Nov 2022 13:03:36 +0000
Subject: [PATCH 230/457] ANDROID: KVM: arm64: s2mpu: S2MPU V9 code

Add S2MPU V9 code with current page table ops and version ops.
Most SMPT_* macros are now function of protection bits
To keep logic modification minimal and avoid duplicate code
SMPT and FMPT function are kept the same and the values that
changed between S2MPU versions are used as variables instead of
macros

Bug: 255731794
Change-Id: I2a1b8bab630032d8c923c23e96e1182ce5f734ff
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/io-mpt-s2mpu.h        |   2 +
 arch/arm64/include/asm/kvm_s2mpu.h           |  59 +++++++--
 arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c | 119 ++++++++++++++++---
 arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c        | 104 ++++++++++++++--
 arch/arm64/kvm/iommu/s2mpu.c                 |   5 +-
 5 files changed, 249 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/io-mpt-s2mpu.h b/arch/arm64/include/asm/io-mpt-s2mpu.h
index 382422b26ed6..0dfff4c08ec8 100644
--- a/arch/arm64/include/asm/io-mpt-s2mpu.h
+++ b/arch/arm64/include/asm/io-mpt-s2mpu.h
@@ -15,11 +15,13 @@ struct s2mpu_mpt_cfg {
 };
 
 struct s2mpu_mpt_ops {
+	u32 (*smpt_size)(void);
 	void (*init_with_prot)(void *dev_va, enum mpt_prot prot);
 	void (*init_with_mpt)(void *dev_va, struct mpt *mpt);
 	void (*apply_range)(void *dev_va, struct mpt *mpt, u32 first_gb, u32 last_gb);
 	void (*prepare_range)(struct mpt *mpt, phys_addr_t first_byte,
 			      phys_addr_t last_byte, enum mpt_prot prot);
+	int (*pte_from_addr_smpt)(u32 *smpt, u64 addr);
 };
 
 const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg);
diff --git a/arch/arm64/include/asm/kvm_s2mpu.h b/arch/arm64/include/asm/kvm_s2mpu.h
index 13104e2f18d7..674963c879d1 100644
--- a/arch/arm64/include/asm/kvm_s2mpu.h
+++ b/arch/arm64/include/asm/kvm_s2mpu.h
@@ -197,10 +197,32 @@
 #define V9_MAX_PTLB_NUM				0x100
 #define V9_MAX_STLB_NUM				0x100
 
-#define V9_L1ENTRY_ATTR_GRAN_MASK               BIT(3)
-#define V9_MPT_PROT_BITS                        4
+#define V9_CTRL0_DIS_CHK_S1L1PTW_MASK		BIT(0)
+#define V9_CTRL0_DIS_CHK_S1L2PTW_MASK		BIT(1)
+#define V9_CTRL0_DIS_CHK_USR_MARCHED_REQ_MASK	BIT(3)
+#define V9_CTRL0_FAULT_MODE_MASK		BIT(4)
+#define V9_CTRL0_ENF_FLT_MODE_S1_NONSEC_MASK	BIT(5)
+#define V9_CTRL0_DESTRUCTIVE_AP_CHK_MODE_MASK	BIT(6)
+#define V9_CTRL0_MASK				(V9_CTRL0_DIS_CHK_S1L1PTW_MASK | \
+						 V9_CTRL0_DESTRUCTIVE_AP_CHK_MODE_MASK | \
+						 V9_CTRL0_DIS_CHK_USR_MARCHED_REQ_MASK | \
+						 V9_CTRL0_DIS_CHK_S1L2PTW_MASK | \
+						 V9_CTRL0_ENF_FLT_MODE_S1_NONSEC_MASK | \
+						 V9_CTRL0_FAULT_MODE_MASK)
+
+/*
+ * S2MPU V9 specific values (some new and some different from old versions)
+ * to avoid any confusion all names are prefixed with V9.
+ */
+#define V9_L1ENTRY_ATTR_GRAN_MASK		BIT(3)
+#define V9_MPT_PROT_BITS			4
 #define V9_MPT_ACCESS_SHIFT			2
 
+/* V1,V2 variants. */
+#define MPT_ACCESS_SHIFT			0
+#define L1ENTRY_ATTR_GRAN_MASK			GENMASK(5, 4)
+#define MPT_PROT_BITS				2
+
 #define REG_NS_CTRL0				0x0
 #define REG_NS_CTRL1				0x4
 #define REG_NS_CFG				0x10
@@ -316,12 +338,11 @@
 #define L1ENTRY_ATTR_GRAN_4K			0x0
 #define L1ENTRY_ATTR_GRAN_64K			0x1
 #define L1ENTRY_ATTR_GRAN_2M			0x2
+#define L1ENTRY_ATTR_GRAN(gran, msk)		FIELD_PREP(msk, gran)
 #define L1ENTRY_ATTR_PROT_MASK			GENMASK(2, 1)
-#define L1ENTRY_ATTR_GRAN_MASK			GENMASK(5, 4)
 #define L1ENTRY_ATTR_PROT(prot)			FIELD_PREP(L1ENTRY_ATTR_PROT_MASK, prot)
-#define L1ENTRY_ATTR_GRAN(gran)			FIELD_PREP(L1ENTRY_ATTR_GRAN_MASK, gran)
 #define L1ENTRY_ATTR_1G(prot)			L1ENTRY_ATTR_PROT(prot)
-#define L1ENTRY_ATTR_L2(gran)			(L1ENTRY_ATTR_GRAN(gran) | \
+#define L1ENTRY_ATTR_L2(gran, msk)		(L1ENTRY_ATTR_GRAN(gran, msk) | \
 						 L1ENTRY_ATTR_L2TABLE_EN)
 
 #define NR_GIGABYTES				64
@@ -339,16 +360,19 @@
 #endif
 static_assert(SMPT_GRAN <= PAGE_SIZE);
 
-#define MPT_PROT_BITS				2
+
 #define SMPT_WORD_SIZE				sizeof(u32)
-#define SMPT_ELEMS_PER_BYTE			(BITS_PER_BYTE / MPT_PROT_BITS)
-#define SMPT_ELEMS_PER_WORD			(SMPT_WORD_SIZE * SMPT_ELEMS_PER_BYTE)
-#define SMPT_WORD_BYTE_RANGE			(SMPT_GRAN * SMPT_ELEMS_PER_WORD)
+#define SMPT_ELEMS_PER_BYTE(prot_bits)		(BITS_PER_BYTE / (prot_bits))
+#define SMPT_ELEMS_PER_WORD(prot_bits)		(SMPT_WORD_SIZE * SMPT_ELEMS_PER_BYTE(prot_bits))
+#define SMPT_WORD_BYTE_RANGE(prot_bits)		(SMPT_GRAN * SMPT_ELEMS_PER_WORD(prot_bits))
 #define SMPT_NUM_ELEMS				(SZ_1G / SMPT_GRAN)
-#define SMPT_SIZE				(SMPT_NUM_ELEMS / SMPT_ELEMS_PER_BYTE)
-#define SMPT_NUM_WORDS				(SMPT_SIZE / SMPT_WORD_SIZE)
-#define SMPT_NUM_PAGES				(SMPT_SIZE / PAGE_SIZE)
-#define SMPT_ORDER				get_order(SMPT_SIZE)
+#define SMPT_SIZE(prot_bits)			(SMPT_NUM_ELEMS / SMPT_ELEMS_PER_BYTE(prot_bits))
+#define SMPT_NUM_WORDS(prot_bits)		(SMPT_SIZE(prot_bits) / SMPT_WORD_SIZE)
+#define SMPT_NUM_PAGES(prot_bits)		(SMPT_SIZE(prot_bits) / PAGE_SIZE)
+#define SMPT_ORDER(prot_bits)			get_order(SMPT_SIZE(prot_bits))
+
+
+#define SMPT_GRAN_MASK				GENMASK(1, 0)
 
 /* SysMMU_SYNC registers, relative to SYSMMU_SYNC_S2_OFFSET. */
 #define REG_NS_SYNC_CMD				0x0
@@ -375,6 +399,15 @@ enum s2mpu_version {
 	S2MPU_VERSION_9 = 0x90000000,
 };
 
+static inline int smpt_order_from_version(enum s2mpu_version version)
+{
+	if (version == S2MPU_VERSION_9)
+		return SMPT_ORDER(V9_MPT_PROT_BITS);
+	else if ((version == S2MPU_VERSION_1) || (version == S2MPU_VERSION_2))
+		return SMPT_ORDER(MPT_PROT_BITS);
+	BUG();
+}
+
 enum mpt_prot {
 	MPT_PROT_NONE	= 0,
 	MPT_PROT_R	= BIT(0),
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
index 832368171e05..e101c4c4c0b4 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/io-mpt-s2mpu.c
@@ -5,6 +5,37 @@
 
 #include <asm/io-mpt-s2mpu.h>
 
+#define GRAN_BYTE(gran)			((gran << V9_MPT_PROT_BITS) | (gran))
+#define GRAN_HWORD(gran)		((GRAN_BYTE(gran) << 8) | (GRAN_BYTE(gran)))
+#define GRAN_WORD(gran)			(((u32)(GRAN_HWORD(gran) << 16) | (GRAN_HWORD(gran))))
+#define GRAN_DWORD(gran)		((u64)((u64)GRAN_WORD(gran) << 32) | (u64)(GRAN_WORD(gran)))
+
+#define SMPT_NUM_TO_BYTE(x)		((x) / SMPT_GRAN / SMPT_ELEMS_PER_BYTE(config_prot_bits))
+#define BYTE_TO_SMPT_INDEX(x)	((x) / SMPT_WORD_BYTE_RANGE(config_prot_bits))
+
+
+/*
+ * MPT table ops can be configured only for one version at runtime,
+ * these variables will hold version specific data set a run time init, to avoid
+ * having duplicate code or unnessery check during operations.
+ */
+static u32 config_prot_bits;
+static u32 config_access_shift;
+static const u64 *config_lut_prot;
+static u32 config_gran_mask;
+static u32 this_version;
+
+/*
+ * page table entries for different protection look up table
+ * granularity is compile time config, so we can do this also for
+ * this array without having duplicate arrays
+ */
+static const u64 v9_mpt_prot_doubleword[] = {
+	[MPT_PROT_NONE] = 0x0000000000000000 | GRAN_DWORD(SMPT_GRAN_ATTR),
+	[MPT_PROT_R]    = 0x4444444444444444 | GRAN_DWORD(SMPT_GRAN_ATTR),
+	[MPT_PROT_W]	= 0x8888888888888888 | GRAN_DWORD(SMPT_GRAN_ATTR),
+	[MPT_PROT_RW]   = 0xcccccccccccccccc | GRAN_DWORD(SMPT_GRAN_ATTR),
+};
 static const u64 mpt_prot_doubleword[] = {
 	[MPT_PROT_NONE] = 0x0000000000000000,
 	[MPT_PROT_R]    = 0x5555555555555555,
@@ -12,6 +43,25 @@ static const u64 mpt_prot_doubleword[] = {
 	[MPT_PROT_RW]   = 0xffffffffffffffff,
 };
 
+static inline int pte_from_addr_smpt(u32 *smpt, u64 addr)
+{
+	u32 word_idx, idx, pte, val;
+
+	word_idx = BYTE_TO_SMPT_INDEX(addr);
+	val = READ_ONCE(smpt[word_idx]);
+	idx  = (addr / SMPT_GRAN) % SMPT_ELEMS_PER_WORD(config_prot_bits);
+
+	pte  = (val >> (idx * config_prot_bits)) & ((1 << config_prot_bits)-1);
+	return pte;
+}
+
+static inline int prot_from_addr_smpt(u32 *smpt, u64 addr)
+{
+	int pte = pte_from_addr_smpt(smpt, addr);
+
+	return (pte >> config_access_shift);
+}
+
 /* Set protection bits of SMPT in a given range without using memset. */
 static void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
 					 size_t end_gb_byte, enum mpt_prot prot)
@@ -23,20 +73,21 @@ static void __set_smpt_range_slow(u32 *smpt, size_t start_gb_byte,
 	start_word_byte = start_gb_byte;
 	while (start_word_byte < end_gb_byte) {
 		/* Determine the range of bytes covered by this word. */
-		word_idx = start_word_byte / SMPT_WORD_BYTE_RANGE;
+		word_idx = BYTE_TO_SMPT_INDEX(start_word_byte);
 		end_word_byte = min(
-			ALIGN(start_word_byte + 1, SMPT_WORD_BYTE_RANGE),
+			ALIGN(start_word_byte + 1, SMPT_WORD_BYTE_RANGE(config_prot_bits)),
 			end_gb_byte);
 
 		/* Identify protection bit offsets within the word. */
-		first_elem = (start_word_byte / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
-		last_elem = ((end_word_byte - 1) / SMPT_GRAN) % SMPT_ELEMS_PER_WORD;
+		first_elem = (start_word_byte / SMPT_GRAN) % SMPT_ELEMS_PER_WORD(config_prot_bits);
+		last_elem =
+			((end_word_byte - 1) / SMPT_GRAN) % SMPT_ELEMS_PER_WORD(config_prot_bits);
 
 		/* Modify the corresponding word. */
 		val = READ_ONCE(smpt[word_idx]);
 		for (i = first_elem; i <= last_elem; i++) {
-			val &= ~(MPT_PROT_MASK << (i * MPT_PROT_BITS));
-			val |= prot << (i * MPT_PROT_BITS);
+			val &= ~(MPT_PROT_MASK << (i * config_prot_bits + config_access_shift));
+			val |= prot << (i * config_prot_bits + config_access_shift);
 		}
 		WRITE_ONCE(smpt[word_idx], val);
 
@@ -49,25 +100,33 @@ static void __set_smpt_range(u32 *smpt, size_t start_gb_byte,
 				    size_t end_gb_byte, enum mpt_prot prot)
 {
 	size_t interlude_start, interlude_end, interlude_bytes, word_idx;
-	char prot_byte = (char)mpt_prot_doubleword[prot];
+
+	char prot_byte = (char)config_lut_prot[prot];
 
 	if (start_gb_byte >= end_gb_byte)
 		return;
 
 	/* Check if range spans at least one full u32 word. */
-	interlude_start = ALIGN(start_gb_byte, SMPT_WORD_BYTE_RANGE);
-	interlude_end = ALIGN_DOWN(end_gb_byte, SMPT_WORD_BYTE_RANGE);
+	interlude_start = ALIGN(start_gb_byte, SMPT_WORD_BYTE_RANGE(config_prot_bits));
+	interlude_end = ALIGN_DOWN(end_gb_byte, SMPT_WORD_BYTE_RANGE(config_prot_bits));
 
-	/* If not, fall back to editing bits in the given range. */
+	/*
+	 * If not, fall back to editing bits in the given range.
+	 * sets bit for PTEs that are in less than 32 bits (can't be done by memset)
+	 */
 	if (interlude_start >= interlude_end) {
 		__set_smpt_range_slow(smpt, start_gb_byte, end_gb_byte, prot);
 		return;
 	}
 
 	/* Use bit-editing for prologue/epilogue, memset for interlude. */
-	word_idx = interlude_start / SMPT_WORD_BYTE_RANGE;
-	interlude_bytes = (interlude_end - interlude_start) / SMPT_GRAN / SMPT_ELEMS_PER_BYTE;
+	word_idx = BYTE_TO_SMPT_INDEX(interlude_start);
+	interlude_bytes = SMPT_NUM_TO_BYTE(interlude_end - interlude_start);
 
+	/*
+	 * These are pages in the start and at then end that are
+	 * not part of full 32 bit SMPT word.
+	 */
 	__set_smpt_range_slow(smpt, start_gb_byte, interlude_start, prot);
 	memset(&smpt[word_idx], prot_byte, interlude_bytes);
 	__set_smpt_range_slow(smpt, interlude_end, end_gb_byte, prot);
@@ -79,8 +138,8 @@ static bool __is_smpt_uniform(u32 *smpt, enum mpt_prot prot)
 	size_t i;
 	u64 *doublewords = (u64 *)smpt;
 
-	for (i = 0; i < SMPT_NUM_WORDS / 2; i++) {
-		if (doublewords[i] != mpt_prot_doubleword[prot])
+	for (i = 0; i < SMPT_NUM_WORDS(config_prot_bits) / 2; i++) {
+		if (doublewords[i] != config_lut_prot[prot])
 			return false;
 	}
 	return true;
@@ -140,6 +199,11 @@ static void __set_fmpt_range(struct fmpt *fmpt, size_t start_gb_byte,
 	fmpt->flags = MPT_UPDATE_L1;
 }
 
+static u32 smpt_size(void)
+{
+	return SMPT_SIZE(config_prot_bits);
+}
+
 static void __set_l1entry_attr_with_prot(void *dev_va, unsigned int gb,
 					 unsigned int vid, enum mpt_prot prot)
 {
@@ -154,7 +218,7 @@ static void __set_l1entry_attr_with_fmpt(void *dev_va, unsigned int gb,
 		__set_l1entry_attr_with_prot(dev_va, gb, vid, fmpt->prot);
 	} else {
 		/* Order against writes to the SMPT. */
-		writel(L1ENTRY_ATTR_L2(SMPT_GRAN_ATTR),
+		writel(config_gran_mask | L1ENTRY_ATTR_L2TABLE_EN,
 		       dev_va + REG_NS_L1ENTRY_ATTR(vid, gb));
 	}
 }
@@ -218,21 +282,40 @@ static void prepare_range(struct mpt *mpt, phys_addr_t first_byte,
 		__set_fmpt_range(fmpt, start_gb_byte, end_gb_byte, prot);
 
 		if (fmpt->flags & MPT_UPDATE_L2)
-			kvm_flush_dcache_to_poc(fmpt->smpt, SMPT_SIZE);
+			kvm_flush_dcache_to_poc(fmpt->smpt, smpt_size());
 	}
 }
 
 static const struct s2mpu_mpt_ops this_ops = {
+	.smpt_size = smpt_size,
 	.init_with_prot = init_with_prot,
 	.init_with_mpt = init_with_mpt,
 	.apply_range = apply_range,
 	.prepare_range = prepare_range,
+	.pte_from_addr_smpt = pte_from_addr_smpt,
 };
 
 const struct s2mpu_mpt_ops *s2mpu_get_mpt_ops(struct s2mpu_mpt_cfg cfg)
 {
-	if ((cfg.version == S2MPU_VERSION_1) || (cfg.version == S2MPU_VERSION_2))
-		return &this_ops;
 
+	/* If called before with different version return NULL. */
+	if (WARN_ON(this_version && (this_version != cfg.version)))
+		return NULL;
+	/* 2MB granularity not supported in V9 */
+	if ((cfg.version == S2MPU_VERSION_9) && (SMPT_GRAN_ATTR != L1ENTRY_ATTR_GRAN_2M)) {
+		config_prot_bits = V9_MPT_PROT_BITS;
+		config_access_shift = V9_MPT_ACCESS_SHIFT;
+		config_lut_prot = v9_mpt_prot_doubleword;
+		config_gran_mask = L1ENTRY_ATTR_GRAN(SMPT_GRAN_ATTR, V9_L1ENTRY_ATTR_GRAN_MASK);
+		this_version = cfg.version;
+		return &this_ops;
+	} else if ((cfg.version == S2MPU_VERSION_2) || (cfg.version == S2MPU_VERSION_1)) {
+		config_prot_bits = MPT_PROT_BITS;
+		config_access_shift = MPT_ACCESS_SHIFT;
+		config_lut_prot = mpt_prot_doubleword;
+		config_gran_mask = L1ENTRY_ATTR_GRAN(SMPT_GRAN_ATTR, L1ENTRY_ATTR_GRAN_MASK);
+		this_version = cfg.version;
+		return &this_ops;
+	}
 	return NULL;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
index f86a525344e8..d52890b1c5b2 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu/s2mpu.c
@@ -174,6 +174,23 @@ static void __set_control_regs(struct pkvm_iommu *dev)
 	writel_relaxed(0, dev->va + REG_NS_CTRL1);
 	writel_relaxed(ctrl0, dev->va + REG_NS_CTRL0);
 }
+static void __set_control_regs_v9(struct pkvm_iommu *dev)
+{
+	/* Return DECERR to device on permission fault. */
+	writel_relaxed(ALL_VIDS_BITMAP,
+		       dev->va + REG_NS_V9_CTRL_ERR_RESP_T_PER_VID_SET);
+	/*
+	 * Enable interrupts on fault for all VIDs. The IRQ must also be
+	 * specified in DT to get unmasked in the GIC.
+	 */
+	writel_relaxed(ALL_VIDS_BITMAP,
+		       dev->va + REG_NS_INTERRUPT_ENABLE_PER_VID_SET);
+	writel_relaxed(0, dev->va + REG_NS_CTRL0);
+	/* Enable the S2MPU, otherwise all traffic would be allowed through. */
+	writel_relaxed(ALL_VIDS_BITMAP,
+		       dev->va + REG_NS_V9_CTRL_PROT_EN_PER_VID_SET);
+	writel_relaxed(0, dev->va + REG_NS_V9_CFG_MPTW_ATTRIBUTE);
+}
 
 /*
  * Poll the given SFR until its value has all bits of a given mask set.
@@ -248,8 +265,8 @@ static void __invalidation_barrier_complete(struct pkvm_iommu *dev)
 			__invalidation_barrier_slow(sync);
 	}
 
-	/* Must not access SFRs while S2MPU is busy invalidating (v2 only). */
-	if (is_version(dev, S2MPU_VERSION_2)) {
+	/* Must not access SFRs while S2MPU is busy invalidating */
+	if (is_version(dev, S2MPU_VERSION_2) || is_version(dev, S2MPU_VERSION_9)) {
 		__wait_while(dev->va + REG_NS_STATUS,
 			     STATUS_BUSY | STATUS_ON_INVALIDATING);
 	}
@@ -401,6 +418,64 @@ static int s2mpu_suspend(struct pkvm_iommu *dev)
 	return initialize_with_prot(dev, MPT_PROT_NONE);
 }
 
+static u32 host_mmio_reg_access_mask_v9(size_t off, bool is_write)
+{
+	const u32 no_access = 0;
+	const u32 read_write = (u32)(-1);
+	const u32 read_only = is_write ? no_access : read_write;
+	const u32 write_only = is_write ? read_write : no_access;
+
+	switch (off) {
+	/* Allow reading control registers for debugging. */
+	case REG_NS_CTRL0:
+		return read_only & V9_CTRL0_MASK;
+	case REG_NS_V9_CTRL_ERR_RESP_T_PER_VID_SET:
+		return read_only & ALL_VIDS_BITMAP;
+	case REG_NS_V9_CTRL_PROT_EN_PER_VID_SET:
+		return read_only & ALL_VIDS_BITMAP;
+	case REG_NS_V9_READ_STLB:
+		return write_only & (V9_READ_STLB_MASK_TYPEA|V9_READ_STLB_MASK_TYPEB);
+	case REG_NS_V9_READ_STLB_TPN:
+		return read_only & V9_READ_STLB_TPN_MASK;
+	case REG_NS_V9_READ_STLB_TAG_PPN:
+		return read_only & V9_READ_STLB_TAG_PPN_MASK;
+	case REG_NS_V9_READ_STLB_TAG_OTHERS:
+		return read_only & V9_READ_STLB_TAG_OTHERS_MASK;
+	case REG_NS_V9_READ_STLB_DATA:
+		return read_only;
+	case REG_NS_V9_MPTC_INFO:
+		return read_only & V9_READ_MPTC_INFO_MASK;
+	case REG_NS_V9_READ_MPTC:
+		return write_only & V9_READ_MPTC_MASK;
+	case REG_NS_V9_READ_MPTC_TAG_PPN:
+		return read_only & V9_READ_MPTC_TAG_PPN_MASK;
+	case REG_NS_V9_READ_MPTC_TAG_OTHERS:
+		return read_only & V9_READ_MPTC_TAG_OTHERS_MASK;
+	case REG_NS_V9_READ_MPTC_DATA:
+		return read_only;
+	case REG_NS_V9_PMMU_INFO:
+		return read_only & V9_READ_PMMU_INFO_MASK;
+	case REG_NS_V9_READ_PTLB:
+		return write_only & V9_READ_PTLB_MASK;
+	case REG_NS_V9_READ_PTLB_TAG:
+		return read_only & V9_READ_PTLB_TAG_MASK;
+	case REG_NS_V9_READ_PTLB_DATA_S1_EN_PPN_AP:
+		return read_only & V9_READ_PTLB_DATA_S1_ENABLE_PPN_AP_MASK;
+	case REG_NS_V9_READ_PTLB_DATA_S1_DIS_AP_LIST:
+		return read_only;
+	case REG_NS_V9_PMMU_INDICATOR:
+		return read_only & V9_READ_PMMU_INDICATOR_MASK;
+	case REG_NS_V9_SWALKER_INFO:
+		return read_only&V9_SWALKER_INFO_MASK;
+	};
+	if (off >= REG_NS_V9_PMMU_PTLB_INFO(0) && off < REG_NS_V9_PMMU_PTLB_INFO(V9_MAX_PTLB_NUM))
+		return read_only&V9_READ_PMMU_PTLB_INFO_MASK;
+	if (off >= REG_NS_V9_STLB_INFO(0) && off < REG_NS_V9_STLB_INFO(V9_MAX_STLB_NUM))
+		return read_only&V9_READ_SLTB_INFO_MASK;
+
+	return no_access;
+}
+
 static u32 host_mmio_reg_access_mask_v1_v2(size_t off, bool is_write)
 {
 	const u32 no_access = 0;
@@ -491,12 +566,20 @@ static bool s2mpu_host_dabt_handler(struct pkvm_iommu *dev,
 		cpu_reg(host_ctxt, rd) = readl_relaxed(dev->va + off) & mask;
 	return true;
 }
-
-const struct  s2mpu_reg_ops ops_v1_v2 = {
+/*
+ * Operations that differ between versions. We need to maintain
+ * old behaviour were v1 and v2 can be used together.
+ */
+const struct s2mpu_reg_ops ops_v1_v2 = {
 	.init = __initialize,
 	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v1_v2,
 	.set_control_regs = __set_control_regs,
 };
+const struct s2mpu_reg_ops ops_v9 = {
+	.init = __initialize_v2,
+	.host_mmio_reg_access_mask = host_mmio_reg_access_mask_v9,
+	.set_control_regs = __set_control_regs_v9,
+};
 
 static int s2mpu_init(void *data, size_t size)
 {
@@ -505,6 +588,7 @@ static int s2mpu_init(void *data, size_t size)
 	phys_addr_t pa;
 	unsigned int gb;
 	int ret = 0;
+	int smpt_nr_pages, smpt_size;
 	struct s2mpu_mpt_cfg cfg;
 
 	if (size != sizeof(in_mpt))
@@ -514,8 +598,11 @@ static int s2mpu_init(void *data, size_t size)
 	memcpy(&in_mpt, data, sizeof(in_mpt));
 
 	cfg.version = in_mpt.version;
+	/* Make sure the version sent is supported by the driver. */
 	if ((cfg.version == S2MPU_VERSION_1) || (cfg.version == S2MPU_VERSION_2))
 		reg_ops = &ops_v1_v2;
+	else if (cfg.version == S2MPU_VERSION_9)
+		reg_ops = &ops_v9;
 	else
 		return -ENODEV;
 
@@ -525,17 +612,20 @@ static int s2mpu_init(void *data, size_t size)
 	if (!mpt_ops)
 		return -EINVAL;
 
+	smpt_size = mpt_ops->smpt_size();
+	smpt_nr_pages = smpt_size / PAGE_SIZE;
+
 	/* Take ownership of all SMPT buffers. This will also map them in. */
 	for_each_gb(gb) {
 		smpt = kern_hyp_va(in_mpt.fmpt[gb].smpt);
 		pa = __hyp_pa(smpt);
 
-		if (!IS_ALIGNED(pa, SMPT_SIZE)) {
+		if (!IS_ALIGNED(pa, smpt_size)) {
 			ret = -EINVAL;
 			break;
 		}
 
-		ret = __pkvm_host_donate_hyp(pa >> PAGE_SHIFT, SMPT_NUM_PAGES);
+		ret = __pkvm_host_donate_hyp(pa >> PAGE_SHIFT, smpt_nr_pages);
 		if (ret)
 			break;
 
@@ -554,7 +644,7 @@ static int s2mpu_init(void *data, size_t size)
 				break;
 
 			WARN_ON(__pkvm_hyp_donate_host(__hyp_pa(smpt) >> PAGE_SHIFT,
-						       SMPT_NUM_PAGES));
+						       smpt_nr_pages));
 		}
 		memset(&host_mpt, 0, sizeof(host_mpt));
 	}
diff --git a/arch/arm64/kvm/iommu/s2mpu.c b/arch/arm64/kvm/iommu/s2mpu.c
index fd63ce1fc6e4..b9dcc3469f06 100644
--- a/arch/arm64/kvm/iommu/s2mpu.c
+++ b/arch/arm64/kvm/iommu/s2mpu.c
@@ -22,6 +22,7 @@ static int init_s2mpu_driver(u32 version)
 	unsigned long addr;
 	u64 pfn;
 	int ret = 0;
+	const int smpt_order = smpt_order_from_version(version);
 
 	mutex_lock(&lock);
 	if (init_done)
@@ -39,7 +40,7 @@ static int init_s2mpu_driver(u32 version)
 
 	/* Allocate SMPT buffers. */
 	for_each_gb(gb) {
-		addr = __get_free_pages(GFP_KERNEL, SMPT_ORDER);
+		addr = __get_free_pages(GFP_KERNEL, smpt_order);
 		if (!addr) {
 			ret = -ENOMEM;
 			goto out_free;
@@ -68,7 +69,7 @@ out_free:
 	/* TODO - will driver return the memory? */
 	if (ret) {
 		for_each_gb(gb)
-			free_pages((unsigned long)mpt->fmpt[gb].smpt, SMPT_ORDER);
+			free_pages((unsigned long)mpt->fmpt[gb].smpt, smpt_order);
 		free_page((unsigned long)mpt);
 	}
 out:

From a9a4f4f9584b131370ec710864898d48a146e4c5 Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 22 Nov 2022 09:46:13 +0000
Subject: [PATCH 231/457] ANDROID: KVM: arm64: Use correct pkvm owners type

Update host_stage2_set_owner_locked() and caller functions
to use enum pkvm_component_id instead of u8, as IDs are not
defined as const variables anymore. and all other functions
and structs use this enum consistently.

Use the correct max number of IDs as it is not calculated from
invalid PTEs anymore.

Test: MicrodroidHostTestCases
Bug: 255731794
Change-Id: Ib45dc34ff9ff20716adb79920f32875034587343
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  3 ++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 13 ++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 18109fc7cc4d..e28316bc1a18 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -59,6 +59,7 @@ enum pkvm_component_id {
 	PKVM_ID_HYP,
 	PKVM_ID_GUEST,
 	PKVM_ID_FFA,
+	PKVM_ID_MAX = PKVM_ID_FFA,
 };
 
 extern unsigned long hyp_nr_cpus;
@@ -84,7 +85,7 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
 bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot,
 			     bool update_iommu);
-int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id);
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id);
 int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
 int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 5c0aff3e7694..88b92197526a 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -592,19 +592,18 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
 }
 
 #define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
-#define KVM_MAX_OWNER_ID		FIELD_MAX(KVM_INVALID_PTE_OWNER_MASK)
-static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
+static kvm_pte_t kvm_init_invalid_leaf_owner(enum pkvm_component_id owner_id)
 {
 	return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
 }
 
-int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
+int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id)
 {
 	kvm_pte_t annotation;
 	enum kvm_pgtable_prot prot;
 	int ret;
 
-	if (owner_id > KVM_MAX_OWNER_ID)
+	if (owner_id > PKVM_ID_MAX)
 		return -EINVAL;
 
 	annotation = kvm_init_invalid_leaf_owner(owner_id);
@@ -894,7 +893,7 @@ static int host_initiate_unshare(u64 *completer_addr,
 static int host_initiate_donation(u64 *completer_addr,
 				  const struct pkvm_mem_transition *tx)
 {
-	u8 owner_id = tx->completer.id;
+	enum pkvm_component_id owner_id = tx->completer.id;
 	u64 size = tx->nr_pages * PAGE_SIZE;
 
 	*completer_addr = tx->initiator.host.completer_addr;
@@ -955,7 +954,7 @@ static int host_complete_share(u64 addr, const struct pkvm_mem_transition *tx,
 
 static int host_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
 {
-	u8 owner_id = tx->initiator.id;
+	enum pkvm_component_id owner_id = tx->initiator.id;
 	u64 size = tx->nr_pages * PAGE_SIZE;
 
 	if (tx->initiator.id == PKVM_ID_GUEST)
@@ -967,7 +966,7 @@ static int host_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx)
 static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx)
 {
 	u64 size = tx->nr_pages * PAGE_SIZE;
-	u8 host_id = tx->completer.id;
+	enum pkvm_component_id host_id = tx->completer.id;
 
 	return host_stage2_set_owner_locked(addr, size, host_id);
 }

From 378e0dbd918b7f3406d7a0876bb793d85a966b4d Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 5 Dec 2022 15:48:59 +0000
Subject: [PATCH 232/457] ANDROID: arm64: patching: Refactor
 __aarch64_insn_write()

In order to allow the re-use of __arch64_insn_write() to fixup
relocations in pKVM modules, refactor it into a more abstract
__arch64_text_write() function which also takes a size as parameter.

No functional changes intended.

Bug: 244543039
Bug: 244373730
Change-Id: I60d789d8a4b1271deeb2f6ac6d3b7fc55bdbb465
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kernel/patching.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
index 33e0fabc0b79..b949db008562 100644
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@@ -66,16 +66,16 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
 	return ret;
 }
 
-static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
+static int __kprobes __aarch64_text_write(void *dst, void *src, size_t size)
 {
-	void *waddr = addr;
-	unsigned long flags = 0;
+	unsigned long flags;
+	void *waddr;
 	int ret;
 
 	raw_spin_lock_irqsave(&patch_lock, flags);
-	waddr = patch_map(addr, FIX_TEXT_POKE0);
+	waddr = patch_map(dst, FIX_TEXT_POKE0);
 
-	ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);
+	ret = copy_to_kernel_nofault(waddr, src, size);
 
 	patch_unmap(FIX_TEXT_POKE0);
 	raw_spin_unlock_irqrestore(&patch_lock, flags);
@@ -85,7 +85,9 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
 
 int __kprobes aarch64_insn_write(void *addr, u32 insn)
 {
-	return __aarch64_insn_write(addr, cpu_to_le32(insn));
+	__le32 __insn = cpu_to_le32(insn);
+
+	return __aarch64_text_write(addr, &__insn, AARCH64_INSN_SIZE);
 }
 
 int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)

From d996511e4327ee2437d2078a1a8cd9898d22003b Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 5 Dec 2022 15:52:57 +0000
Subject: [PATCH 233/457] ANDROID: arm64: patching: Add aarch64_addr_write()

The process of applying hypervisor relocations involves patching
addresses in the hypervisor object. In the existing KVM nVHE relocation
procedure, the relocations are applied early enough for write-permission
not to be a problem when touching e.g. the rodata section. But applying
relocations on hypervisor modules embedded in a kernel module proves
more challenging, as the kernel module loader will actively map text and
rodata sections read-only. In order to allow patching in those sections,
let's introduce a new helper function using the text fixmap to
temporarily map the relevant pages writable.

Bug: 244543039
Bug: 244373730
Change-Id: I9dcdade1927e5bc121db87bc950fb70a374c44cd
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/patching.h | 1 +
 arch/arm64/kernel/patching.c      | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/patching.h
index 6bf5adc56295..82b1e0c66809 100644
--- a/arch/arm64/include/asm/patching.h
+++ b/arch/arm64/include/asm/patching.h
@@ -6,6 +6,7 @@
 
 int aarch64_insn_read(void *addr, u32 *insnp);
 int aarch64_insn_write(void *addr, u32 insn);
+int aarch64_addr_write(void *addr, u64 dst);
 
 int aarch64_insn_patch_text_nosync(void *addr, u32 insn);
 int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt);
diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c
index b949db008562..b336073fe6b1 100644
--- a/arch/arm64/kernel/patching.c
+++ b/arch/arm64/kernel/patching.c
@@ -90,6 +90,11 @@ int __kprobes aarch64_insn_write(void *addr, u32 insn)
 	return __aarch64_text_write(addr, &__insn, AARCH64_INSN_SIZE);
 }
 
+int __kprobes aarch64_addr_write(void *addr, u64 dst)
+{
+	return __aarch64_text_write(addr, &dst, sizeof(dst));
+}
+
 int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
 {
 	u32 *tp = addr;

From 8a3e189fbd0f4e11630800508c2b34198a8dfd7a Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Mon, 27 Jun 2022 14:03:28 +0100
Subject: [PATCH 234/457] ANDROID: KVM: arm64: Add mapping removal interface
 for nVHE hyp

The new pkvm_remove_mappings() allows the caller to unmap a memory range
from the hypervisor.

This is to allow later introduction of in-hypervisor tracing.

Bug: 244543039
Bug: 229972309
Change-Id: I9edc7c2ae55c4b7f5d464d26ce3351b5fd4bf9f3
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mm.h | 1 +
 arch/arm64/kvm/hyp/nvhe/mm.c         | 9 +++++++++
 2 files changed, 10 insertions(+)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 164a0986dc52..3ceb29b36069 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -28,5 +28,6 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 				  enum kvm_pgtable_prot prot,
 				  unsigned long *haddr);
 int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
+void pkvm_remove_mappings(void *from, void *to);
 
 #endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index c80b2c007619..ca4ccc4e9894 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -136,6 +136,15 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 	return ret;
 }
 
+void pkvm_remove_mappings(void *from, void *to)
+{
+	unsigned long size = (unsigned long)to - (unsigned long)from;
+
+	hyp_spin_lock(&pkvm_pgd_lock);
+	WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, (u64)from, size) != size);
+	hyp_spin_unlock(&pkvm_pgd_lock);
+}
+
 int hyp_back_vmemmap(phys_addr_t back)
 {
 	unsigned long i, start, size, end = 0;

From a8d5d29e85048ec29761e90c38cde2eda7a5a3dc Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 9 Sep 2022 16:31:33 +0000
Subject: [PATCH 235/457] ANDROID: KVM: arm64: Move gen-hyprel into a tool
 directory

In order to allow re-use of the gen-hyprel tool to build hypervisor
modules in the future, move it up to the arm64 tools folder.

Bug: 244543039
Bug: 244373730
Change-Id: I188a2dac1acf4974213499970cc29552807497eb
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/Makefile                             | 5 +++++
 arch/arm64/kvm/hyp/nvhe/.gitignore              | 1 -
 arch/arm64/kvm/hyp/nvhe/Makefile                | 7 +++----
 arch/arm64/tools/.gitignore                     | 2 ++
 arch/arm64/tools/Makefile                       | 4 ++++
 arch/arm64/{kvm/hyp/nvhe => tools}/gen-hyprel.c | 0
 6 files changed, 14 insertions(+), 5 deletions(-)
 create mode 100644 arch/arm64/tools/.gitignore
 rename arch/arm64/{kvm/hyp/nvhe => tools}/gen-hyprel.c (100%)

diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 4ce6fd353ba4..34691c2cd8f4 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -190,6 +190,11 @@ ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS),y)
   endif
 endif
 
+ifeq ($(CONFIG_KVM),y)
+archscripts:
+	$(Q)$(MAKE) $(build)=arch/arm64/tools gen-hyprel
+endif
+
 ifeq ($(KBUILD_EXTMOD),)
 # We need to generate vdso-offsets.h before compiling certain files in kernel/.
 # In order to do that, we should use the archprepare target, but we can't since
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
index 5b6c43cc96f8..899547d88045 100644
--- a/arch/arm64/kvm/hyp/nvhe/.gitignore
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -1,4 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
-gen-hyprel
 hyp.lds
 hyp-reloc.S
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 01fd26cff95a..de1f2551acef 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -14,8 +14,7 @@ ccflags-y += -fno-stack-protector	\
 	     -DDISABLE_BRANCH_PROFILING	\
 	     $(DISABLE_STACKLEAK_PLUGIN)
 
-hostprogs := gen-hyprel
-HOST_EXTRACFLAGS += -I$(objtree)/include
+HYPREL := arch/arm64/tools/gen-hyprel
 
 lib-objs := clear_page.o copy_page.o memcpy.o memset.o
 lib-objs := $(addprefix ../../../lib/, $(lib-objs))
@@ -66,7 +65,7 @@ $(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
 #    runtime. Because the hypervisor is part of the kernel binary, relocations
 #    produce a kernel VA. We enumerate relocations targeting hyp at build time
 #    and convert the kernel VAs at those positions to hyp VAs.
-$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o $(obj)/gen-hyprel FORCE
+$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o FORCE
 	$(call if_changed,hyprel)
 
 # 5) Compile hyp-reloc.S and link it into the existing partially linked object.
@@ -85,7 +84,7 @@ $(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
 # The HYPREL command calls `gen-hyprel` to generate an assembly file with
 # a list of relocations targeting hyp code/data.
 quiet_cmd_hyprel = HYPREL  $@
-      cmd_hyprel = $(obj)/gen-hyprel $< > $@
+      cmd_hyprel = $(HYPREL) $< > $@
 
 # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
 # to avoid clashes with VHE code/data.
diff --git a/arch/arm64/tools/.gitignore b/arch/arm64/tools/.gitignore
new file mode 100644
index 000000000000..1ddedddfffbe
--- /dev/null
+++ b/arch/arm64/tools/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+gen-hyprel
diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile
index 07a93ab21a62..3b604506343e 100644
--- a/arch/arm64/tools/Makefile
+++ b/arch/arm64/tools/Makefile
@@ -22,3 +22,7 @@ $(kapi)/cpucaps.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE
 
 $(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE
 	$(call if_changed,gen_sysreg)
+
+HOST_EXTRACFLAGS += -I$(objtree)/include
+hostprogs += gen-hyprel
+gen-hyprel: $(obj)/gen-hyprel
diff --git a/arch/arm64/kvm/hyp/nvhe/gen-hyprel.c b/arch/arm64/tools/gen-hyprel.c
similarity index 100%
rename from arch/arm64/kvm/hyp/nvhe/gen-hyprel.c
rename to arch/arm64/tools/gen-hyprel.c

From f347aa8c3da299a25c49dbfa533958985cfa46be Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 19 Sep 2022 15:10:31 +0000
Subject: [PATCH 236/457] ANDROID: KVM: arm64: Make gen-hyprel emit delimiters

The start and end pointers of the hyp relocs section are currently
specified in the vmlinux linker script. In order to ease re-using the
same relocation procedure for hypervisor modules, emit the delimiters
from the generated hyp-reloc.S file directly.

Bug: 244543039
Bug: 244373730
Change-Id: I845af2d40e1dd13301069537c6325f6a6f381ce4
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/tools/gen-hyprel.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/tools/gen-hyprel.c b/arch/arm64/tools/gen-hyprel.c
index 6bc88a756cb7..fa719b6c6d54 100644
--- a/arch/arm64/tools/gen-hyprel.c
+++ b/arch/arm64/tools/gen-hyprel.c
@@ -296,8 +296,10 @@ static void init_elf(const char *path)
 /* Print the prologue of the output ASM file. */
 static void emit_prologue(void)
 {
-	printf(".data\n"
-	       ".pushsection " HYP_RELOC_SECTION ", \"a\"\n");
+	printf("#include <linux/linkage.h>\n"
+	       ".data\n"
+	       ".pushsection " HYP_RELOC_SECTION ", \"a\"\n"
+	       "SYM_ENTRY(__hyprel_start, SYM_L_GLOBAL, SYM_A_NONE)\n");
 }
 
 /* Print ASM statements needed as a prologue to a processed hyp section. */
@@ -347,7 +349,8 @@ static void emit_rela_abs64(Elf64_Rela *rela, const char *sh_orig_name)
 /* Print the epilogue of the output ASM file. */
 static void emit_epilogue(void)
 {
-	printf(".popsection\n");
+	printf("SYM_ENTRY(__hyprel_end, SYM_L_GLOBAL, SYM_A_NONE)\n"
+	       ".popsection\n");
 }
 
 /*

From 32e2601b60ae4372c4500e538aec4006f2f20fd1 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 12:43:53 +0000
Subject: [PATCH 237/457] ANDROID: KVM: arm64: Refactor nvhe Makefile

Building and linking hypervisor modules into kernel modules involves
very similar problems to building and linking the KVM nVHE hypervisor
object into the kernel image. In order to re-use the same building
procedure, let's factor out and generalize the nVHE makefile in a way
that allows it to be re-used for module builds.

Bug: 244543039
Bug: 244373730
Change-Id: I89b2630f8cfd3ce5b624300fc277749be4fc9e04
Signed-off-by: Quentin Perret <qperret@google.com>
[vdonnefort@: Merge with updated nvhe/Makefile]
Signed-off-by: Vincent Donnefort <vdonnefort@google.com
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/Makefile      | 96 +--------------------------
 arch/arm64/kvm/hyp/nvhe/Makefile.nvhe | 94 ++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 94 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/Makefile.nvhe

diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index de1f2551acef..d3a98aa3d672 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -1,20 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
-#
-
-asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
-
-# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as
-# there is no way to execute them and any such MMIO access from nVHE KVM
-# will explode instantly (Words of Marc Zyngier). So introduce a generic flag
-# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM.
-ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
-ccflags-y += -fno-stack-protector	\
-	     -DDISABLE_BRANCH_PROFILING	\
-	     $(DISABLE_STACKLEAK_PLUGIN)
-
-HYPREL := arch/arm64/tools/gen-hyprel
 
 lib-objs := clear_page.o copy_page.o memcpy.o memset.o
 lib-objs := $(addprefix ../../../lib/, $(lib-objs))
@@ -30,84 +14,8 @@ hyp-obj-y += $(lib-objs)
 hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/s2mpu.o
 hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/io-mpt-s2mpu.o
 
-##
-## Build rules for compiling nVHE hyp code
-## Output of this folder is `kvm_nvhe.o`, a partially linked object
-## file containing all nVHE hyp code and data.
-##
-
-hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y))
-obj-y := kvm_nvhe.o
-targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o
-
-# 1) Compile all source files to `.nvhe.o` object files. The file extension
-#    avoids file name clashes for files shared with VHE.
-$(obj)/%.nvhe.o: $(src)/%.c FORCE
-	$(call if_changed_rule,cc_o_c)
-$(obj)/%.nvhe.o: $(src)/%.S FORCE
-	$(call if_changed_rule,as_o_S)
-
-# 2) Compile linker script.
 $(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
 	$(call if_changed_dep,cpp_lds_S)
 
-# 3) Partially link all '.nvhe.o' files and apply the linker script.
-#    Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
-#    Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
-#          the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
-#          keep the dependency on the target while avoiding an error from
-#          GNU ld if the linker script is passed to it twice.
-LDFLAGS_kvm_nvhe.tmp.o := -r -T
-$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
-	$(call if_changed,ld)
-
-# 4) Generate list of hyp code/data positions that need to be relocated at
-#    runtime. Because the hypervisor is part of the kernel binary, relocations
-#    produce a kernel VA. We enumerate relocations targeting hyp at build time
-#    and convert the kernel VAs at those positions to hyp VAs.
-$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o FORCE
-	$(call if_changed,hyprel)
-
-# 5) Compile hyp-reloc.S and link it into the existing partially linked object.
-#    The object file now contains a section with pointers to hyp positions that
-#    will contain kernel VAs at runtime. These pointers have relocations on them
-#    so that they get updated as the hyp object is linked into `vmlinux`.
-LDFLAGS_kvm_nvhe.rel.o := -r
-$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
-	$(call if_changed,ld)
-
-# 6) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
-#    Prefixes names of ELF symbols with '__kvm_nvhe_'.
-$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
-	$(call if_changed,hypcopy)
-
-# The HYPREL command calls `gen-hyprel` to generate an assembly file with
-# a list of relocations targeting hyp code/data.
-quiet_cmd_hyprel = HYPREL  $@
-      cmd_hyprel = $(HYPREL) $< > $@
-
-# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
-# to avoid clashes with VHE code/data.
-quiet_cmd_hypcopy = HYPCOPY $@
-      cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
-
-# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
-# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
-KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
-# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
-# when profile optimization is applied. gen-hyprel does not support SHT_REL and
-# causes a build failure. Remove profile optimization flags.
-KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
-
-# KVM nVHE code is run at a different exception code with a different map, so
-# compiler instrumentation that inserts callbacks or checks into the code may
-# cause crashes. Just disable it.
-GCOV_PROFILE	:= n
-KASAN_SANITIZE	:= n
-KCSAN_SANITIZE	:= n
-UBSAN_SANITIZE	:= n
-KCOV_INSTRUMENT	:= n
-
-# Skip objtool checking for this directory because nVHE code is compiled with
-# non-standard build rules.
-OBJECT_FILES_NON_STANDARD := y
+include $(srctree)/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
+obj-y := kvm_nvhe.o
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe b/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
new file mode 100644
index 000000000000..381e610042eb
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module, HYP/nVHE part
+#
+
+asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
+
+# Tracepoint and MMIO logging symbols should not be visible at nVHE KVM as
+# there is no way to execute them and any such MMIO access from nVHE KVM
+# will explode instantly (Words of Marc Zyngier). So introduce a generic flag
+# __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM.
+ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
+ccflags-y += -fno-stack-protector	\
+	     -DDISABLE_BRANCH_PROFILING	\
+	     $(DISABLE_STACKLEAK_PLUGIN)
+
+HYPREL := arch/arm64/tools/gen-hyprel
+
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
+
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y))
+targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+#    avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
+	$(call if_changed_rule,as_o_S)
+
+# 2) Partially link all '.nvhe.o' files and apply the linker script.
+#    Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+#    Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+#          the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+#          keep the dependency on the target while avoiding an error from
+#          GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+	$(call if_changed,ld)
+
+# 3) Generate list of hyp code/data positions that need to be relocated at
+#    runtime. Because the hypervisor is part of the kernel binary, relocations
+#    produce a kernel VA. We enumerate relocations targeting hyp at build time
+#    and convert the kernel VAs at those positions to hyp VAs.
+$(obj)/hyp-reloc.S: $(obj)/kvm_nvhe.tmp.o FORCE
+	$(call if_changed,hyprel)
+
+# 4) Compile hyp-reloc.S and link it into the existing partially linked object.
+#    The object file now contains a section with pointers to hyp positions that
+#    will contain kernel VAs at runtime. These pointers have relocations on them
+#    so that they get updated as the hyp object is linked into `vmlinux`.
+LDFLAGS_kvm_nvhe.rel.o := -r
+$(obj)/kvm_nvhe.rel.o: $(obj)/kvm_nvhe.tmp.o $(obj)/hyp-reloc.o FORCE
+	$(call if_changed,ld)
+
+# 5) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+#    Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.rel.o FORCE
+	$(call if_changed,hypcopy)
+
+# The HYPREL command calls `gen-hyprel` to generate an assembly file with
+# a list of relocations targeting hyp code/data.
+quiet_cmd_hyprel = HYPREL  $@
+      cmd_hyprel = $(HYPREL) $< > $@
+
+# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
+# to avoid clashes with VHE code/data.
+quiet_cmd_hypcopy = HYPCOPY $@
+      cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
+
+# Remove ftrace, Shadow Call Stack, and CFI CFLAGS.
+# This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations.
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
+# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile'
+# when profile optimization is applied. gen-hyprel does not support SHT_REL and
+# causes a build failure. Remove profile optimization flags.
+KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS))
+
+# KVM nVHE code is run at a different exception code with a different map, so
+# compiler instrumentation that inserts callbacks or checks into the code may
+# cause crashes. Just disable it.
+GCOV_PROFILE	:= n
+KASAN_SANITIZE	:= n
+KCSAN_SANITIZE	:= n
+UBSAN_SANITIZE	:= n
+KCOV_INSTRUMENT	:= n
+
+# Skip objtool checking for this directory because nVHE code is compiled with
+# non-standard build rules.
+OBJECT_FILES_NON_STANDARD := y

From 6e84750015875de36602317606839b1855e4f96a Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 6 Oct 2022 11:57:45 +0100
Subject: [PATCH 238/457] ANDROID: KVM: arm64: Allow loading modules to the
 pKVM hypervisor

All nVHE hypervisor code is currently required to be statically linked
into the kernel image. Sadly, scaling pKVM will inevitably require
running _some_ hardware-specific code in the hypervisor due to the
absence of architecture requirements regarding IOMMU implementations or
power management, for example.

In order to address this issue, introduce the ability to load modules
in the pKVM hypervisor at run-time. pKVM modules are expected to be
embedded inside kernel modules, and to be loaded into pKVM when their
kernel counterpart is loaded at EL1. pKVM module loading is defined as a
privileged operation -- all of them must be loaded while the host kernel
is still part of the trusted computing base.

Bug: 244543039
Bug: 244373730
Co-authored-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
Change-Id: If8e5d3ac0a2893c892aff09e5b51d3b8e14693f8
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h          |   4 +
 arch/arm64/include/asm/kvm_mmu.h          |   4 +
 arch/arm64/include/asm/kvm_pkvm_module.h  |  61 ++++++++++
 arch/arm64/include/asm/module.lds.h       |  23 ++++
 arch/arm64/kvm/hyp/include/nvhe/mm.h      |   4 +
 arch/arm64/kvm/hyp/include/nvhe/modules.h |   5 +
 arch/arm64/kvm/hyp/nvhe/Makefile          |   1 +
 arch/arm64/kvm/hyp/nvhe/Makefile.module   |   6 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c        |  36 ++++++
 arch/arm64/kvm/hyp/nvhe/mm.c              |  27 +++++
 arch/arm64/kvm/hyp/nvhe/module.lds.S      |  37 ++++++
 arch/arm64/kvm/hyp/nvhe/modules.c         |  21 ++++
 arch/arm64/kvm/pkvm.c                     | 139 ++++++++++++++++++++++
 arch/arm64/kvm/va_layout.c                |  24 ++++
 14 files changed, 392 insertions(+)
 create mode 100644 arch/arm64/include/asm/kvm_pkvm_module.h
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/modules.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/Makefile.module
 create mode 100644 arch/arm64/kvm/hyp/nvhe/module.lds.S
 create mode 100644 arch/arm64/kvm/hyp/nvhe/modules.c

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 81cd42bc0bc0..bf1ace7d3efc 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -63,6 +63,10 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
 	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
+	__KVM_HOST_SMCCC_FUNC___pkvm_alloc_module_va,
+	__KVM_HOST_SMCCC_FUNC___pkvm_map_module_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_unmap_module_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_module,
 	__KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
 
 	/* Hypercalls available after pKVM finalisation */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index d4b2114cef54..ecbe1e623685 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -116,11 +116,15 @@ alternative_cb_end
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <asm/kvm_host.h>
+#include <asm/kvm_pkvm_module.h>
 
 void kvm_update_va_mask(struct alt_instr *alt,
 			__le32 *origptr, __le32 *updptr, int nr_inst);
 void kvm_compute_layout(void);
 void kvm_apply_hyp_relocations(void);
+void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va,
+				      kvm_nvhe_reloc_t *begin,
+				      kvm_nvhe_reloc_t *end);
 
 #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
 
diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
new file mode 100644
index 000000000000..7d0b680d38e1
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ARM64_KVM_PKVM_MODULE_H__
+#define __ARM64_KVM_PKVM_MODULE_H__
+
+#include <linux/export.h>
+
+struct pkvm_module_ops {
+};
+
+struct pkvm_module_section {
+	void *start;
+	void *end;
+};
+
+typedef s32 kvm_nvhe_reloc_t;
+
+struct pkvm_el2_module {
+	struct pkvm_module_section text;
+	struct pkvm_module_section bss;
+	struct pkvm_module_section rodata;
+	struct pkvm_module_section data;
+	kvm_nvhe_reloc_t *relocs;
+	unsigned int nr_relocs;
+	int (*init)(const struct pkvm_module_ops *ops);
+};
+
+#ifdef MODULE
+int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this);
+
+#define pkvm_load_el2_module(init_fn)					\
+({									\
+	extern char __kvm_nvhe___hypmod_text_start[];			\
+	extern char __kvm_nvhe___hypmod_text_end[];			\
+	extern char __kvm_nvhe___hypmod_bss_start[];			\
+	extern char __kvm_nvhe___hypmod_bss_end[];			\
+	extern char __kvm_nvhe___hypmod_rodata_start[];			\
+	extern char __kvm_nvhe___hypmod_rodata_end[];			\
+	extern char __kvm_nvhe___hypmod_data_start[];			\
+	extern char __kvm_nvhe___hypmod_data_end[];			\
+	extern char __kvm_nvhe___hyprel_start[];			\
+	extern char __kvm_nvhe___hyprel_end[];				\
+	struct pkvm_el2_module mod;					\
+									\
+	mod.text.start		= __kvm_nvhe___hypmod_text_start;	\
+	mod.text.end		= __kvm_nvhe___hypmod_text_end;		\
+	mod.bss.start		= __kvm_nvhe___hypmod_bss_start;	\
+	mod.bss.end		= __kvm_nvhe___hypmod_bss_end;		\
+	mod.rodata.start	= __kvm_nvhe___hypmod_rodata_start;	\
+	mod.rodata.end		= __kvm_nvhe___hypmod_rodata_end;	\
+	mod.data.start		= __kvm_nvhe___hypmod_data_start;	\
+	mod.data.end		= __kvm_nvhe___hypmod_data_end;		\
+	mod.relocs		= (kvm_nvhe_reloc_t *)__kvm_nvhe___hyprel_start; \
+	mod.nr_relocs		= (__kvm_nvhe___hyprel_end - __kvm_nvhe___hyprel_start) / \
+				  sizeof(*mod.relocs);			\
+	mod.init = init_fn;						\
+									\
+	__pkvm_load_el2_module(&mod, THIS_MODULE);			\
+})
+#endif
+#endif
diff --git a/arch/arm64/include/asm/module.lds.h b/arch/arm64/include/asm/module.lds.h
index 094701ec5500..f11d92211862 100644
--- a/arch/arm64/include/asm/module.lds.h
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,3 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <asm/page-def.h>
+
 SECTIONS {
 #ifdef CONFIG_ARM64_MODULE_PLTS
 	.plt 0 : { BYTE(0) }
@@ -17,4 +20,24 @@ SECTIONS {
 	 */
 	.text.hot : { *(.text.hot) }
 #endif
+
+#ifdef CONFIG_KVM
+	.hyp.text : ALIGN(PAGE_SIZE) {
+		*(.hyp.text)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.bss : ALIGN(PAGE_SIZE) {
+		*(.hyp.bss)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.rodata : ALIGN(PAGE_SIZE) {
+		*(.hyp.rodata)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.data : ALIGN(PAGE_SIZE) {
+		*(.hyp.data)
+		. = ALIGN(PAGE_SIZE);
+	}
+	.hyp.reloc : ALIGN(4) {	*(.hyp.reloc) }
+#endif
 }
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 3ceb29b36069..6afc4b1e5d04 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -12,6 +12,7 @@
 
 extern struct kvm_pgtable pkvm_pgtable;
 extern hyp_spinlock_t pkvm_pgd_lock;
+extern const struct pkvm_module_ops module_ops;
 
 int hyp_create_pcpu_fixmap(void);
 void *hyp_fixmap_map(phys_addr_t phys);
@@ -30,4 +31,7 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr);
 void pkvm_remove_mappings(void *from, void *to);
 
+int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot);
+void __pkvm_unmap_module_page(u64 pfn, void *va);
+void *__pkvm_alloc_module_va(u64 nr_pages);
 #endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
new file mode 100644
index 000000000000..e9e05ab58594
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -0,0 +1,5 @@
+#ifdef CONFIG_MODULES
+int __pkvm_init_module(void *module_init);
+#else
+static inline int __pkvm_init_module(void *module_init); { return -EOPNOTSUPP; }
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index d3a98aa3d672..52cbf4f06aab 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -9,6 +9,7 @@ hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
 hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
+hyp-obj-$(CONFIG_MODULES) += modules.o
 hyp-obj-y += $(lib-objs)
 
 hyp-obj-$(CONFIG_KVM_S2MPU) += iommu/s2mpu.o
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile.module b/arch/arm64/kvm/hyp/nvhe/Makefile.module
new file mode 100644
index 000000000000..d3ad4468b50d
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile.module
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+$(obj)/hyp.lds: arch/arm64/kvm/hyp/nvhe/module.lds.S FORCE
+	$(call if_changed_dep,cpp_lds_S)
+
+include $(srctree)/arch/arm64/kvm/hyp/nvhe/Makefile.nvhe
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index a2854ea1a3ad..5be78f35ea09 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -18,6 +18,7 @@
 #include <nvhe/ffa.h>
 #include <nvhe/iommu.h>
 #include <nvhe/mem_protect.h>
+#include <nvhe/modules.h>
 #include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>
@@ -1154,6 +1155,37 @@ static void handle___pkvm_iommu_finalize(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_iommu_finalize();
 }
 
+static void handle___pkvm_alloc_module_va(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, nr_pages, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = (u64)__pkvm_alloc_module_va(nr_pages);
+}
+
+static void handle___pkvm_map_module_page(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, pfn, host_ctxt, 1);
+	DECLARE_REG(void *, va, host_ctxt, 2);
+	DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3);
+
+	cpu_reg(host_ctxt, 1) = (u64)__pkvm_map_module_page(pfn, va, prot);
+}
+
+static void handle___pkvm_unmap_module_page(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(u64, pfn, host_ctxt, 1);
+	DECLARE_REG(void *, va, host_ctxt, 2);
+
+	__pkvm_unmap_module_page(pfn, va);
+}
+
+static void handle___pkvm_init_module(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(void *, ptr, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_init_module(ptr);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1192,6 +1224,10 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_iommu_register),
 	HANDLE_FUNC(__pkvm_iommu_pm_notify),
 	HANDLE_FUNC(__pkvm_iommu_finalize),
+	HANDLE_FUNC(__pkvm_alloc_module_va),
+	HANDLE_FUNC(__pkvm_map_module_page),
+	HANDLE_FUNC(__pkvm_unmap_module_page),
+	HANDLE_FUNC(__pkvm_init_module),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index ca4ccc4e9894..81e39e080ac5 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -100,6 +100,33 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 	return err;
 }
 
+void *__pkvm_alloc_module_va(u64 nr_pages)
+{
+	unsigned long addr;
+	int ret;
+
+	ret = pkvm_alloc_private_va_range(nr_pages << PAGE_SHIFT, &addr);
+
+	return ret ? NULL : (void *)addr;
+}
+
+int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot)
+{
+	int ret;
+
+	ret = __pkvm_host_donate_hyp(pfn, 1);
+	if (ret)
+		return ret;
+
+	return __pkvm_create_mappings((unsigned long)va, PAGE_SIZE, hyp_pfn_to_phys(pfn), prot);
+}
+
+void __pkvm_unmap_module_page(u64 pfn, void *va)
+{
+	WARN_ON(__pkvm_hyp_donate_host(pfn, 1));
+	pkvm_remove_mappings(va, va + PAGE_SIZE);
+}
+
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
 {
 	unsigned long start = (unsigned long)from;
diff --git a/arch/arm64/kvm/hyp/nvhe/module.lds.S b/arch/arm64/kvm/hyp/nvhe/module.lds.S
new file mode 100644
index 000000000000..d463a58f2197
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/module.lds.S
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/hyp_image.h>
+#include <asm/page-def.h>
+
+SECTIONS {
+	.hyp.text : {
+		HYP_SECTION_SYMBOL_NAME(.text) = .;
+		__hypmod_text_start = .;
+		*(.text .text.*)
+		__hypmod_text_end = .;
+	}
+
+	.hyp.bss : {
+		HYP_SECTION_SYMBOL_NAME(.bss) = .;
+		__hypmod_bss_start = .;
+		*(.bss .bss.*)
+		FILL(0)
+		__hypmod_bss_end = .;
+	}
+
+	.hyp.rodata : {
+		HYP_SECTION_SYMBOL_NAME(.rodata) = .;
+		__hypmod_rodata_start = .;
+		*(.rodata .rodata.*)
+		BYTE(0)
+		__hypmod_rodata_end = .;
+	}
+
+	.hyp.data : {
+		HYP_SECTION_SYMBOL_NAME(.data) = .;
+		__hypmod_data_start = .;
+		*(.data .data.*)
+		BYTE(0)
+		__hypmod_data_end = .;
+	}
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
new file mode 100644
index 000000000000..9e695d9a4243
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google LLC
+ */
+#include <asm/kvm_host.h>
+#include <asm/kvm_pkvm_module.h>
+
+#include <nvhe/modules.h>
+
+const struct pkvm_module_ops module_ops = {
+};
+
+int __pkvm_init_module(void *module_init)
+{
+	int (*do_module_init)(const struct pkvm_module_ops *ops) = module_init;
+	int ret;
+
+	ret = do_module_init(&module_ops);
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 8285eff0dcd7..4e14e055b724 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -13,7 +13,9 @@
 #include <linux/of_reserved_mem.h>
 #include <linux/sort.h>
 
+#include <asm/kvm_mmu.h>
 #include <asm/kvm_pkvm.h>
+#include <asm/kvm_pkvm_module.h>
 
 #include "hyp_constants.h"
 
@@ -412,3 +414,140 @@ int pkvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 
 	return 0;
 }
+
+struct pkvm_mod_sec_mapping {
+	struct pkvm_module_section *sec;
+	enum kvm_pgtable_prot prot;
+};
+
+static void pkvm_unmap_module_pages(void *kern_va, void *hyp_va, size_t size)
+{
+	size_t offset;
+	u64 pfn;
+
+	for (offset = 0; offset < size; offset += PAGE_SIZE) {
+		pfn = vmalloc_to_pfn(kern_va + offset);
+		kvm_call_hyp_nvhe(__pkvm_unmap_module_page, pfn,
+				  hyp_va + offset);
+	}
+}
+
+static void pkvm_unmap_module_sections(struct pkvm_mod_sec_mapping *secs_map, void *hyp_va_base, int nr_secs)
+{
+	size_t offset, size;
+	void *start;
+	int i;
+
+	for (i = 0; i < nr_secs; i++) {
+		start = secs_map[i].sec->start;
+		size = secs_map[i].sec->end - start;
+		offset = start - secs_map[0].sec->start;
+		pkvm_unmap_module_pages(start, hyp_va_base + offset, size);
+	}
+}
+
+static int pkvm_map_module_section(struct pkvm_mod_sec_mapping *sec_map, void *hyp_va)
+{
+	size_t offset, size = sec_map->sec->end - sec_map->sec->start;
+	int ret;
+	u64 pfn;
+
+	for (offset = 0; offset < size; offset += PAGE_SIZE) {
+		pfn = vmalloc_to_pfn(sec_map->sec->start + offset);
+		ret = kvm_call_hyp_nvhe(__pkvm_map_module_page, pfn,
+					hyp_va + offset, sec_map->prot);
+		if (ret) {
+			pkvm_unmap_module_pages(sec_map->sec->start, hyp_va, offset);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int pkvm_map_module_sections(struct pkvm_mod_sec_mapping *secs_map, void *hyp_va_base, int nr_secs)
+{
+	size_t offset;
+	int i, ret;
+
+	for (i = 0; i < nr_secs; i++) {
+		offset = secs_map[i].sec->start - secs_map[0].sec->start;
+		ret = pkvm_map_module_section(&secs_map[i], hyp_va_base + offset);
+		if (ret) {
+			pkvm_unmap_module_sections(secs_map, hyp_va_base, i);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int __pkvm_cmp_mod_sec(const void *p1, const void *p2)
+{
+	struct pkvm_mod_sec_mapping const *s1 = p1;
+	struct pkvm_mod_sec_mapping const *s2 = p2;
+
+	return s1->sec->start < s2->sec->start ? -1 : s1->sec->start > s2->sec->start;
+}
+
+int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this)
+{
+	struct pkvm_mod_sec_mapping secs_map[] = {
+		{ &mod->text, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X },
+		{ &mod->bss, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W },
+		{ &mod->rodata, KVM_PGTABLE_PROT_R },
+		{ &mod->data, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W },
+	};
+	void *start, *end, *hyp_va;
+	kvm_nvhe_reloc_t *endrel;
+	size_t offset, size;
+	int ret, i;
+
+	if (!is_protected_kvm_enabled())
+		return -EOPNOTSUPP;
+
+	for (i = 0; i < ARRAY_SIZE(secs_map); i++) {
+		if (!PAGE_ALIGNED(secs_map[i].sec->start)) {
+			kvm_err("EL2 sections are not page-aligned\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!try_module_get(this)) {
+		kvm_err("Kernel module has been unloaded\n");
+		return -ENODEV;
+	}
+
+	sort(secs_map, ARRAY_SIZE(secs_map), sizeof(secs_map[0]), __pkvm_cmp_mod_sec, NULL);
+	start = secs_map[0].sec->start;
+	end = secs_map[ARRAY_SIZE(secs_map) - 1].sec->end;
+	size = PAGE_ALIGN(end - start);
+
+	hyp_va = (void *)kvm_call_hyp_nvhe(__pkvm_alloc_module_va, size >> PAGE_SHIFT);
+	if (!hyp_va) {
+		kvm_err("Failed to allocate hypervisor VA space for EL2 module\n");
+		module_put(this);
+		return -ENOMEM;
+	}
+	endrel = (void *)mod->relocs + mod->nr_relocs * sizeof(*endrel);
+	kvm_apply_hyp_module_relocations(start, hyp_va, mod->relocs, endrel);
+
+	ret = pkvm_map_module_sections(secs_map, hyp_va, ARRAY_SIZE(secs_map));
+	if (ret) {
+		kvm_err("Failed to map EL2 module page: %d\n", ret);
+		module_put(this);
+		return ret;
+	}
+
+	offset = (size_t)((void *)mod->init - start);
+	ret = kvm_call_hyp_nvhe(__pkvm_init_module, hyp_va + offset);
+	if (ret) {
+		kvm_err("Failed to init EL2 module: %d\n", ret);
+		pkvm_unmap_module_sections(secs_map, hyp_va, ARRAY_SIZE(secs_map));
+		module_put(this);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__pkvm_load_el2_module);
diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c
index 91b22a014610..2923da080521 100644
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -12,6 +12,7 @@
 #include <asm/insn.h>
 #include <asm/kvm_mmu.h>
 #include <asm/memory.h>
+#include <asm/patching.h>
 
 /*
  * The LSB of the HYP VA tag
@@ -109,6 +110,29 @@ __init void kvm_apply_hyp_relocations(void)
 	}
 }
 
+void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va,
+				      kvm_nvhe_reloc_t *begin,
+				      kvm_nvhe_reloc_t *end)
+{
+	kvm_nvhe_reloc_t *rel;
+
+	for (rel = begin; rel < end; ++rel) {
+		u32 **ptr, *va;
+
+		/*
+		 * Each entry contains a 32-bit relative offset from itself
+		 * to a VA position in the module area.
+		 */
+		ptr = (u32 **)((char *)rel + *rel);
+
+		/* Read the module VA value at the relocation address. */
+		va = *ptr;
+
+		/* Convert the module VA of the reloc to a hyp VA */
+		WARN_ON(aarch64_addr_write(ptr, (u64)(((void *)va - mod_start) + hyp_va)));
+	}
+}
+
 static u32 compute_instruction(int n, u32 rd, u32 rn)
 {
 	u32 insn = AARCH64_BREAK_FAULT;

From 4deb454ebf4482aaf3fd7c008fb7a7add6266dff Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 09:24:51 +0000
Subject: [PATCH 239/457] ANDROID: KVM: arm64: Include .note.gnu.property in
 .hyp.rodata

Since .hyp.rodata sections of pKVM modules are emitted with SHT_MERGE,
ld.ldd feels free to attempt merging it with other sections.
Unfortunately, the pKVM module linker script doesn't always place them
in output sections, hence causing link failures:

  ld.lld: error: drivers/misc/pkvm-pl011/hyp/kvm_nvhe.tmp.o:(.hyp.rodata):
  offset is outside the section

In practice, ld.ldd only seems to attempt merging .note.gnu.property with
.hyp.rodata. To work around the problem, make sure to explicitely place
the .note.gnu.property in .hyp.rodata from the start, hence preventing
ld.ldd from trying to optimize further.

A preferable solution would be to teach ld.lld that merging pKVM modules
sections is a bad idea, or to make sure the sections are not emitted
with SHT_MERGE to begin with, but we couldn't find an obvious way to make
that happen. This workaround is nothing more than a pratical compromise.

Bug: 244543039
Reported-by: Will Deacon <will@kernel.org>
Suggested-by: Will Deacon <will@kernel.org>
Change-Id: Iae902bdfd21915f552e218515cd77881a95fef2d
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/module.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/module.lds.S b/arch/arm64/kvm/hyp/nvhe/module.lds.S
index d463a58f2197..696ab5408265 100644
--- a/arch/arm64/kvm/hyp/nvhe/module.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/module.lds.S
@@ -22,7 +22,7 @@ SECTIONS {
 	.hyp.rodata : {
 		HYP_SECTION_SYMBOL_NAME(.rodata) = .;
 		__hypmod_rodata_start = .;
-		*(.rodata .rodata.*)
+		*(.rodata .rodata.* .note.gnu.property)
 		BYTE(0)
 		__hypmod_rodata_end = .;
 	}

From c3db83c87f5eec9f3976a75ef718285b028e970a Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 19 Sep 2022 13:12:44 +0000
Subject: [PATCH 240/457] ANDROID: KVM: arm64: Expose
 __pkvm_create_private_mapping to pKVM modules

pKVM has an internal API allowing to create mappings in the 'private'
range of the hypervisor VA space for which there are no rules
constraining the VA-PA relation (hence comparable to the vmalloc area in
the kernel). This will be a useful API for hypervisor modules, so expose
it in the recently introduced module ops struct.

Bug: 244543039
Bug: 244373730
Change-Id: I2a8f958a02c3c3b9871224b65b00b207820a507a
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 4 ++++
 arch/arm64/kvm/hyp/nvhe/modules.c        | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 7d0b680d38e1..8b45704529e7 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -3,9 +3,13 @@
 #ifndef __ARM64_KVM_PKVM_MODULE_H__
 #define __ARM64_KVM_PKVM_MODULE_H__
 
+#include <asm/kvm_pgtable.h>
 #include <linux/export.h>
 
 struct pkvm_module_ops {
+	int (*create_private_mapping)(phys_addr_t phys, size_t size,
+				      enum kvm_pgtable_prot prot,
+				      unsigned long *haddr);
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 9e695d9a4243..43c670d4f7cf 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -6,8 +6,10 @@
 #include <asm/kvm_pkvm_module.h>
 
 #include <nvhe/modules.h>
+#include <nvhe/mm.h>
 
 const struct pkvm_module_ops module_ops = {
+	.create_private_mapping = __pkvm_create_private_mapping,
 };
 
 int __pkvm_init_module(void *module_init)

From b21ae4963d0d204a1405c4aedd76eb01cd79dfdd Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 19 Sep 2022 13:35:54 +0000
Subject: [PATCH 241/457] ANDROID: KVM: arm64: Add serial framework for pKVM

Debugging a hypervisor tends to be trickier than normal system code
such as the kernel. The lack of availability of a UART framework is a
significant contributor to that. In order to address this, introduce a
framework allowing to load serial drivers into the hypervisor.

Bug: 244543039
Bug: 244373730
Change-Id: I2e7a1fd9abc9d5aa9d95f1d271a997d54a8fd582
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h |  1 +
 arch/arm64/kvm/hyp/include/nvhe/serial.h | 11 ++++
 arch/arm64/kvm/hyp/nvhe/Makefile         |  3 +-
 arch/arm64/kvm/hyp/nvhe/modules.c        |  2 +
 arch/arm64/kvm/hyp/nvhe/serial.c         | 68 ++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c          |  1 +
 6 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kvm/hyp/include/nvhe/serial.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/serial.c

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 8b45704529e7..27bc000132c3 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -10,6 +10,7 @@ struct pkvm_module_ops {
 	int (*create_private_mapping)(phys_addr_t phys, size_t size,
 				      enum kvm_pgtable_prot prot,
 				      unsigned long *haddr);
+	int (*register_serial_driver)(void (*hyp_putc_cb)(char));
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/serial.h b/arch/arm64/kvm/hyp/include/nvhe/serial.h
new file mode 100644
index 000000000000..85ff8fdd6c9c
--- /dev/null
+++ b/arch/arm64/kvm/hyp/include/nvhe/serial.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ARM64_KVM_NVHE_SERIAL_H__
+#define __ARM64_KVM_NVHE_SERIAL_H__
+
+void hyp_puts(const char *s);
+void hyp_putx64(u64 x);
+void hyp_putc(char c);
+int __pkvm_register_serial_driver(void (*driver_cb)(char));
+
+#endif
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index 52cbf4f06aab..2e08dacb7b6f 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -5,7 +5,8 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
 
 hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
 	 hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
-	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o iommu.o
+	 cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o iommu.o \
+	 serial.o
 hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
 hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 43c670d4f7cf..b845fe559307 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -7,9 +7,11 @@
 
 #include <nvhe/modules.h>
 #include <nvhe/mm.h>
+#include <nvhe/serial.h>
 
 const struct pkvm_module_ops module_ops = {
 	.create_private_mapping = __pkvm_create_private_mapping,
+	.register_serial_driver = __pkvm_register_serial_driver,
 };
 
 int __pkvm_init_module(void *module_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/serial.c b/arch/arm64/kvm/hyp/nvhe/serial.c
new file mode 100644
index 000000000000..0b2cf3b6d6a5
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/serial.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 - Google LLC
+ */
+
+#include <nvhe/pkvm.h>
+#include <nvhe/spinlock.h>
+
+static void (*__hyp_putc)(char c);
+
+static inline void __hyp_putx4(unsigned int x)
+{
+	x &= 0xf;
+	if (x <= 9)
+		x += '0';
+	else
+		x += ('a' - 0xa);
+
+	__hyp_putc(x);
+}
+
+static inline void __hyp_putx4n(unsigned long x, int n)
+{
+	int i = n >> 2;
+
+	__hyp_putc('0');
+	__hyp_putc('x');
+
+	while (i--)
+		__hyp_putx4(x >> (4 * i));
+
+	__hyp_putc('\n');
+	__hyp_putc('\r');
+}
+
+static inline bool hyp_serial_enabled(void)
+{
+	return !!READ_ONCE(__hyp_putc);
+}
+
+void hyp_puts(const char *s)
+{
+	if (!hyp_serial_enabled())
+		return;
+
+	while (*s)
+		__hyp_putc(*s++);
+
+	__hyp_putc('\n');
+	__hyp_putc('\r');
+}
+
+void hyp_putx64(u64 x)
+{
+	if (hyp_serial_enabled())
+		__hyp_putx4n(x, 64);
+}
+
+void hyp_putc(char c)
+{
+	if (hyp_serial_enabled())
+		__hyp_putc(c);
+}
+
+int __pkvm_register_serial_driver(void (*cb)(char))
+{
+	return cmpxchg(&__hyp_putc, NULL, cb) ? -EBUSY : 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 66392411ecaa..5e379f03fcca 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -18,6 +18,7 @@
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
+#include <nvhe/serial.h>
 #include <nvhe/trap_handler.h>
 
 unsigned long hyp_nr_cpus;

From 29dceaf3d563c522e23052b1ec6b85ba3d358773 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 25 Oct 2022 13:03:38 +0000
Subject: [PATCH 242/457] ANDROID: KVM: arm64: Expose puts and putx64 in pKVM
 ABI

Expose the hyp_puts() and hyp_putx64() helpers in the module_ops struct
to allow logging messages on the UART from pKVM modules.

Bug: 244543039
Bug: 244373730
Change-Id: Ica578667297e5a1f94c370603c29482be89982a9
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 2 ++
 arch/arm64/kvm/hyp/nvhe/modules.c        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 27bc000132c3..a9dabdaf79ba 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -11,6 +11,8 @@ struct pkvm_module_ops {
 				      enum kvm_pgtable_prot prot,
 				      unsigned long *haddr);
 	int (*register_serial_driver)(void (*hyp_putc_cb)(char));
+	void (*puts)(const char *str);
+	void (*putx64)(u64 num);
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index b845fe559307..f0514ee2593b 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -12,6 +12,8 @@
 const struct pkvm_module_ops module_ops = {
 	.create_private_mapping = __pkvm_create_private_mapping,
 	.register_serial_driver = __pkvm_register_serial_driver,
+	.puts = hyp_puts,
+	.putx64 = hyp_putx64,
 };
 
 int __pkvm_init_module(void *module_init)

From 4d4c9f98296e82e0ef92acade21209c9a0febdfc Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 26 Oct 2022 08:44:34 +0000
Subject: [PATCH 243/457] ANDROID: KVM: arm64: Expose hyp fixmap helpers in
 module_ops

Expose the hyp_fixmap helpers in the module_ops struct to allow dynamic
mapping and unmapping of pages from pKVM modules.

Bug: 244543039
Bug: 244373730
Change-Id: I201db6044ed5eb4c2821a64a6b650b931dd2e389
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 2 ++
 arch/arm64/kvm/hyp/nvhe/modules.c        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index a9dabdaf79ba..af8491c1ccae 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -13,6 +13,8 @@ struct pkvm_module_ops {
 	int (*register_serial_driver)(void (*hyp_putc_cb)(char));
 	void (*puts)(const char *str);
 	void (*putx64)(u64 num);
+	void *(*fixmap_map)(phys_addr_t phys);
+	void (*fixmap_unmap)(void);
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index f0514ee2593b..d8e11922597a 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -14,6 +14,8 @@ const struct pkvm_module_ops module_ops = {
 	.register_serial_driver = __pkvm_register_serial_driver,
 	.puts = hyp_puts,
 	.putx64 = hyp_putx64,
+	.fixmap_map = hyp_fixmap_map,
+	.fixmap_unmap = hyp_fixmap_unmap,
 };
 
 int __pkvm_init_module(void *module_init)

From 4f73cf46ab6f7040f4ace9ba581cf554b01fd6a4 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 26 Oct 2022 10:32:15 +0000
Subject: [PATCH 244/457] ANDROID: KVM: arm64: Expose kvm_flush_dcache_to_poc()
 in module_ops

Expose kvm_flush_dcache_to_poc() in the module_ops struct to allow CMOs
from pKVM modules.

Bug: 244543039
Bug: 244373730
Change-Id: I91d57a94effd2710d868591c6baf4a5672d149a4
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 1 +
 arch/arm64/kvm/hyp/nvhe/modules.c        | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index af8491c1ccae..49f3b0ff2549 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -15,6 +15,7 @@ struct pkvm_module_ops {
 	void (*putx64)(u64 num);
 	void *(*fixmap_map)(phys_addr_t phys);
 	void (*fixmap_unmap)(void);
+	void (*flush_dcache_to_poc)(void *addr, size_t size);
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index d8e11922597a..c91488716557 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -9,6 +9,11 @@
 #include <nvhe/mm.h>
 #include <nvhe/serial.h>
 
+static void __kvm_flush_dcache_to_poc(void *addr, size_t size)
+{
+	kvm_flush_dcache_to_poc((unsigned long)addr, (unsigned long)size);
+}
+
 const struct pkvm_module_ops module_ops = {
 	.create_private_mapping = __pkvm_create_private_mapping,
 	.register_serial_driver = __pkvm_register_serial_driver,
@@ -16,6 +21,7 @@ const struct pkvm_module_ops module_ops = {
 	.putx64 = hyp_putx64,
 	.fixmap_map = hyp_fixmap_map,
 	.fixmap_unmap = hyp_fixmap_unmap,
+	.flush_dcache_to_poc = __kvm_flush_dcache_to_poc,
 };
 
 int __pkvm_init_module(void *module_init)

From c0faf0f3e756aee933e77f8e7e21895c2ae9c0f1 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 25 Oct 2022 15:22:50 +0000
Subject: [PATCH 245/457] ANDROID: KVM: arm64: Introduce
 PKVM_PAGE_RESTRICTED_PROT

pKVM currently doesn't map pages with reduced permissions in stage2
page-tables, for both host and guests. But in preparation for allowing
that to happen, introduce a new page 'meta-state' that is set whenever a
page has non-default permissions. Introducing it ensures that e.g.
host-to-guest donations or shares will fail for a page that the host has
read-only, as the page will no longer be PKVM_PAGE_OWNED only.

Bug: 244543039
Bug: 244373730
Change-Id: Ib3b20c7edabafb9a305f000a37bbdb4bcc8fdbbb
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 27 ++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index e28316bc1a18..e9a3c0740cf6 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -31,6 +31,7 @@ enum pkvm_page_state {
 
 	/* Meta-states which aren't encoded directly in the PTE's SW bits */
 	PKVM_NOPAGE,
+	PKVM_PAGE_RESTRICTED_PROT,
 };
 
 #define PKVM_PAGE_STATE_PROT_MASK	(KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 88b92197526a..fbebff2e088c 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -824,10 +824,17 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
 
 static enum pkvm_page_state host_get_page_state(kvm_pte_t pte)
 {
+	enum pkvm_page_state state = 0;
+	enum kvm_pgtable_prot prot;
+
 	if (!kvm_pte_valid(pte) && pte)
 		return PKVM_NOPAGE;
 
-	return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+	prot = kvm_pgtable_stage2_pte_prot(pte);
+	if (kvm_pte_valid(pte) && ((prot & KVM_PGTABLE_PROT_RWX) != PKVM_HOST_MEM_PROT))
+		state = PKVM_PAGE_RESTRICTED_PROT;
+
+	return state | pkvm_getstate(prot);
 }
 
 static int __host_check_page_state_range(u64 addr, u64 size,
@@ -973,10 +980,17 @@ static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx
 
 static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte)
 {
+	enum pkvm_page_state state = 0;
+	enum kvm_pgtable_prot prot;
+
 	if (!kvm_pte_valid(pte))
 		return PKVM_NOPAGE;
 
-	return pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte));
+	prot = kvm_pgtable_hyp_pte_prot(pte);
+	if (kvm_pte_valid(pte) && ((prot & KVM_PGTABLE_PROT_RWX) != PAGE_HYP))
+		state = PKVM_PAGE_RESTRICTED_PROT;
+
+	return state | pkvm_getstate(prot);
 }
 
 static int __hyp_check_page_state_range(u64 addr, u64 size,
@@ -1085,10 +1099,17 @@ static int hyp_complete_donation(u64 addr,
 
 static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte)
 {
+	enum pkvm_page_state state = 0;
+	enum kvm_pgtable_prot prot;
+
 	if (!kvm_pte_valid(pte))
 		return PKVM_NOPAGE;
 
-	return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
+	prot = kvm_pgtable_stage2_pte_prot(pte);
+	if (kvm_pte_valid(pte) && ((prot & KVM_PGTABLE_PROT_RWX) != KVM_PGTABLE_PROT_RWX))
+		state = PKVM_PAGE_RESTRICTED_PROT;
+
+	return state | pkvm_getstate(prot);
 }
 
 static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr,

From 5b2287bddcae9078d9f82ac4bc5c7c8e56ef45a6 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 25 Oct 2022 16:06:39 +0000
Subject: [PATCH 246/457] ANDROID: KVM: arm64: Add a permission fault handler

In preparation for allowing to restrict host permissions at stage-2 for
certain pages, introduce some infrastructure allowing a pKVM module to
register a permission fault handler.

Bug: 244543039
Bug: 244373730
Change-Id: I8035c64969cc0ebb01c8936b1974b3bc103ba84f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h      |  1 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 18 ++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/modules.c             |  1 +
 4 files changed, 21 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 49f3b0ff2549..9967b3c2eac7 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -16,6 +16,7 @@ struct pkvm_module_ops {
 	void *(*fixmap_map)(phys_addr_t phys);
 	void (*fixmap_unmap)(void);
 	void (*flush_dcache_to_poc)(void *addr, size_t size);
+	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index e9a3c0740cf6..d532d4a53b3f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -92,6 +92,7 @@ int kvm_host_prepare_stage2(void *pgt_pool_base);
 int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
 
+int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
 int hyp_pin_shared_mem(void *from, void *to);
 void hyp_unpin_shared_mem(void *from, void *to);
 void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index fbebff2e088c..f46f5af4f840 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -717,6 +717,21 @@ static bool is_dabt(u64 esr)
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_LOW;
 }
 
+static int (*perm_fault_handler)(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr);
+
+int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr))
+{
+	return cmpxchg(&perm_fault_handler, NULL, cb) ? -EBUSY : 0;
+}
+
+static int handle_host_perm_fault(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr)
+{
+	int (*cb)(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr);
+
+	cb = READ_ONCE(perm_fault_handler);
+	return cb ? cb(host_ctxt, esr, addr) : -EPERM;
+}
+
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 {
 	struct kvm_vcpu_fault_info fault;
@@ -742,6 +757,9 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 
 	host_unlock_component();
 
+	if ((esr & ESR_ELx_FSC_TYPE) == FSC_PERM)
+		ret = handle_host_perm_fault(host_ctxt, esr, addr);
+
 	if (ret == -EPERM)
 		host_inject_abort(host_ctxt);
 	else
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index c91488716557..e4bbfc60c6c4 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -22,6 +22,7 @@ const struct pkvm_module_ops module_ops = {
 	.fixmap_map = hyp_fixmap_map,
 	.fixmap_unmap = hyp_fixmap_unmap,
 	.flush_dcache_to_poc = __kvm_flush_dcache_to_poc,
+	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
 };
 
 int __pkvm_init_module(void *module_init)

From fc7032826c98097ee234d919929b1c50965e7bc5 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 25 Oct 2022 15:48:36 +0000
Subject: [PATCH 247/457] ANDROID: KVM: arm64: Introduce
 hyp_protect_host_page()

Introduce a new helper allowing to map host-owned pages with reduced
permissions (e.g. RO) at stage-2.

Bug: 244543039
Bug: 244373730
Change-Id: I96f732eb044b1fcbdc3d7db51d406580f0142ad4
Signed-off-by: Quentin Perret <qperret@google.com>
[vdonnefort@: Fix for host_stage2_idmap_locked merge conflict]
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h      |  1 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 32 +++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/modules.c             |  2 ++
 4 files changed, 36 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 9967b3c2eac7..92bb90954cfd 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -17,6 +17,7 @@ struct pkvm_module_ops {
 	void (*fixmap_unmap)(void);
 	void (*flush_dcache_to_poc)(void *addr, size_t size);
 	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
+	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index d532d4a53b3f..64d643355759 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -96,6 +96,7 @@ int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt,
 int hyp_pin_shared_mem(void *from, void *to);
 void hyp_unpin_shared_mem(void *from, void *to);
 void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
+int hyp_protect_host_page(u64 pfn, enum kvm_pgtable_prot prot);
 int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
 		    struct kvm_hyp_memcache *host_mc);
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index f46f5af4f840..bc9c7edf9df3 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1819,6 +1819,38 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
 	return ret;
 }
 
+int hyp_protect_host_page(u64 pfn, enum kvm_pgtable_prot prot)
+{
+	u64 addr = hyp_pfn_to_phys(pfn);
+	kvm_pte_t pte;
+	u32 level;
+	int ret;
+
+	if ((prot & KVM_PGTABLE_PROT_RWX) != prot || prot == KVM_PGTABLE_PROT_RWX)
+		return -EINVAL;
+
+	host_lock_component();
+	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
+	if (ret)
+		goto unlock;
+
+	if (host_get_page_state(pte) != PKVM_PAGE_OWNED) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
+	/* XXX: optimize ... */
+	if (kvm_pte_valid(pte) && (level == KVM_PGTABLE_MAX_LEVELS - 1))
+		ret = kvm_pgtable_stage2_unmap(&host_mmu.pgt, addr, PAGE_SIZE);
+	if (!ret)
+		ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false);
+
+unlock:
+	host_unlock_component();
+
+	return ret;
+}
+
 int hyp_pin_shared_mem(void *from, void *to)
 {
 	u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index e4bbfc60c6c4..b3c6fbb452d9 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -5,6 +5,7 @@
 #include <asm/kvm_host.h>
 #include <asm/kvm_pkvm_module.h>
 
+#include <nvhe/mem_protect.h>
 #include <nvhe/modules.h>
 #include <nvhe/mm.h>
 #include <nvhe/serial.h>
@@ -23,6 +24,7 @@ const struct pkvm_module_ops module_ops = {
 	.fixmap_unmap = hyp_fixmap_unmap,
 	.flush_dcache_to_poc = __kvm_flush_dcache_to_poc,
 	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
+	.protect_host_page = hyp_protect_host_page,
 };
 
 int __pkvm_init_module(void *module_init)

From a8f7fefd69019e76d0c02abb236ddb7f7827cd49 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 25 Nov 2022 18:02:30 +0000
Subject: [PATCH 248/457] ANDROID: KVM: arm64: Return a token for a pKVM module
 registration

Later introduction of custom HVCs will require to store some data.
Conviniently, the only thing we need is the start addr (in the hyp VA) of
the text section. So use this value as a token to avoid having to store
it anywhere.

Bug: 244543039
Bug: 244373730
Change-Id: Idd0bcbbb36d189aa4932833ca5b40382c2cddb08
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h |  7 ++++---
 arch/arm64/kvm/pkvm.c                    | 12 +++++++++++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 92bb90954cfd..2a8ced4b53d5 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -38,9 +38,10 @@ struct pkvm_el2_module {
 };
 
 #ifdef MODULE
-int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this);
+int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
+			   unsigned long *token);
 
-#define pkvm_load_el2_module(init_fn)					\
+#define pkvm_load_el2_module(init_fn, token)				\
 ({									\
 	extern char __kvm_nvhe___hypmod_text_start[];			\
 	extern char __kvm_nvhe___hypmod_text_end[];			\
@@ -67,7 +68,7 @@ int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this);
 				  sizeof(*mod.relocs);			\
 	mod.init = init_fn;						\
 									\
-	__pkvm_load_el2_module(&mod, THIS_MODULE);			\
+	__pkvm_load_el2_module(&mod, THIS_MODULE, token);		\
 })
 #endif
 #endif
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 4e14e055b724..77af44dc0019 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -490,7 +490,8 @@ static int __pkvm_cmp_mod_sec(const void *p1, const void *p2)
 	return s1->sec->start < s2->sec->start ? -1 : s1->sec->start > s2->sec->start;
 }
 
-int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this)
+int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
+			   unsigned long *token)
 {
 	struct pkvm_mod_sec_mapping secs_map[] = {
 		{ &mod->text, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X },
@@ -529,6 +530,15 @@ int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this)
 		module_put(this);
 		return -ENOMEM;
 	}
+
+	/*
+	 * The token can be used for other calls related to this module.
+	 * Conveniently the only information needed is this addr so let's use it
+	 * as an identifier.
+	 */
+	if (token)
+		*token = (unsigned long)hyp_va;
+
 	endrel = (void *)mod->relocs + mod->nr_relocs * sizeof(*endrel);
 	kvm_apply_hyp_module_relocations(start, hyp_va, mod->relocs, endrel);
 

From e2eb8807e6b795dfcef162f7c4a45954077401c5 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Tue, 27 Sep 2022 17:15:49 +0100
Subject: [PATCH 249/457] ANDROID: KVM: arm64: Add support for custom hypercall
 registration

When pKVM is in use, allow pKVM modules to register custom hypercall
handlers:

   * pkvm_register_el2_call(): Give a handler to the hypervisor and gets in
     return the newly registered hypercall number.

   * pkvm_el2_mod_call(): Call the previously registered hypercall handler.

There is a limit of 128 hypercalls that can be registered.

Bug: 244543039
Bug: 244373730
Change-Id: I3d6c89675efe5f65f6b53c3b45ae155d1a00164c
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h          |  7 +++
 arch/arm64/include/asm/kvm_pkvm_module.h  | 25 ++++++++
 arch/arm64/kvm/hyp/include/nvhe/modules.h | 13 +++++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c        | 11 ++++
 arch/arm64/kvm/hyp/nvhe/modules.c         | 71 ++++++++++++++++++++++-
 arch/arm64/kvm/pkvm.c                     | 15 +++++
 6 files changed, 141 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index bf1ace7d3efc..b3b20a886892 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -89,6 +89,13 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_register,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_pm_notify,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_finalize,
+	__KVM_HOST_SMCCC_FUNC___pkvm_register_hcall,
+
+	/*
+	 * Start of the dynamically registered hypercalls. Start a bit
+	 * further, just in case some modules...
+	 */
+	__KVM_HOST_SMCCC_FUNC___dynamic_hcalls = 128,
 };
 
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 2a8ced4b53d5..86ba74e01492 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -6,6 +6,8 @@
 #include <asm/kvm_pgtable.h>
 #include <linux/export.h>
 
+typedef void (*dyn_hcall_t)(struct kvm_cpu_context *);
+
 struct pkvm_module_ops {
 	int (*create_private_mapping)(phys_addr_t phys, size_t size,
 				      enum kvm_pgtable_prot prot,
@@ -70,5 +72,28 @@ int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
 									\
 	__pkvm_load_el2_module(&mod, THIS_MODULE, token);		\
 })
+
+int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
+			     unsigned long hyp_text_kern_va);
+
+#define pkvm_register_el2_mod_call(hfn, token)				\
+({									\
+	extern char __kvm_nvhe___hypmod_text_start[];			\
+	unsigned long hyp_text_kern_va = 				\
+		(unsigned long)__kvm_nvhe___hypmod_text_start;		\
+	__pkvm_register_el2_call(function_nocfi(hfn), token,		\
+				 hyp_text_kern_va);			\
+})
+
+#define pkvm_el2_mod_call(id, ...)					\
+	({								\
+		struct arm_smccc_res res;				\
+									\
+		arm_smccc_1_1_hvc(KVM_HOST_SMCCC_ID(id),		\
+				  ##__VA_ARGS__, &res);			\
+		WARN_ON(res.a0 != SMCCC_RET_SUCCESS);			\
+									\
+		res.a1;							\
+	})
 #endif
 #endif
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index e9e05ab58594..d4d77ea1b3f6 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -1,5 +1,18 @@
+#include <asm/kvm_pgtable.h>
+
+#define HCALL_HANDLED 0
+#define HCALL_UNHANDLED -1
+
 #ifdef CONFIG_MODULES
 int __pkvm_init_module(void *module_init);
+int __pkvm_register_hcall(unsigned long hfn_hyp_va);
+int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt);
 #else
 static inline int __pkvm_init_module(void *module_init); { return -EOPNOTSUPP; }
+static inline int
+__pkvm_register_hcall(unsigned long hfn_hyp_va) { return -EOPNOTSUPP; }
+static inline int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt)
+{
+	return HCALL_UNHANDLED;
+}
 #endif
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 5be78f35ea09..9ee73e630ec5 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1186,6 +1186,13 @@ static void handle___pkvm_init_module(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_init_module(ptr);
 }
 
+static void handle___pkvm_register_hcall(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned long, hfn_hyp_va, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_register_hcall(hfn_hyp_va);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1228,6 +1235,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_map_module_page),
 	HANDLE_FUNC(__pkvm_unmap_module_page),
 	HANDLE_FUNC(__pkvm_init_module),
+	HANDLE_FUNC(__pkvm_register_hcall),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
@@ -1236,6 +1244,9 @@ static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
 	unsigned long hcall_min = 0;
 	hcall_t hfn;
 
+	if (handle_host_dynamic_hcall(host_ctxt) == HCALL_HANDLED)
+		return;
+
 	/*
 	 * If pKVM has been initialised then reject any calls to the
 	 * early "privileged" hypercalls. Note that we cannot reject
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index b3c6fbb452d9..ee2c963a7c90 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -9,6 +9,8 @@
 #include <nvhe/modules.h>
 #include <nvhe/mm.h>
 #include <nvhe/serial.h>
+#include <nvhe/spinlock.h>
+#include <nvhe/trap_handler.h>
 
 static void __kvm_flush_dcache_to_poc(void *addr, size_t size)
 {
@@ -33,6 +35,73 @@ int __pkvm_init_module(void *module_init)
 	int ret;
 
 	ret = do_module_init(&module_ops);
-
 	return ret;
 }
+
+#define MAX_DYNAMIC_HCALLS 128
+
+atomic_t num_dynamic_hcalls = ATOMIC_INIT(0);
+DEFINE_HYP_SPINLOCK(dyn_hcall_lock);
+
+static dyn_hcall_t host_dynamic_hcalls[MAX_DYNAMIC_HCALLS];
+
+int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(unsigned long, id, host_ctxt, 0);
+	dyn_hcall_t hfn;
+	int dyn_id;
+
+	/*
+	 * TODO: static key to protect when no dynamic hcall is registered?
+	 */
+
+	dyn_id = (int)(id - KVM_HOST_SMCCC_ID(0)) -
+		 __KVM_HOST_SMCCC_FUNC___dynamic_hcalls;
+	if (dyn_id < 0)
+		return HCALL_UNHANDLED;
+
+	cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
+
+	/*
+	 * Order access to num_dynamic_hcalls and host_dynamic_hcalls. Paired
+	 * with __pkvm_register_hcall().
+	 */
+	if (dyn_id >= atomic_read_acquire(&num_dynamic_hcalls))
+		goto end;
+
+	hfn = READ_ONCE(host_dynamic_hcalls[dyn_id]);
+	if (!hfn)
+		goto end;
+
+	cpu_reg(host_ctxt, 0) = SMCCC_RET_SUCCESS;
+	hfn(host_ctxt);
+end:
+	return HCALL_HANDLED;
+}
+
+int __pkvm_register_hcall(unsigned long hvn_hyp_va)
+{
+	dyn_hcall_t hfn = (void *)hvn_hyp_va;
+	int reserved_id;
+
+	hyp_spin_lock(&dyn_hcall_lock);
+
+	reserved_id = atomic_read(&num_dynamic_hcalls);
+
+	if (reserved_id >= MAX_DYNAMIC_HCALLS) {
+		hyp_spin_unlock(&dyn_hcall_lock);
+		return -ENOMEM;
+	}
+
+	WRITE_ONCE(host_dynamic_hcalls[reserved_id], hfn);
+
+	/*
+	 * Order access to num_dynamic_hcalls and host_dynamic_hcalls. Paired
+	 * with handle_host_dynamic_hcall.
+	 */
+	atomic_set_release(&num_dynamic_hcalls, reserved_id + 1);
+
+	hyp_spin_unlock(&dyn_hcall_lock);
+
+	return reserved_id + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls;
+};
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 77af44dc0019..3587fea8e555 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -561,3 +561,18 @@ int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__pkvm_load_el2_module);
+
+int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
+			     unsigned long hyp_text_kern_va)
+{
+	unsigned long hfn_hyp_va, offset, text_hyp_va = token;
+	int ret;
+
+	offset = (unsigned long)hfn - hyp_text_kern_va;
+	hfn_hyp_va = text_hyp_va + offset;
+
+	ret = kvm_call_hyp_nvhe(__pkvm_register_hcall,
+				(unsigned long)hfn_hyp_va);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__pkvm_register_el2_call);

From 9896e587b013221f4a6e8673fbfef2306fe7a233 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 15 Jul 2022 16:50:38 +0100
Subject: [PATCH 250/457] ANDROID: KVM: arm64: Support unaligned fixmap in the
 nVHE hyp

Return the fixmap VA with the page offset, instead of the page base
address.

Bug: 244543039
Bug: 229972309
Change-Id: I40c342f84e3cd395156ef846f0434c028d4e3fa3
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 81e39e080ac5..d4472de5b284 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -266,7 +266,7 @@ void *hyp_fixmap_map(phys_addr_t phys)
 	WRITE_ONCE(*ptep, pte);
 	dsb(ishst);
 
-	return (void *)slot->addr;
+	return (void *)slot->addr + offset_in_page(phys);
 }
 
 static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)

From ec51620fe192c3e4b3935144175998d153a83b80 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Thu, 24 Nov 2022 15:23:26 +0000
Subject: [PATCH 251/457] ANDROID: KVM: arm64: Block module loading based on
 cmdline or HVC

Disable module loading, based on the cmdline option
kvm-arm.protected_modules. If enabled, the feature can later be
irreversibly disabled with the HVC __pkvm_close_module_registration().

Bug: 244543039
Bug: 244373730
Change-Id: I693a1db350ae15bcaab59103a68b60f087dcc6b4
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_asm.h          |  9 ++--
 arch/arm64/include/asm/kvm_hyp.h          |  2 +
 arch/arm64/kvm/hyp/include/nvhe/modules.h |  8 +++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c        |  7 +++
 arch/arm64/kvm/hyp/nvhe/mm.c              | 34 +++++++++----
 arch/arm64/kvm/hyp/nvhe/modules.c         | 62 +++++++++++++++++++++--
 arch/arm64/kvm/pkvm.c                     |  9 ++++
 7 files changed, 114 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index b3b20a886892..75de6e58cffa 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -63,10 +63,6 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
 	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
-	__KVM_HOST_SMCCC_FUNC___pkvm_alloc_module_va,
-	__KVM_HOST_SMCCC_FUNC___pkvm_map_module_page,
-	__KVM_HOST_SMCCC_FUNC___pkvm_unmap_module_page,
-	__KVM_HOST_SMCCC_FUNC___pkvm_init_module,
 	__KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize,
 
 	/* Hypercalls available after pKVM finalisation */
@@ -90,6 +86,11 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_pm_notify,
 	__KVM_HOST_SMCCC_FUNC___pkvm_iommu_finalize,
 	__KVM_HOST_SMCCC_FUNC___pkvm_register_hcall,
+	__KVM_HOST_SMCCC_FUNC___pkvm_alloc_module_va,
+	__KVM_HOST_SMCCC_FUNC___pkvm_map_module_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_unmap_module_page,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_module,
+	__KVM_HOST_SMCCC_FUNC___pkvm_close_module_registration,
 
 	/*
 	 * Start of the dynamically registered hypercalls. Start a bit
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index d450ed354d69..d2a89623da91 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -131,4 +131,6 @@ extern unsigned long kvm_nvhe_sym(__icache_flags);
 extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
 extern bool kvm_nvhe_sym(smccc_trng_available);
 
+extern bool kvm_nvhe_sym(__pkvm_modules_enabled);
+
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index d4d77ea1b3f6..2453301352bb 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -7,6 +7,10 @@
 int __pkvm_init_module(void *module_init);
 int __pkvm_register_hcall(unsigned long hfn_hyp_va);
 int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt);
+void pkvm_modules_lock(void);
+void pkvm_modules_unlock(void);
+bool pkvm_modules_enabled(void);
+int __pkvm_close_module_registration(void);
 #else
 static inline int __pkvm_init_module(void *module_init); { return -EOPNOTSUPP; }
 static inline int
@@ -15,4 +19,8 @@ static inline int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt)
 {
 	return HCALL_UNHANDLED;
 }
+static inline void pkvm_modules_lock(void) { }
+static inline void pkvm_modules_unlock(void) { }
+static inline bool pkvm_modules_enabled(void) { return false; }
+static inline int __pkvm_close_module_registration(void) { return -EOPNOTSUPP; }
 #endif
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 9ee73e630ec5..f15232433454 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1193,6 +1193,12 @@ static void handle___pkvm_register_hcall(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __pkvm_register_hcall(hfn_hyp_va);
 }
 
+static void
+handle___pkvm_close_module_registration(struct kvm_cpu_context *host_ctxt)
+{
+	cpu_reg(host_ctxt, 1) = __pkvm_close_module_registration();
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);
 
 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1236,6 +1242,7 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__pkvm_unmap_module_page),
 	HANDLE_FUNC(__pkvm_init_module),
 	HANDLE_FUNC(__pkvm_register_hcall),
+	HANDLE_FUNC(__pkvm_close_module_registration),
 };
 
 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index d4472de5b284..6ed9bf0819bc 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -16,6 +16,7 @@
 #include <nvhe/memory.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/modules.h>
 #include <nvhe/spinlock.h>
 
 struct kvm_pgtable pkvm_pgtable;
@@ -102,29 +103,44 @@ int __pkvm_create_private_mapping(phys_addr_t phys, size_t size,
 
 void *__pkvm_alloc_module_va(u64 nr_pages)
 {
-	unsigned long addr;
-	int ret;
+	unsigned long addr = 0;
 
-	ret = pkvm_alloc_private_va_range(nr_pages << PAGE_SHIFT, &addr);
+	pkvm_modules_lock();
+	if (pkvm_modules_enabled())
+		pkvm_alloc_private_va_range(nr_pages << PAGE_SHIFT, &addr);
+	pkvm_modules_unlock();
 
-	return ret ? NULL : (void *)addr;
+	return (void *)addr;
 }
 
 int __pkvm_map_module_page(u64 pfn, void *va, enum kvm_pgtable_prot prot)
 {
-	int ret;
+	int ret = -EACCES;
+
+	pkvm_modules_lock();
+
+	if (!pkvm_modules_enabled())
+		goto err;
 
 	ret = __pkvm_host_donate_hyp(pfn, 1);
 	if (ret)
-		return ret;
+		goto err;
 
-	return __pkvm_create_mappings((unsigned long)va, PAGE_SIZE, hyp_pfn_to_phys(pfn), prot);
+	ret = __pkvm_create_mappings((unsigned long)va, PAGE_SIZE, hyp_pfn_to_phys(pfn), prot);
+err:
+	pkvm_modules_unlock();
+
+	return ret;
 }
 
 void __pkvm_unmap_module_page(u64 pfn, void *va)
 {
-	WARN_ON(__pkvm_hyp_donate_host(pfn, 1));
-	pkvm_remove_mappings(va, va + PAGE_SIZE);
+	pkvm_modules_lock();
+	if (pkvm_modules_enabled()) {
+		WARN_ON(__pkvm_hyp_donate_host(pfn, 1));
+		pkvm_remove_mappings(va, va + PAGE_SIZE);
+	}
+	pkvm_modules_unlock();
 }
 
 int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot)
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index ee2c963a7c90..b71ce3880f3b 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -17,6 +17,42 @@ static void __kvm_flush_dcache_to_poc(void *addr, size_t size)
 	kvm_flush_dcache_to_poc((unsigned long)addr, (unsigned long)size);
 }
 
+DEFINE_HYP_SPINLOCK(modules_lock);
+
+bool __pkvm_modules_enabled __ro_after_init;
+
+void pkvm_modules_lock(void)
+{
+	hyp_spin_lock(&modules_lock);
+}
+
+void pkvm_modules_unlock(void)
+{
+	hyp_spin_unlock(&modules_lock);
+}
+
+bool pkvm_modules_enabled(void)
+{
+	return __pkvm_modules_enabled;
+}
+
+int __pkvm_close_module_registration(void)
+{
+	int ret;
+
+	pkvm_modules_lock();
+	ret = __pkvm_modules_enabled ? 0 : -EACCES;
+	if (!ret) {
+		void *addr = hyp_fixmap_map(__hyp_pa(&__pkvm_modules_enabled));
+		*(bool *)addr = false;
+		hyp_fixmap_unmap();
+	}
+	pkvm_modules_unlock();
+
+	/* The fuse is blown! No way back until reset */
+	return ret;
+}
+
 const struct pkvm_module_ops module_ops = {
 	.create_private_mapping = __pkvm_create_private_mapping,
 	.register_serial_driver = __pkvm_register_serial_driver,
@@ -34,7 +70,15 @@ int __pkvm_init_module(void *module_init)
 	int (*do_module_init)(const struct pkvm_module_ops *ops) = module_init;
 	int ret;
 
+	pkvm_modules_lock();
+	if (!pkvm_modules_enabled()) {
+		ret = -EACCES;
+		goto err;
+	}
 	ret = do_module_init(&module_ops);
+err:
+	pkvm_modules_unlock();
+
 	return ret;
 }
 
@@ -82,15 +126,21 @@ end:
 int __pkvm_register_hcall(unsigned long hvn_hyp_va)
 {
 	dyn_hcall_t hfn = (void *)hvn_hyp_va;
-	int reserved_id;
+	int reserved_id, ret;
+
+	pkvm_modules_lock();
+	if (!pkvm_modules_enabled()) {
+		ret = -EACCES;
+		goto err;
+	}
 
 	hyp_spin_lock(&dyn_hcall_lock);
 
 	reserved_id = atomic_read(&num_dynamic_hcalls);
 
 	if (reserved_id >= MAX_DYNAMIC_HCALLS) {
-		hyp_spin_unlock(&dyn_hcall_lock);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_hcall_unlock;
 	}
 
 	WRITE_ONCE(host_dynamic_hcalls[reserved_id], hfn);
@@ -101,7 +151,11 @@ int __pkvm_register_hcall(unsigned long hvn_hyp_va)
 	 */
 	atomic_set_release(&num_dynamic_hcalls, reserved_id + 1);
 
+	ret = reserved_id + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls;
+err_hcall_unlock:
 	hyp_spin_unlock(&dyn_hcall_lock);
+err:
+	pkvm_modules_unlock();
 
-	return reserved_id + __KVM_HOST_SMCCC_FUNC___dynamic_hcalls;
+	return ret;
 };
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 3587fea8e555..8800c4865651 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -13,6 +13,7 @@
 #include <linux/of_reserved_mem.h>
 #include <linux/sort.h>
 
+#include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_pkvm.h>
 #include <asm/kvm_pkvm_module.h>
@@ -415,6 +416,14 @@ int pkvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 	return 0;
 }
 
+static int __init early_pkvm_enable_modules(char *arg)
+{
+	kvm_nvhe_sym(__pkvm_modules_enabled) = true;
+
+	return 0;
+}
+early_param("kvm-arm.protected_modules", early_pkvm_enable_modules);
+
 struct pkvm_mod_sec_mapping {
 	struct pkvm_module_section *sec;
 	enum kvm_pgtable_prot prot;

From 4287da7498176a833d06e9f6b35d761f969ffc62 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 17:43:13 +0000
Subject: [PATCH 252/457] ANDROID: KVM: arm64: Fix build with CONFIG_MODULES=n

The recently introduced support for pKVM modules has clearly lacked a
bit of testing with CONFIG_MODULES=n.

Fix the broken __pkvm_init_module() stub to make the bots happy.

Bug: 244543039
Bug: 244373730
Reported-by: kernel test robot <lkp@intel.com>
Change-Id: Ie59cbf46442721de78ef51523debadc4c156530e
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/include/nvhe/modules.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index 2453301352bb..d69aa744b82f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -12,7 +12,7 @@ void pkvm_modules_unlock(void);
 bool pkvm_modules_enabled(void);
 int __pkvm_close_module_registration(void);
 #else
-static inline int __pkvm_init_module(void *module_init); { return -EOPNOTSUPP; }
+static inline int __pkvm_init_module(void *module_init) { return -EOPNOTSUPP; }
 static inline int
 __pkvm_register_hcall(unsigned long hfn_hyp_va) { return -EOPNOTSUPP; }
 static inline int handle_host_dynamic_hcall(struct kvm_cpu_context *host_ctxt)

From ed83f265a5f3fc466080cac01b815ed974414c15 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 17:50:33 +0000
Subject: [PATCH 253/457] ANDROID: KVM: arm64: Fix link with CONFIG_MODULES=n

The recently introduced support for pKVM modules has clearly lacked a
bit of testing with CONFIG_MODULES=n.

Make sure to not access __pkvm_modules_enabled when CONFIG_MODULES=n to
fix the link.

Bug: 244543039
Bug: 244373730
Reported-by: kernel test robot <lkp@intel.com>
Change-Id: I65c6ad7c07c2812b85dcd435b24b3444c0ce7f1e
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/pkvm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 8800c4865651..f53776f04a95 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -416,6 +416,7 @@ int pkvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 	return 0;
 }
 
+#ifdef CONFIG_MODULES
 static int __init early_pkvm_enable_modules(char *arg)
 {
 	kvm_nvhe_sym(__pkvm_modules_enabled) = true;
@@ -423,6 +424,7 @@ static int __init early_pkvm_enable_modules(char *arg)
 	return 0;
 }
 early_param("kvm-arm.protected_modules", early_pkvm_enable_modules);
+#endif
 
 struct pkvm_mod_sec_mapping {
 	struct pkvm_module_section *sec;

From 901f361acc109d38dffc0171e01c231b2dddd4ec Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Wed, 10 Aug 2022 13:14:17 +0000
Subject: [PATCH 254/457] ANDROID: KVM: arm64: Pass the pagetable struct as an
 argument to the freewalker

Extend the scope of the stage2_freewalker by passing the pgt instead of
the mm_ops callbacks. This will later be used by the stage2_pte_is_counted
function.

Bug: 222044487
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Change-Id: I390661eb106cbdb863cbb1832e39ec155c439091
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 5992b7a7cb1f..ead0c5e5c6ba 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -1201,7 +1201,8 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 			      enum kvm_pgtable_walk_flags flag,
 			      void * const arg)
 {
-	struct kvm_pgtable_mm_ops *mm_ops = arg;
+	struct kvm_pgtable *pgt = arg;
+	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
 	kvm_pte_t pte = *ptep;
 
 	if (!stage2_pte_is_counted(pte))
@@ -1222,7 +1223,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
 		.cb	= stage2_free_walker,
 		.flags	= KVM_PGTABLE_WALK_LEAF |
 			  KVM_PGTABLE_WALK_TABLE_POST,
-		.arg	= pgt->mm_ops,
+		.arg	= pgt,
 	};
 
 	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));

From ca5e2707ba4cc26f61680231f4501eb5c55f76f5 Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Wed, 10 Aug 2022 13:38:34 +0000
Subject: [PATCH 255/457] ANDROID: KVM: arm64: Split stage2_put_pte function

Refactor the code and add stage2_clear_pte(..) which removes the PTE
without dropping the refcount for an entry.

Bug: 222044487
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Change-Id: Ia2cb47f2ffad6faa5c6b4ec8a37bcbe61be0bc2f
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index ead0c5e5c6ba..6dddfc20c7dc 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -662,6 +662,16 @@ static bool stage2_pte_is_counted(kvm_pte_t pte)
 	return !!pte;
 }
 
+static void stage2_clear_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
+			     u32 level)
+{
+	if (!kvm_pte_valid(*ptep))
+		return;
+
+	kvm_clear_pte(ptep);
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+}
+
 static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
 			   u32 level, struct kvm_pgtable_mm_ops *mm_ops)
 {
@@ -669,11 +679,7 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
 	 * Clear the existing PTE, and perform break-before-make with
 	 * TLB maintenance if it was valid.
 	 */
-	if (kvm_pte_valid(*ptep)) {
-		kvm_clear_pte(ptep);
-		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
-	}
-
+	stage2_clear_pte(ptep, mmu, addr, level);
 	mm_ops->put_page(ptep);
 }
 

From c4714ab75ddab2513e89c519847ad9d060f6aae7 Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Mon, 14 Nov 2022 18:23:05 +0000
Subject: [PATCH 256/457] ANDROID: KVM: arm64: Move PTE attributes definitions
 to the common header

Make PTE attribute definitions available from kvm_pgtable.h and take
them out of the pagetable code. These attributes will be used later in
mem_protect.c to construct different masks during the PTE manipulation
callbacks.

Bug: 222044487
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Change-Id: I2f7108815ef0fa536e7f3314762a412119400fe9
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h | 25 +++++++++++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 25 -------------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 02546b1082f4..8419fb3e7bbe 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -44,6 +44,31 @@ typedef u64 kvm_pte_t;
 
 #define KVM_PHYS_INVALID		(-1ULL)
 
+#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
+
 static inline bool kvm_pte_valid(kvm_pte_t pte)
 {
 	return pte & KVM_PTE_VALID;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 6dddfc20c7dc..bf0c1c0ee7ad 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -17,31 +17,6 @@
 #define KVM_PTE_TYPE_PAGE		1
 #define KVM_PTE_TYPE_TABLE		1
 
-#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
-
-#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
-#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
-#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
-#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
-#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
-
-#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
-
-#define KVM_PTE_LEAF_ATTR_HI_SW		GENMASK(58, 55)
-
-#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
-
-#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
-
 #define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
 					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)

From d096d35445d714c30ef6fdd395d597ab4b8b5828 Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Mon, 14 Nov 2022 17:02:17 +0000
Subject: [PATCH 257/457] ANDROID: KVM: arm64: Have different callbacks for PTE
 manipulation

Move the host specific code for PTE reference counting out of the
pagetable code and define a new structure that wraps all the PTE
manipulation callbacks. This structure will be passed during the
pagetable code initialization and it allows to register different
callback for [guest|host].

Bug: 222044487
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Change-Id: I116e8322935762df2f2be6e8d51a3f0c140b3d36
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h  | 30 +++++++++++++++------
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 38 +++++++++++++++++++++++----
 arch/arm64/kvm/hyp/pgtable.c          | 38 +++++++++++++--------------
 arch/arm64/kvm/mmu.c                  | 20 +++++++++++++-
 4 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8419fb3e7bbe..26c15fc77711 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -200,6 +200,22 @@ enum kvm_pgtable_prot {
 typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
 					   enum kvm_pgtable_prot prot);
 
+typedef bool (*kvm_pgtable_pte_is_counted_cb_t)(kvm_pte_t pte, u32 level);
+
+/**
+ * struct kvm_pgtable_pte_ops - PTE callbacks.
+ * @force_pte_cb:		Force the mapping granularity to pages and
+ *				return true if we support this instead of
+ *				block mappings.
+ * @pte_is_counted_cb		Verify the attributes of the @pte argument
+ *				and return true if the descriptor needs to be
+ *				refcounted, otherwise return false.
+ */
+struct kvm_pgtable_pte_ops {
+	kvm_pgtable_force_pte_cb_t		force_pte_cb;
+	kvm_pgtable_pte_is_counted_cb_t		pte_is_counted_cb;
+};
+
 /**
  * struct kvm_pgtable - KVM page-table.
  * @ia_bits:		Maximum input address size, in bits.
@@ -208,8 +224,7 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
  * @mm_ops:		Memory management callbacks.
  * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
  * @flags:		Stage-2 page-table flags.
- * @force_pte_cb:	Function that returns true if page level mappings must
- *			be used instead of block mappings.
+ * @pte_ops:		PTE callbacks.
  */
 struct kvm_pgtable {
 	u32					ia_bits;
@@ -220,7 +235,7 @@ struct kvm_pgtable {
 	/* Stage-2 only */
 	struct kvm_s2_mmu			*mmu;
 	enum kvm_pgtable_stage2_flags		flags;
-	kvm_pgtable_force_pte_cb_t		force_pte_cb;
+	struct kvm_pgtable_pte_ops		*pte_ops;
 };
 
 /**
@@ -349,18 +364,17 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);
  * @mmu:	S2 MMU context for this S2 translation
  * @mm_ops:	Memory management callbacks.
  * @flags:	Stage-2 configuration flags.
- * @force_pte_cb: Function that returns true if page level mappings must
- *		be used instead of block mappings.
+ * @pte_ops:	PTE callbacks.
  *
  * Return: 0 on success, negative error code on failure.
  */
 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			      struct kvm_pgtable_mm_ops *mm_ops,
 			      enum kvm_pgtable_stage2_flags flags,
-			      kvm_pgtable_force_pte_cb_t force_pte_cb);
+			      struct kvm_pgtable_pte_ops *pte_ops);
 
-#define kvm_pgtable_stage2_init(pgt, mmu, mm_ops) \
-	__kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL)
+#define kvm_pgtable_stage2_init(pgt, mmu, mm_ops, pte_ops) \
+	__kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, pte_ops)
 
 /**
  * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index bc9c7edf9df3..c201da97edad 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -30,6 +30,18 @@ static struct hyp_pool host_s2_pool;
 static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
 #define current_vm (*this_cpu_ptr(&__current_vm))
 
+static struct kvm_pgtable_pte_ops host_s2_pte_ops;
+static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot);
+static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level);
+static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
+				      enum kvm_pgtable_prot prot);
+static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level);
+
+static struct kvm_pgtable_pte_ops guest_s2_pte_ops = {
+	.force_pte_cb = guest_stage2_force_pte_cb,
+	.pte_is_counted_cb = guest_stage2_pte_is_counted
+};
+
 static void guest_lock_component(struct pkvm_hyp_vm *vm)
 {
 	hyp_spin_lock(&vm->lock);
@@ -129,8 +141,6 @@ static void prepare_host_vtcr(void)
 					  id_aa64mmfr1_el1_sys_val, phys_shift);
 }
 
-static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot);
-
 int kvm_host_prepare_stage2(void *pgt_pool_base)
 {
 	struct kvm_s2_mmu *mmu = &host_mmu.arch.mmu;
@@ -144,9 +154,12 @@ int kvm_host_prepare_stage2(void *pgt_pool_base)
 	if (ret)
 		return ret;
 
+	host_s2_pte_ops.force_pte_cb = host_stage2_force_pte;
+	host_s2_pte_ops.pte_is_counted_cb = host_stage2_pte_is_counted;
+
 	ret = __kvm_pgtable_stage2_init(&host_mmu.pgt, mmu,
 					&host_mmu.mm_ops, KVM_HOST_S2_FLAGS,
-					host_stage2_force_pte_cb);
+					&host_s2_pte_ops);
 	if (ret)
 		return ret;
 
@@ -163,6 +176,11 @@ static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
 	return true;
 }
 
+static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
+{
+	return host_stage2_pte_is_counted(pte, level);
+}
+
 static void *guest_s2_zalloc_pages_exact(size_t size)
 {
 	void *addr = hyp_alloc_pages(&current_vm->pool, get_order(size));
@@ -252,7 +270,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)
 
 	guest_lock_component(vm);
 	ret = __kvm_pgtable_stage2_init(mmu->pgt, mmu, &vm->mm_ops, 0,
-					guest_stage2_force_pte_cb);
+					&guest_s2_pte_ops);
 	guest_unlock_component(vm);
 	if (ret)
 		return ret;
@@ -618,7 +636,7 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component
 	return 0;
 }
 
-static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
+static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot)
 {
 	/*
 	 * Block mappings must be used with care in the host stage-2 as a
@@ -640,6 +658,16 @@ static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot pr
 		return prot != PKVM_HOST_MMIO_PROT;
 }
 
+static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
+{
+	/*
+	 * The refcount tracks valid entries as well as invalid entries if they
+	 * encode ownership of a page to another entity than the page-table
+	 * owner, whose id is 0.
+	 */
+	return !!pte;
+}
+
 static int host_stage2_idmap(u64 addr)
 {
 	struct kvm_mem_range range;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index bf0c1c0ee7ad..5668f06f9a5c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -482,7 +482,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
 	pgt->start_level	= KVM_PGTABLE_MAX_LEVELS - levels;
 	pgt->mm_ops		= mm_ops;
 	pgt->mmu		= NULL;
-	pgt->force_pte_cb	= NULL;
+	pgt->pte_ops		= NULL;
 
 	return 0;
 }
@@ -627,16 +627,6 @@ static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
 	return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
 }
 
-static bool stage2_pte_is_counted(kvm_pte_t pte)
-{
-	/*
-	 * The refcount tracks valid entries as well as invalid entries if they
-	 * encode ownership of a page to another entity than the page-table
-	 * owner, whose id is 0.
-	 */
-	return !!pte;
-}
-
 static void stage2_clear_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
 			     u32 level)
 {
@@ -685,6 +675,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	kvm_pte_t new, old = *ptep;
 	u64 granule = kvm_granule_size(level), phys = data->phys;
 	struct kvm_pgtable *pgt = data->mmu->pgt;
+	struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
 	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
 
 	if (!stage2_leaf_mapping_allowed(addr, end, level, data))
@@ -695,7 +686,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	else
 		new = data->annotation;
 
-	if (stage2_pte_is_counted(old)) {
+	if (pte_ops->pte_is_counted_cb(old, level)) {
 		/*
 		 * Skip updating the PTE if we are trying to recreate the exact
 		 * same mapping or only change the access permissions. Instead,
@@ -722,7 +713,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 	if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
 		mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
 
-	if (stage2_pte_is_counted(new))
+	if (pte_ops->pte_is_counted_cb(new, level))
 		mm_ops->get_page(ptep);
 
 out_set_pte:
@@ -759,11 +750,13 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 				struct stage2_map_data *data)
 {
 	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+	struct kvm_pgtable *pgt = data->mmu->pgt;
+	struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
 	kvm_pte_t *childp, pte = *ptep;
 	int ret;
 
 	if (data->anchor) {
-		if (stage2_pte_is_counted(pte))
+		if (pte_ops->pte_is_counted_cb(pte, level))
 			mm_ops->put_page(ptep);
 
 		return 0;
@@ -788,7 +781,7 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	 * a table. Accesses beyond 'end' that fall within the new table
 	 * will be mapped lazily.
 	 */
-	if (stage2_pte_is_counted(pte))
+	if (pte_ops->pte_is_counted_cb(pte, level))
 		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
 
 	kvm_set_table_pte(ptep, childp, mm_ops);
@@ -864,12 +857,12 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 			   void *mc)
 {
 	int ret;
+	struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
 	struct stage2_map_data map_data = {
 		.phys		= ALIGN_DOWN(phys, PAGE_SIZE),
 		.mmu		= pgt->mmu,
 		.memcache	= mc,
 		.mm_ops		= pgt->mm_ops,
-		.force_pte	= pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
 	};
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_map_walker,
@@ -879,6 +872,9 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
 		.arg		= &map_data,
 	};
 
+	if (pte_ops->force_pte_cb)
+		map_data.force_pte = pte_ops->force_pte_cb(addr, addr + size, prot);
+
 	if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
 		return -EINVAL;
 
@@ -925,11 +921,12 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	struct kvm_pgtable *pgt = arg;
 	struct kvm_s2_mmu *mmu = pgt->mmu;
 	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
+	struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
 	kvm_pte_t pte = *ptep, *childp = NULL;
 	bool need_flush = false;
 
 	if (!kvm_pte_valid(pte)) {
-		if (stage2_pte_is_counted(pte)) {
+		if (pte_ops->pte_is_counted_cb(pte, level)) {
 			kvm_clear_pte(ptep);
 			mm_ops->put_page(ptep);
 		}
@@ -1144,7 +1141,7 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 			      struct kvm_pgtable_mm_ops *mm_ops,
 			      enum kvm_pgtable_stage2_flags flags,
-			      kvm_pgtable_force_pte_cb_t force_pte_cb)
+			      struct kvm_pgtable_pte_ops *pte_ops)
 {
 	size_t pgd_sz;
 	u64 vtcr = mmu->arch->vtcr;
@@ -1162,7 +1159,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
 	pgt->mm_ops		= mm_ops;
 	pgt->mmu		= mmu;
 	pgt->flags		= flags;
-	pgt->force_pte_cb	= force_pte_cb;
+	pgt->pte_ops		= pte_ops;
 
 	/* Ensure zeroed PGD pages are visible to the hardware walker */
 	dsb(ishst);
@@ -1184,9 +1181,10 @@ static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 {
 	struct kvm_pgtable *pgt = arg;
 	struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
+	struct kvm_pgtable_pte_ops *pte_ops = pgt->pte_ops;
 	kvm_pte_t pte = *ptep;
 
-	if (!stage2_pte_is_counted(pte))
+	if (!pte_ops->pte_is_counted_cb(pte, level))
 		return 0;
 
 	mm_ops->put_page(ptep);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 49a56711c33c..9b6df5286dd5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -680,6 +680,17 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
 	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
 }
 
+static bool stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
+{
+	return true;
+}
+
+static bool stage2_pte_is_counted(kvm_pte_t pte, u32 level)
+
+{
+	return !!pte;
+}
+
 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.zalloc_page		= stage2_memcache_zalloc_page,
 	.zalloc_pages_exact	= kvm_s2_zalloc_pages_exact,
@@ -693,6 +704,12 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
 	.icache_inval_pou	= invalidate_icache_guest_page,
 };
 
+static struct kvm_pgtable_pte_ops kvm_s2_pte_ops = {
+	.force_pte_cb = stage2_force_pte_cb,
+	.pte_is_counted_cb = stage2_pte_is_counted
+
+};
+
 /**
  * kvm_init_stage2_mmu - Initialise a S2 MMU structure
  * @kvm:	The pointer to the KVM structure
@@ -746,7 +763,8 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
 		return -ENOMEM;
 
 	mmu->arch = &kvm->arch;
-	err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
+	err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops,
+				      &kvm_s2_pte_ops);
 	if (err)
 		goto out_free_pgtable;
 

From 9a46f648c83904bdfc35af3aaec72ac84613892a Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Thu, 8 Dec 2022 10:54:54 +0000
Subject: [PATCH 258/457] ANDROID: KVM: arm64: Move kvm_pte_table to the common
 header

In preparation for the coalescing algorithm implementation, move the
function which verifies if a page table entry is a tabel to the common
header.

Bug: 222044487
Change-Id: I4124b7727e91f61b8f0a7e44cd91403d09d83c3c
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h | 16 ++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 16 ----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 26c15fc77711..4f857405ae85 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -44,6 +44,11 @@ typedef u64 kvm_pte_t;
 
 #define KVM_PHYS_INVALID		(-1ULL)
 
+#define KVM_PTE_TYPE			BIT(1)
+#define KVM_PTE_TYPE_BLOCK		0
+#define KVM_PTE_TYPE_PAGE		1
+#define KVM_PTE_TYPE_TABLE		1
+
 #define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
 
 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
@@ -112,6 +117,17 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
 	return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
 }
 
+static inline bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return false;
+
+	if (!kvm_pte_valid(pte))
+		return false;
+
+	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
 /**
  * struct kvm_pgtable_mm_ops - Memory management callbacks.
  * @zalloc_page:		Allocate a single zeroed memory page.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 5668f06f9a5c..82b55bd8d570 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -12,11 +12,6 @@
 #include <asm/stage2_pgtable.h>
 
 
-#define KVM_PTE_TYPE			BIT(1)
-#define KVM_PTE_TYPE_BLOCK		0
-#define KVM_PTE_TYPE_PAGE		1
-#define KVM_PTE_TYPE_TABLE		1
-
 #define KVM_PTE_LEAF_ATTR_S2_PERMS	(KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
 					 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
 					 KVM_PTE_LEAF_ATTR_HI_S2_XN)
@@ -81,17 +76,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
 	return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
 }
 
-static bool kvm_pte_table(kvm_pte_t pte, u32 level)
-{
-	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
-		return false;
-
-	if (!kvm_pte_valid(pte))
-		return false;
-
-	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
-}
-
 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
 {
 	return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));

From 1af7ed3212d977869ed312e1f9b43ab259d36727 Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Thu, 8 Dec 2022 11:19:17 +0000
Subject: [PATCH 259/457] ANDROID: KVM: arm64: Coalesce host stage2 entries on
 ownership reclaim

This optimization allows us to re-create higher order block mappings in
the host stage2 pagetables after we teardown a guest VM.
When the host reclaims ownership during guest teardown, the page table
walker drops the refcount of the counted entries and clears out
unreferenced entries (refcount == 1). Clearing out the entry installs a
zero PTE. When the host stage2 receives a data abort because there is no
mapping associated, it will try to create the largest possible block
mapping from the founded leaf entry.
With the current patch, we increase the chances of finding a leaf entry
that has level < 3 if the requested region comes from a reclaimed torned
down VM memory. This has the advantage of reducing the TLB pressure at
host stage2.

To increase the coalescing chances, we modify the way we refcount page
table descriptors for host stage2:
- non-zero invalid PTEs
- any of the reserved-high bits(58-55) toogled
- non-default attribute mappings
- page table descriptors

Bug: 222044487
Test: dump the host stage2 pagetables and view the mapping
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Change-Id: I90ff4ec2185e9a76d7ad17e77ef9bdd8ce3e8698
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h  | 18 ++++++++++++
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 18 ++++++++++--
 arch/arm64/kvm/hyp/pgtable.c          | 41 ++++++++++++++++++++++++++-
 3 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4f857405ae85..d150b1dcc4df 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -208,6 +208,24 @@ enum kvm_pgtable_prot {
 #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
 #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
 
+#define KVM_HOST_S2_DEFAULT_ATTR   (KVM_PTE_LEAF_ATTR_HI |	\
+				    KVM_PTE_LEAF_ATTR_LO)
+
+#define KVM_HOST_S2_DEFAULT_MEM_PTE		\
+	(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_SH	|	\
+	KVM_PTE_LEAF_ATTR_LO_S2_AF)
+
+#define KVM_HOST_S2_DEFAULT_MMIO_PTE		\
+	(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR |	\
+	KVM_PTE_LEAF_ATTR_HI_S2_XN |		\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_SH |		\
+	KVM_PTE_LEAF_ATTR_LO_S2_AF)
+
 #define PAGE_HYP		KVM_PGTABLE_PROT_RW
 #define PAGE_HYP_EXEC		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
 #define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index c201da97edad..d3a1a8d84640 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -178,7 +178,7 @@ static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
 
 static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
-	return host_stage2_pte_is_counted(pte, level);
+	return !!pte;
 }
 
 static void *guest_s2_zalloc_pages_exact(size_t size)
@@ -660,12 +660,26 @@ static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot)
 
 static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
+	u64 phys;
+
 	/*
 	 * The refcount tracks valid entries as well as invalid entries if they
 	 * encode ownership of a page to another entity than the page-table
 	 * owner, whose id is 0.
 	 */
-	return !!pte;
+	if (!kvm_pte_valid(pte))
+		return !!pte;
+
+	if (kvm_pte_table(pte, level))
+		return true;
+
+	phys = kvm_pte_to_phys(pte);
+	if (!addr_is_memory(phys))
+		return (pte & KVM_HOST_S2_DEFAULT_ATTR) !=
+			KVM_HOST_S2_DEFAULT_MMIO_PTE;
+	else
+		return (pte & KVM_HOST_S2_DEFAULT_ATTR) !=
+			KVM_HOST_S2_DEFAULT_MEM_PTE;
 }
 
 static int host_stage2_idmap(u64 addr)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 82b55bd8d570..7808591e2a09 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -767,6 +767,13 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	 */
 	if (pte_ops->pte_is_counted_cb(pte, level))
 		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
+	else {
+		/*
+		 * On non-refcounted PTEs we just clear them out without
+		 * dropping the refcount.
+		 */
+		stage2_clear_pte(ptep, data->mmu, addr, level);
+	}
 
 	kvm_set_table_pte(ptep, childp, mm_ops);
 	mm_ops->get_page(ptep);
@@ -774,6 +781,35 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	return 0;
 }
 
+static void stage2_coalesce_walk_table_post(u64 addr, u64 end, u32 level,
+			kvm_pte_t *ptep,
+			struct stage2_map_data *data)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+	kvm_pte_t *childp = kvm_pte_follow(*ptep, mm_ops);
+
+	/*
+	 * Decrement the refcount only on the set ownership path to avoid a
+	 * loop situation when the following happens:
+	 *  1. We take a host stage2 fault and we create a small mapping which
+	 *  has default attributes (is not refcounted).
+	 *  2. On the way back we execute the post handler and we zap the
+	 *  table that holds our mapping.
+	 */
+	if (kvm_phys_is_valid(data->phys) ||
+	    !kvm_level_supports_block_mapping(level))
+		return;
+
+	/*
+	 * Free a page that is not referenced anymore and drop the reference
+	 * of the page table page.
+	 */
+	if (mm_ops->page_count(childp) == 1) {
+		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
+		mm_ops->put_page(childp);
+	}
+}
+
 static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 				      kvm_pte_t *ptep,
 				      struct stage2_map_data *data)
@@ -782,8 +818,11 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 	kvm_pte_t *childp;
 	int ret = 0;
 
-	if (!data->anchor)
+	if (!data->anchor) {
+		stage2_coalesce_walk_table_post(addr, end, level, ptep,
+						data);
 		return 0;
+	}
 
 	if (data->anchor == ptep) {
 		childp = data->childp;

From e38c4149ed85c027a8ee363d2db79cfe80f01dca Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Thu, 15 Dec 2022 12:45:24 +0000
Subject: [PATCH 260/457] ANDROID: KVM: arm64: Don't filter out
 KVM_FUNC_MMIO_GUARD_MAP hypercalls

If a KVM_FUNC_MMIO_GUARD_MAP hypercall from a protected guest fails at
EL2 due to running out of page-table memory, the call is forwarded to
the host so that additional memory can be donated using the vCPU's
memcache.

Unfortunately, the host filters out these calls the hypervisor will
replay the guest's HVC instruction forever, making no progress because
it will fail each time.

Avoid filtering out KVM_FUNC_MMIO_GUARD_MAP, in the same way as we
handle the SHARE and UNSHARE hypercalls.

Bug: 262700476
Cc: Keir Fraser <keirf@google.com>
Signed-off-by: Will Deacon <willdeacon@google.com>
Change-Id: Idd14c6bc08a4232939676e3566b79cbc7c927a3a
---
 arch/arm64/kvm/hypercalls.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index fc64e8358928..b4712bcc697d 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -85,6 +85,7 @@ static bool kvm_hvc_call_default_allowed(u32 func_id)
 	case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID:
 	case ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID:
+	case ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_MAP_FUNC_ID:
 		return true;
 	default:
 		/* PSCI 0.2 and up is in the 0:0x1f range */

From 72cc19df8b71095f9740ff0ca6a75bf7ed27b0cd Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Tue, 13 Dec 2022 21:19:25 +0000
Subject: [PATCH 261/457] ANDROID: KVM: arm64: Add support for non-cacheable
 mappings

Hypervisor vendor modules may need to create non-cacheable mappings in
the hypervisor stage-1 for interacting with devices such as IOMMUs.

Add support for this memory type to the KVM pgtable API and implement
it for both stage-1 and stage-2.

Bug: 244373730
Signed-off-by: Will Deacon <willdeacon@google.com>
Change-Id: I2f88db7fe47e16366018e3e48f30d09b299ae6e4
---
 arch/arm64/include/asm/kvm_pgtable.h |  2 ++
 arch/arm64/include/asm/memory.h      |  2 ++
 arch/arm64/kvm/hyp/pgtable.c         | 34 ++++++++++++++++++++++------
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index d150b1dcc4df..8e8cd6bc6433 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -184,6 +184,7 @@ enum kvm_pgtable_stage2_flags {
  * @KVM_PGTABLE_PROT_W:		Write permission.
  * @KVM_PGTABLE_PROT_R:		Read permission.
  * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
+ * @KVM_PGTABLE_PROT_NC:       Normal non-cacheable attributes.
  * @KVM_PGTABLE_PROT_SW0:	Software bit 0.
  * @KVM_PGTABLE_PROT_SW1:	Software bit 1.
  * @KVM_PGTABLE_PROT_SW2:	Software bit 2.
@@ -195,6 +196,7 @@ enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_R			= BIT(2),
 
 	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+	KVM_PGTABLE_PROT_NC			= BIT(4),
 
 	KVM_PGTABLE_PROT_SW0			= BIT(55),
 	KVM_PGTABLE_PROT_SW1			= BIT(56),
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 9dd08cd339c3..cb7055dc6045 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -147,6 +147,7 @@
  * Memory types for Stage-2 translation
  */
 #define MT_S2_NORMAL		0xf
+#define MT_S2_NORMAL_NC		0x5
 #define MT_S2_DEVICE_nGnRE	0x1
 
 /*
@@ -154,6 +155,7 @@
  * Stage-2 enforces Normal-WB and Device-nGnRE
  */
 #define MT_S2_FWB_NORMAL	6
+#define MT_S2_FWB_NORMAL_NC	5
 #define MT_S2_FWB_DEVICE_nGnRE	1
 
 #ifdef CONFIG_ARM64_4K_PAGES
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 7808591e2a09..6aac30b3ba7f 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -273,16 +273,26 @@ struct hyp_map_data {
 
 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
 {
-	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
-	u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
-	kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
-	u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
 	u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
 					       KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+	u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+	bool nc = prot & KVM_PGTABLE_PROT_NC;
+	kvm_pte_t attr;
+	u32 mtype;
 
-	if (!(prot & KVM_PGTABLE_PROT_R))
+	if (!(prot & KVM_PGTABLE_PROT_R) || (device && nc))
 		return -EINVAL;
 
+	if (device)
+		mtype = MT_DEVICE_nGnRnE;
+	else if (nc)
+		mtype = MT_NORMAL_NC;
+	else
+		mtype = MT_NORMAL;
+
+	attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+
 	if (prot & KVM_PGTABLE_PROT_X) {
 		if (prot & KVM_PGTABLE_PROT_W)
 			return -EINVAL;
@@ -563,9 +573,19 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
 				kvm_pte_t *ptep)
 {
 	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
-	kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
-			    KVM_S2_MEMATTR(pgt, NORMAL);
 	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+	bool nc = prot & KVM_PGTABLE_PROT_NC;
+	kvm_pte_t attr;
+
+	if (device && nc)
+		return -EINVAL;
+
+	if (device)
+		attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
+	else if (nc)
+		attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
+	else
+		attr = KVM_S2_MEMATTR(pgt, NORMAL);
 
 	if (!(prot & KVM_PGTABLE_PROT_X))
 		attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;

From 6b510a1423eefaa5c0423735c71c87141aaa7dd1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2022 16:12:33 -0800
Subject: [PATCH 262/457] UPSTREAM: crypto: api - optimize algorithm
 registration when self-tests disabled

Currently, registering an algorithm with the crypto API always causes a
notification to be posted to the "cryptomgr", which then creates a
kthread to self-test the algorithm.  However, if self-tests are disabled
in the kconfig (as is the default option), then this kthread just
notifies waiters that the algorithm has been tested, then exits.

This causes a significant amount of overhead, especially in the kthread
creation and destruction, which is not necessary at all.  For example,
in a quick test I found that booting a "minimum" x86_64 kernel with all
the crypto options enabled (except for the self-tests) takes about 400ms
until PID 1 can start.  Of that, a full 13ms is spent just doing this
pointless dance, involving a kthread being created, run, and destroyed
over 200 times.  That's over 3% of the entire kernel start time.

Fix this by just skipping the creation of the test larval and the
posting of the registration notification entirely, when self-tests are
disabled.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Bug: 256875295
(cherry picked from commit a7008584ab19d2df05caa95634cd72bc41f4cad3)
Change-Id: Ia6be068618e9286c1be01415a6766ba2fa94fc0d
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/algapi.c | 156 +++++++++++++++++++++++++++---------------------
 crypto/api.c    |   3 -
 2 files changed, 87 insertions(+), 72 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 5c69ff8e8fa5..950195e90bfc 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -222,12 +222,64 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
 }
 EXPORT_SYMBOL_GPL(crypto_remove_spawns);
 
+static void crypto_alg_finish_registration(struct crypto_alg *alg,
+					   bool fulfill_requests,
+					   struct list_head *algs_to_put)
+{
+	struct crypto_alg *q;
+
+	list_for_each_entry(q, &crypto_alg_list, cra_list) {
+		if (q == alg)
+			continue;
+
+		if (crypto_is_moribund(q))
+			continue;
+
+		if (crypto_is_larval(q)) {
+			struct crypto_larval *larval = (void *)q;
+
+			/*
+			 * Check to see if either our generic name or
+			 * specific name can satisfy the name requested
+			 * by the larval entry q.
+			 */
+			if (strcmp(alg->cra_name, q->cra_name) &&
+			    strcmp(alg->cra_driver_name, q->cra_name))
+				continue;
+
+			if (larval->adult)
+				continue;
+			if ((q->cra_flags ^ alg->cra_flags) & larval->mask)
+				continue;
+
+			if (fulfill_requests && crypto_mod_get(alg))
+				larval->adult = alg;
+			else
+				larval->adult = ERR_PTR(-EAGAIN);
+
+			continue;
+		}
+
+		if (strcmp(alg->cra_name, q->cra_name))
+			continue;
+
+		if (strcmp(alg->cra_driver_name, q->cra_driver_name) &&
+		    q->cra_priority > alg->cra_priority)
+			continue;
+
+		crypto_remove_spawns(q, algs_to_put, alg);
+	}
+
+	crypto_notify(CRYPTO_MSG_ALG_LOADED, alg);
+}
+
 static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
 {
 	struct crypto_larval *larval;
 
-	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER))
-		return NULL;
+	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) ||
+	    IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
+		return NULL; /* No self-test needed */
 
 	larval = crypto_larval_alloc(alg->cra_name,
 				     alg->cra_flags | CRYPTO_ALG_TESTED, 0);
@@ -248,7 +300,8 @@ static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
 	return larval;
 }
 
-static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
+static struct crypto_larval *
+__crypto_register_alg(struct crypto_alg *alg, struct list_head *algs_to_put)
 {
 	struct crypto_alg *q;
 	struct crypto_larval *larval;
@@ -259,9 +312,6 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 
 	INIT_LIST_HEAD(&alg->cra_users);
 
-	/* No cheating! */
-	alg->cra_flags &= ~CRYPTO_ALG_TESTED;
-
 	ret = -EEXIST;
 
 	list_for_each_entry(q, &crypto_alg_list, cra_list) {
@@ -288,13 +338,18 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg)
 
 	list_add(&alg->cra_list, &crypto_alg_list);
 
-	if (larval)
-		list_add(&larval->alg.cra_list, &crypto_alg_list);
-	else
-		alg->cra_flags |= CRYPTO_ALG_TESTED;
-
 	crypto_stats_init(alg);
 
+	if (larval) {
+		/* No cheating! */
+		alg->cra_flags &= ~CRYPTO_ALG_TESTED;
+
+		list_add(&larval->alg.cra_list, &crypto_alg_list);
+	} else {
+		alg->cra_flags |= CRYPTO_ALG_TESTED;
+		crypto_alg_finish_registration(alg, true, algs_to_put);
+	}
+
 out:
 	return larval;
 
@@ -341,7 +396,10 @@ found:
 
 	alg->cra_flags |= CRYPTO_ALG_TESTED;
 
-	/* Only satisfy larval waiters if we are the best. */
+	/*
+	 * If a higher-priority implementation of the same algorithm is
+	 * currently being tested, then don't fulfill request larvals.
+	 */
 	best = true;
 	list_for_each_entry(q, &crypto_alg_list, cra_list) {
 		if (crypto_is_moribund(q) || !crypto_is_larval(q))
@@ -356,47 +414,7 @@ found:
 		}
 	}
 
-	list_for_each_entry(q, &crypto_alg_list, cra_list) {
-		if (q == alg)
-			continue;
-
-		if (crypto_is_moribund(q))
-			continue;
-
-		if (crypto_is_larval(q)) {
-			struct crypto_larval *larval = (void *)q;
-
-			/*
-			 * Check to see if either our generic name or
-			 * specific name can satisfy the name requested
-			 * by the larval entry q.
-			 */
-			if (strcmp(alg->cra_name, q->cra_name) &&
-			    strcmp(alg->cra_driver_name, q->cra_name))
-				continue;
-
-			if (larval->adult)
-				continue;
-			if ((q->cra_flags ^ alg->cra_flags) & larval->mask)
-				continue;
-
-			if (best && crypto_mod_get(alg))
-				larval->adult = alg;
-			else
-				larval->adult = ERR_PTR(-EAGAIN);
-
-			continue;
-		}
-
-		if (strcmp(alg->cra_name, q->cra_name))
-			continue;
-
-		if (strcmp(alg->cra_driver_name, q->cra_driver_name) &&
-		    q->cra_priority > alg->cra_priority)
-			continue;
-
-		crypto_remove_spawns(q, &list, alg);
-	}
+	crypto_alg_finish_registration(alg, best, &list);
 
 complete:
 	complete_all(&test->completion);
@@ -423,7 +441,8 @@ EXPORT_SYMBOL_GPL(crypto_remove_final);
 int crypto_register_alg(struct crypto_alg *alg)
 {
 	struct crypto_larval *larval;
-	bool test_started;
+	LIST_HEAD(algs_to_put);
+	bool test_started = false;
 	int err;
 
 	alg->cra_flags &= ~CRYPTO_ALG_DEAD;
@@ -432,17 +451,18 @@ int crypto_register_alg(struct crypto_alg *alg)
 		return err;
 
 	down_write(&crypto_alg_sem);
-	larval = __crypto_register_alg(alg);
-	test_started = static_key_enabled(&crypto_boot_test_finished);
-	if (!IS_ERR_OR_NULL(larval))
+	larval = __crypto_register_alg(alg, &algs_to_put);
+	if (!IS_ERR_OR_NULL(larval)) {
+		test_started = static_key_enabled(&crypto_boot_test_finished);
 		larval->test_started = test_started;
+	}
 	up_write(&crypto_alg_sem);
 
-	if (IS_ERR_OR_NULL(larval))
+	if (IS_ERR(larval))
 		return PTR_ERR(larval);
-
 	if (test_started)
 		crypto_wait_for_test(larval);
+	crypto_remove_final(&algs_to_put);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(crypto_register_alg);
@@ -619,6 +639,7 @@ int crypto_register_instance(struct crypto_template *tmpl,
 	struct crypto_larval *larval;
 	struct crypto_spawn *spawn;
 	u32 fips_internal = 0;
+	LIST_HEAD(algs_to_put);
 	int err;
 
 	err = crypto_check_alg(&inst->alg);
@@ -650,7 +671,7 @@ int crypto_register_instance(struct crypto_template *tmpl,
 
 	inst->alg.cra_flags |= (fips_internal & CRYPTO_ALG_FIPS_INTERNAL);
 
-	larval = __crypto_register_alg(&inst->alg);
+	larval = __crypto_register_alg(&inst->alg, &algs_to_put);
 	if (IS_ERR(larval))
 		goto unlock;
 	else if (larval)
@@ -662,15 +683,12 @@ int crypto_register_instance(struct crypto_template *tmpl,
 unlock:
 	up_write(&crypto_alg_sem);
 
-	err = PTR_ERR(larval);
-	if (IS_ERR_OR_NULL(larval))
-		goto err;
-
-	crypto_wait_for_test(larval);
-	err = 0;
-
-err:
-	return err;
+	if (IS_ERR(larval))
+		return PTR_ERR(larval);
+	if (larval)
+		crypto_wait_for_test(larval);
+	crypto_remove_final(&algs_to_put);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(crypto_register_instance);
 
diff --git a/crypto/api.c b/crypto/api.c
index 64f2d365a8e9..52ce10a35366 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -172,9 +172,6 @@ void crypto_wait_for_test(struct crypto_larval *larval)
 
 	err = wait_for_completion_killable(&larval->completion);
 	WARN_ON(err);
-	if (!err)
-		crypto_notify(CRYPTO_MSG_ALG_LOADED, larval);
-
 out:
 	crypto_larval_kill(&larval->alg);
 }

From 8c54209f4a5ed5db16813a4164fec9afe133b4a8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2022 16:12:34 -0800
Subject: [PATCH 263/457] UPSTREAM: crypto: algboss - optimize registration of
 internal algorithms

Since algboss always skips testing of algorithms with the
CRYPTO_ALG_INTERNAL flag, there is no need to go through the dance of
creating the test kthread, which creates a lot of overhead.  Instead, we
can just directly finish the algorithm registration, like is now done
when self-tests are disabled entirely.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Bug: 256875295
(cherry picked from commit 9cadd73adef1e1d53ea100f28e3e258698b92418)
Change-Id: I10f814cd6903d41265f69297d8568b43ec30012e
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/algapi.c  |  3 ++-
 crypto/algboss.c | 13 +------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 950195e90bfc..851b247f043d 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -278,7 +278,8 @@ static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
 	struct crypto_larval *larval;
 
 	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) ||
-	    IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
+	    IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) ||
+	    (alg->cra_flags & CRYPTO_ALG_INTERNAL))
 		return NULL; /* No self-test needed */
 
 	larval = crypto_larval_alloc(alg->cra_name,
diff --git a/crypto/algboss.c b/crypto/algboss.c
index eb5fe84efb83..13d37320a66e 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -181,12 +181,8 @@ static int cryptomgr_test(void *data)
 	goto skiptest;
 #endif
 
-	if (type & CRYPTO_ALG_TESTED)
-		goto skiptest;
-
 	err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED);
 
-skiptest:
 	crypto_alg_tested(param->driver, err);
 
 	kfree(param);
@@ -197,7 +193,6 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg)
 {
 	struct task_struct *thread;
 	struct crypto_test_param *param;
-	u32 type;
 
 	if (!try_module_get(THIS_MODULE))
 		goto err;
@@ -208,13 +203,7 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg)
 
 	memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver));
 	memcpy(param->alg, alg->cra_name, sizeof(param->alg));
-	type = alg->cra_flags;
-
-	/* Do not test internal algorithms. */
-	if (type & CRYPTO_ALG_INTERNAL)
-		type |= CRYPTO_ALG_TESTED;
-
-	param->type = type;
+	param->type = alg->cra_flags;
 
 	thread = kthread_run(cryptomgr_test, param, "cryptomgr_test");
 	if (IS_ERR(thread))

From 1e15da5148f593374a744405efb133a84fe47d01 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2022 16:12:35 -0800
Subject: [PATCH 264/457] UPSTREAM: crypto: api - compile out
 crypto_boot_test_finished when tests disabled

The crypto_boot_test_finished static key is unnecessary when self-tests
are disabled in the kconfig, so optimize it out accordingly, along with
the entirety of crypto_start_tests().  This mainly avoids the overhead
of an unnecessary static_branch_enable() on every boot.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Bug: 256875295
(cherry picked from commit 06bd9c967eaac5484c31c3dc6dfbef6183819508)
Change-Id: I68eff9772dc219a8786bf410cb4e946052ea7811
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/algapi.c   |  7 +++++--
 crypto/api.c      |  8 +++++---
 crypto/internal.h | 20 +++++++++++++++++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/crypto/algapi.c b/crypto/algapi.c
index 851b247f043d..d08f864f08be 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -454,7 +454,7 @@ int crypto_register_alg(struct crypto_alg *alg)
 	down_write(&crypto_alg_sem);
 	larval = __crypto_register_alg(alg, &algs_to_put);
 	if (!IS_ERR_OR_NULL(larval)) {
-		test_started = static_key_enabled(&crypto_boot_test_finished);
+		test_started = crypto_boot_test_finished();
 		larval->test_started = test_started;
 	}
 	up_write(&crypto_alg_sem);
@@ -1253,6 +1253,9 @@ EXPORT_SYMBOL_GPL(crypto_stats_skcipher_decrypt);
 
 static void __init crypto_start_tests(void)
 {
+	if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
+		return;
+
 	for (;;) {
 		struct crypto_larval *larval = NULL;
 		struct crypto_alg *q;
@@ -1286,7 +1289,7 @@ static void __init crypto_start_tests(void)
 		crypto_wait_for_test(larval);
 	}
 
-	static_branch_enable(&crypto_boot_test_finished);
+	set_crypto_boot_test_finished();
 }
 
 static int __init crypto_algapi_init(void)
diff --git a/crypto/api.c b/crypto/api.c
index 52ce10a35366..b022702f6436 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -31,8 +31,10 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem);
 BLOCKING_NOTIFIER_HEAD(crypto_chain);
 EXPORT_SYMBOL_GPL(crypto_chain);
 
-DEFINE_STATIC_KEY_FALSE(crypto_boot_test_finished);
-EXPORT_SYMBOL_GPL(crypto_boot_test_finished);
+#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished);
+EXPORT_SYMBOL_GPL(__crypto_boot_test_finished);
+#endif
 
 static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
 
@@ -202,7 +204,7 @@ static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg)
 	struct crypto_larval *larval = (void *)alg;
 	long timeout;
 
-	if (!static_branch_likely(&crypto_boot_test_finished))
+	if (!crypto_boot_test_finished())
 		crypto_start_test(larval);
 
 	timeout = wait_for_completion_killable_timeout(
diff --git a/crypto/internal.h b/crypto/internal.h
index c08385571853..932f0aafddc3 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -47,7 +47,25 @@ extern struct list_head crypto_alg_list;
 extern struct rw_semaphore crypto_alg_sem;
 extern struct blocking_notifier_head crypto_chain;
 
-DECLARE_STATIC_KEY_FALSE(crypto_boot_test_finished);
+#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+static inline bool crypto_boot_test_finished(void)
+{
+	return true;
+}
+static inline void set_crypto_boot_test_finished(void)
+{
+}
+#else
+DECLARE_STATIC_KEY_FALSE(__crypto_boot_test_finished);
+static inline bool crypto_boot_test_finished(void)
+{
+	return static_branch_likely(&__crypto_boot_test_finished);
+}
+static inline void set_crypto_boot_test_finished(void)
+{
+	static_branch_enable(&__crypto_boot_test_finished);
+}
+#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */
 
 #ifdef CONFIG_PROC_FS
 void __init crypto_init_proc(void);

From 4ce9b3423d941ab90775fb483f144f50353b0340 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2022 16:12:36 -0800
Subject: [PATCH 265/457] UPSTREAM: crypto: kdf - skip self-test when tests
 disabled

Make kdf_sp800108 honor the CONFIG_CRYPTO_MANAGER_DISABLE_TESTS kconfig
option, so that it doesn't always waste time running its self-test.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Bug: 256875295
(cherry picked from commit 0bf365c0efdd8fc03cb82e381ea4d76196c66bc2)
Change-Id: Ib189f727a7bb1a231cc8ec4ca6450e685bd678f7
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/kdf_sp800108.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/crypto/kdf_sp800108.c b/crypto/kdf_sp800108.c
index 58edf7797abf..c6e3ad82d5f7 100644
--- a/crypto/kdf_sp800108.c
+++ b/crypto/kdf_sp800108.c
@@ -125,9 +125,13 @@ static const struct kdf_testvec kdf_ctr_hmac_sha256_tv_template[] = {
 
 static int __init crypto_kdf108_init(void)
 {
-	int ret = kdf_test(&kdf_ctr_hmac_sha256_tv_template[0], "hmac(sha256)",
-			   crypto_kdf108_setkey, crypto_kdf108_ctr_generate);
+	int ret;
 
+	if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
+		return 0;
+
+	ret = kdf_test(&kdf_ctr_hmac_sha256_tv_template[0], "hmac(sha256)",
+		       crypto_kdf108_setkey, crypto_kdf108_ctr_generate);
 	if (ret) {
 		if (fips_enabled)
 			panic("alg: self-tests for CTR-KDF (hmac(sha256)) failed (rc=%d)\n",

From 93b8f2a16fd9dc00649923369917eb3619590433 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2022 16:12:37 -0800
Subject: [PATCH 266/457] UPSTREAM: crypto: kdf - silence noisy self-test

Make the kdf_sp800108 self-test only print a message on success when
fips_enabled, so that it's consistent with testmgr.c and doesn't spam
the kernel log with a message that isn't really important.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Bug: 256875295
(cherry picked from commit 790c4c9f532318e3fe8c6f0b498072abc80e1195)
Change-Id: Icb2c93c0ed51789fd425d36ebcaebf01af6b4c78
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/kdf_sp800108.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/kdf_sp800108.c b/crypto/kdf_sp800108.c
index c6e3ad82d5f7..c3f9938e1ad2 100644
--- a/crypto/kdf_sp800108.c
+++ b/crypto/kdf_sp800108.c
@@ -140,7 +140,7 @@ static int __init crypto_kdf108_init(void)
 		WARN(1,
 		     "alg: self-tests for CTR-KDF (hmac(sha256)) failed (rc=%d)\n",
 		     ret);
-	} else {
+	} else if (fips_enabled) {
 		pr_info("alg: self-tests for CTR-KDF (hmac(sha256)) passed\n");
 	}
 

From 72c995ce885fdb6f53cc7919147ed801037b52d7 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 13 Nov 2022 16:12:38 -0800
Subject: [PATCH 267/457] UPSTREAM: crypto: algboss - compile out test-related
 code when tests disabled

When CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is set, the code in algboss.c
that handles CRYPTO_MSG_ALG_REGISTER is unnecessary, so make it be
compiled out.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Bug: 256875295
(cherry picked from commit 441cb1b730006bd2d636f72dc7f6e11a8a0ecce5)
Change-Id: I11ebf60e1915ad5d13bd16a26d6c2c0944b4c401
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 crypto/algboss.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/crypto/algboss.c b/crypto/algboss.c
index 13d37320a66e..0de1e6697949 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -175,11 +175,7 @@ static int cryptomgr_test(void *data)
 {
 	struct crypto_test_param *param = data;
 	u32 type = param->type;
-	int err = 0;
-
-#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
-	goto skiptest;
-#endif
+	int err;
 
 	err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED);
 
@@ -194,6 +190,9 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg)
 	struct task_struct *thread;
 	struct crypto_test_param *param;
 
+	if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
+		return NOTIFY_DONE;
+
 	if (!try_module_get(THIS_MODULE))
 		goto err;
 

From 552a021d7dfdf4a12c3d2553b4da8919a2a29f3f Mon Sep 17 00:00:00 2001
From: Ramji Jiyani <ramjiyani@google.com>
Date: Fri, 16 Dec 2022 06:32:13 +0000
Subject: [PATCH 268/457] ANDROID: GKI: Protect exports of protected GKI
 modules

Implement support for protecting the exported symbols of
protected GKI modules.

Only signed GKI modules are permitted to export symbols
listed in the android/abi_gki_protected_exports file.
Attempting to export these symbols from an unsigned module
will result in the module failing to load, with a
'Permission denied' error message.

Bug: 232430739
Test: TH
Change-Id: I3e8b330938e116bb2e022d356ac0d55108a84a01
Signed-off-by: Ramji Jiyani <ramjiyani@google.com>
(cherry picked from commit fd1e768866606ff230280f26faca797bb47bd8c0)
---
 android/abi_gki_protected_exports  |  0
 kernel/module/Makefile             | 10 ++++++++--
 kernel/module/gki_module.c         | 21 ++++++++++++++++++---
 kernel/module/internal.h           |  5 +++++
 kernel/module/main.c               |  8 ++++++++
 scripts/gen_gki_modules_headers.sh | 23 +++++++++++++++--------
 6 files changed, 54 insertions(+), 13 deletions(-)
 create mode 100644 android/abi_gki_protected_exports

diff --git a/android/abi_gki_protected_exports b/android/abi_gki_protected_exports
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 756419001b30..06201b9a0574 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -25,9 +25,15 @@ obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
 # ANDROID: GKI: Generate headerfile required for gki_module.o
 #
 # Dependencies on generated files need to be listed explicitly
-$(obj)/gki_module.o: $(obj)/gki_module_unprotected.h
+$(obj)/gki_module.o: $(obj)/gki_module_protected_exports.h \
+			$(obj)/gki_module_unprotected.h
 
 $(obj)/gki_module_unprotected.h: $(srctree)/scripts/gen_gki_modules_headers.sh \
-                                $(if $(wildcard ${OUT_DIR}/abi_symbollist.raw), ${OUT_DIR}/abi_symbollist.raw)
+	                         $(if $(wildcard ${OUT_DIR}/abi_symbollist.raw), ${OUT_DIR}/abi_symbollist.raw)
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/gen_gki_modules_headers.sh $@ \
+	"$(srctree)"
+
+$(obj)/gki_module_protected_exports.h: $(srctree)/android/abi_gki_protected_exports \
+				$(srctree)/scripts/gen_gki_modules_headers.sh
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/gen_gki_modules_headers.sh $@ \
 	"$(srctree)"
diff --git a/kernel/module/gki_module.c b/kernel/module/gki_module.c
index f2367d77e3a2..fc2968c3791e 100644
--- a/kernel/module/gki_module.c
+++ b/kernel/module/gki_module.c
@@ -13,14 +13,29 @@
 /*
  * Build time generated header files
  *
+ * gki_module_protected_exports.h -- Symbols protected from _export_ by unsigned modules
  * gki_module_unprotected.h -- Symbols allowed to _access_ by unsigned modules
  */
+#include "gki_module_protected_exports.h"
 #include "gki_module_unprotected.h"
 
+#define MAX_STRCMP_LEN (max(MAX_UNPROTECTED_NAME_LEN, MAX_PROTECTED_EXPORTS_NAME_LEN))
+
 /* bsearch() comparision callback */
 static int cmp_name(const void *sym, const void *protected_sym)
 {
-	return strncmp(sym, protected_sym, MAX_UNPROTECTED_NAME_LEN);
+	return strncmp(sym, protected_sym, MAX_STRCMP_LEN);
+}
+
+/**
+ * gki_is_module_protected_export - Is a symbol exported from a protected GKI module?
+ *
+ * @name:	Symbol being checked against exported symbols from protected GKI modules
+ */
+bool gki_is_module_protected_export(const char *name)
+{
+	return bsearch(name, gki_protected_exports_symbols, NR_PROTECTED_EXPORTS_SYMBOLS,
+		       MAX_PROTECTED_EXPORTS_NAME_LEN, cmp_name) != NULL;
 }
 
 /**
@@ -30,8 +45,8 @@ static int cmp_name(const void *sym, const void *protected_sym)
  */
 bool gki_is_module_unprotected_symbol(const char *name)
 {
-	if (NO_OF_UNPROTECTED_SYMBOLS) {
-		return bsearch(name, gki_unprotected_symbols, NO_OF_UNPROTECTED_SYMBOLS,
+	if (NR_UNPROTECTED_SYMBOLS) {
+		return bsearch(name, gki_unprotected_symbols, NR_UNPROTECTED_SYMBOLS,
 				MAX_UNPROTECTED_NAME_LEN, cmp_name) != NULL;
 	} else {
 		/*
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 6f96a966f387..d51b047b4a54 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -306,9 +306,14 @@ static inline int same_magic(const char *amagic, const char *bmagic, bool has_cr
 
 #ifdef CONFIG_MODULE_SIG_PROTECT
 extern bool gki_is_module_unprotected_symbol(const char *name);
+extern bool gki_is_module_protected_export(const char *name);
 #else
 static inline bool gki_is_module_unprotected_symbol(const char *name)
 {
 	return true;
 }
+static inline bool gki_is_module_protected_export(const char *name)
+{
+	return false;
+}
 #endif /* CONFIG_MODULE_SIG_PROTECT */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 0087414df044..3aaa90ed7929 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1267,6 +1267,14 @@ static int verify_exported_symbols(struct module *mod)
 				.name	= kernel_symbol_name(s),
 				.gplok	= true,
 			};
+
+			if (!mod->sig_ok && gki_is_module_protected_export(
+						kernel_symbol_name(s))) {
+				pr_err("%s: exports protected symbol %s\n",
+				       mod->name, kernel_symbol_name(s));
+				return -EACCES;
+			}
+
 			if (find_symbol(&fsa)) {
 				pr_err("%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
diff --git a/scripts/gen_gki_modules_headers.sh b/scripts/gen_gki_modules_headers.sh
index 99f82bab8a02..4fe9d6faa19b 100755
--- a/scripts/gen_gki_modules_headers.sh
+++ b/scripts/gen_gki_modules_headers.sh
@@ -49,11 +49,11 @@ generate_header() {
 
 	# Find Maximum symbol name length if valid symbol_file exist
 	if [  -s "${symbol_file}" ]; then
-		# Skip 1st line (symbol header), Trim white spaces & +1 for null termination
+		# Trim white spaces & +1 for null termination
 		local max_name_len=$(awk '
 				{
 					$1=$1;
-					if ( length > L && NR > 1) {
+					if ( length > L ) {
 						L=length
 					}
 				} END { print ++L }' "${symbol_file}")
@@ -67,11 +67,11 @@ generate_header() {
 	/*
 	 * DO NOT EDIT
 	 *
-	 * Build generated header file with unprotected symbols/exports
+	 * Build generated header file with ${symbol_type}
 	 */
 
-	#define NO_OF_$(printf ${symbol_type} | tr [:lower:] [:upper:])_SYMBOLS \\
-	$(printf '\t')(sizeof(gki_${symbol_type}_symbols) / sizeof(gki_${symbol_type}_symbols[0]))
+	#define NR_$(printf ${symbol_type} | tr [:lower:] [:upper:])_SYMBOLS \\
+	$(printf '\t')(ARRAY_SIZE(gki_${symbol_type}_symbols))
 	#define MAX_$(printf ${symbol_type} | tr [:lower:] [:upper:])_NAME_LEN (${max_name_len})
 
 	static const char gki_${symbol_type}_symbols[][MAX_$(printf ${symbol_type} |
@@ -87,8 +87,15 @@ generate_header() {
 	echo "};" >> "${header_file}"
 }
 
-# Sorted list of vendor symbols
-GKI_VENDOR_SYMBOLS="${OUT_DIR}/abi_symbollist.raw"
+if [ "$(basename "${TARGET}")" = "gki_module_unprotected.h" ]; then
+	# Sorted list of vendor symbols
+	GKI_VENDOR_SYMBOLS="${OUT_DIR}/abi_symbollist.raw"
 
-generate_header "${TARGET}" "${GKI_VENDOR_SYMBOLS}" "unprotected"
+	generate_header "${TARGET}" "${GKI_VENDOR_SYMBOLS}" "unprotected"
+else
+	# Sorted list of exported symbols
+	GKI_EXPORTED_SYMBOLS="${SRCTREE}/android/abi_gki_protected_exports"
+
+	generate_header "${TARGET}" "${GKI_EXPORTED_SYMBOLS}" "protected_exports"
+fi
 

From e4a6cb847fc47fa294a9db9bbd64128976b1bbf4 Mon Sep 17 00:00:00 2001
From: Ramji Jiyani <ramjiyani@google.com>
Date: Sun, 18 Dec 2022 23:09:16 -0800
Subject: [PATCH 269/457] ANDROID: GKI: Only protect exports if KMI symbols are
 present

Only enforce export protection if there are symbols in the
unprotected list for the Kernel Module Interface (KMI).

This is only relevant for targets like arm64 that have
defined ABI symbol lists. This allows non-GKI targets
like arm and x86 to continue using GKI source code
without disabling the feature for those targets.

Bug: 232430739
Test: TH
Fixes: fd1e76886660 ("ANDROID: GKI: Protect exports of protected GKI modules")
Change-Id: Ie89e8f63eda99d9b7aacd1bb76d036b3ff4ba37c
Signed-off-by: Ramji Jiyani <ramjiyani@google.com>
(cherry picked from commit a6eaf3db80788a1445a7a2e05503f32610d79486)
---
 kernel/module/gki_module.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/module/gki_module.c b/kernel/module/gki_module.c
index fc2968c3791e..4f124f9a14ec 100644
--- a/kernel/module/gki_module.c
+++ b/kernel/module/gki_module.c
@@ -34,8 +34,17 @@ static int cmp_name(const void *sym, const void *protected_sym)
  */
 bool gki_is_module_protected_export(const char *name)
 {
-	return bsearch(name, gki_protected_exports_symbols, NR_PROTECTED_EXPORTS_SYMBOLS,
+	if (NR_UNPROTECTED_SYMBOLS) {
+		return bsearch(name, gki_protected_exports_symbols, NR_PROTECTED_EXPORTS_SYMBOLS,
 		       MAX_PROTECTED_EXPORTS_NAME_LEN, cmp_name) != NULL;
+	} else {
+		/*
+		 * If there are no symbols in unprotected list; We don't need to
+		 * protect exports as there is no KMI enforcement.
+		 * Treat everything exportable in this case.
+		 */
+		return false;
+	}
 }
 
 /**
@@ -52,7 +61,7 @@ bool gki_is_module_unprotected_symbol(const char *name)
 		/*
 		 * If there are no symbols in unprotected list;
 		 * there isn't a KMI enforcement for the kernel.
-		 * Treat evertything accessible in this case.
+		 * Treat everything accessible in this case.
 		 */
 		return true;
 	}

From 84b1c3a4293a0237f8756fed5c89cebf008ca9c6 Mon Sep 17 00:00:00 2001
From: Ramji Jiyani <ramjiyani@google.com>
Date: Fri, 16 Dec 2022 10:17:09 +0000
Subject: [PATCH 270/457] ANDROID: GKI: Add list of protected GKI modules

android/gki_protected_modules serves as a running
list of protected GKI modules. This list is being
used as an input to generate list of protected
GKI modules exports at android/abi_gki_protected_exports

All GKI modules are protected except zram.ko & zsmalloc.ko
as baseline in this list.

Bug: 232430739
Test: TH
Change-Id: I0c993769b9d07543755fd056199b0e4d10d27f77
Signed-off-by: Ramji Jiyani <ramjiyani@google.com>
(cherry picked from commit 90ff743687851d57282e3bc1f75875c660998e46)
---
 android/gki_protected_modules | 47 +++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 android/gki_protected_modules

diff --git a/android/gki_protected_modules b/android/gki_protected_modules
new file mode 100644
index 000000000000..fa169fc3f527
--- /dev/null
+++ b/android/gki_protected_modules
@@ -0,0 +1,47 @@
+drivers/bluetooth/btbcm.ko
+drivers/bluetooth/btqca.ko
+drivers/bluetooth/btsdio.ko
+drivers/bluetooth/hci_uart.ko
+drivers/net/can/dev/can-dev.ko
+drivers/net/can/slcan/slcan.ko
+drivers/net/can/vcan.ko
+drivers/net/ppp/bsd_comp.ko
+drivers/net/ppp/ppp_deflate.ko
+drivers/net/ppp/ppp_generic.ko
+drivers/net/ppp/ppp_mppe.ko
+drivers/net/ppp/pppox.ko
+drivers/net/ppp/pptp.ko
+drivers/net/slip/slhc.ko
+drivers/usb/class/cdc-acm.ko
+drivers/usb/serial/ftdi_sio.ko
+drivers/usb/serial/usbserial.ko
+lib/crypto/libarc4.ko
+net/6lowpan/6lowpan.ko
+net/6lowpan/nhc_dest.ko
+net/6lowpan/nhc_fragment.ko
+net/6lowpan/nhc_hop.ko
+net/6lowpan/nhc_ipv6.ko
+net/6lowpan/nhc_mobility.ko
+net/6lowpan/nhc_routing.ko
+net/6lowpan/nhc_udp.ko
+net/8021q/8021q.ko
+net/bluetooth/bluetooth.ko
+net/bluetooth/hidp/hidp.ko
+net/bluetooth/rfcomm/rfcomm.ko
+net/can/can.ko
+net/can/can-bcm.ko
+net/can/can-gw.ko
+net/can/can-raw.ko
+net/ieee802154/6lowpan/ieee802154_6lowpan.ko
+net/ieee802154/ieee802154.ko
+net/ieee802154/ieee802154_socket.ko
+net/l2tp/l2tp_core.ko
+net/l2tp/l2tp_ppp.ko
+net/mac80211/mac80211.ko
+net/mac802154/mac802154.ko
+net/nfc/nfc.ko
+net/rfkill/rfkill.ko
+net/tipc/diag.ko
+net/tipc/tipc.ko
+net/wireless/cfg80211.ko
+

From 1a89a59e1aa3e9809b7fea075039be8a733b308a Mon Sep 17 00:00:00 2001
From: Ramji Jiyani <ramjiyani@google.com>
Date: Fri, 16 Dec 2022 10:25:26 +0000
Subject: [PATCH 271/457] ANDROID: GKI: Update GKI modules protected exports

UPDATED: Cherry pick updated with exports for the
protected modules for kernel 6.1

Update protected export symbols list with exports
from list of protected modules at
android/gki_protected_modules.

It includes symbols from every GKI modules except
zram & zsmalloc; and serves as a baseline.

Bug: 232430739
Test: TH
Change-Id: Iec33dfe093b4e9e0281b910b2b3bf998cef55394
Signed-off-by: Ramji Jiyani <ramjiyani@google.com>
(cherry picked from commit 16c63232db1d56d03522a0e1945f3ec6a0e3ee8d)
---
 android/abi_gki_protected_exports | 538 ++++++++++++++++++++++++++++++
 1 file changed, 538 insertions(+)

diff --git a/android/abi_gki_protected_exports b/android/abi_gki_protected_exports
index e69de29bb2d1..0cfdef38622b 100644
--- a/android/abi_gki_protected_exports
+++ b/android/abi_gki_protected_exports
@@ -0,0 +1,538 @@
+__cfg80211_alloc_event_skb
+__cfg80211_alloc_reply_skb
+__cfg80211_radar_event
+__cfg80211_send_event_skb
+__hci_cmd_send
+__hci_cmd_sync
+__hci_cmd_sync_ev
+__hci_cmd_sync_sk
+__hci_cmd_sync_status
+__hci_cmd_sync_status_sk
+__ieee80211_schedule_txq
+__nfc_alloc_vendor_cmd_reply_skb
+alloc_can_err_skb
+alloc_can_skb
+alloc_candev_mqs
+alloc_canfd_skb
+alloc_canxl_skb
+arc4_crypt
+arc4_setkey
+baswap
+bridge_tunnel_header
+bt_accept_dequeue
+bt_accept_enqueue
+bt_accept_unlink
+bt_debugfs
+bt_err
+bt_err_ratelimited
+bt_info
+bt_procfs_cleanup
+bt_procfs_init
+bt_sock_ioctl
+bt_sock_link
+bt_sock_poll
+bt_sock_reclassify_lock
+bt_sock_recvmsg
+bt_sock_register
+bt_sock_stream_recvmsg
+bt_sock_unlink
+bt_sock_unregister
+bt_sock_wait_ready
+bt_sock_wait_state
+bt_status
+bt_to_errno
+bt_warn
+bt_warn_ratelimited
+btbcm_check_bdaddr
+btbcm_finalize
+btbcm_initialize
+btbcm_patchram
+btbcm_read_pcm_int_params
+btbcm_set_bdaddr
+btbcm_setup_apple
+btbcm_setup_patchram
+btbcm_write_pcm_int_params
+can_bus_off
+can_change_mtu
+can_change_state
+can_dropped_invalid_skb
+can_eth_ioctl_hwts
+can_ethtool_op_get_ts_info_hwts
+can_fd_dlc2len
+can_fd_len2dlc
+can_free_echo_skb
+can_get_echo_skb
+can_get_state_str
+can_proto_register
+can_proto_unregister
+can_put_echo_skb
+can_rx_offload_add_fifo
+can_rx_offload_add_manual
+can_rx_offload_add_timestamp
+can_rx_offload_del
+can_rx_offload_enable
+can_rx_offload_get_echo_skb
+can_rx_offload_irq_finish
+can_rx_offload_irq_offload_fifo
+can_rx_offload_irq_offload_timestamp
+can_rx_offload_queue_tail
+can_rx_offload_queue_timestamp
+can_rx_offload_threaded_irq_finish
+can_rx_register
+can_rx_unregister
+can_send
+can_skb_get_frame_len
+can_sock_destruct
+cfg80211_any_usable_channels
+cfg80211_assoc_comeback
+cfg80211_assoc_failure
+cfg80211_auth_timeout
+cfg80211_background_cac_abort
+cfg80211_bss_color_notify
+cfg80211_bss_flush
+cfg80211_bss_iter
+cfg80211_cac_event
+cfg80211_calculate_bitrate
+cfg80211_ch_switch_notify
+cfg80211_ch_switch_started_notify
+cfg80211_chandef_compatible
+cfg80211_chandef_create
+cfg80211_chandef_dfs_required
+cfg80211_chandef_usable
+cfg80211_chandef_valid
+cfg80211_check_combinations
+cfg80211_check_station_change
+cfg80211_classify8021d
+cfg80211_conn_failed
+cfg80211_connect_done
+cfg80211_control_port_tx_status
+cfg80211_cqm_beacon_loss_notify
+cfg80211_cqm_pktloss_notify
+cfg80211_cqm_rssi_notify
+cfg80211_cqm_txe_notify
+cfg80211_crit_proto_stopped
+cfg80211_del_sta_sinfo
+cfg80211_disconnected
+cfg80211_external_auth_request
+cfg80211_find_elem_match
+cfg80211_find_vendor_elem
+cfg80211_free_nan_func
+cfg80211_ft_event
+cfg80211_get_bss
+cfg80211_get_drvinfo
+cfg80211_get_ies_channel_number
+cfg80211_get_iftype_ext_capa
+cfg80211_get_p2p_attr
+cfg80211_get_station
+cfg80211_gtk_rekey_notify
+cfg80211_ibss_joined
+cfg80211_iftype_allowed
+cfg80211_inform_bss_data
+cfg80211_inform_bss_frame_data
+cfg80211_is_element_inherited
+cfg80211_iter_combinations
+cfg80211_merge_profile
+cfg80211_mgmt_tx_status_ext
+cfg80211_michael_mic_failure
+cfg80211_nan_func_terminated
+cfg80211_nan_match
+cfg80211_new_sta
+cfg80211_notify_new_peer_candidate
+cfg80211_pmksa_candidate_notify
+cfg80211_pmsr_complete
+cfg80211_pmsr_report
+cfg80211_port_authorized
+cfg80211_probe_status
+cfg80211_put_bss
+cfg80211_ready_on_channel
+cfg80211_ref_bss
+cfg80211_reg_can_beacon
+cfg80211_reg_can_beacon_relax
+cfg80211_register_netdevice
+cfg80211_remain_on_channel_expired
+cfg80211_report_obss_beacon_khz
+cfg80211_report_wowlan_wakeup
+cfg80211_roamed
+cfg80211_rx_assoc_resp
+cfg80211_rx_control_port
+cfg80211_rx_mgmt_ext
+cfg80211_rx_mlme_mgmt
+cfg80211_rx_spurious_frame
+cfg80211_rx_unexpected_4addr_frame
+cfg80211_rx_unprot_mlme_mgmt
+cfg80211_scan_done
+cfg80211_sched_scan_results
+cfg80211_sched_scan_stopped
+cfg80211_sched_scan_stopped_locked
+cfg80211_send_layer2_update
+cfg80211_shutdown_all_interfaces
+cfg80211_sinfo_alloc_tid_stats
+cfg80211_sta_opmode_change_notify
+cfg80211_stop_iface
+cfg80211_tdls_oper_request
+cfg80211_tx_mgmt_expired
+cfg80211_tx_mlme_mgmt
+cfg80211_unlink_bss
+cfg80211_unregister_wdev
+cfg80211_update_owe_info_event
+cfg80211_vendor_cmd_get_sender
+cfg80211_vendor_cmd_reply
+close_candev
+free_candev
+freq_reg_info
+get_wiphy_regdom
+h4_recv_buf
+hci_alloc_dev_priv
+hci_cmd_sync
+hci_cmd_sync_cancel
+hci_cmd_sync_queue
+hci_conn_check_secure
+hci_conn_security
+hci_conn_switch_role
+hci_free_dev
+hci_get_route
+hci_mgmt_chan_register
+hci_mgmt_chan_unregister
+hci_recv_diag
+hci_recv_frame
+hci_register_cb
+hci_register_dev
+hci_release_dev
+hci_reset_dev
+hci_resume_dev
+hci_set_fw_info
+hci_set_hw_info
+hci_suspend_dev
+hci_uart_register_device
+hci_uart_tx_wakeup
+hci_uart_unregister_device
+hci_unregister_cb
+hci_unregister_dev
+hidp_hid_driver
+ieee80211_alloc_hw_nm
+ieee80211_amsdu_to_8023s
+ieee80211_ap_probereq_get
+ieee80211_ave_rssi
+ieee80211_beacon_cntdwn_is_complete
+ieee80211_beacon_get_template
+ieee80211_beacon_get_tim
+ieee80211_beacon_loss
+ieee80211_beacon_set_cntdwn
+ieee80211_beacon_update_cntdwn
+ieee80211_bss_get_elem
+ieee80211_calc_rx_airtime
+ieee80211_calc_tx_airtime
+ieee80211_chandef_to_operating_class
+ieee80211_channel_switch_disconnect
+ieee80211_channel_to_freq_khz
+ieee80211_chswitch_done
+ieee80211_color_change_finish
+ieee80211_connection_loss
+ieee80211_cqm_beacon_loss_notify
+ieee80211_cqm_rssi_notify
+ieee80211_csa_finish
+ieee80211_ctstoself_duration
+ieee80211_ctstoself_get
+ieee80211_data_to_8023_exthdr
+ieee80211_disable_rssi_reports
+ieee80211_disconnect
+ieee80211_enable_rssi_reports
+ieee80211_find_sta
+ieee80211_find_sta_by_ifaddr
+ieee80211_find_sta_by_link_addrs
+ieee80211_free_hw
+ieee80211_free_txskb
+ieee80211_freq_khz_to_channel
+ieee80211_generic_frame_duration
+ieee80211_get_bssid
+ieee80211_get_buffered_bc
+ieee80211_get_channel_khz
+ieee80211_get_fils_discovery_tmpl
+ieee80211_get_hdrlen_from_skb
+ieee80211_get_key_rx_seq
+ieee80211_get_mesh_hdrlen
+ieee80211_get_num_supported_channels
+ieee80211_get_response_rate
+ieee80211_get_tkip_p1k_iv
+ieee80211_get_tkip_p2k
+ieee80211_get_tkip_rx_p1k
+ieee80211_get_tx_rates
+ieee80211_get_unsol_bcast_probe_resp_tmpl
+ieee80211_get_vht_max_nss
+ieee80211_gtk_rekey_add
+ieee80211_gtk_rekey_notify
+ieee80211_hdrlen
+ieee80211_hw_restart_disconnect
+ieee80211_ie_split_ric
+ieee80211_iter_chan_contexts_atomic
+ieee80211_iter_keys
+ieee80211_iter_keys_rcu
+ieee80211_iterate_active_interfaces_atomic
+ieee80211_iterate_active_interfaces_mtx
+ieee80211_iterate_interfaces
+ieee80211_iterate_stations
+ieee80211_iterate_stations_atomic
+ieee80211_key_mic_failure
+ieee80211_key_replay
+ieee80211_manage_rx_ba_offl
+ieee80211_mandatory_rates
+ieee80211_mark_rx_ba_filtered_frames
+ieee80211_nan_func_match
+ieee80211_nan_func_terminated
+ieee80211_next_txq
+ieee80211_nullfunc_get
+ieee80211_operating_class_to_band
+ieee80211_parse_p2p_noa
+ieee80211_probereq_get
+ieee80211_proberesp_get
+ieee80211_pspoll_get
+ieee80211_queue_delayed_work
+ieee80211_queue_stopped
+ieee80211_queue_work
+ieee80211_radar_detected
+ieee80211_radiotap_iterator_init
+ieee80211_radiotap_iterator_next
+ieee80211_rate_control_register
+ieee80211_rate_control_unregister
+ieee80211_ready_on_channel
+ieee80211_register_hw
+ieee80211_remain_on_channel_expired
+ieee80211_remove_key
+ieee80211_report_low_ack
+ieee80211_report_wowlan_wakeup
+ieee80211_request_smps
+ieee80211_reserve_tid
+ieee80211_restart_hw
+ieee80211_resume_disconnect
+ieee80211_rts_duration
+ieee80211_rts_get
+ieee80211_rx_ba_timer_expired
+ieee80211_rx_irqsafe
+ieee80211_rx_list
+ieee80211_rx_napi
+ieee80211_s1g_channel_width
+ieee80211_scan_completed
+ieee80211_sched_scan_results
+ieee80211_sched_scan_stopped
+ieee80211_send_bar
+ieee80211_send_eosp_nullfunc
+ieee80211_set_active_links
+ieee80211_set_active_links_async
+ieee80211_set_key_rx_seq
+ieee80211_sta_block_awake
+ieee80211_sta_eosp
+ieee80211_sta_ps_transition
+ieee80211_sta_pspoll
+ieee80211_sta_recalc_aggregates
+ieee80211_sta_register_airtime
+ieee80211_sta_set_buffered
+ieee80211_sta_uapsd_trigger
+ieee80211_start_tx_ba_cb_irqsafe
+ieee80211_start_tx_ba_session
+ieee80211_stop_queue
+ieee80211_stop_queues
+ieee80211_stop_rx_ba_session
+ieee80211_stop_tx_ba_cb_irqsafe
+ieee80211_stop_tx_ba_session
+ieee80211_tdls_oper_request
+ieee80211_tkip_add_iv
+ieee80211_tx_dequeue
+ieee80211_tx_prepare_skb
+ieee80211_tx_rate_update
+ieee80211_tx_status
+ieee80211_tx_status_8023
+ieee80211_tx_status_ext
+ieee80211_tx_status_irqsafe
+ieee80211_txq_airtime_check
+ieee80211_txq_get_depth
+ieee80211_txq_may_transmit
+ieee80211_txq_schedule_start
+ieee80211_unregister_hw
+ieee80211_unreserve_tid
+ieee80211_update_mu_groups
+ieee80211_update_p2p_noa
+ieee80211_vif_to_wdev
+ieee80211_wake_queue
+ieee80211_wake_queues
+ieee802154_alloc_hw
+ieee802154_configure_durations
+ieee802154_free_hw
+ieee802154_hdr_peek
+ieee802154_hdr_peek_addrs
+ieee802154_hdr_pull
+ieee802154_hdr_push
+ieee802154_max_payload
+ieee802154_register_hw
+ieee802154_rx_irqsafe
+ieee802154_stop_queue
+ieee802154_unregister_hw
+ieee802154_wake_queue
+ieee802154_xmit_complete
+ieee802154_xmit_error
+ieee802154_xmit_hw_error
+ieeee80211_obss_color_collision_notify
+l2cap_add_psm
+l2cap_chan_close
+l2cap_chan_connect
+l2cap_chan_create
+l2cap_chan_del
+l2cap_chan_list
+l2cap_chan_put
+l2cap_chan_send
+l2cap_chan_set_defaults
+l2cap_conn_get
+l2cap_conn_put
+l2cap_is_socket
+l2cap_register_user
+l2cap_unregister_user
+l2tp_recv_common
+l2tp_session_create
+l2tp_session_dec_refcount
+l2tp_session_delete
+l2tp_session_get
+l2tp_session_get_by_ifname
+l2tp_session_get_nth
+l2tp_session_inc_refcount
+l2tp_session_register
+l2tp_session_set_header_len
+l2tp_sk_to_tunnel
+l2tp_tunnel_create
+l2tp_tunnel_dec_refcount
+l2tp_tunnel_delete
+l2tp_tunnel_get
+l2tp_tunnel_get_nth
+l2tp_tunnel_get_session
+l2tp_tunnel_inc_refcount
+l2tp_tunnel_register
+l2tp_udp_encap_recv
+l2tp_xmit_skb
+lowpan_header_compress
+lowpan_header_decompress
+lowpan_nhc_add
+lowpan_nhc_del
+lowpan_register_netdev
+lowpan_register_netdevice
+lowpan_unregister_netdev
+lowpan_unregister_netdevice
+nfc_add_se
+nfc_alloc_recv_skb
+nfc_allocate_device
+nfc_class
+nfc_dep_link_is_up
+nfc_driver_failure
+nfc_find_se
+nfc_fw_download_done
+nfc_get_local_general_bytes
+nfc_proto_register
+nfc_proto_unregister
+nfc_register_device
+nfc_remove_se
+nfc_se_connectivity
+nfc_se_transaction
+nfc_send_to_raw_sock
+nfc_set_remote_general_bytes
+nfc_target_lost
+nfc_targets_found
+nfc_tm_activated
+nfc_tm_data_received
+nfc_tm_deactivated
+nfc_unregister_device
+nfc_vendor_cmd_reply
+of_can_transceiver
+open_candev
+ppp_channel_index
+ppp_dev_name
+ppp_input
+ppp_input_error
+ppp_output_wakeup
+ppp_register_channel
+ppp_register_compressor
+ppp_register_net_channel
+ppp_unit_number
+ppp_unregister_channel
+ppp_unregister_compressor
+pppox_compat_ioctl
+pppox_ioctl
+pppox_unbind_sock
+qca_read_soc_version
+qca_send_pre_shutdown_cmd
+qca_set_bdaddr
+qca_set_bdaddr_rome
+qca_uart_setup
+rate_control_set_rates
+reg_initiator_name
+reg_query_regdb_wmm
+register_candev
+register_pppox_proto
+regulatory_hint
+regulatory_pre_cac_allowed
+regulatory_set_wiphy_regd
+regulatory_set_wiphy_regd_sync
+rfc1042_header
+rfkill_alloc
+rfkill_blocked
+rfkill_destroy
+rfkill_find_type
+rfkill_get_led_trigger_name
+rfkill_init_sw_state
+rfkill_pause_polling
+rfkill_register
+rfkill_resume_polling
+rfkill_set_hw_state_reason
+rfkill_set_led_trigger_name
+rfkill_set_states
+rfkill_set_sw_state
+rfkill_soft_blocked
+rfkill_unregister
+safe_candev_priv
+slhc_compress
+slhc_free
+slhc_init
+slhc_remember
+slhc_toss
+slhc_uncompress
+tipc_dump_done
+tipc_dump_start
+tipc_nl_sk_walk
+tipc_sk_fill_sock_diag
+unregister_candev
+unregister_pppox_proto
+usb_serial_claim_interface
+usb_serial_deregister_drivers
+usb_serial_generic_chars_in_buffer
+usb_serial_generic_close
+usb_serial_generic_get_icount
+usb_serial_generic_open
+usb_serial_generic_process_read_urb
+usb_serial_generic_read_bulk_callback
+usb_serial_generic_resume
+usb_serial_generic_submit_read_urbs
+usb_serial_generic_throttle
+usb_serial_generic_tiocmiwait
+usb_serial_generic_unthrottle
+usb_serial_generic_wait_until_sent
+usb_serial_generic_write
+usb_serial_generic_write_bulk_callback
+usb_serial_generic_write_start
+usb_serial_handle_dcd_change
+usb_serial_port_softint
+usb_serial_register_drivers
+usb_serial_resume
+usb_serial_suspend
+wdev_chandef
+wdev_to_ieee80211_vif
+wiphy_apply_custom_regulatory
+wiphy_free
+wiphy_new_nm
+wiphy_read_of_freq_limits
+wiphy_register
+wiphy_rfkill_set_hw_state_reason
+wiphy_rfkill_start_polling
+wiphy_to_ieee80211_hw
+wiphy_unregister
+wpan_phy_find
+wpan_phy_for_each
+wpan_phy_free
+wpan_phy_new
+wpan_phy_register
+wpan_phy_unregister
\ No newline at end of file

From fa2ace3965b40dd87eb09c1888c0698a2321859a Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@denx.de>
Date: Mon, 24 Oct 2022 19:30:12 +0200
Subject: [PATCH 272/457] f2fs: should put a page when checking the summary
 info

The commit introduces another bug.

Cc: stable@vger.kernel.org
Fixes: c6ad7fd16657e ("f2fs: fix to do sanity check on summary info")
Signed-off-by: Pavel Machek <pavel@denx.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 4546e01b2ee0..dab794225cce 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1110,6 +1110,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (ofs_in_node >= max_addrs) {
 		f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
 			ofs_in_node, dni->ino, dni->nid, max_addrs);
+		f2fs_put_page(node_page, 1);
 		return false;
 	}
 

From eaf4d7509b5720c20110424680dfb9edc96d3059 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 17 Oct 2022 17:52:05 -0700
Subject: [PATCH 273/457] f2fs: let's avoid to get cp_rwsem twice by
 f2fs_evict_inode by d_invalidate

f2fs_unlink
 -> f2fs_lock_op
 -> d_invalidate
  -> shrink_dentry_list
   -> iput_final
    -> f2fs_evict_inode
     -> f2fs_lock_op

Reviewed-by: Chao Yu <chao@kernel.org>
Tested-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a389772fd212..e104409c3a0e 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -632,6 +632,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 		goto fail;
 	}
 	f2fs_delete_entry(de, page, dir, inode);
+	f2fs_unlock_op(sbi);
+
 #if IS_ENABLED(CONFIG_UNICODE)
 	/* VFS negative dentries are incompatible with Encoding and
 	 * Case-insensitiveness. Eventually we'll want avoid
@@ -642,8 +644,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_CASEFOLDED(dir))
 		d_invalidate(dentry);
 #endif
-	f2fs_unlock_op(sbi);
-
 	if (IS_DIRSYNC(dir))
 		f2fs_sync_fs(sbi->sb, 1);
 fail:

From 8dfb15856f21da556b47a4e8096af80b09cfc444 Mon Sep 17 00:00:00 2001
From: Zhang Qilong <zhangqilong3@huawei.com>
Date: Tue, 18 Oct 2022 10:45:32 +0800
Subject: [PATCH 274/457] f2fs: Fix the race condition of resize flag between
 resizefs

Because the set/clear SBI_IS_RESIZEFS flag not between any locks,
In the following case:
  thread1			thread2
   ->ioctl(resizefs)
    ->set RESIZEFS flag		 ->ioctl(resizefs)
    ...                   	  ->set RESIZEFS flag
    ->clear RESIZEFS flag
    				  ->resizefs stream
				    # No RESIZEFS flag in the stream

Also before freeze_super, the resizefs not started, we should not set
the SBI_IS_RESIZEFS flag.

So move the set/clear SBI_IS_RESIZEFS flag between the cp_mutex and
gc_lock.

Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress")
Signed-off-by: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index dab794225cce..7b4be412cec0 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -2134,8 +2134,6 @@ out_unlock:
 	if (err)
 		return err;
 
-	set_sbi_flag(sbi, SBI_IS_RESIZEFS);
-
 	freeze_super(sbi->sb);
 	f2fs_down_write(&sbi->gc_lock);
 	f2fs_down_write(&sbi->cp_global_sem);
@@ -2151,6 +2149,7 @@ out_unlock:
 	if (err)
 		goto out_err;
 
+	set_sbi_flag(sbi, SBI_IS_RESIZEFS);
 	err = free_segment_range(sbi, secs, false);
 	if (err)
 		goto recover_out;
@@ -2174,6 +2173,7 @@ out_unlock:
 		f2fs_commit_super(sbi, false);
 	}
 recover_out:
+	clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
 	if (err) {
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
 		f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
@@ -2186,6 +2186,5 @@ out_err:
 	f2fs_up_write(&sbi->cp_global_sem);
 	f2fs_up_write(&sbi->gc_lock);
 	thaw_super(sbi->sb);
-	clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
 	return err;
 }

From 49bebf3d4f7632baed57f73449a20f2899fb2fdd Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 21 Oct 2022 10:34:22 +0800
Subject: [PATCH 275/457] f2fs: fix to invalidate dcc->f2fs_issue_discard in
 error path

Syzbot reports a NULL pointer dereference issue as below:

 __refcount_add include/linux/refcount.h:193 [inline]
 __refcount_inc include/linux/refcount.h:250 [inline]
 refcount_inc include/linux/refcount.h:267 [inline]
 get_task_struct include/linux/sched/task.h:110 [inline]
 kthread_stop+0x34/0x1c0 kernel/kthread.c:703
 f2fs_stop_discard_thread+0x3c/0x5c fs/f2fs/segment.c:1638
 kill_f2fs_super+0x5c/0x194 fs/f2fs/super.c:4522
 deactivate_locked_super+0x70/0xe8 fs/super.c:332
 deactivate_super+0xd0/0xd4 fs/super.c:363
 cleanup_mnt+0x1f8/0x234 fs/namespace.c:1186
 __cleanup_mnt+0x20/0x30 fs/namespace.c:1193
 task_work_run+0xc4/0x14c kernel/task_work.c:177
 exit_task_work include/linux/task_work.h:38 [inline]
 do_exit+0x26c/0xbe0 kernel/exit.c:795
 do_group_exit+0x60/0xe8 kernel/exit.c:925
 __do_sys_exit_group kernel/exit.c:936 [inline]
 __se_sys_exit_group kernel/exit.c:934 [inline]
 __wake_up_parent+0x0/0x40 kernel/exit.c:934
 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:52 [inline]
 el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142
 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206
 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636
 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654
 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581

The root cause of this issue is in error path of f2fs_start_discard_thread(),
it missed to invalidate dcc->f2fs_issue_discard, later kthread_stop() may
access invalid pointer.

Fixes: 4d67490498ac ("f2fs: Don't create discard thread when device doesn't support realtime discard")
Reported-by: syzbot+035a381ea1afb63f098d@syzkaller.appspotmail.com
Reported-by: syzbot+729c925c2d9fc495ddee@syzkaller.appspotmail.com
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index acf3d3fa4363..7a4f7c88b8b9 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2025,8 +2025,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
 
 	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
 				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
-	if (IS_ERR(dcc->f2fs_issue_discard))
+	if (IS_ERR(dcc->f2fs_issue_discard)) {
 		err = PTR_ERR(dcc->f2fs_issue_discard);
+		dcc->f2fs_issue_discard = NULL;
+	}
 
 	return err;
 }

From 10a14afbdbafb812646954ac6356ea55bea50c1c Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Thu, 6 Oct 2022 23:09:28 +0800
Subject: [PATCH 276/457] f2fs: support fault injection for
 f2fs_is_valid_blkaddr()

This patch supports to inject fault into f2fs_is_valid_blkaddr() to
simulate accessing inconsistent data/meta block addressses from caller.

Usage:
a) echo 262144 > /sys/fs/f2fs/<dev>/inject_type or
b) mount -o fault_type=262144 <dev> <mountpoint>

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst | 1 +
 fs/f2fs/checkpoint.c               | 5 +++++
 fs/f2fs/f2fs.h                     | 1 +
 fs/f2fs/super.c                    | 1 +
 4 files changed, 8 insertions(+)

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 17df9a02ccff..b797e8ec96ed 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -199,6 +199,7 @@ fault_type=%d		 Support configuring fault injection type, should be
 			 FAULT_SLAB_ALLOC	  0x000008000
 			 FAULT_DQUOT_INIT	  0x000010000
 			 FAULT_LOCK_OP		  0x000020000
+			 FAULT_BLKADDR		  0x000040000
 			 ===================	  ===========
 mode=%s			 Control block allocation mode which supports "adaptive"
 			 and "lfs". In "lfs" mode, there should be no random
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0c82dae082aa..c00694a50222 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
+	if (time_to_inject(sbi, FAULT_BLKADDR)) {
+		f2fs_show_injection_info(sbi, FAULT_BLKADDR);
+		return false;
+	}
+
 	switch (type) {
 	case META_NAT:
 		break;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e6355a5683b7..f57cb49dc383 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -60,6 +60,7 @@ enum {
 	FAULT_SLAB_ALLOC,
 	FAULT_DQUOT_INIT,
 	FAULT_LOCK_OP,
+	FAULT_BLKADDR,
 	FAULT_MAX,
 };
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3834ead04620..df26fbe2bf58 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_SLAB_ALLOC]	= "slab alloc",
 	[FAULT_DQUOT_INIT]	= "dquot initialize",
 	[FAULT_LOCK_OP]		= "lock_op",
+	[FAULT_BLKADDR]		= "invalid blkaddr",
 };
 
 void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,

From a4cc4f13faae7d4074eab57ed2d6bf326fc5fa9e Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 11:08:31 +0800
Subject: [PATCH 277/457] f2fs: remove batched_trim_sections node

commit 377224c47118("f2fs: don't split checkpoint in fstrim") obsolete
batch mode and related sysfs entry.

Since this testing sysfs node has been deprecated for a long time, let's
remove it.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 3 ---
 fs/f2fs/sysfs.c | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f57cb49dc383..e990870bdab9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1063,9 +1063,6 @@ struct f2fs_sm_info {
 	/* a threshold to reclaim prefree segments */
 	unsigned int rec_prefree_segments;
 
-	/* for batched trimming */
-	unsigned int trim_sections;		/* # of sections to trim */
-
 	struct list_head sit_entry_set;	/* sit entry set list */
 
 	unsigned int ipu_policy;	/* in-place-update policy */
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index df27afd71ef4..926b7a844362 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -488,9 +488,6 @@ out:
 			return -EINVAL;
 	}
 
-	if (!strcmp(a->attr.name, "trim_sections"))
-		return -EINVAL;
-
 	if (!strcmp(a->attr.name, "gc_urgent")) {
 		if (t == 0) {
 			sbi->gc_mode = GC_NORMAL;
@@ -790,7 +787,6 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
 F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -919,7 +915,6 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(max_discard_issue_time),
 	ATTR_LIST(discard_granularity),
 	ATTR_LIST(pending_discard),
-	ATTR_LIST(batched_trim_sections),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),

From 34517f1529581a42d0b62478d43eaa1eaefafb92 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 14:50:24 +0800
Subject: [PATCH 278/457] f2fs: fix gc mode when gc_urgent_high_remaining is 1

Under the current logic, when gc_urgent_high_remaining is set to 1,
the mode will be switched to normal at the beginning, instead of
running in gc_urgent mode.

Let's switch the gc mode back to normal when the gc ends.

Fixes: 265576181b4a ("f2fs: remove gc_urgent_high_limited for cleanup")
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 7b4be412cec0..d2e9c280773f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,16 +96,6 @@ static int gc_thread_func(void *data)
 		 * invalidated soon after by user update or deletion.
 		 * So, I'd like to wait some time to collect dirty segments.
 		 */
-		if (sbi->gc_mode == GC_URGENT_HIGH) {
-			spin_lock(&sbi->gc_urgent_high_lock);
-			if (sbi->gc_urgent_high_remaining) {
-				sbi->gc_urgent_high_remaining--;
-				if (!sbi->gc_urgent_high_remaining)
-					sbi->gc_mode = GC_NORMAL;
-			}
-			spin_unlock(&sbi->gc_urgent_high_lock);
-		}
-
 		if (sbi->gc_mode == GC_URGENT_HIGH ||
 				sbi->gc_mode == GC_URGENT_MID) {
 			wait_ms = gc_th->urgent_sleep_time;
@@ -162,6 +152,15 @@ do_gc:
 		/* balancing f2fs's metadata periodically */
 		f2fs_balance_fs_bg(sbi, true);
 next:
+		if (sbi->gc_mode == GC_URGENT_HIGH) {
+			spin_lock(&sbi->gc_urgent_high_lock);
+			if (sbi->gc_urgent_high_remaining) {
+				sbi->gc_urgent_high_remaining--;
+				if (!sbi->gc_urgent_high_remaining)
+					sbi->gc_mode = GC_NORMAL;
+			}
+			spin_unlock(&sbi->gc_urgent_high_lock);
+		}
 		sb_end_write(sbi->sb);
 
 	} while (!kthread_should_stop());

From a2f3be3942f714c020c03f05812809e302134bbf Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 16:05:26 +0800
Subject: [PATCH 279/457] f2fs: cleanup in f2fs_create_flush_cmd_control()

Just cleanup for readable, no functional changes.

Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7a4f7c88b8b9..0df47ad80efb 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -620,12 +620,12 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
 	struct flush_cmd_control *fcc;
-	int err = 0;
+	int err;
 
 	if (SM_I(sbi)->fcc_info) {
 		fcc = SM_I(sbi)->fcc_info;
 		if (fcc->f2fs_issue_flush)
-			return err;
+			return 0;
 		goto init_thread;
 	}
 
@@ -638,7 +638,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->fcc_info = fcc;
 	if (!test_opt(sbi, FLUSH_MERGE))
-		return err;
+		return 0;
 
 init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
@@ -650,7 +650,7 @@ init_thread:
 		return err;
 	}
 
-	return err;
+	return 0;
 }
 
 void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)

From 465e787ef52bb9cef4adf674f86de89a8585297b Mon Sep 17 00:00:00 2001
From: Dongdong Zhang <zhangdongdong1@oppo.com>
Date: Tue, 25 Oct 2022 17:40:36 +0800
Subject: [PATCH 280/457] f2fs: fix normal discard process

In the DPOLICY_BG mode, there is a conflict between
the two conditions "i + 1 < dpolicy->granularity" and
"i < DEFAULT_DISCARD_GRANULARITY". If i = 15, the first
condition is false, it will enter the second condition
and dispatch all small granularity discards in function
 __issue_discard_cmd_orderly. The restrictive effect
of the first condition to small discards will be
invalidated. These two conditions should align.

Fixes: 20ee4382322c ("f2fs: issue small discard by LBA order")
Signed-off-by: Dongdong Zhang <zhangdongdong1@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0df47ad80efb..38f6a2bcb158 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1448,7 +1448,7 @@ retry:
 		if (i + 1 < dpolicy->granularity)
 			break;
 
-		if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+		if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
 			return __issue_discard_cmd_orderly(sbi, dpolicy);
 
 		pend_list = &dcc->pend_list[i];

From 9240403e144fd01ac35ff90f81756d16fc30726f Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 01:54:01 +0800
Subject: [PATCH 281/457] f2fs: add barrier mount option

This patch adds a mount option, barrier, in f2fs.
The barrier option is the opposite of nobarrier.
If this option is set, cache_flush commands are allowed to be issued.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst | 2 ++
 fs/f2fs/super.c                    | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index b797e8ec96ed..6e67c5e6c7c3 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -154,6 +154,8 @@ nobarrier		 This option can be used if underlying storage guarantees
 			 If this option is set, no cache_flush commands are issued
 			 but f2fs still guarantees the write ordering of all the
 			 data writes.
+barrier		 If this option is set, cache_flush commands are allowed to be
+			 issued.
 fastboot		 This option is used when a system wants to reduce mount
 			 time as much as possible, even though normal performance
 			 can be sacrificed.
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index df26fbe2bf58..a247027711d8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -111,6 +111,7 @@ enum {
 	Opt_noinline_dentry,
 	Opt_flush_merge,
 	Opt_noflush_merge,
+	Opt_barrier,
 	Opt_nobarrier,
 	Opt_fastboot,
 	Opt_extent_cache,
@@ -187,6 +188,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_noinline_dentry, "noinline_dentry"},
 	{Opt_flush_merge, "flush_merge"},
 	{Opt_noflush_merge, "noflush_merge"},
+	{Opt_barrier, "barrier"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_fastboot, "fastboot"},
 	{Opt_extent_cache, "extent_cache"},
@@ -807,6 +809,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 		case Opt_nobarrier:
 			set_opt(sbi, NOBARRIER);
 			break;
+		case Opt_barrier:
+			clear_opt(sbi, NOBARRIER);
+			break;
 		case Opt_fastboot:
 			set_opt(sbi, FASTBOOT);
 			break;
@@ -1940,6 +1945,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",flush_merge");
 	if (test_opt(sbi, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
+	else
+		seq_puts(seq, ",barrier");
 	if (test_opt(sbi, FASTBOOT))
 		seq_puts(seq, ",fastboot");
 	if (test_opt(sbi, EXTENT_CACHE))

From 9856de5d78af879d2e5ae365ce20f6289ed087ae Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Mon, 24 Oct 2022 16:00:35 -0700
Subject: [PATCH 282/457] f2fs: allow to set compression for inlined file

The below commit disallows to set compression on empty created file which
has a inline_data. Let's fix it.

Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 82cda1258227..f96bbfa8b399 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1915,6 +1915,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
 			if (!f2fs_disable_compressed_file(inode))
 				return -EINVAL;
 		} else {
+			/* try to convert inline_data to support compression */
+			int err = f2fs_convert_inline_inode(inode);
+			if (err)
+				return err;
 			if (!f2fs_may_compress(inode))
 				return -EINVAL;
 			if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))

From e79b89649ac5dbe46148aa7f8d0ab57d41b5b9fb Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 16:32:26 +0800
Subject: [PATCH 283/457] f2fs: introduce max_ordered_discard sysfs node

The current max_ordered_discard is a fixed value, change it to be
configurable through the sys node.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  6 ++++++
 fs/f2fs/f2fs.h                          |  3 +++
 fs/f2fs/segment.c                       |  3 ++-
 fs/f2fs/sysfs.c                         | 11 +++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 483639fb727b..53f70eadec96 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -99,6 +99,12 @@ Description:	Controls the issue rate of discard commands that consist of small
 		checkpoint is triggered, and issued during the checkpoint.
 		By default, it is disabled with 0.
 
+What:		/sys/fs/f2fs/<disk>/max_ordered_discard
+Date:		October 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	Controls the maximum ordered discard, the unit size is one block(4KB).
+		Set it to 16 by default.
+
 What:		/sys/fs/f2fs/<disk>/max_discard_request
 Date:		December 2021
 Contact:	"Konstantin Vyshetsky" <vkon@google.com>
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e990870bdab9..fa8dc00dfb2b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -331,6 +331,8 @@ struct discard_entry {
 
 /* default discard granularity of inner discard thread, unit: block count */
 #define DEFAULT_DISCARD_GRANULARITY		16
+/* default maximum discard granularity of ordered discard, unit: block count */
+#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY	16
 
 /* max discard pend list number */
 #define MAX_PLIST_NUM		512
@@ -410,6 +412,7 @@ struct discard_cmd_control {
 	unsigned int mid_discard_issue_time;	/* mid. interval between discard issue */
 	unsigned int max_discard_issue_time;	/* max. interval between discard issue */
 	unsigned int discard_granularity;	/* discard granularity */
+	unsigned int max_ordered_discard;	/* maximum discard granularity issued by lba order */
 	unsigned int undiscard_blks;		/* # of undiscard blocks */
 	unsigned int next_pos;			/* next discard position */
 	atomic_t issued_discard;		/* # of issued discard */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 38f6a2bcb158..c470b443615f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1448,7 +1448,7 @@ retry:
 		if (i + 1 < dpolicy->granularity)
 			break;
 
-		if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+		if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered)
 			return __issue_discard_cmd_orderly(sbi, dpolicy);
 
 		pend_list = &dcc->pend_list[i];
@@ -2048,6 +2048,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 
 	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+	dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
 	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
 		dcc->discard_granularity = sbi->blocks_per_seg;
 	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 926b7a844362..8095345ebdad 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -483,6 +483,15 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "max_ordered_discard")) {
+		if (t == 0 || t > MAX_PLIST_NUM)
+			return -EINVAL;
+		if (!f2fs_block_unit_discard(sbi))
+			return -EINVAL;
+		*ui = t;
+		return count;
+	}
+
 	if (!strcmp(a->attr.name, "migration_granularity")) {
 		if (t == 0 || t > sbi->segs_per_sec)
 			return -EINVAL;
@@ -786,6 +795,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
 F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
@@ -914,6 +924,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(mid_discard_issue_time),
 	ATTR_LIST(max_discard_issue_time),
 	ATTR_LIST(discard_granularity),
+	ATTR_LIST(max_ordered_discard),
 	ATTR_LIST(pending_discard),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),

From 948e16866ee3834f497bfd7597b6b06d9795e522 Mon Sep 17 00:00:00 2001
From: Keoseong Park <keosung.park@samsung.com>
Date: Thu, 27 Oct 2022 20:01:05 +0900
Subject: [PATCH 284/457] f2fs: Fix typo in comments

Change "truncateion" to "truncation".

Signed-off-by: Keoseong Park <keosung.park@samsung.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f96bbfa8b399..c605a4f2bce2 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	raw_node = F2FS_NODE(dn->node_page);
 	addr = blkaddr_in_node(raw_node) + base + ofs;
 
-	/* Assumption: truncateion starts with cluster */
+	/* Assumption: truncation starts with cluster */
 	for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
 		block_t blkaddr = le32_to_cpu(*addr);
 

From d7c136b3c0fb8c71a8ad841b0a287953dca71603 Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <quic_mojha@quicinc.com>
Date: Thu, 27 Oct 2022 14:42:40 +0530
Subject: [PATCH 285/457] f2fs: fix the assign logic of iocb

commit 18ae8d12991b ("f2fs: show more DIO information in tracepoint")
introduces iocb field in 'f2fs_direct_IO_enter' trace event
And it only assigns the pointer and later it accesses its field
in trace print log.

Unable to handle kernel paging request at virtual address ffffffc04cef3d30
Mem abort info:
ESR = 0x96000007
EC = 0x25: DABT (current EL), IL = 32 bits

 pc : trace_raw_output_f2fs_direct_IO_enter+0x54/0xa4
 lr : trace_raw_output_f2fs_direct_IO_enter+0x2c/0xa4
 sp : ffffffc0443cbbd0
 x29: ffffffc0443cbbf0 x28: ffffff8935b120d0 x27: ffffff8935b12108
 x26: ffffff8935b120f0 x25: ffffff8935b12100 x24: ffffff8935b110c0
 x23: ffffff8935b10000 x22: ffffff88859a936c x21: ffffff88859a936c
 x20: ffffff8935b110c0 x19: ffffff8935b10000 x18: ffffffc03b195060
 x17: ffffff8935b11e76 x16: 00000000000000cc x15: ffffffef855c4f2c
 x14: 0000000000000001 x13: 000000000000004e x12: ffff0000ffffff00
 x11: ffffffef86c350d0 x10: 00000000000010c0 x9 : 000000000fe0002c
 x8 : ffffffc04cef3d28 x7 : 7f7f7f7f7f7f7f7f x6 : 0000000002000000
 x5 : ffffff8935b11e9a x4 : 0000000000006250 x3 : ffff0a00ffffff04
 x2 : 0000000000000002 x1 : ffffffef86a0a31f x0 : ffffff8935b10000
 Call trace:
  trace_raw_output_f2fs_direct_IO_enter+0x54/0xa4
  print_trace_fmt+0x9c/0x138
  print_trace_line+0x154/0x254
  tracing_read_pipe+0x21c/0x380
  vfs_read+0x108/0x3ac
  ksys_read+0x7c/0xec
  __arm64_sys_read+0x20/0x30
  invoke_syscall+0x60/0x150
  el0_svc_common.llvm.1237943816091755067+0xb8/0xf8
  do_el0_svc+0x28/0xa0

Fix it by copying the required variables for printing and while at
it fix the similar issue at some other places in the same file.

Fixes: bd984c03097b ("f2fs: show more DIO information in tracepoint")
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index c6b372401c27..ff57e7f9914c 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -322,7 +322,7 @@ TRACE_EVENT(f2fs_unlink_enter,
 		__field(ino_t,	ino)
 		__field(loff_t,	size)
 		__field(blkcnt_t, blocks)
-		__field(const char *,	name)
+		__string(name,  dentry->d_name.name)
 	),
 
 	TP_fast_assign(
@@ -330,7 +330,7 @@ TRACE_EVENT(f2fs_unlink_enter,
 		__entry->ino	= dir->i_ino;
 		__entry->size	= dir->i_size;
 		__entry->blocks	= dir->i_blocks;
-		__entry->name	= dentry->d_name.name;
+		__assign_str(name, dentry->d_name.name);
 	),
 
 	TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, "
@@ -338,7 +338,7 @@ TRACE_EVENT(f2fs_unlink_enter,
 		show_dev_ino(__entry),
 		__entry->size,
 		(unsigned long long)__entry->blocks,
-		__entry->name)
+		__get_str(name))
 );
 
 DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit,
@@ -940,25 +940,29 @@ TRACE_EVENT(f2fs_direct_IO_enter,
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(ino_t,	ino)
-		__field(struct kiocb *,	iocb)
+		__field(loff_t,	ki_pos)
+		__field(int,	ki_flags)
+		__field(u16,	ki_ioprio)
 		__field(unsigned long,	len)
 		__field(int,	rw)
 	),
 
 	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->ino	= inode->i_ino;
-		__entry->iocb	= iocb;
-		__entry->len	= len;
-		__entry->rw	= rw;
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->ki_pos		= iocb->ki_pos;
+		__entry->ki_flags	= iocb->ki_flags;
+		__entry->ki_ioprio	= iocb->ki_ioprio;
+		__entry->len		= len;
+		__entry->rw		= rw;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d",
 		show_dev_ino(__entry),
-		__entry->iocb->ki_pos,
+		__entry->ki_pos,
 		__entry->len,
-		__entry->iocb->ki_flags,
-		__entry->iocb->ki_ioprio,
+		__entry->ki_flags,
+		__entry->ki_ioprio,
 		__entry->rw)
 );
 
@@ -1407,19 +1411,19 @@ TRACE_EVENT(f2fs_write_checkpoint,
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(int,	reason)
-		__field(char *,	msg)
+		__string(dest_msg, msg)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
 		__entry->reason		= reason;
-		__entry->msg		= msg;
+		__assign_str(dest_msg, msg);
 	),
 
 	TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
 		show_dev(__entry->dev),
 		show_cpreason(__entry->reason),
-		__entry->msg)
+		__get_str(dest_msg))
 );
 
 DECLARE_EVENT_CLASS(f2fs_discard,

From 62cab7f374aba0759f129d8026e590a7c89cb1e3 Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <quic_mojha@quicinc.com>
Date: Thu, 27 Oct 2022 14:42:41 +0530
Subject: [PATCH 286/457] f2fs: fix the msg data type

Data type of msg in f2fs_write_checkpoint trace should
be const char * instead of char *.

Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/trace/events/f2fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index ff57e7f9914c..7fbfce498472 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1404,7 +1404,7 @@ TRACE_EVENT(f2fs_readpages,
 
 TRACE_EVENT(f2fs_write_checkpoint,
 
-	TP_PROTO(struct super_block *sb, int reason, char *msg),
+	TP_PROTO(struct super_block *sb, int reason, const char *msg),
 
 	TP_ARGS(sb, reason, msg),
 

From e4e33a3c566e32cb179b2b0d660c346907b5ef39 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 27 Oct 2022 18:24:46 +0800
Subject: [PATCH 287/457] f2fs: fix return val in f2fs_start_ckpt_thread()

Return PTR_ERR(cprc->f2fs_issue_ckpt) instead of -ENOMEM;

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c |  4 +++-
 fs/f2fs/gc.c         | 15 +++++++--------
 fs/f2fs/segment.c    |  4 ++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c00694a50222..56f7d0d6a8b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1902,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
 	cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
 			"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+		int err = PTR_ERR(cprc->f2fs_issue_ckpt);
+
 		cprc->f2fs_issue_ckpt = NULL;
-		return -ENOMEM;
+		return err;
 	}
 
 	set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d2e9c280773f..15f56859966c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -171,13 +171,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_gc_kthread *gc_th;
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
-	int err = 0;
 
 	gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
-	if (!gc_th) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!gc_th)
+		return -ENOMEM;
 
 	gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
 	gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
@@ -192,12 +189,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
 	sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
 			"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(gc_th->f2fs_gc_task)) {
-		err = PTR_ERR(gc_th->f2fs_gc_task);
+		int err = PTR_ERR(gc_th->f2fs_gc_task);
+
 		kfree(gc_th);
 		sbi->gc_thread = NULL;
+		return err;
 	}
-out:
-	return err;
+
+	return 0;
 }
 
 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c470b443615f..c4270cd6eaab 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -620,7 +620,6 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
 {
 	dev_t dev = sbi->sb->s_bdev->bd_dev;
 	struct flush_cmd_control *fcc;
-	int err;
 
 	if (SM_I(sbi)->fcc_info) {
 		fcc = SM_I(sbi)->fcc_info;
@@ -644,7 +643,8 @@ init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
-		err = PTR_ERR(fcc->f2fs_issue_flush);
+		int err = PTR_ERR(fcc->f2fs_issue_flush);
+
 		kfree(fcc);
 		SM_I(sbi)->fcc_info = NULL;
 		return err;

From fde8597ba51c302215bed83e2311faaa346d198a Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 28 Oct 2022 17:30:26 +0800
Subject: [PATCH 288/457] f2fs: fix to destroy sbi->post_read_wq in error path
 of f2fs_fill_super()

In error path of f2fs_fill_super(), this patch fixes to call
f2fs_destroy_post_read_wq() once if we fail in f2fs_start_ckpt_thread().

Fixes: 261eeb9c1585 ("f2fs: introduce checkpoint_merge mount option")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a247027711d8..e6365f040171 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4531,9 +4531,9 @@ free_nm:
 	f2fs_destroy_node_manager(sbi);
 free_sm:
 	f2fs_destroy_segment_manager(sbi);
-	f2fs_destroy_post_read_wq(sbi);
 stop_ckpt_thread:
 	f2fs_stop_ckpt_thread(sbi);
+	f2fs_destroy_post_read_wq(sbi);
 free_devices:
 	destroy_device_list(sbi);
 	kvfree(sbi->ckpt);

From 6b11fb814e0aab500472773ddc942a4171fad706 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 11:32:16 +0800
Subject: [PATCH 289/457] f2fs: introduce gc_mode sysfs node

Revert "f2fs: make gc_urgent and gc_segment_mode sysfs node readable".

Add a gc_mode sysfs node to show the current gc_mode as a string.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  6 ++++++
 fs/f2fs/f2fs.h                          |  1 +
 fs/f2fs/sysfs.c                         | 15 +++++++++------
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 53f70eadec96..ef2b3572ba18 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -640,3 +640,9 @@ Date:		July 2022
 Contact:	"Daeho Jeong" <daehojeong@google.com>
 Description:	Show the accumulated total revoked atomic write block count after boot.
 		If you write "0" here, you can initialize to "0".
+
+What:		/sys/fs/f2fs/<disk>/gc_mode
+Date:		October 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	Show the current gc_mode as a string.
+		This is a read-only entry.
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fa8dc00dfb2b..662b27c19de1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1319,6 +1319,7 @@ enum {
 	MAX_TIME,
 };
 
+/* Note that you need to keep synchronization with this gc_mode_names array */
 enum {
 	GC_NORMAL,
 	GC_IDLE_CB,
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 8095345ebdad..1fbd41c48328 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -143,6 +143,12 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
 				&SM_I(sbi)->dcc_info->discard_cmd_cnt));
 }
 
+static ssize_t gc_mode_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]);
+}
+
 static ssize_t features_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
@@ -332,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 		return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
 #endif
 
-	if (!strcmp(a->attr.name, "gc_urgent"))
-		return sysfs_emit(buf, "%s\n",
-				gc_mode_names[sbi->gc_mode]);
-
 	if (!strcmp(a->attr.name, "gc_segment_mode"))
-		return sysfs_emit(buf, "%s\n",
-				gc_mode_names[sbi->gc_segment_mode]);
+		return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
 
 	if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
 		return sysfs_emit(buf, "%u\n",
@@ -844,6 +845,7 @@ F2FS_GENERAL_RO_ATTR(encoding);
 F2FS_GENERAL_RO_ATTR(mounted_time_sec);
 F2FS_GENERAL_RO_ATTR(main_blkaddr);
 F2FS_GENERAL_RO_ATTR(pending_discard);
+F2FS_GENERAL_RO_ATTR(gc_mode);
 #ifdef CONFIG_F2FS_STAT_FS
 F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
 F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -926,6 +928,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(discard_granularity),
 	ATTR_LIST(max_ordered_discard),
 	ATTR_LIST(pending_discard),
+	ATTR_LIST(gc_mode),
 	ATTR_LIST(ipu_policy),
 	ATTR_LIST(min_ipu_util),
 	ATTR_LIST(min_fsync_blocks),

From 19ca1814ba8facb838c9d7d528e114271a823b88 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 28 Oct 2022 09:49:53 -0700
Subject: [PATCH 290/457] f2fs: use sysfs_emit instead of sprintf

Let's use sysfs_emit.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 1fbd41c48328..af23ed6121b0 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 static ssize_t dirty_segments_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(dirty_segments(sbi)));
 }
 
 static ssize_t free_segments_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(free_segments(sbi)));
 }
 
 static ssize_t ovp_segments_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(overprovision_segments(sbi)));
 }
 
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 			(unsigned long long)(sbi->kbytes_written +
 			((f2fs_get_sectors_written(sbi) -
 				sbi->sectors_written_start) >> 1)));
@@ -125,13 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 static ssize_t sb_status_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%lx\n", sbi->s_flag);
+	return sysfs_emit(buf, "%lx\n", sbi->s_flag);
 }
 
 static ssize_t cp_status_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
+	return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
 }
 
 static ssize_t pending_discard_show(struct f2fs_attr *a,
@@ -139,7 +139,7 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
 {
 	if (!SM_I(sbi)->dcc_info)
 		return -EINVAL;
-	return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
 				&SM_I(sbi)->dcc_info->discard_cmd_cnt));
 }
 
@@ -205,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a,
 static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
 					struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%u\n", sbi->current_reserved_blocks);
+	return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks);
 }
 
 static ssize_t unusable_show(struct f2fs_attr *a,
@@ -217,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a,
 		unusable = sbi->unusable_block_count;
 	else
 		unusable = f2fs_get_unusable_blocks(sbi);
-	return sprintf(buf, "%llu\n", (unsigned long long)unusable);
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable);
 }
 
 static ssize_t encoding_show(struct f2fs_attr *a,
@@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
 			(sb->s_encoding->version >> 8) & 0xff,
 			sb->s_encoding->version & 0xff);
 #endif
-	return sprintf(buf, "(none)");
+	return sysfs_emit(buf, "(none)");
 }
 
 static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+	return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time);
 }
 
 #ifdef CONFIG_F2FS_STAT_FS
@@ -247,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
 
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 		(unsigned long long)(si->tot_blks -
 			(si->bg_data_blks + si->bg_node_blks)));
 }
@@ -257,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
 {
 	struct f2fs_stat_info *si = F2FS_STAT(sbi);
 
-	return sprintf(buf, "%llu\n",
+	return sysfs_emit(buf, "%llu\n",
 		(unsigned long long)(si->bg_data_blks + si->bg_node_blks));
 }
 
@@ -268,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
 
 	si->dirty_count = dirty_segments(sbi);
 	f2fs_update_sit_info(sbi);
-	return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
 }
 #endif
 
@@ -363,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 
 	ui = (unsigned int *)(ptr + a->offset);
 
-	return sprintf(buf, "%u\n", *ui);
+	return sysfs_emit(buf, "%u\n", *ui);
 }
 
 static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -728,7 +728,7 @@ static void f2fs_sb_release(struct kobject *kobj)
 static ssize_t f2fs_feature_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sprintf(buf, "supported\n");
+	return sysfs_emit(buf, "supported\n");
 }
 
 #define F2FS_FEATURE_RO_ATTR(_name)				\
@@ -741,8 +741,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
 	if (F2FS_HAS_FEATURE(sbi, a->id))
-		return sprintf(buf, "supported\n");
-	return sprintf(buf, "unsupported\n");
+		return sysfs_emit(buf, "supported\n");
+	return sysfs_emit(buf, "unsupported\n");
 }
 
 #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat)			\

From a4e29eb0aab4b63e92852cc4361a0fd2284a8b5d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 28 Oct 2022 10:07:13 -0700
Subject: [PATCH 291/457] f2fs: add missing bracket in doc

Let's add missing <>.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index ef2b3572ba18..a6a60268dcc5 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -241,7 +241,7 @@ Description:	Shows total written kbytes issued to disk.
 What:		/sys/fs/f2fs/<disk>/features
 Date:		July 2017
 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
-Description:	<deprecated: should use /sys/fs/f2fs/<disk>/feature_list/
+Description:	<deprecated: should use /sys/fs/f2fs/<disk>/feature_list/>
 		Shows all enabled features in current device.
 		Supported features:
 		encryption, blkzoned, extra_attr, projquota, inode_checksum,

From f7c4090cbd7b29578ce629ea9c8f7283431899dd Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 14:50:25 +0800
Subject: [PATCH 292/457] f2fs: replace gc_urgent_high_remaining with
 gc_remaining_trials

The user can set the trial count limit for GC urgent and
idle mode with replaced gc_remaining_trials.. If GC thread gets
to the limit, the mode will turn back to GC normal mode finally.

It was applied only to GC_URGENT, while this patch expands it for
GC_IDLE.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  8 ++++----
 fs/f2fs/f2fs.h                          |  5 +++--
 fs/f2fs/gc.c                            | 12 ++++++------
 fs/f2fs/super.c                         |  2 +-
 fs/f2fs/sysfs.c                         | 12 ++++++------
 5 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index a6a60268dcc5..24e7cb77f265 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -598,10 +598,10 @@ Description:	With "mode=fragment:block" mount options, we can scatter block allo
 		in the length of 1..<max_fragment_hole> by turns. This value can be set
 		between 1..512 and the default value is 4.
 
-What:		/sys/fs/f2fs/<disk>/gc_urgent_high_remaining
-Date:		December 2021
-Contact:	"Daeho Jeong" <daehojeong@google.com>
-Description:	You can set the trial count limit for GC urgent high mode with this value.
+What:		/sys/fs/f2fs/<disk>/gc_remaining_trials
+Date:		October 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	You can set the trial count limit for GC urgent and idle mode with this value.
 		If GC thread gets to the limit, the mode will turn back to GC normal mode.
 		By default, the value is zero, which means there is no limit like before.
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 662b27c19de1..04ef4cce3d7f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1736,8 +1736,9 @@ struct f2fs_sb_info {
 	unsigned int cur_victim_sec;		/* current victim section num */
 	unsigned int gc_mode;			/* current GC state */
 	unsigned int next_victim_seg[2];	/* next segment in victim section */
-	spinlock_t gc_urgent_high_lock;
-	unsigned int gc_urgent_high_remaining;	/* remaining trial count for GC_URGENT_HIGH */
+	spinlock_t gc_remaining_trials_lock;
+	/* remaining trial count for GC_URGENT_* and GC_IDLE_* */
+	unsigned int gc_remaining_trials;
 
 	/* for skip statistic */
 	unsigned long long skipped_gc_rwsem;		/* FG_GC only */
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 15f56859966c..6466db75af5d 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -152,14 +152,14 @@ do_gc:
 		/* balancing f2fs's metadata periodically */
 		f2fs_balance_fs_bg(sbi, true);
 next:
-		if (sbi->gc_mode == GC_URGENT_HIGH) {
-			spin_lock(&sbi->gc_urgent_high_lock);
-			if (sbi->gc_urgent_high_remaining) {
-				sbi->gc_urgent_high_remaining--;
-				if (!sbi->gc_urgent_high_remaining)
+		if (sbi->gc_mode != GC_NORMAL) {
+			spin_lock(&sbi->gc_remaining_trials_lock);
+			if (sbi->gc_remaining_trials) {
+				sbi->gc_remaining_trials--;
+				if (!sbi->gc_remaining_trials)
 					sbi->gc_mode = GC_NORMAL;
 			}
-			spin_unlock(&sbi->gc_urgent_high_lock);
+			spin_unlock(&sbi->gc_remaining_trials_lock);
 		}
 		sb_end_write(sbi->sb);
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e6365f040171..a43d8a46a6e5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3624,7 +3624,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->seq_file_ra_mul = MIN_RA_MUL;
 	sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
 	sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
-	spin_lock_init(&sbi->gc_urgent_high_lock);
+	spin_lock_init(&sbi->gc_remaining_trials_lock);
 	atomic64_set(&sbi->current_atomic_write, 0);
 
 	sbi->dir_level = DEF_DIR_LEVEL;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index af23ed6121b0..032c03e09580 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -538,10 +538,10 @@ out:
 		return count;
 	}
 
-	if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
-		spin_lock(&sbi->gc_urgent_high_lock);
-		sbi->gc_urgent_high_remaining = t;
-		spin_unlock(&sbi->gc_urgent_high_lock);
+	if (!strcmp(a->attr.name, "gc_remaining_trials")) {
+		spin_lock(&sbi->gc_remaining_trials_lock);
+		sbi->gc_remaining_trials = t;
+		spin_unlock(&sbi->gc_remaining_trials_lock);
 
 		return count;
 	}
@@ -832,7 +832,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
 #endif
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials);
 F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
 F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(free_segments);
@@ -961,7 +961,7 @@ static struct attribute *f2fs_attrs[] = {
 #endif
 	ATTR_LIST(data_io_flag),
 	ATTR_LIST(node_io_flag),
-	ATTR_LIST(gc_urgent_high_remaining),
+	ATTR_LIST(gc_remaining_trials),
 	ATTR_LIST(ckpt_thread_ioprio),
 	ATTR_LIST(dirty_segments),
 	ATTR_LIST(free_segments),

From 0417feccbab50424461d40e991e3f14530c5b3a5 Mon Sep 17 00:00:00 2001
From: "wangkailong@jari.cn" <wangkailong@jari.cn>
Date: Sat, 29 Oct 2022 22:49:30 +0800
Subject: [PATCH 293/457] f2fs: replace ternary operator with max()

Fix the following coccicheck warning:

./fs/f2fs/segment.c:877:24-25: WARNING opportunity for max()

Signed-off-by: KaiLong Wang <wangkailong@jari.cn>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c4270cd6eaab..aa4be7f25963 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -856,7 +856,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
 	}
 	mutex_unlock(&dirty_i->seglist_lock);
 
-	unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
+	unusable = max(holes[DATA], holes[NODE]);
 	if (unusable > ovp_holes)
 		return unusable - ovp_holes;
 	return 0;

From 30385b5011c80dff6e50202f5b562c7cabb9170d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 8 Nov 2022 17:59:34 -0800
Subject: [PATCH 294/457] f2fs: allow to read node block after shutdown

If block address is still alive, we should give a valid node block even after
shutdown. Otherwise, we can see zero data when reading out a file.

Cc: stable@vger.kernel.org
Fixes: 83a3bfdb5a8a ("f2fs: indicate shutdown f2fs to allow unmount successfully")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/node.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 983572f23896..b9ee5a1176a0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1360,8 +1360,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
 		return err;
 
 	/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
-	if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
-			is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
+	if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) {
 		ClearPageUptodate(page);
 		return -ENOENT;
 	}

From 9b392d1718fedb5210282f087629848164365279 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 25 Oct 2022 21:26:38 +0800
Subject: [PATCH 295/457] f2fs: add proc entry to show discard_plist info

This patch adds a new proc entry to show discard_plist
information in more detail, which is very helpful to
know the discard pend list count clearly.

Such as:

Discard pend list(Show diacrd_cmd count on each entry, .:not exist):
  0       390     156      85      67      46      37      26      14
  8        17      12       9       9       6      12      11      10
  16        5       9       2       4       8       3       4       1
  24        3       2       2       5       2       4       5       4
  32        3       3       2       3       .       3       3       1
  40        .       4       1       3       2       1       2       1
  48        1       .       1       1       .       1       1       .
  56        .       1       1       1       .       2       .       1
  64        1       2       .       .       .       .       .       .
  72        .       1       .       .       .       .       .       .
  80        3       1       .       .       1       1       .       .
  88        1       .       .       .       1       .       .       1
......

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/sysfs.c | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 032c03e09580..97bf0dbb0974 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -1252,6 +1252,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
 	return 0;
 }
 
+static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
+						void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+	int i, count;
+
+	seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n");
+	if (!f2fs_realtime_discard_enable(sbi))
+		return 0;
+
+	if (dcc) {
+		mutex_lock(&dcc->cmd_lock);
+		for (i = 0; i < MAX_PLIST_NUM; i++) {
+			struct list_head *pend_list;
+			struct discard_cmd *dc, *tmp;
+
+			if (i % 8 == 0)
+				seq_printf(seq, "  %-3d", i);
+			count = 0;
+			pend_list = &dcc->pend_list[i];
+			list_for_each_entry_safe(dc, tmp, pend_list, list)
+				count++;
+			if (count)
+				seq_printf(seq, " %7d", count);
+			else
+				seq_puts(seq, "       .");
+			if (i % 8 == 7)
+				seq_putc(seq, '\n');
+		}
+		seq_putc(seq, '\n');
+		mutex_unlock(&dcc->cmd_lock);
+	}
+
+	return 0;
+}
+
 int __init f2fs_init_sysfs(void)
 {
 	int ret;
@@ -1322,6 +1360,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 #endif
 		proc_create_single_data("victim_bits", 0444, sbi->s_proc,
 				victim_bits_seq_show, sb);
+		proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
+				discard_plist_seq_show, sb);
 	}
 	return 0;
 put_feature_list_kobj:
@@ -1345,6 +1385,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
 		remove_proc_entry("segment_info", sbi->s_proc);
 		remove_proc_entry("segment_bits", sbi->s_proc);
 		remove_proc_entry("victim_bits", sbi->s_proc);
+		remove_proc_entry("discard_plist_info", sbi->s_proc);
 		remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
 	}
 

From b390af7241d7a231e56a4d49e47626c1fc426df1 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Mon, 31 Oct 2022 12:24:15 -0700
Subject: [PATCH 296/457] f2fs: correct i_size change for atomic writes

We need to make sure i_size doesn't change until atomic write commit is
successful and restore it when commit is failed.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    |  8 ++++++++
 fs/f2fs/file.c    | 18 +++++++++++-------
 fs/f2fs/inode.c   |  5 ++++-
 fs/f2fs/segment.c | 14 ++++++++++----
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 04ef4cce3d7f..11c475beca2c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -768,6 +768,7 @@ enum {
 	FI_COMPRESS_RELEASED,	/* compressed blocks were released */
 	FI_ALIGNED_WRITE,	/* enable aligned write */
 	FI_COW_FILE,		/* indicate COW file */
+	FI_ATOMIC_COMMITTED,	/* indicate atomic commit completed except disk sync */
 	FI_MAX,			/* max flag, never be used */
 };
 
@@ -826,6 +827,7 @@ struct f2fs_inode_info {
 	unsigned int i_cluster_size;		/* cluster size */
 
 	unsigned int atomic_write_cnt;
+	loff_t original_i_size;		/* original i_size before atomic write */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -3075,6 +3077,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
 
+static inline bool f2fs_is_atomic_file(struct inode *inode);
+
 static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 {
 	bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
@@ -3084,6 +3088,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 		return;
 
 	i_size_write(inode, i_size);
+
+	if (f2fs_is_atomic_file(inode))
+		return;
+
 	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c605a4f2bce2..28f586e77999 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2041,6 +2041,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inode *pinode;
+	loff_t isize;
 	int ret;
 
 	if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2099,7 +2100,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
 		goto out;
 	}
-	f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
+
+	f2fs_write_inode(inode, NULL);
+
+	isize = i_size_read(inode);
+	fi->original_i_size = isize;
+	f2fs_i_size_write(fi->cow_inode, isize);
 
 	stat_inc_atomic_inode(inode);
 
@@ -2137,16 +2143,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 
 	if (f2fs_is_atomic_file(inode)) {
 		ret = f2fs_commit_atomic_write(inode);
-		if (ret)
-			goto unlock_out;
-
-		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
 		if (!ret)
-			f2fs_abort_atomic_write(inode, false);
+			ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+
+		f2fs_abort_atomic_write(inode, ret);
 	} else {
 		ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
 	}
-unlock_out:
+
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9f0d3864d9f1..577f109b4e1d 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -621,9 +621,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 	ri->i_uid = cpu_to_le32(i_uid_read(inode));
 	ri->i_gid = cpu_to_le32(i_gid_read(inode));
 	ri->i_links = cpu_to_le32(inode->i_nlink);
-	ri->i_size = cpu_to_le64(i_size_read(inode));
 	ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
 
+	if (!f2fs_is_atomic_file(inode) ||
+			is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
+		ri->i_size = cpu_to_le64(i_size_read(inode));
+
 	if (et) {
 		read_lock(&et->lock);
 		set_raw_extent(&et->largest, &ri->i_ext);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index aa4be7f25963..8aa81238c770 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,14 +192,18 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	if (!f2fs_is_atomic_file(inode))
 		return;
 
-	if (clean)
-		truncate_inode_pages_final(inode->i_mapping);
 	clear_inode_flag(fi->cow_inode, FI_COW_FILE);
 	iput(fi->cow_inode);
 	fi->cow_inode = NULL;
 	release_atomic_write_cnt(inode);
+	clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	stat_dec_atomic_inode(inode);
+
+	if (clean) {
+		truncate_inode_pages_final(inode->i_mapping);
+		f2fs_i_size_write(inode, fi->original_i_size);
+	}
 }
 
 static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -335,10 +339,12 @@ next:
 	}
 
 out:
-	if (ret)
+	if (ret) {
 		sbi->revoked_atomic_block += fi->atomic_write_cnt;
-	else
+	} else {
 		sbi->committed_atomic_block += fi->atomic_write_cnt;
+		set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+	}
 
 	__complete_revoke_list(inode, &revoke_list, ret ? true : false);
 

From 1d56d3276474b144de03fcb2afa0301010cfc269 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sun, 6 Nov 2022 21:25:44 +0800
Subject: [PATCH 297/457] f2fs: fix to avoid accessing uninitialized spinlock

syzbot reports a kernel bug:

 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x1e3/0x2cb lib/dump_stack.c:106
 assign_lock_key+0x22a/0x240 kernel/locking/lockdep.c:981
 register_lock_class+0x287/0x9b0 kernel/locking/lockdep.c:1294
 __lock_acquire+0xe4/0x1f60 kernel/locking/lockdep.c:4934
 lock_acquire+0x1a7/0x400 kernel/locking/lockdep.c:5668
 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline]
 _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:154
 spin_lock include/linux/spinlock.h:350 [inline]
 f2fs_save_errors fs/f2fs/super.c:3868 [inline]
 f2fs_handle_error+0x29/0x230 fs/f2fs/super.c:3896
 f2fs_iget+0x215/0x4bb0 fs/f2fs/inode.c:516
 f2fs_fill_super+0x47d3/0x7b50 fs/f2fs/super.c:4222
 mount_bdev+0x26c/0x3a0 fs/super.c:1401
 legacy_get_tree+0xea/0x180 fs/fs_context.c:610
 vfs_get_tree+0x88/0x270 fs/super.c:1531
 do_new_mount+0x289/0xad0 fs/namespace.c:3040
 do_mount fs/namespace.c:3383 [inline]
 __do_sys_mount fs/namespace.c:3591 [inline]
 __se_sys_mount+0x2e3/0x3d0 fs/namespace.c:3568
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

F2FS-fs (loop1): Failed to read F2FS meta data inode

The root cause is if sbi->error_lock may be accessed before
its initialization, fix it.

Link: https://lore.kernel.org/linux-f2fs-devel/0000000000007edb6605ecbb6442@google.com/T/#u
Reported-by: syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com
Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a43d8a46a6e5..68a6c2eedcac 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4196,6 +4196,9 @@ try_onemore:
 	if (err)
 		goto free_bio_info;
 
+	spin_lock_init(&sbi->error_lock);
+	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
 	init_f2fs_rwsem(&sbi->cp_rwsem);
 	init_f2fs_rwsem(&sbi->quota_sem);
 	init_waitqueue_head(&sbi->cp_wait);
@@ -4263,9 +4266,6 @@ try_onemore:
 		goto free_devices;
 	}
 
-	spin_lock_init(&sbi->error_lock);
-	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
-
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	percpu_counter_set(&sbi->total_valid_inode_count,

From 0e5801e1b97692668a9d91c1c3660ea888c11bde Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Tue, 8 Nov 2022 22:33:21 +0800
Subject: [PATCH 298/457] f2fs: optimize iteration over sparse directories

Wei Chen reports a kernel bug as blew:

INFO: task syz-executor.0:29056 blocked for more than 143 seconds.
      Not tainted 5.15.0-rc5 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:syz-executor.0  state:D stack:14632 pid:29056 ppid:  6574 flags:0x00000004
Call Trace:
 __schedule+0x4a1/0x1720
 schedule+0x36/0xe0
 rwsem_down_write_slowpath+0x322/0x7a0
 fscrypt_ioctl_set_policy+0x11f/0x2a0
 __f2fs_ioctl+0x1a9f/0x5780
 f2fs_ioctl+0x89/0x3a0
 __x64_sys_ioctl+0xe8/0x140
 do_syscall_64+0x34/0xb0
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Eric did some investigation on this issue, quoted from reply of Eric:

"Well, the quality of this bug report has a lot to be desired (not on
upstream kernel, reproducer is full of totally irrelevant stuff, not
sent to the mailing list of the filesystem whose disk image is being
fuzzed, etc.).  But what is going on is that f2fs_empty_dir() doesn't
consider the case of a directory with an extremely large i_size on a
malicious disk image.

Specifically, the reproducer mounts an f2fs image with a directory
that has an i_size of 14814520042850357248, then calls
FS_IOC_SET_ENCRYPTION_POLICY on it.

That results in a call to f2fs_empty_dir() to check whether the
directory is empty.  f2fs_empty_dir() then iterates through all
3616826182336513 blocks the directory allegedly contains to check
whether any contain anything.  i_rwsem is held during this, so
anything else that tries to take it will hang."

In order to solve this issue, let's use f2fs_get_next_page_offset()
to speed up iteration by skipping holes for all below functions:
- f2fs_empty_dir
- f2fs_readdir
- find_in_level

The way why we can speed up iteration was described in
'commit 3cf4574705b4 ("f2fs: introduce get_next_page_offset to speed
up SEEK_DATA")'.

Meanwhile, in f2fs_empty_dir(), let's use f2fs_find_data_page()
instead f2fs_get_lock_data_page(), due to i_rwsem was held in
caller of f2fs_empty_dir(), there shouldn't be any races, so it's
fine to not lock dentry page during lookuping dirents in the page.

Link: https://lore.kernel.org/lkml/536944df-a0ae-1dd8-148f-510b476e1347@kernel.org/T/
Reported-by: Wei Chen <harperchen1110@gmail.com>
Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c | 17 ++++++++++++-----
 fs/f2fs/dir.c  | 34 ++++++++++++++++++++++++----------
 fs/f2fs/f2fs.h |  5 +++--
 fs/f2fs/gc.c   |  4 ++--
 4 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a71e818cd67b..9b47ded653d1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1206,7 +1206,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 }
 
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
-				     blk_opf_t op_flags, bool for_write)
+				     blk_opf_t op_flags, bool for_write,
+				     pgoff_t *next_pgofs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
@@ -1232,12 +1233,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
-	if (err)
+	if (err) {
+		if (err == -ENOENT && next_pgofs)
+			*next_pgofs = f2fs_get_next_page_offset(&dn, index);
 		goto put_err;
+	}
 	f2fs_put_dnode(&dn);
 
 	if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
 		err = -ENOENT;
+		if (next_pgofs)
+			*next_pgofs = index + 1;
 		goto put_err;
 	}
 	if (dn.data_blkaddr != NEW_ADDR &&
@@ -1281,7 +1287,8 @@ put_err:
 	return ERR_PTR(err);
 }
 
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+					pgoff_t *next_pgofs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
@@ -1291,7 +1298,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = f2fs_get_read_data_page(inode, index, 0, false);
+	page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
 	if (IS_ERR(page))
 		return page;
 
@@ -1317,7 +1324,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = f2fs_get_read_data_page(inode, index, 0, for_write);
+	page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
 	if (IS_ERR(page))
 		return page;
 
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 21960a899b6a..030b7fd4142f 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 	unsigned int bidx, end_block;
 	struct page *dentry_page;
 	struct f2fs_dir_entry *de = NULL;
+	pgoff_t next_pgofs;
 	bool room = false;
 	int max_slots;
 
@@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 			       le32_to_cpu(fname->hash) % nbucket);
 	end_block = bidx + nblock;
 
-	for (; bidx < end_block; bidx++) {
+	while (bidx < end_block) {
 		/* no need to allocate new dentry pages to all the indices */
-		dentry_page = f2fs_find_data_page(dir, bidx);
+		dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
 		if (IS_ERR(dentry_page)) {
 			if (PTR_ERR(dentry_page) == -ENOENT) {
 				room = true;
+				bidx = next_pgofs;
 				continue;
 			} else {
 				*res_page = dentry_page;
@@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 		if (max_slots >= s)
 			room = true;
 		f2fs_put_page(dentry_page, 0);
+
+		bidx++;
 	}
 
 	if (!de && room && F2FS_I(dir)->chash != fname->hash) {
@@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 
 bool f2fs_empty_dir(struct inode *dir)
 {
-	unsigned long bidx;
+	unsigned long bidx = 0;
 	struct page *dentry_page;
 	unsigned int bit_pos;
 	struct f2fs_dentry_block *dentry_blk;
@@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir)
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_empty_inline_dir(dir);
 
-	for (bidx = 0; bidx < nblock; bidx++) {
-		dentry_page = f2fs_get_lock_data_page(dir, bidx, false);
+	while (bidx < nblock) {
+		pgoff_t next_pgofs;
+
+		dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
 		if (IS_ERR(dentry_page)) {
-			if (PTR_ERR(dentry_page) == -ENOENT)
+			if (PTR_ERR(dentry_page) == -ENOENT) {
+				bidx = next_pgofs;
 				continue;
-			else
+			} else {
 				return false;
+			}
 		}
 
 		dentry_blk = page_address(dentry_page);
@@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir)
 						NR_DENTRY_IN_BLOCK,
 						bit_pos);
 
-		f2fs_put_page(dentry_page, 1);
+		f2fs_put_page(dentry_page, 0);
 
 		if (bit_pos < NR_DENTRY_IN_BLOCK)
 			return false;
+
+		bidx++;
 	}
 	return true;
 }
@@ -1104,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		goto out_free;
 	}
 
-	for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+	for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+		pgoff_t next_pgofs;
 
 		/* allow readdir() to be interrupted */
 		if (fatal_signal_pending(current)) {
@@ -1118,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 			page_cache_sync_readahead(inode->i_mapping, ra, file, n,
 				min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
 
-		dentry_page = f2fs_find_data_page(inode, n);
+		dentry_page = f2fs_find_data_page(inode, n, &next_pgofs);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
 			if (err == -ENOENT) {
 				err = 0;
+				n = next_pgofs;
 				continue;
 			} else {
 				goto out_free;
@@ -1141,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		}
 
 		f2fs_put_page(dentry_page, 0);
+
+		n++;
 	}
 out_free:
 	fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 11c475beca2c..6a8cbf5bb187 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3807,8 +3807,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
 int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
 int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
 struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
-			blk_opf_t op_flags, bool for_write);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+			blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+							pgoff_t *next_pgofs);
 struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
 			bool for_write);
 struct page *f2fs_get_new_data_page(struct inode *inode,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6466db75af5d..f1a46519a5fe 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1562,8 +1562,8 @@ next_step:
 				continue;
 			}
 
-			data_page = f2fs_get_read_data_page(inode,
-						start_bidx, REQ_RAHEAD, true);
+			data_page = f2fs_get_read_data_page(inode, start_bidx,
+							REQ_RAHEAD, true, NULL);
 			f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 			if (IS_ERR(data_page)) {
 				iput(inode);

From 98c2bb9001712af3c6f550aef0edefc308e833ba Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 9 Nov 2022 07:04:42 +0900
Subject: [PATCH 299/457] f2fs: initialize locks earlier in f2fs_fill_super()

syzbot is reporting lockdep warning at f2fs_handle_error() [1], for
spin_lock(&sbi->error_lock) is called before spin_lock_init() is called.
For safe locking in error handling, move initialization of locks (and
obvious structures) in f2fs_fill_super() to immediately after memory
allocation.

Link: https://syzkaller.appspot.com/bug?extid=40642be9b7e0bb28e0df [1]
Reported-by: syzbot <syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+40642be9b7e0bb28e0df@syzkaller.appspotmail.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 68a6c2eedcac..8f4fc3ad6765 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4103,6 +4103,24 @@ try_onemore:
 
 	sbi->sb = sb;
 
+	/* initialize locks within allocated memory */
+	init_f2fs_rwsem(&sbi->gc_lock);
+	mutex_init(&sbi->writepages);
+	init_f2fs_rwsem(&sbi->cp_global_sem);
+	init_f2fs_rwsem(&sbi->node_write);
+	init_f2fs_rwsem(&sbi->node_change);
+	spin_lock_init(&sbi->stat_lock);
+	init_f2fs_rwsem(&sbi->cp_rwsem);
+	init_f2fs_rwsem(&sbi->quota_sem);
+	init_waitqueue_head(&sbi->cp_wait);
+	spin_lock_init(&sbi->error_lock);
+
+	for (i = 0; i < NR_INODE_TYPE; i++) {
+		INIT_LIST_HEAD(&sbi->inode_list[i]);
+		spin_lock_init(&sbi->inode_lock[i]);
+	}
+	mutex_init(&sbi->flush_lock);
+
 	/* Load the checksum driver */
 	sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
 	if (IS_ERR(sbi->s_chksum_driver)) {
@@ -4126,6 +4144,8 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
 	/* precompute checksum seed for metadata */
 	if (f2fs_sb_has_inode_chksum(sbi))
 		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
@@ -4182,26 +4202,14 @@ try_onemore:
 
 	/* init f2fs-specific super block info */
 	sbi->valid_super_block = valid_super_block;
-	init_f2fs_rwsem(&sbi->gc_lock);
-	mutex_init(&sbi->writepages);
-	init_f2fs_rwsem(&sbi->cp_global_sem);
-	init_f2fs_rwsem(&sbi->node_write);
-	init_f2fs_rwsem(&sbi->node_change);
 
 	/* disallow all the data/node/meta page writes */
 	set_sbi_flag(sbi, SBI_POR_DOING);
-	spin_lock_init(&sbi->stat_lock);
 
 	err = f2fs_init_write_merge_io(sbi);
 	if (err)
 		goto free_bio_info;
 
-	spin_lock_init(&sbi->error_lock);
-	memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
-
-	init_f2fs_rwsem(&sbi->cp_rwsem);
-	init_f2fs_rwsem(&sbi->quota_sem);
-	init_waitqueue_head(&sbi->cp_wait);
 	init_sb_info(sbi);
 
 	err = f2fs_init_iostat(sbi);
@@ -4279,12 +4287,6 @@ try_onemore:
 	limit_reserve_root(sbi);
 	adjust_unusable_cap_perc(sbi);
 
-	for (i = 0; i < NR_INODE_TYPE; i++) {
-		INIT_LIST_HEAD(&sbi->inode_list[i]);
-		spin_lock_init(&sbi->inode_lock[i]);
-	}
-	mutex_init(&sbi->flush_lock);
-
 	f2fs_init_extent_cache_info(sbi);
 
 	f2fs_init_ino_entry_info(sbi);

From 097dd939a6b0f7324940567587664d50c8a3b9aa Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 10 Nov 2022 17:15:01 +0800
Subject: [PATCH 300/457] f2fs: fix to set flush_merge opt and show
 noflush_merge

Some minor modifications to flush_merge and related parameters:

  1.The FLUSH_MERGE opt is set by default only in non-ro mode.
  2.When ro and merge are set at the same time, an error is reported.
  3.Display noflush_merge mount opt.

Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8f4fc3ad6765..75027ff85cd9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1353,6 +1353,12 @@ default_check:
 		return -EINVAL;
 	}
 
+	if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) &&
+		test_opt(sbi, FLUSH_MERGE)) {
+		f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
+		return -EINVAL;
+	}
+
 	if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
 		f2fs_err(sbi, "Allow to mount readonly mode only");
 		return -EROFS;
@@ -1941,8 +1947,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",inline_dentry");
 	else
 		seq_puts(seq, ",noinline_dentry");
-	if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+	if (test_opt(sbi, FLUSH_MERGE))
 		seq_puts(seq, ",flush_merge");
+	else
+		seq_puts(seq, ",noflush_merge");
 	if (test_opt(sbi, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	else
@@ -2073,7 +2081,8 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, MERGE_CHECKPOINT);
 	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
-	set_opt(sbi, FLUSH_MERGE);
+	if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb))
+		set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
 		set_opt(sbi, DISCARD);
 	if (f2fs_sb_has_blkzoned(sbi)) {

From c9bf02a2965159196838b59faed4ab566cb23cc6 Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daehojeong@google.com>
Date: Fri, 11 Nov 2022 09:04:06 -0800
Subject: [PATCH 301/457] f2fs: introduce F2FS_IOC_START_ATOMIC_REPLACE

introduce a new ioctl to replace the whole content of a file atomically,
which means it induces truncate and content update at the same time.
We can start it with F2FS_IOC_START_ATOMIC_REPLACE and complete it with
F2FS_IOC_COMMIT_ATOMIC_WRITE. Or abort it with
F2FS_IOC_ABORT_ATOMIC_WRITE.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c            |  3 +++
 fs/f2fs/f2fs.h            |  1 +
 fs/f2fs/file.c            | 21 +++++++++++++++------
 fs/f2fs/segment.c         | 13 ++++++++++++-
 include/uapi/linux/f2fs.h |  1 +
 5 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9b47ded653d1..560fa80590e9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3466,6 +3466,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
 	else if (*blk_addr != NULL_ADDR)
 		return 0;
 
+	if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE))
+		goto reserve_block;
+
 	/* Look for the block in the original inode */
 	err = __find_data_block(inode, index, &ori_blk_addr);
 	if (err)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6a8cbf5bb187..b89b5d755ce0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -769,6 +769,7 @@ enum {
 	FI_ALIGNED_WRITE,	/* enable aligned write */
 	FI_COW_FILE,		/* indicate COW file */
 	FI_ATOMIC_COMMITTED,	/* indicate atomic commit completed except disk sync */
+	FI_ATOMIC_REPLACE,	/* indicate atomic replace */
 	FI_MAX,			/* max flag, never be used */
 };
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 28f586e77999..ab0a0d3730f6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2034,7 +2034,7 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
 	return put_user(inode->i_generation, (int __user *)arg);
 }
 
-static int f2fs_ioc_start_atomic_write(struct file *filp)
+static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
 {
 	struct inode *inode = file_inode(filp);
 	struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
@@ -2103,15 +2103,22 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
 	f2fs_write_inode(inode, NULL);
 
-	isize = i_size_read(inode);
-	fi->original_i_size = isize;
-	f2fs_i_size_write(fi->cow_inode, isize);
-
 	stat_inc_atomic_inode(inode);
 
 	set_inode_flag(inode, FI_ATOMIC_FILE);
 	set_inode_flag(fi->cow_inode, FI_COW_FILE);
 	clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+
+	isize = i_size_read(inode);
+	fi->original_i_size = isize;
+	if (truncate) {
+		set_inode_flag(inode, FI_ATOMIC_REPLACE);
+		truncate_inode_pages_final(inode->i_mapping);
+		f2fs_i_size_write(inode, 0);
+		isize = 0;
+	}
+	f2fs_i_size_write(fi->cow_inode, isize);
+
 	f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
 
 	f2fs_update_time(sbi, REQ_TIME);
@@ -4139,7 +4146,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case FS_IOC_GETVERSION:
 		return f2fs_ioc_getversion(filp, arg);
 	case F2FS_IOC_START_ATOMIC_WRITE:
-		return f2fs_ioc_start_atomic_write(filp);
+		return f2fs_ioc_start_atomic_write(filp, false);
+	case F2FS_IOC_START_ATOMIC_REPLACE:
+		return f2fs_ioc_start_atomic_write(filp, true);
 	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
 		return f2fs_ioc_commit_atomic_write(filp);
 	case F2FS_IOC_ABORT_ATOMIC_WRITE:
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8aa81238c770..5ac026a57228 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -197,6 +197,7 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	fi->cow_inode = NULL;
 	release_atomic_write_cnt(inode);
 	clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
+	clear_inode_flag(inode, FI_ATOMIC_REPLACE);
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	stat_dec_atomic_inode(inode);
 
@@ -261,14 +262,24 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
 					bool revoke)
 {
 	struct revoke_entry *cur, *tmp;
+	pgoff_t start_index = 0;
+	bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
 
 	list_for_each_entry_safe(cur, tmp, head, list) {
-		if (revoke)
+		if (revoke) {
 			__replace_atomic_write_block(inode, cur->index,
 						cur->old_addr, NULL, true);
+		} else if (truncate) {
+			f2fs_truncate_hole(inode, start_index, cur->index);
+			start_index = cur->index + 1;
+		}
+
 		list_del(&cur->list);
 		kmem_cache_free(revoke_entry_slab, cur);
 	}
+
+	if (!revoke && truncate)
+		f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false);
 }
 
 static int __f2fs_commit_atomic_write(struct inode *inode)
diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h
index 3121d127d5aa..955d440be104 100644
--- a/include/uapi/linux/f2fs.h
+++ b/include/uapi/linux/f2fs.h
@@ -42,6 +42,7 @@
 						struct f2fs_comp_option)
 #define F2FS_IOC_DECOMPRESS_FILE	_IO(F2FS_IOCTL_MAGIC, 23)
 #define F2FS_IOC_COMPRESS_FILE		_IO(F2FS_IOCTL_MAGIC, 24)
+#define F2FS_IOC_START_ATOMIC_REPLACE	_IO(F2FS_IOCTL_MAGIC, 25)
 
 /*
  * should be same as XFS_IOC_GOINGDOWN.

From 6c38400dc0ee8c2dca6d95cf04f5875613dbf399 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Tue, 15 Nov 2022 00:08:47 +0800
Subject: [PATCH 302/457] f2fs: fix to do sanity check on i_extra_isize in
 is_alive()

syzbot found a f2fs bug:

BUG: KASAN: slab-out-of-bounds in data_blkaddr fs/f2fs/f2fs.h:2891 [inline]
BUG: KASAN: slab-out-of-bounds in is_alive fs/f2fs/gc.c:1117 [inline]
BUG: KASAN: slab-out-of-bounds in gc_data_segment fs/f2fs/gc.c:1520 [inline]
BUG: KASAN: slab-out-of-bounds in do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734
Read of size 4 at addr ffff888076557568 by task kworker/u4:3/52

CPU: 1 PID: 52 Comm: kworker/u4:3 Not tainted 6.1.0-rc4-syzkaller-00362-gfef7fd48922d #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
Workqueue: writeback wb_workfn (flush-7:0)
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106
print_address_description mm/kasan/report.c:284 [inline]
print_report+0x15e/0x45d mm/kasan/report.c:395
kasan_report+0xbb/0x1f0 mm/kasan/report.c:495
data_blkaddr fs/f2fs/f2fs.h:2891 [inline]
is_alive fs/f2fs/gc.c:1117 [inline]
gc_data_segment fs/f2fs/gc.c:1520 [inline]
do_garbage_collect+0x386a/0x3df0 fs/f2fs/gc.c:1734
f2fs_gc+0x88c/0x20a0 fs/f2fs/gc.c:1831
f2fs_balance_fs+0x544/0x6b0 fs/f2fs/segment.c:410
f2fs_write_inode+0x57e/0xe20 fs/f2fs/inode.c:753
write_inode fs/fs-writeback.c:1440 [inline]
__writeback_single_inode+0xcfc/0x1440 fs/fs-writeback.c:1652
writeback_sb_inodes+0x54d/0xf90 fs/fs-writeback.c:1870
wb_writeback+0x2c5/0xd70 fs/fs-writeback.c:2044
wb_do_writeback fs/fs-writeback.c:2187 [inline]
wb_workfn+0x2dc/0x12f0 fs/fs-writeback.c:2227
process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289
worker_thread+0x665/0x1080 kernel/workqueue.c:2436
kthread+0x2e4/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306

The root cause is that we forgot to do sanity check on .i_extra_isize
in below path, result in accessing invalid address later, fix it.
- gc_data_segment
 - is_alive
  - data_blkaddr
   - offset_in_addr

Reported-by: syzbot+f8f3dfa4abc489e768a1@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-f2fs-devel/0000000000003cb3c405ed5c17f9@google.com/T/#u
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f1a46519a5fe..0f967b1e98f2 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1077,7 +1077,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 {
 	struct page *node_page;
 	nid_t nid;
-	unsigned int ofs_in_node, max_addrs;
+	unsigned int ofs_in_node, max_addrs, base;
 	block_t source_blkaddr;
 
 	nid = le32_to_cpu(sum->nid);
@@ -1103,11 +1103,17 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		return false;
 	}
 
-	max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
-						DEF_ADDRS_PER_BLOCK;
-	if (ofs_in_node >= max_addrs) {
-		f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
-			ofs_in_node, dni->ino, dni->nid, max_addrs);
+	if (IS_INODE(node_page)) {
+		base = offset_in_addr(F2FS_INODE(node_page));
+		max_addrs = DEF_ADDRS_PER_INODE;
+	} else {
+		base = 0;
+		max_addrs = DEF_ADDRS_PER_BLOCK;
+	}
+
+	if (base + ofs_in_node >= max_addrs) {
+		f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
+			base, ofs_in_node, max_addrs, dni->ino, dni->nid);
 		f2fs_put_page(node_page, 1);
 		return false;
 	}

From 4d8c7abfee10095c5c18da0a331729aeed24cbc2 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Sat, 12 Nov 2022 00:13:49 +0800
Subject: [PATCH 303/457] f2fs: remove submit label in __submit_discard_cmd()

Complaint from Matthew Wilcox in another similar place:

	"submit?  You don't submit anything at the 'submit' label.
	it should be called 'skip' or something.  But I think this
	is just badly written and you don't need a goto at all."

Let's remove submit label for readability.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 5ac026a57228..8b0b76550578 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1143,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
 		if (time_to_inject(sbi, FAULT_DISCARD)) {
 			f2fs_show_injection_info(sbi, FAULT_DISCARD);
 			err = -EIO;
-			goto submit;
-		}
-		err = __blkdev_issue_discard(bdev,
+		} else {
+			err = __blkdev_issue_discard(bdev,
 					SECTOR_FROM_BLOCK(start),
 					SECTOR_FROM_BLOCK(len),
 					GFP_NOFS, &bio);
-submit:
+		}
 		if (err) {
 			spin_lock_irqsave(&dc->lock, flags);
 			if (dc->state == D_PARTIAL)

From e3bef45cf6121c70ded71e5349b8a3dcdd4dbcec Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Tue, 15 Nov 2022 14:35:35 +0800
Subject: [PATCH 304/457] f2fs: fix to alloc_mode changed after remount on a
 small volume device

The commit 84b89e5d943d8 ("f2fs: add auto tuning for small devices") add
tuning for small volume device, now support to tune alloce_mode to 'reuse'
if it's small size. But the alloc_mode will change to 'default' when do
remount on this small size dievce. This patch fo fix alloc_mode changed
when do remount for a small volume device.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 75027ff85cd9..96cfe626a670 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2059,7 +2059,11 @@ static void default_options(struct f2fs_sb_info *sbi)
 		F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
 
 	F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
-	F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+	if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <=
+							SMALL_VOLUME_SEGMENTS)
+		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+	else
+		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
 	F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
 	F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
 	F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
@@ -4077,7 +4081,6 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 
 	/* adjust parameters according to the volume size */
 	if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
-		F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
 		if (f2fs_block_unit_discard(sbi))
 			sm_i->dcc_info->discard_granularity = 1;
 		sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |

From 56f4ea601ecae23692c8fd823e18e31b46870c0f Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Tue, 15 Nov 2022 14:35:36 +0800
Subject: [PATCH 305/457] f2fs: cleanup for 'f2fs_tuning_parameters' function

A cleanup patch for 'f2fs_tuning_parameters' function.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 96cfe626a670..05101cd4140b 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4077,13 +4077,11 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
 
 static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 {
-	struct f2fs_sm_info *sm_i = SM_I(sbi);
-
 	/* adjust parameters according to the volume size */
-	if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
+	if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
 		if (f2fs_block_unit_discard(sbi))
-			sm_i->dcc_info->discard_granularity = 1;
-		sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+			SM_I(sbi)->dcc_info->discard_granularity = 1;
+		SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
 					1 << F2FS_IPU_HONOR_OPU_WRITE;
 	}
 

From c3c23a99d5fa3ea051c030a3e20838909bafe199 Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Tue, 15 Nov 2022 14:35:37 +0800
Subject: [PATCH 306/457] f2fs: change type for 'sbi->readdir_ra'

Before this patch, the varibale 'readdir_ra' takes effect if it's equal
to '1' or not, so we can change type for it from 'int' to 'bool'.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/dir.c   | 2 +-
 fs/f2fs/f2fs.h  | 2 +-
 fs/f2fs/super.c | 2 +-
 fs/f2fs/sysfs.c | 5 +++++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 030b7fd4142f..8e025157f35c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1010,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
 	struct blk_plug plug;
-	bool readdir_ra = sbi->readdir_ra == 1;
+	bool readdir_ra = sbi->readdir_ra;
 	bool found_valid_dirent = false;
 	int err = 0;
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b89b5d755ce0..96bd3461c0bb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1698,7 +1698,7 @@ struct f2fs_sb_info {
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
 	int dir_level;				/* directory level */
-	int readdir_ra;				/* readahead inode in readdir */
+	bool readdir_ra;			/* readahead inode in readdir */
 	u64 max_io_bytes;			/* max io bytes to merge IOs */
 
 	block_t user_block_count;		/* # of user blocks */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 05101cd4140b..31435c8645c8 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4085,7 +4085,7 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 					1 << F2FS_IPU_HONOR_OPU_WRITE;
 	}
 
-	sbi->readdir_ra = 1;
+	sbi->readdir_ra = true;
 }
 
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 97bf0dbb0974..33ec467b3772 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -656,6 +656,11 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "readdir_ra")) {
+		sbi->readdir_ra = !!t;
+		return count;
+	}
+
 	*ui = (unsigned int)t;
 
 	return count;

From 001c982b36659b25aad770f1515fc3347cbfd076 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Thu, 17 Nov 2022 23:10:54 +0800
Subject: [PATCH 307/457] f2fs: set zstd compress level correctly

Fixes: cf30f6a5f0c6 ("lib: zstd: Add kernel-specific API")
Signed-off-by: Sheng Yong <shengyong@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Reviewed-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d315c2de136f..74d3f2d2271f 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
 	if (!level)
 		level = F2FS_ZSTD_DEFAULT_CLEVEL;
 
-	params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen);
+	params = zstd_get_params(level, cc->rlen);
 	workspace_size = zstd_cstream_workspace_bound(&params.cParams);
 
 	workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),

From 8d9e35642f4d006895d3d9befb15b4e9b1d61723 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Fri, 11 Nov 2022 18:08:29 +0800
Subject: [PATCH 308/457] f2fs: fix to enable compress for newly created file
 if extension matches

If compress_extension is set, and a newly created file matches the
extension, the file could be marked as compression file. However,
if inline_data is also enabled, there is no chance to check its
extension since f2fs_should_compress() always returns false.

This patch moves set_compress_inode(), which do extension check, in
f2fs_should_compress() to check extensions before setting inline
data flag.

Fixes: 7165841d578e ("f2fs: fix to check inline_data during compressed inode conversion")
Signed-off-by: Sheng Yong <shengyong@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  |   2 +-
 fs/f2fs/namei.c | 329 ++++++++++++++++++++++++------------------------
 2 files changed, 164 insertions(+), 167 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 96bd3461c0bb..f0833638f59e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2980,7 +2980,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 /* Flags that should be inherited by new inodes from their parent. */
 #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
 			   F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
-			   F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
+			   F2FS_CASEFOLD_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define F2FS_REG_FLMASK		(~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e104409c3a0e..54448dccbb6a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,8 +22,163 @@
 #include "acl.h"
 #include <trace/events/f2fs.h>
 
+static inline int is_extension_exist(const unsigned char *s, const char *sub,
+						bool tmp_ext)
+{
+	size_t slen = strlen(s);
+	size_t sublen = strlen(sub);
+	int i;
+
+	if (sublen == 1 && *sub == '*')
+		return 1;
+
+	/*
+	 * filename format of multimedia file should be defined as:
+	 * "filename + '.' + extension + (optional: '.' + temp extension)".
+	 */
+	if (slen < sublen + 2)
+		return 0;
+
+	if (!tmp_ext) {
+		/* file has no temp extension */
+		if (s[slen - sublen - 1] != '.')
+			return 0;
+		return !strncasecmp(s + slen - sublen, sub, sublen);
+	}
+
+	for (i = 1; i < slen - sublen; i++) {
+		if (s[i] != '.')
+			continue;
+		if (!strncasecmp(s + i + 1, sub, sublen))
+			return 1;
+	}
+
+	return 0;
+}
+
+int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
+							bool hot, bool set)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	int hot_count = sbi->raw_super->hot_ext_count;
+	int total_count = cold_count + hot_count;
+	int start, count;
+	int i;
+
+	if (set) {
+		if (total_count == F2FS_MAX_EXTENSION)
+			return -EINVAL;
+	} else {
+		if (!hot && !cold_count)
+			return -EINVAL;
+		if (hot && !hot_count)
+			return -EINVAL;
+	}
+
+	if (hot) {
+		start = cold_count;
+		count = total_count;
+	} else {
+		start = 0;
+		count = cold_count;
+	}
+
+	for (i = start; i < count; i++) {
+		if (strcmp(name, extlist[i]))
+			continue;
+
+		if (set)
+			return -EINVAL;
+
+		memcpy(extlist[i], extlist[i + 1],
+				F2FS_EXTENSION_LEN * (total_count - i - 1));
+		memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
+		if (hot)
+			sbi->raw_super->hot_ext_count = hot_count - 1;
+		else
+			sbi->raw_super->extension_count =
+						cpu_to_le32(cold_count - 1);
+		return 0;
+	}
+
+	if (!set)
+		return -EINVAL;
+
+	if (hot) {
+		memcpy(extlist[count], name, strlen(name));
+		sbi->raw_super->hot_ext_count = hot_count + 1;
+	} else {
+		char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
+
+		memcpy(buf, &extlist[cold_count],
+				F2FS_EXTENSION_LEN * hot_count);
+		memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
+		memcpy(extlist[cold_count], name, strlen(name));
+		memcpy(&extlist[cold_count + 1], buf,
+				F2FS_EXTENSION_LEN * hot_count);
+		sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
+	}
+	return 0;
+}
+
+static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir,
+				struct inode *inode, const unsigned char *name)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	unsigned char (*noext)[F2FS_EXTENSION_LEN] =
+						F2FS_OPTION(sbi).noextensions;
+	unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
+	unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+	unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+	int i, cold_count, hot_count;
+
+	if (!f2fs_sb_has_compression(sbi))
+		return;
+
+	if (S_ISDIR(inode->i_mode))
+		goto inherit_comp;
+
+	/* This name comes only from normal files. */
+	if (!name)
+		return;
+
+	/* Don't compress hot files. */
+	f2fs_down_read(&sbi->sb_lock);
+	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	hot_count = sbi->raw_super->hot_ext_count;
+	for (i = cold_count; i < cold_count + hot_count; i++)
+		if (is_extension_exist(name, extlist[i], false))
+			break;
+	f2fs_up_read(&sbi->sb_lock);
+	if (i < (cold_count + hot_count))
+		return;
+
+	/* Don't compress unallowed extension. */
+	for (i = 0; i < noext_cnt; i++)
+		if (is_extension_exist(name, noext[i], false))
+			return;
+
+	/* Compress wanting extension. */
+	for (i = 0; i < ext_cnt; i++) {
+		if (is_extension_exist(name, ext[i], false)) {
+			set_compress_context(inode);
+			return;
+		}
+	}
+inherit_comp:
+	/* Inherit the {no-}compression flag in directory */
+	if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) {
+		F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL;
+		f2fs_mark_inode_dirty_sync(inode, true);
+	} else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) {
+		set_compress_context(inode);
+	}
+}
+
 static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
-						struct inode *dir, umode_t mode)
+						struct inode *dir, umode_t mode,
+						const char *name)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	nid_t ino;
@@ -114,12 +269,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 	if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
 		set_inode_flag(inode, FI_PROJ_INHERIT);
 
-	if (f2fs_sb_has_compression(sbi)) {
-		/* Inherit the compression flag in directory */
-		if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
-					f2fs_may_compress(inode))
-			set_compress_context(inode);
-	}
+	/* Check compression first. */
+	set_compress_new_inode(sbi, dir, inode, name);
 
 	/* Should enable inline_data after compression set */
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
@@ -153,40 +304,6 @@ fail_drop:
 	return ERR_PTR(err);
 }
 
-static inline int is_extension_exist(const unsigned char *s, const char *sub,
-						bool tmp_ext)
-{
-	size_t slen = strlen(s);
-	size_t sublen = strlen(sub);
-	int i;
-
-	if (sublen == 1 && *sub == '*')
-		return 1;
-
-	/*
-	 * filename format of multimedia file should be defined as:
-	 * "filename + '.' + extension + (optional: '.' + temp extension)".
-	 */
-	if (slen < sublen + 2)
-		return 0;
-
-	if (!tmp_ext) {
-		/* file has no temp extension */
-		if (s[slen - sublen - 1] != '.')
-			return 0;
-		return !strncasecmp(s + slen - sublen, sub, sublen);
-	}
-
-	for (i = 1; i < slen - sublen; i++) {
-		if (s[i] != '.')
-			continue;
-		if (!strncasecmp(s + i + 1, sub, sublen))
-			return 1;
-	}
-
-	return 0;
-}
-
 /*
  * Set file's temperature for hot/cold data separation
  */
@@ -217,124 +334,6 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
 		file_set_hot(inode);
 }
 
-int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
-							bool hot, bool set)
-{
-	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
-	int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
-	int hot_count = sbi->raw_super->hot_ext_count;
-	int total_count = cold_count + hot_count;
-	int start, count;
-	int i;
-
-	if (set) {
-		if (total_count == F2FS_MAX_EXTENSION)
-			return -EINVAL;
-	} else {
-		if (!hot && !cold_count)
-			return -EINVAL;
-		if (hot && !hot_count)
-			return -EINVAL;
-	}
-
-	if (hot) {
-		start = cold_count;
-		count = total_count;
-	} else {
-		start = 0;
-		count = cold_count;
-	}
-
-	for (i = start; i < count; i++) {
-		if (strcmp(name, extlist[i]))
-			continue;
-
-		if (set)
-			return -EINVAL;
-
-		memcpy(extlist[i], extlist[i + 1],
-				F2FS_EXTENSION_LEN * (total_count - i - 1));
-		memset(extlist[total_count - 1], 0, F2FS_EXTENSION_LEN);
-		if (hot)
-			sbi->raw_super->hot_ext_count = hot_count - 1;
-		else
-			sbi->raw_super->extension_count =
-						cpu_to_le32(cold_count - 1);
-		return 0;
-	}
-
-	if (!set)
-		return -EINVAL;
-
-	if (hot) {
-		memcpy(extlist[count], name, strlen(name));
-		sbi->raw_super->hot_ext_count = hot_count + 1;
-	} else {
-		char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN];
-
-		memcpy(buf, &extlist[cold_count],
-				F2FS_EXTENSION_LEN * hot_count);
-		memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN);
-		memcpy(extlist[cold_count], name, strlen(name));
-		memcpy(&extlist[cold_count + 1], buf,
-				F2FS_EXTENSION_LEN * hot_count);
-		sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1);
-	}
-	return 0;
-}
-
-static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
-						const unsigned char *name)
-{
-	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
-	unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
-	unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
-	unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
-	unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
-	int i, cold_count, hot_count;
-
-	if (!f2fs_sb_has_compression(sbi) ||
-			F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
-			!f2fs_may_compress(inode) ||
-			(!ext_cnt && !noext_cnt))
-		return;
-
-	f2fs_down_read(&sbi->sb_lock);
-
-	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
-	hot_count = sbi->raw_super->hot_ext_count;
-
-	for (i = cold_count; i < cold_count + hot_count; i++) {
-		if (is_extension_exist(name, extlist[i], false)) {
-			f2fs_up_read(&sbi->sb_lock);
-			return;
-		}
-	}
-
-	f2fs_up_read(&sbi->sb_lock);
-
-	for (i = 0; i < noext_cnt; i++) {
-		if (is_extension_exist(name, noext[i], false)) {
-			f2fs_disable_compressed_file(inode);
-			return;
-		}
-	}
-
-	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
-		return;
-
-	for (i = 0; i < ext_cnt; i++) {
-		if (!is_extension_exist(name, ext[i], false))
-			continue;
-
-		/* Do not use inline_data with compression */
-		stat_dec_inline_inode(inode);
-		clear_inode_flag(inode, FI_INLINE_DATA);
-		set_compress_context(inode);
-		return;
-	}
-}
-
 static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 		       struct dentry *dentry, umode_t mode, bool excl)
 {
@@ -352,15 +351,13 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, mode);
+	inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
 	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
 		set_file_temperature(sbi, inode, dentry->d_name.name);
 
-	set_compress_inode(sbi, inode, dentry->d_name.name);
-
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -689,7 +686,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
+	inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -760,7 +757,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
+	inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -817,7 +814,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, mode);
+	inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -856,7 +853,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
 	if (err)
 		return err;
 
-	inode = f2fs_new_inode(mnt_userns, dir, mode);
+	inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 

From eba4e4aacc25a4b55cb1a5222d38d8488fe63150 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong@oppo.com>
Date: Fri, 11 Nov 2022 18:08:30 +0800
Subject: [PATCH 309/457] f2fs: move set_file_temperature into f2fs_new_inode

Since the file name has already passed to f2fs_new_inode(), let's
move set_file_temperature() into f2fs_new_inode().

Signed-off-by: Sheng Yong <shengyong@oppo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/namei.c | 62 +++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 54448dccbb6a..58a91ce8fe08 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -176,6 +176,32 @@ inherit_comp:
 	}
 }
 
+/*
+ * Set file's temperature for hot/cold data separation
+ */
+static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
+		const unsigned char *name)
+{
+	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+	int i, cold_count, hot_count;
+
+	f2fs_down_read(&sbi->sb_lock);
+	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+	hot_count = sbi->raw_super->hot_ext_count;
+	for (i = 0; i < cold_count + hot_count; i++)
+		if (is_extension_exist(name, extlist[i], true))
+			break;
+	f2fs_up_read(&sbi->sb_lock);
+
+	if (i == cold_count + hot_count)
+		return;
+
+	if (i < cold_count)
+		file_set_cold(inode);
+	else
+		file_set_hot(inode);
+}
+
 static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 						struct inode *dir, umode_t mode,
 						const char *name)
@@ -276,6 +302,9 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(inode, FI_INLINE_DATA);
 
+	if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY))
+		set_file_temperature(sbi, inode, name);
+
 	stat_inc_inline_xattr(inode);
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
@@ -304,36 +333,6 @@ fail_drop:
 	return ERR_PTR(err);
 }
 
-/*
- * Set file's temperature for hot/cold data separation
- */
-static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
-		const unsigned char *name)
-{
-	__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
-	int i, cold_count, hot_count;
-
-	f2fs_down_read(&sbi->sb_lock);
-
-	cold_count = le32_to_cpu(sbi->raw_super->extension_count);
-	hot_count = sbi->raw_super->hot_ext_count;
-
-	for (i = 0; i < cold_count + hot_count; i++) {
-		if (is_extension_exist(name, extlist[i], true))
-			break;
-	}
-
-	f2fs_up_read(&sbi->sb_lock);
-
-	if (i == cold_count + hot_count)
-		return;
-
-	if (i < cold_count)
-		file_set_cold(inode);
-	else
-		file_set_hot(inode);
-}
-
 static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 		       struct dentry *dentry, umode_t mode, bool excl)
 {
@@ -355,9 +354,6 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
-		set_file_temperature(sbi, inode, dentry->d_name.name);
-
 	inode->i_op = &f2fs_file_inode_operations;
 	inode->i_fop = &f2fs_file_operations;
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;

From 7b2c7dcdf42236a998331f2c2473ff38ad30b304 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Sat, 19 Nov 2022 01:40:28 +0800
Subject: [PATCH 310/457] f2fs: fix description about discard_granularity node

Let's fix the inconsistency in the text description.
Default discard granularity is 16. For small devices,
default value is 1.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 24e7cb77f265..32404781e76f 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -138,7 +138,8 @@ Contact:	"Chao Yu" <yuchao0@huawei.com>
 Description:	Controls discard granularity of inner discard thread. Inner thread
 		will not issue discards with size that is smaller than granularity.
 		The unit size is one block(4KB), now only support configuring
-		in range of [1, 512]. Default value is 4(=16KB).
+		in range of [1, 512]. Default value is 16.
+		For small devices, default value is 1.
 
 What:		/sys/fs/f2fs/<disk>/umount_discard_timeout
 Date:		January 2019

From afcde7f31aca0cad0fd89e1b6f3d77eed9b0a885 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 17 Nov 2022 01:10:45 +0800
Subject: [PATCH 311/457] f2fs: make __queue_discard_cmd() return void

Since __queue_discard_cmd() never returns an error,
let's make it return void.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8b0b76550578..14ece4bf7c7e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1358,13 +1358,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
 	}
 }
 
-static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
 		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
 	block_t lblkstart = blkstart;
 
 	if (!f2fs_bdev_support_discard(bdev))
-		return 0;
+		return;
 
 	trace_f2fs_queue_discard(bdev, blkstart, blklen);
 
@@ -1376,7 +1376,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 	mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
 	__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
 	mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
-	return 0;
 }
 
 static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
@@ -1776,7 +1775,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	}
 
 	/* For conventional zones, use regular discard if supported */
-	return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+	__queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+	return 0;
 }
 #endif
 
@@ -1787,7 +1787,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
 	if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
 		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
 #endif
-	return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+	__queue_discard_cmd(sbi, bdev, blkstart, blklen);
+	return 0;
 }
 
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,

From 9cd485417ad37039a807bbb5e69ad9317f419e29 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Wed, 23 Nov 2022 06:42:52 +0800
Subject: [PATCH 312/457] f2fs: truncate blocks in batch in
 __complete_revoke_list()

Use f2fs_do_truncate_blocks() to truncate all blocks in-batch in
__complete_revoke_list().

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 14ece4bf7c7e..37c721e1eb03 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -262,24 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
 					bool revoke)
 {
 	struct revoke_entry *cur, *tmp;
-	pgoff_t start_index = 0;
 	bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
 
 	list_for_each_entry_safe(cur, tmp, head, list) {
-		if (revoke) {
+		if (revoke)
 			__replace_atomic_write_block(inode, cur->index,
 						cur->old_addr, NULL, true);
-		} else if (truncate) {
-			f2fs_truncate_hole(inode, start_index, cur->index);
-			start_index = cur->index + 1;
-		}
 
 		list_del(&cur->list);
 		kmem_cache_free(revoke_entry_slab, cur);
 	}
 
 	if (!revoke && truncate)
-		f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false);
+		f2fs_do_truncate_blocks(inode, 0, false);
 }
 
 static int __f2fs_commit_atomic_write(struct inode *inode)

From 32cd796c728ba8d7f6f27fae2719b841adc5e51e Mon Sep 17 00:00:00 2001
From: Yonggil Song <yonggil.song@samsung.com>
Date: Tue, 22 Nov 2022 18:03:20 +0900
Subject: [PATCH 313/457] f2fs: avoid victim selection from previous victim
 section

When f2fs chooses GC victim in large section & LFS mode,
next_victim_seg[gc_type] is referenced first. After segment is freed,
next_victim_seg[gc_type] has the next segment number.
However, next_victim_seg[gc_type] still has the last segment number
even after the last segment of section is freed. In this case, when f2fs
chooses a victim for the next GC round, the last segment of previous victim
section is chosen as a victim.

Initialize next_victim_seg[gc_type] to NULL_SEGNO for the last segment in
large section.

Fixes: e3080b0120a1 ("f2fs: support subsectional garbage collection")
Signed-off-by: Yonggil Song <yonggil.song@samsung.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 0f967b1e98f2..f1b68eda2235 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1749,8 +1749,9 @@ freed:
 				get_valid_blocks(sbi, segno, false) == 0)
 			seg_freed++;
 
-		if (__is_large_section(sbi) && segno + 1 < end_segno)
-			sbi->next_victim_seg[gc_type] = segno + 1;
+		if (__is_large_section(sbi))
+			sbi->next_victim_seg[gc_type] =
+				(segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
 skip:
 		f2fs_put_page(sum_page, 0);
 	}

From a69b03e0839abf25ad88a78e7c1052d4465b090d Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Fri, 18 Nov 2022 11:46:00 +0800
Subject: [PATCH 314/457] f2fs: init discard policy after thread wakeup

Under the current logic, after the discard thread wakes up, it will not
run according to the expected policy, but will use the expected policy
before sleep. Move the strategy selection to after the thread wakes up,
so that the running state of the thread meets expectations.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 37c721e1eb03..73ad8dc9a4d3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1679,6 +1679,11 @@ static int issue_discard_thread(void *data)
 	set_freezable();
 
 	do {
+		wait_event_interruptible_timeout(*q,
+				kthread_should_stop() || freezing(current) ||
+				dcc->discard_wake,
+				msecs_to_jiffies(wait_ms));
+
 		if (sbi->gc_mode == GC_URGENT_HIGH ||
 			!f2fs_available_free_memory(sbi, DISCARD_CACHE))
 			__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
@@ -1686,14 +1691,6 @@ static int issue_discard_thread(void *data)
 			__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
 						dcc->discard_granularity);
 
-		if (!atomic_read(&dcc->discard_cmd_cnt))
-		       wait_ms = dpolicy.max_interval;
-
-		wait_event_interruptible_timeout(*q,
-				kthread_should_stop() || freezing(current) ||
-				dcc->discard_wake,
-				msecs_to_jiffies(wait_ms));
-
 		if (dcc->discard_wake)
 			dcc->discard_wake = 0;
 
@@ -1707,12 +1704,11 @@ static int issue_discard_thread(void *data)
 			continue;
 		if (kthread_should_stop())
 			return 0;
-		if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+		if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
+			!atomic_read(&dcc->discard_cmd_cnt)) {
 			wait_ms = dpolicy.max_interval;
 			continue;
 		}
-		if (!atomic_read(&dcc->discard_cmd_cnt))
-			continue;
 
 		sb_start_intwrite(sbi->sb);
 
@@ -1727,6 +1723,8 @@ static int issue_discard_thread(void *data)
 		} else {
 			wait_ms = dpolicy.max_interval;
 		}
+		if (!atomic_read(&dcc->discard_cmd_cnt))
+			wait_ms = dpolicy.max_interval;
 
 		sb_end_intwrite(sbi->sb);
 

From b7d98b7ba208b44c1447a20135c887194b0505b8 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 00:44:01 +0800
Subject: [PATCH 315/457] f2fs: define MIN_DISCARD_GRANULARITY macro

Do cleanup in f2fs_tuning_parameters() and __init_discard_policy(),
let's use macro instead of number.

Suggested-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h    | 2 ++
 fs/f2fs/segment.c | 4 ++--
 fs/f2fs/super.c   | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f0833638f59e..4694b55b6df4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -329,6 +329,8 @@ struct discard_entry {
 	unsigned char discard_map[SIT_VBLOCK_MAP_SIZE];	/* segment discard bitmap */
 };
 
+/* minimum discard granularity, unit: block count */
+#define MIN_DISCARD_GRANULARITY		1
 /* default discard granularity of inner discard thread, unit: block count */
 #define DEFAULT_DISCARD_GRANULARITY		16
 /* default maximum discard granularity of ordered discard, unit: block count */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 73ad8dc9a4d3..c7afcc6cd75b 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1065,7 +1065,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 		dpolicy->sync = false;
 		dpolicy->ordered = true;
 		if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
-			dpolicy->granularity = 1;
+			dpolicy->granularity = MIN_DISCARD_GRANULARITY;
 			if (atomic_read(&dcc->discard_cmd_cnt))
 				dpolicy->max_interval =
 					dcc->min_discard_issue_time;
@@ -1080,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 	} else if (discard_type == DPOLICY_UMOUNT) {
 		dpolicy->io_aware = false;
 		/* we need to issue all to keep CP_TRIMMED_FLAG */
-		dpolicy->granularity = 1;
+		dpolicy->granularity = MIN_DISCARD_GRANULARITY;
 		dpolicy->timeout = true;
 	}
 }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 31435c8645c8..daf14b55a972 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4080,7 +4080,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
 	/* adjust parameters according to the volume size */
 	if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
 		if (f2fs_block_unit_discard(sbi))
-			SM_I(sbi)->dcc_info->discard_granularity = 1;
+			SM_I(sbi)->dcc_info->discard_granularity =
+						MIN_DISCARD_GRANULARITY;
 		SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
 					1 << F2FS_IPU_HONOR_OPU_WRITE;
 	}

From 3da0529e345074b0c37dca777c0e6f7333f953ce Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 00:44:02 +0800
Subject: [PATCH 316/457] f2fs: introduce discard_urgent_util sysfs node

Through this node, you can control the background discard
to run more aggressively or not aggressively when reach the
utilization rate of the space.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs | 8 ++++++++
 fs/f2fs/f2fs.h                          | 1 +
 fs/f2fs/segment.c                       | 3 ++-
 fs/f2fs/sysfs.c                         | 9 +++++++++
 4 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 32404781e76f..84a009aab1a1 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -647,3 +647,11 @@ Date:		October 2022
 Contact:	"Yangtao Li" <frank.li@vivo.com>
 Description:	Show the current gc_mode as a string.
 		This is a read-only entry.
+
+What:		/sys/fs/f2fs/<disk>/discard_urgent_util
+Date:		November 2022
+Contact:	"Yangtao Li" <frank.li@vivo.com>
+Description:	When space utilization exceeds this, do background DISCARD aggressively.
+		Does DISCARD forcibly in a period of given min_discard_issue_time when the number
+		of discards is not 0 and set discard granularity to 1.
+		Default: 80
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4694b55b6df4..296683648d4f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -413,6 +413,7 @@ struct discard_cmd_control {
 	unsigned int min_discard_issue_time;	/* min. interval between discard issue */
 	unsigned int mid_discard_issue_time;	/* mid. interval between discard issue */
 	unsigned int max_discard_issue_time;	/* max. interval between discard issue */
+	unsigned int discard_urgent_util;	/* utilization which issue discard proactively */
 	unsigned int discard_granularity;	/* discard granularity */
 	unsigned int max_ordered_discard;	/* maximum discard granularity issued by lba order */
 	unsigned int undiscard_blks;		/* # of undiscard blocks */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c7afcc6cd75b..0ff451ea18f6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1064,7 +1064,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
 		dpolicy->io_aware = true;
 		dpolicy->sync = false;
 		dpolicy->ordered = true;
-		if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
+		if (utilization(sbi) > dcc->discard_urgent_util) {
 			dpolicy->granularity = MIN_DISCARD_GRANULARITY;
 			if (atomic_read(&dcc->discard_cmd_cnt))
 				dpolicy->max_interval =
@@ -2079,6 +2079,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 	dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
 	dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
 	dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
+	dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL;
 	dcc->undiscard_blks = 0;
 	dcc->next_pos = 0;
 	dcc->root = RB_ROOT_CACHED;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 33ec467b3772..a4745d596310 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -493,6 +493,13 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "discard_urgent_util")) {
+		if (t > 100)
+			return -EINVAL;
+		*ui = t;
+		return count;
+	}
+
 	if (!strcmp(a->attr.name, "migration_granularity")) {
 		if (t == 0 || t > sbi->segs_per_sec)
 			return -EINVAL;
@@ -800,6 +807,7 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
 F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
 F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
@@ -930,6 +938,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(min_discard_issue_time),
 	ATTR_LIST(mid_discard_issue_time),
 	ATTR_LIST(max_discard_issue_time),
+	ATTR_LIST(discard_urgent_util),
 	ATTR_LIST(discard_granularity),
 	ATTR_LIST(max_ordered_discard),
 	ATTR_LIST(pending_discard),

From 3b38f331c40a3b7eb11b2f090ab222b880faa2a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Nov 2022 10:43:44 +0100
Subject: [PATCH 317/457] f2fs: remove struct segment_allocation
 default_salloc_ops

There is only  single instance of these ops, so remove the indirection
and call allocate_segment_by_default directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 11 ++---------
 fs/f2fs/segment.h |  6 ------
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0ff451ea18f6..bbe6556799ce 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2926,7 +2926,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
 		return;
 alloc:
 	old_segno = curseg->segno;
-	SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+	allocate_segment_by_default(sbi, type, true);
 	locate_dirty_segment(sbi, old_segno);
 }
 
@@ -2957,10 +2957,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
 }
 
-static const struct segment_allocation default_salloc_ops = {
-	.allocate_segment = allocate_segment_by_default,
-};
-
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
 						struct cp_control *cpc)
 {
@@ -3284,7 +3280,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 			get_atssr_segment(sbi, type, se->type,
 						AT_SSR, se->mtime);
 		else
-			sit_i->s_ops->allocate_segment(sbi, type, false);
+			allocate_segment_by_default(sbi, type, false);
 	}
 	/*
 	 * segment dirty status should be updated after segment allocation,
@@ -4270,9 +4266,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 #endif
 
-	/* init SIT information */
-	sit_i->s_ops = &default_salloc_ops;
-
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
 	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
 	sit_i->written_valid_blocks = 0;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index be8f2d7d007b..3ad1b7b6fa94 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -222,10 +222,6 @@ struct sec_entry {
 	unsigned int valid_blocks;	/* # of valid blocks in a section */
 };
 
-struct segment_allocation {
-	void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
-};
-
 #define MAX_SKIP_GC_COUNT			16
 
 struct revoke_entry {
@@ -235,8 +231,6 @@ struct revoke_entry {
 };
 
 struct sit_info {
-	const struct segment_allocation *s_ops;
-
 	block_t sit_base_addr;		/* start block address of SIT area */
 	block_t sit_blocks;		/* # of blocks used by SIT area */
 	block_t written_valid_blocks;	/* # of valid blocks in main area */

From 8c5ac3020331759bafa78caa46518c419fc8cf58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Nov 2022 10:43:45 +0100
Subject: [PATCH 318/457] f2fs: open code allocate_segment_by_default

allocate_segment_by_default has just two callers, which use very
different code pathes inside it based on the force paramter.  Just
open code the logic in the two callers using a new helper to decided
if a new segment should be allocated.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 50 +++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index bbe6556799ce..c4e118eb7d19 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2849,31 +2849,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
 	return 0;
 }
 
-/*
- * flush out current segment and replace it with new segment
- * This function should be returned with success, otherwise BUG
- */
-static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
-						int type, bool force)
+static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
-	if (force)
-		new_curseg(sbi, type, true);
-	else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
-					curseg->seg_type == CURSEG_WARM_NODE)
-		new_curseg(sbi, type, false);
-	else if (curseg->alloc_type == LFS &&
-			is_next_segment_free(sbi, curseg, type) &&
-			likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
-		new_curseg(sbi, type, false);
-	else if (f2fs_need_SSR(sbi) &&
-			get_ssr_segment(sbi, type, SSR, 0))
-		change_curseg(sbi, type, true);
-	else
-		new_curseg(sbi, type, false);
-
-	stat_inc_seg_type(sbi, curseg);
+	if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+	    curseg->seg_type == CURSEG_WARM_NODE)
+		return true;
+	if (curseg->alloc_type == LFS &&
+	    is_next_segment_free(sbi, curseg, type) &&
+	    likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+		return true;
+	if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
+		return true;
+	return false;
 }
 
 void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
@@ -2926,7 +2915,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
 		return;
 alloc:
 	old_segno = curseg->segno;
-	allocate_segment_by_default(sbi, type, true);
+	new_curseg(sbi, type, true);
+	stat_inc_seg_type(sbi, curseg);
 	locate_dirty_segment(sbi, old_segno);
 }
 
@@ -3276,11 +3266,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 		update_sit_entry(sbi, old_blkaddr, -1);
 
 	if (!__has_curseg_space(sbi, curseg)) {
-		if (from_gc)
+		/*
+		 * Flush out current segment and replace it with new segment.
+		 */
+		if (from_gc) {
 			get_atssr_segment(sbi, type, se->type,
 						AT_SSR, se->mtime);
-		else
-			allocate_segment_by_default(sbi, type, false);
+		} else {
+			if (need_new_seg(sbi, type))
+				new_curseg(sbi, type, false);
+			else
+				change_curseg(sbi, type, true);
+			stat_inc_seg_type(sbi, curseg);
+		}
 	}
 	/*
 	 * segment dirty status should be updated after segment allocation,

From 7d61fab4797f69f50357f060bd75641cb24a2706 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Nov 2022 10:43:46 +0100
Subject: [PATCH 319/457] f2fs: remove the unused flush argument to
 change_curseg

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c4e118eb7d19..9486ca49ecb1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2656,7 +2656,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
  * This function always allocates a used segment(from dirty seglist) by SSR
  * manner, so it should recover the existing segment information of valid blocks
  */
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2664,9 +2664,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
 	struct f2fs_summary_block *sum_node;
 	struct page *sum_page;
 
-	if (flush)
-		write_sum_page(sbi, curseg->sum_blk,
-					GET_SUM_BLOCK(sbi, curseg->segno));
+	write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
 
 	__set_test_and_inuse(sbi, new_segno);
 
@@ -2705,7 +2703,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
 		struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
 
 		curseg->seg_type = se->type;
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	} else {
 		/* allocate cold segment by default */
 		curseg->seg_type = CURSEG_COLD_DATA;
@@ -2880,7 +2878,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 		goto unlock;
 
 	if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	else
 		new_curseg(sbi, type, true);
 
@@ -3276,7 +3274,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 			if (need_new_seg(sbi, type))
 				new_curseg(sbi, type, false);
 			else
-				change_curseg(sbi, type, true);
+				change_curseg(sbi, type);
 			stat_inc_seg_type(sbi, curseg);
 		}
 	}
@@ -3539,7 +3537,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	/* change the current segment */
 	if (segno != curseg->segno) {
 		curseg->next_segno = segno;
-		change_curseg(sbi, type, true);
+		change_curseg(sbi, type);
 	}
 
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3567,7 +3565,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (recover_curseg) {
 		if (old_cursegno != curseg->segno) {
 			curseg->next_segno = old_cursegno;
-			change_curseg(sbi, type, true);
+			change_curseg(sbi, type);
 		}
 		curseg->next_blkoff = old_blkoff;
 		curseg->alloc_type = old_alloc_type;

From 9c6b2ad7917c7522b27e4ab619640254e3eaa7ff Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Sat, 26 Nov 2022 10:38:07 +0800
Subject: [PATCH 320/457] MAINTAINERS: Add f2fs bug tracker link

As f2fs component in bugzilla.kernel.org was created and used since
2018-7.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst | 6 +++++-
 MAINTAINERS                        | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 6e67c5e6c7c3..67e1f3e86f32 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -25,10 +25,14 @@ a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
 
 - git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
 
-For reporting bugs and sending patches, please use the following mailing list:
+For sending patches, please use the following mailing list:
 
 - linux-f2fs-devel@lists.sourceforge.net
 
+For reporting bugs, please use the following f2fs bug tracker link:
+
+- https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
+
 Background and Design issues
 ============================
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 886d3f69ee64..82c29270b7cd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7831,6 +7831,7 @@ M:	Chao Yu <chao@kernel.org>
 L:	linux-f2fs-devel@lists.sourceforge.net
 S:	Maintained
 W:	https://f2fs.wiki.kernel.org/
+B:	https://bugzilla.kernel.org/enter_bug.cgi?product=File%20System&component=f2fs
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git
 F:	Documentation/ABI/testing/sysfs-fs-f2fs
 F:	Documentation/filesystems/f2fs.rst

From 14599434a25b2c02e89a8fb2f08391290be19da6 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Fri, 25 Nov 2022 19:47:36 +0800
Subject: [PATCH 321/457] f2fs: do some cleanup for f2fs module init

Just for cleanup, no functional changes.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/compress.c | 46 ++++++----------------------------------------
 fs/f2fs/data.c     | 14 ++++----------
 fs/f2fs/gc.c       |  4 +---
 fs/f2fs/recovery.c |  4 +---
 fs/f2fs/super.c    |  8 ++------
 5 files changed, 14 insertions(+), 62 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 74d3f2d2271f..9723f0bed923 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages,
 int f2fs_init_compress_mempool(void)
 {
 	compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
-	if (!compress_page_pool)
-		return -ENOMEM;
-
-	return 0;
+	return compress_page_pool ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_compress_mempool(void)
@@ -1983,9 +1980,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
 
 	sbi->page_array_slab = f2fs_kmem_cache_create(slab_name,
 					sbi->page_array_slab_size);
-	if (!sbi->page_array_slab)
-		return -ENOMEM;
-	return 0;
+	return sbi->page_array_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
@@ -1993,53 +1988,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
 	kmem_cache_destroy(sbi->page_array_slab);
 }
 
-static int __init f2fs_init_cic_cache(void)
+int __init f2fs_init_compress_cache(void)
 {
 	cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry",
 					sizeof(struct compress_io_ctx));
 	if (!cic_entry_slab)
 		return -ENOMEM;
-	return 0;
-}
-
-static void f2fs_destroy_cic_cache(void)
-{
-	kmem_cache_destroy(cic_entry_slab);
-}
-
-static int __init f2fs_init_dic_cache(void)
-{
 	dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry",
 					sizeof(struct decompress_io_ctx));
 	if (!dic_entry_slab)
-		return -ENOMEM;
-	return 0;
-}
-
-static void f2fs_destroy_dic_cache(void)
-{
-	kmem_cache_destroy(dic_entry_slab);
-}
-
-int __init f2fs_init_compress_cache(void)
-{
-	int err;
-
-	err = f2fs_init_cic_cache();
-	if (err)
-		goto out;
-	err = f2fs_init_dic_cache();
-	if (err)
 		goto free_cic;
 	return 0;
 free_cic:
-	f2fs_destroy_cic_cache();
-out:
+	kmem_cache_destroy(cic_entry_slab);
 	return -ENOMEM;
 }
 
 void f2fs_destroy_compress_cache(void)
 {
-	f2fs_destroy_dic_cache();
-	f2fs_destroy_cic_cache();
+	kmem_cache_destroy(dic_entry_slab);
+	kmem_cache_destroy(cic_entry_slab);
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 560fa80590e9..35c19248b1e2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset;
 
 int __init f2fs_init_bioset(void)
 {
-	if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
-					0, BIOSET_NEED_BVECS))
-		return -ENOMEM;
-	return 0;
+	return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
+					0, BIOSET_NEED_BVECS);
 }
 
 void f2fs_destroy_bioset(void)
@@ -4090,9 +4088,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
 	sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
 						 WQ_UNBOUND | WQ_HIGHPRI,
 						 num_online_cpus());
-	if (!sbi->post_read_wq)
-		return -ENOMEM;
-	return 0;
+	return sbi->post_read_wq ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
@@ -4105,9 +4101,7 @@ int __init f2fs_init_bio_entry_cache(void)
 {
 	bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
 			sizeof(struct bio_entry));
-	if (!bio_entry_slab)
-		return -ENOMEM;
-	return 0;
+	return bio_entry_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_bio_entry_cache(void)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f1b68eda2235..d19e26b2e875 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1904,9 +1904,7 @@ int __init f2fs_create_garbage_collection_cache(void)
 {
 	victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
 					sizeof(struct victim_entry));
-	if (!victim_entry_slab)
-		return -ENOMEM;
-	return 0;
+	return victim_entry_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_garbage_collection_cache(void)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dea95b48b647..77fd453949b1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -923,9 +923,7 @@ int __init f2fs_create_recovery_cache(void)
 {
 	fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
 					sizeof(struct fsync_inode_entry));
-	if (!fsync_entry_slab)
-		return -ENOMEM;
-	return 0;
+	return fsync_entry_slab ? 0 : -ENOMEM;
 }
 
 void f2fs_destroy_recovery_cache(void)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index daf14b55a972..a5f6f632cf7c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -288,9 +288,7 @@ static int __init f2fs_create_casefold_cache(void)
 {
 	f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
 							F2FS_NAME_LEN);
-	if (!f2fs_cf_name_slab)
-		return -ENOMEM;
-	return 0;
+	return f2fs_cf_name_slab ? 0 : -ENOMEM;
 }
 
 static void f2fs_destroy_casefold_cache(void)
@@ -4647,9 +4645,7 @@ static int __init init_inodecache(void)
 	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
 			sizeof(struct f2fs_inode_info), 0,
 			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
-	if (!f2fs_inode_cachep)
-		return -ENOMEM;
-	return 0;
+	return f2fs_inode_cachep ? 0 : -ENOMEM;
 }
 
 static void destroy_inodecache(void)

From 71f36b65b9ad1fd0e94b4a89adaf7e1695390ea6 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 11:37:08 +0800
Subject: [PATCH 322/457] f2fs: remove F2FS_SET_FEATURE() and
 F2FS_CLEAR_FEATURE() macro

F2FS_SET_FEATURE() and F2FS_CLEAR_FEATURE() have never
been used since they were introduced by this commit
76f105a2dbcd("f2fs: add feature facility in superblock").

So let's remove them. BTW, convert f2fs_sb_has_##name to return bool.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 296683648d4f..cf738f1275b2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -203,10 +203,6 @@ struct f2fs_mount_info {
 #define __F2FS_HAS_FEATURE(raw_super, mask)				\
 	((raw_super->feature & cpu_to_le32(mask)) != 0)
 #define F2FS_HAS_FEATURE(sbi, mask)	__F2FS_HAS_FEATURE(sbi->raw_super, mask)
-#define F2FS_SET_FEATURE(sbi, mask)					\
-	(sbi->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sbi, mask)					\
-	(sbi->raw_super->feature &= ~cpu_to_le32(mask))
 
 /*
  * Default values for user and/or group using reserved blocks
@@ -4387,7 +4383,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
 }
 
 #define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
+static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
 { \
 	return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
 }

From 5a74b63ce612ae28b94099770dd0053e0050b271 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Thu, 24 Nov 2022 10:48:42 +0800
Subject: [PATCH 323/457] f2fs: introduce f2fs_is_readonly() for readability

Introduce f2fs_is_readonly() and use it to simplify code.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h  | 5 +++++
 fs/f2fs/super.c | 5 ++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cf738f1275b2..eb8c27c4e5fc 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4575,6 +4575,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
 	}
 }
 
+static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
+{
+	return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
+}
+
 #define EFSBADCRC	EBADMSG		/* Bad CRC detected */
 #define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index a5f6f632cf7c..79bf1faf4161 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1351,8 +1351,7 @@ default_check:
 		return -EINVAL;
 	}
 
-	if ((f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb)) &&
-		test_opt(sbi, FLUSH_MERGE)) {
+	if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
 		f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
 		return -EINVAL;
 	}
@@ -2083,7 +2082,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, MERGE_CHECKPOINT);
 	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
-	if (!f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb))
+	if (!f2fs_is_readonly(sbi))
 		set_opt(sbi, FLUSH_MERGE);
 	if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
 		set_opt(sbi, DISCARD);

From deb572354fecd80b4887d638e3f4a7a06c570c8d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 09:36:43 -0800
Subject: [PATCH 324/457] f2fs: specify extent cache for read explicitly

Let's descrbie it's read extent cache.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c |  4 ++--
 fs/f2fs/f2fs.h         | 10 +++++-----
 fs/f2fs/inode.c        |  2 +-
 fs/f2fs/node.c         |  2 +-
 fs/f2fs/node.h         |  2 +-
 fs/f2fs/segment.c      |  4 ++--
 fs/f2fs/super.c        | 12 ++++++------
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 932c070173b9..8cd87aee0292 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -383,7 +383,7 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 	if (!i_ext || !i_ext->len)
 		return;
 
-	get_extent_info(&ei, i_ext);
+	get_read_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
 	if (atomic_read(&et->node_cnt))
@@ -710,7 +710,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 	unsigned int node_cnt = 0, tree_cnt = 0;
 	int remained;
 
-	if (!test_opt(sbi, EXTENT_CACHE))
+	if (!test_opt(sbi, READ_EXTENT_CACHE))
 		return 0;
 
 	if (!atomic_read(&sbi->total_zombie_tree))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eb8c27c4e5fc..1c39f8145b61 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -92,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_FLUSH_MERGE		0x00000400
 #define F2FS_MOUNT_NOBARRIER		0x00000800
 #define F2FS_MOUNT_FASTBOOT		0x00001000
-#define F2FS_MOUNT_EXTENT_CACHE		0x00002000
+#define F2FS_MOUNT_READ_EXTENT_CACHE	0x00002000
 #define F2FS_MOUNT_DATA_FLUSH		0x00008000
 #define F2FS_MOUNT_FAULT_INJECTION	0x00010000
 #define F2FS_MOUNT_USRQUOTA		0x00080000
@@ -600,7 +600,7 @@ enum {
 #define F2FS_MIN_EXTENT_LEN	64	/* minimum extent length */
 
 /* number of extent info in extent cache we try to shrink */
-#define EXTENT_CACHE_SHRINK_NUMBER	128
+#define READ_EXTENT_CACHE_SHRINK_NUMBER	128
 
 #define RECOVERY_MAX_RA_BLOCKS		BIO_MAX_VECS
 #define RECOVERY_MIN_RA_BLOCKS		1
@@ -830,7 +830,7 @@ struct f2fs_inode_info {
 	loff_t original_i_size;		/* original i_size before atomic write */
 };
 
-static inline void get_extent_info(struct extent_info *ext,
+static inline void get_read_extent_info(struct extent_info *ext,
 					struct f2fs_extent *i_ext)
 {
 	ext->fofs = le32_to_cpu(i_ext->fofs);
@@ -838,7 +838,7 @@ static inline void get_extent_info(struct extent_info *ext,
 	ext->len = le32_to_cpu(i_ext->len);
 }
 
-static inline void set_raw_extent(struct extent_info *ext,
+static inline void set_raw_read_extent(struct extent_info *ext,
 					struct f2fs_extent *i_ext)
 {
 	i_ext->fofs = cpu_to_le32(ext->fofs);
@@ -4407,7 +4407,7 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-	if (!test_opt(sbi, EXTENT_CACHE) ||
+	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
 			is_inode_flag_set(inode, FI_NO_EXTENT) ||
 			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
 			 !f2fs_sb_has_readonly(sbi)))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 577f109b4e1d..2c705c60019b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -629,7 +629,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 
 	if (et) {
 		read_lock(&et->lock);
-		set_raw_extent(&et->largest, &ri->i_ext);
+		set_raw_read_extent(&et->largest, &ri->i_ext);
 		read_unlock(&et->lock);
 	} else {
 		memset(&ri->i_ext, 0, sizeof(ri->i_ext));
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b9ee5a1176a0..84b147966080 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -85,7 +85,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
-	} else if (type == EXTENT_CACHE) {
+	} else if (type == READ_EXTENT_CACHE) {
 		mem_size = (atomic_read(&sbi->total_ext_tree) *
 				sizeof(struct extent_tree) +
 				atomic_read(&sbi->total_ext_node) *
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3c09cae058b0..0aa48704c77a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -146,7 +146,7 @@ enum mem_type {
 	NAT_ENTRIES,	/* indicates the cached nat entry */
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
-	EXTENT_CACHE,	/* indicates extent cache */
+	READ_EXTENT_CACHE,	/* indicates read extent cache */
 	DISCARD_CACHE,	/* indicates memory of cached discard cmds */
 	COMPRESS_PAGE,	/* indicates memory of cached compressed pages */
 	BASE_CHECK,	/* check kernel status */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 9486ca49ecb1..51de358bc452 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -449,8 +449,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 		return;
 
 	/* try to shrink extent cache when there is no enough memory */
-	if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
-		f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+	if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
+		f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER);
 
 	/* check the # of cached NAT entries */
 	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 79bf1faf4161..412c2e7352c0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -814,10 +814,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			set_opt(sbi, FASTBOOT);
 			break;
 		case Opt_extent_cache:
-			set_opt(sbi, EXTENT_CACHE);
+			set_opt(sbi, READ_EXTENT_CACHE);
 			break;
 		case Opt_noextent_cache:
-			clear_opt(sbi, EXTENT_CACHE);
+			clear_opt(sbi, READ_EXTENT_CACHE);
 			break;
 		case Opt_noinline_data:
 			clear_opt(sbi, INLINE_DATA);
@@ -1954,7 +1954,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",barrier");
 	if (test_opt(sbi, FASTBOOT))
 		seq_puts(seq, ",fastboot");
-	if (test_opt(sbi, EXTENT_CACHE))
+	if (test_opt(sbi, READ_EXTENT_CACHE))
 		seq_puts(seq, ",extent_cache");
 	else
 		seq_puts(seq, ",noextent_cache");
@@ -2076,7 +2076,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, INLINE_XATTR);
 	set_opt(sbi, INLINE_DATA);
 	set_opt(sbi, INLINE_DENTRY);
-	set_opt(sbi, EXTENT_CACHE);
+	set_opt(sbi, READ_EXTENT_CACHE);
 	set_opt(sbi, NOHEAP);
 	clear_opt(sbi, DISABLE_CHECKPOINT);
 	set_opt(sbi, MERGE_CHECKPOINT);
@@ -2218,7 +2218,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool need_restart_ckpt = false, need_stop_ckpt = false;
 	bool need_restart_flush = false, need_stop_flush = false;
 	bool need_restart_discard = false, need_stop_discard = false;
-	bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+	bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
 	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
 	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
@@ -2308,7 +2308,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	/* disallow enable/disable extent_cache dynamically */
-	if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+	if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) {
 		err = -EINVAL;
 		f2fs_warn(sbi, "switch extent_cache option is not allowed");
 		goto restore_opts;

From a69d59cdb22c34b0cc84439c22b942c0874aaae6 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 09:44:58 -0800
Subject: [PATCH 325/457] f2fs: move internal functions into extent_cache.c

No functional change.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 88 +++++++++++++++++++++++++++++++++++++-----
 fs/f2fs/f2fs.h         | 69 +--------------------------------
 2 files changed, 81 insertions(+), 76 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 8cd87aee0292..2a8e31e6d518 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -15,6 +15,77 @@
 #include "node.h"
 #include <trace/events/f2fs.h>
 
+static void __set_extent_info(struct extent_info *ei,
+				unsigned int fofs, unsigned int len,
+				block_t blk, bool keep_clen)
+{
+	ei->fofs = fofs;
+	ei->blk = blk;
+	ei->len = len;
+
+	if (keep_clen)
+		return;
+
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	ei->c_len = 0;
+#endif
+}
+
+static bool f2fs_may_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	/*
+	 * for recovered files during mount do not create extents
+	 * if shrinker is not registered.
+	 */
+	if (list_empty(&sbi->s_list))
+		return false;
+
+	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
+			is_inode_flag_set(inode, FI_NO_EXTENT) ||
+			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+			 !f2fs_sb_has_readonly(sbi)))
+		return false;
+
+	return S_ISREG(inode->i_mode);
+}
+
+static void __try_update_largest_extent(struct extent_tree *et,
+						struct extent_node *en)
+{
+	if (en->ei.len <= et->largest.len)
+		return;
+
+	et->largest = en->ei;
+	et->largest_updated = true;
+}
+
+static bool __is_extent_mergeable(struct extent_info *back,
+				struct extent_info *front)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	if (back->c_len && back->len != back->c_len)
+		return false;
+	if (front->c_len && front->len != front->c_len)
+		return false;
+#endif
+	return (back->fofs + back->len == front->fofs &&
+			back->blk + back->len == front->blk);
+}
+
+static bool __is_back_mergeable(struct extent_info *cur,
+				struct extent_info *back)
+{
+	return __is_extent_mergeable(back, cur);
+}
+
+static bool __is_front_mergeable(struct extent_info *cur,
+				struct extent_info *front)
+{
+	return __is_extent_mergeable(cur, front);
+}
+
 static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
 							unsigned int ofs)
 {
@@ -591,16 +662,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 
 		if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
 			if (parts) {
-				set_extent_info(&ei, end,
-						end - dei.fofs + dei.blk,
-						org_end - end);
+				__set_extent_info(&ei,
+					end, org_end - end,
+					end - dei.fofs + dei.blk, false);
 				en1 = __insert_extent_tree(sbi, et, &ei,
 							NULL, NULL, true);
 				next_en = en1;
 			} else {
-				en->ei.fofs = end;
-				en->ei.blk += end - dei.fofs;
-				en->ei.len -= end - dei.fofs;
+				__set_extent_info(&en->ei,
+					end, en->ei.len - (end - dei.fofs),
+					en->ei.blk + (end - dei.fofs), true);
 				next_en = en;
 			}
 			parts++;
@@ -632,8 +703,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 
 	/* 3. update extent in extent cache */
 	if (blkaddr) {
-
-		set_extent_info(&ei, fofs, blkaddr, len);
+		__set_extent_info(&ei, fofs, len, blkaddr, false);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
 					insert_p, insert_parent, leftmost);
@@ -692,7 +762,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
 	if (en)
 		goto unlock_out;
 
-	set_extent_info(&ei, fofs, blkaddr, llen);
+	__set_extent_info(&ei, fofs, llen, blkaddr, true);
 	ei.c_len = c_len;
 
 	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1c39f8145b61..04fdf010bb77 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -621,7 +621,7 @@ struct rb_entry {
 struct extent_info {
 	unsigned int fofs;		/* start offset in a file */
 	unsigned int len;		/* length of the extent */
-	u32 blk;			/* start block address of the extent */
+	block_t blk;			/* start block address of the extent */
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	unsigned int c_len;		/* physical extent length of compressed blocks */
 #endif
@@ -846,17 +846,6 @@ static inline void set_raw_read_extent(struct extent_info *ext,
 	i_ext->len = cpu_to_le32(ext->len);
 }
 
-static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
-						u32 blk, unsigned int len)
-{
-	ei->fofs = fofs;
-	ei->blk = blk;
-	ei->len = len;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-	ei->c_len = 0;
-#endif
-}
-
 static inline bool __is_discard_mergeable(struct discard_info *back,
 			struct discard_info *front, unsigned int max_len)
 {
@@ -876,41 +865,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
 	return __is_discard_mergeable(cur, front, max_len);
 }
 
-static inline bool __is_extent_mergeable(struct extent_info *back,
-						struct extent_info *front)
-{
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-	if (back->c_len && back->len != back->c_len)
-		return false;
-	if (front->c_len && front->len != front->c_len)
-		return false;
-#endif
-	return (back->fofs + back->len == front->fofs &&
-			back->blk + back->len == front->blk);
-}
-
-static inline bool __is_back_mergeable(struct extent_info *cur,
-						struct extent_info *back)
-{
-	return __is_extent_mergeable(back, cur);
-}
-
-static inline bool __is_front_mergeable(struct extent_info *cur,
-						struct extent_info *front)
-{
-	return __is_extent_mergeable(cur, front);
-}
-
-extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct extent_tree *et,
-						struct extent_node *en)
-{
-	if (en->ei.len > et->largest.len) {
-		et->largest = en->ei;
-		et->largest_updated = true;
-	}
-}
-
 /*
  * For free nid management
  */
@@ -2581,6 +2535,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
 }
 
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
 static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 					struct inode *inode, bool is_inode)
 {
@@ -4403,26 +4358,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
 F2FS_FEATURE_FUNCS(compression, COMPRESSION);
 F2FS_FEATURE_FUNCS(readonly, RO);
 
-static inline bool f2fs_may_extent_tree(struct inode *inode)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
-	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
-			is_inode_flag_set(inode, FI_NO_EXTENT) ||
-			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
-			 !f2fs_sb_has_readonly(sbi)))
-		return false;
-
-	/*
-	 * for recovered files during mount do not create extents
-	 * if shrinker is not registered.
-	 */
-	if (list_empty(&sbi->s_list))
-		return false;
-
-	return S_ISREG(inode->i_mode);
-}
-
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
 				    block_t blkaddr)

From 5dee55abf2fbfcea515b348a9253ad0833facac4 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 10:01:18 -0800
Subject: [PATCH 326/457] f2fs: remove unnecessary __init_extent_tree

Added into the caller.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2a8e31e6d518..c6810347e205 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -386,21 +386,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 	return et;
 }
 
-static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
-				struct extent_tree *et, struct extent_info *ei)
-{
-	struct rb_node **p = &et->root.rb_root.rb_node;
-	struct extent_node *en;
-
-	en = __attach_extent_node(sbi, et, ei, NULL, p, true);
-	if (!en)
-		return NULL;
-
-	et->largest = en->ei;
-	et->cached_en = en;
-	return en;
-}
-
 static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 					struct extent_tree *et)
 {
@@ -460,8 +445,12 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 	if (atomic_read(&et->node_cnt))
 		goto out;
 
-	en = __init_extent_tree(sbi, et, &ei);
+	en = __attach_extent_node(sbi, et, &ei, NULL,
+				&et->root.rb_root.rb_node, true);
 	if (en) {
+		et->largest = en->ei;
+		et->cached_en = en;
+
 		spin_lock(&sbi->extent_lock);
 		list_add_tail(&en->list, &sbi->extent_list);
 		spin_unlock(&sbi->extent_lock);

From 62488df1b92d4730f2270c73fcc26c72b56e2901 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 30 Nov 2022 09:26:29 -0800
Subject: [PATCH 327/457] f2fs: refactor extent_cache to support for read and
 more

This patch prepares extent_cache to be ready for addition.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c              |  20 +-
 fs/f2fs/debug.c             |  65 +++--
 fs/f2fs/extent_cache.c      | 465 +++++++++++++++++++++---------------
 fs/f2fs/f2fs.h              | 119 +++++----
 fs/f2fs/file.c              |   8 +-
 fs/f2fs/gc.c                |   4 +-
 fs/f2fs/inode.c             |   6 +-
 fs/f2fs/node.c              |   8 +-
 fs/f2fs/segment.c           |   3 +-
 fs/f2fs/shrinker.c          |  19 +-
 include/trace/events/f2fs.h |  62 +++--
 11 files changed, 471 insertions(+), 308 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 35c19248b1e2..75abd450730b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1126,7 +1126,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
 {
 	dn->data_blkaddr = blkaddr;
 	f2fs_set_data_blkaddr(dn);
-	f2fs_update_extent_cache(dn);
+	f2fs_update_read_extent_cache(dn);
 }
 
 /* dn->ofs_in_node will be returned with up-to-date last block pointer */
@@ -1195,7 +1195,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
 	struct extent_info ei = {0, };
 	struct inode *inode = dn->inode;
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn->data_blkaddr = ei.blk + index - ei.fofs;
 		return 0;
 	}
@@ -1217,7 +1217,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
 		if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE_READ)) {
@@ -1485,7 +1485,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	pgofs =	(pgoff_t)map->m_lblk;
 	end = pgofs + maxblocks;
 
-	if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+	if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) {
 		if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
 							map->m_may_create)
 			goto next_dnode;
@@ -1695,7 +1695,7 @@ skip:
 		if (map->m_flags & F2FS_MAP_MAPPED) {
 			unsigned int ofs = start_pgofs - map->m_lblk;
 
-			f2fs_update_extent_cache_range(&dn,
+			f2fs_update_read_extent_cache_range(&dn,
 				start_pgofs, map->m_pblk + ofs,
 				map->m_len - ofs);
 		}
@@ -1740,7 +1740,7 @@ sync_out:
 		if (map->m_flags & F2FS_MAP_MAPPED) {
 			unsigned int ofs = start_pgofs - map->m_lblk;
 
-			f2fs_update_extent_cache_range(&dn,
+			f2fs_update_read_extent_cache_range(&dn,
 				start_pgofs, map->m_pblk + ofs,
 				map->m_len - ofs);
 		}
@@ -2201,7 +2201,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	if (f2fs_cluster_is_empty(cc))
 		goto out;
 
-	if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+	if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei))
 		from_dnode = false;
 
 	if (!from_dnode)
@@ -2635,7 +2635,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 
 	if (need_inplace_update(fio) &&
-			f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+	    f2fs_lookup_read_extent_cache(inode, page->index, &ei)) {
 		fio->old_blkaddr = ei.blk + page->index - ei.fofs;
 
 		if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
@@ -3359,7 +3359,7 @@ restart:
 	} else if (locked) {
 		err = f2fs_get_block(&dn, index);
 	} else {
-		if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+		if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 			dn.data_blkaddr = ei.blk + index - ei.fofs;
 		} else {
 			/* hole case */
@@ -3400,7 +3400,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
 
 	set_new_dnode(&dn, inode, ipage, ipage, 0);
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
 	} else {
 		/* hole case */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a216dcdf6941..a9baa121d829 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -72,15 +72,23 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->main_area_zones = si->main_area_sections /
 				le32_to_cpu(raw_super->secs_per_zone);
 
-	/* validation check of the segment numbers */
+	/* general extent cache stats */
+	for (i = 0; i < NR_EXTENT_CACHES; i++) {
+		struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+		si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]);
+		si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]);
+		si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]);
+		si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i];
+		si->ext_tree[i] = atomic_read(&eti->total_ext_tree);
+		si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree);
+		si->ext_node[i] = atomic_read(&eti->total_ext_node);
+	}
+	/* read extent_cache only */
 	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
-	si->hit_cached = atomic64_read(&sbi->read_hit_cached);
-	si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
-	si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
-	si->total_ext = atomic64_read(&sbi->total_hit_ext);
-	si->ext_tree = atomic_read(&sbi->total_ext_tree);
-	si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
-	si->ext_node = atomic_read(&sbi->total_ext_node);
+	si->hit_total[EX_READ] += si->hit_largest;
+
+	/* validation check of the segment numbers */
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
@@ -294,10 +302,16 @@ get_cache:
 				sizeof(struct nat_entry_set);
 	for (i = 0; i < MAX_INO_ENTRY; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
-	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+
+	for (i = 0; i < NR_EXTENT_CACHES; i++) {
+		struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+		si->ext_mem[i] = atomic_read(&eti->total_ext_tree) *
 						sizeof(struct extent_tree);
-	si->cache_mem += atomic_read(&sbi->total_ext_node) *
+		si->ext_mem[i] += atomic_read(&eti->total_ext_node) *
 						sizeof(struct extent_node);
+		si->cache_mem += si->ext_mem[i];
+	}
 
 	si->page_mem = 0;
 	if (sbi->node_inode) {
@@ -490,16 +504,18 @@ static int stat_show(struct seq_file *s, void *v)
 				si->bg_node_blks);
 		seq_printf(s, "BG skip : IO: %u, Other: %u\n",
 				si->io_skip_bggc, si->other_skip_bggc);
-		seq_puts(s, "\nExtent Cache:\n");
+		seq_puts(s, "\nExtent Cache (Read):\n");
 		seq_printf(s, "  - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
-				si->hit_largest, si->hit_cached,
-				si->hit_rbtree);
+				si->hit_largest, si->hit_cached[EX_READ],
+				si->hit_rbtree[EX_READ]);
 		seq_printf(s, "  - Hit Ratio: %llu%% (%llu / %llu)\n",
-				!si->total_ext ? 0 :
-				div64_u64(si->hit_total * 100, si->total_ext),
-				si->hit_total, si->total_ext);
+				!si->total_ext[EX_READ] ? 0 :
+				div64_u64(si->hit_total[EX_READ] * 100,
+				si->total_ext[EX_READ]),
+				si->hit_total[EX_READ], si->total_ext[EX_READ]);
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
-				si->ext_tree, si->zombie_tree, si->ext_node);
+				si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
+				si->ext_node[EX_READ]);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
 			   si->nr_dio_read, si->nr_dio_write);
@@ -566,8 +582,10 @@ static int stat_show(struct seq_file *s, void *v)
 			(si->base_mem + si->cache_mem + si->page_mem) >> 10);
 		seq_printf(s, "  - static: %llu KB\n",
 				si->base_mem >> 10);
-		seq_printf(s, "  - cached: %llu KB\n",
+		seq_printf(s, "  - cached all: %llu KB\n",
 				si->cache_mem >> 10);
+		seq_printf(s, "  - read extent cache: %llu KB\n",
+				si->ext_mem[EX_READ] >> 10);
 		seq_printf(s, "  - paged : %llu KB\n",
 				si->page_mem >> 10);
 	}
@@ -600,10 +618,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
 	si->sbi = sbi;
 	sbi->stat_info = si;
 
-	atomic64_set(&sbi->total_hit_ext, 0);
-	atomic64_set(&sbi->read_hit_rbtree, 0);
+	/* general extent cache stats */
+	for (i = 0; i < NR_EXTENT_CACHES; i++) {
+		atomic64_set(&sbi->total_hit_ext[i], 0);
+		atomic64_set(&sbi->read_hit_rbtree[i], 0);
+		atomic64_set(&sbi->read_hit_cached[i], 0);
+	}
+
+	/* read extent_cache only */
 	atomic64_set(&sbi->read_hit_largest, 0);
-	atomic64_set(&sbi->read_hit_cached, 0);
 
 	atomic_set(&sbi->inline_xattr, 0);
 	atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index c6810347e205..654a14ab8977 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -17,21 +17,37 @@
 
 static void __set_extent_info(struct extent_info *ei,
 				unsigned int fofs, unsigned int len,
-				block_t blk, bool keep_clen)
+				block_t blk, bool keep_clen,
+				enum extent_type type)
 {
 	ei->fofs = fofs;
-	ei->blk = blk;
 	ei->len = len;
 
-	if (keep_clen)
-		return;
-
+	if (type == EX_READ) {
+		ei->blk = blk;
+		if (keep_clen)
+			return;
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	ei->c_len = 0;
+		ei->c_len = 0;
 #endif
+	}
 }
 
-static bool f2fs_may_extent_tree(struct inode *inode)
+static bool __may_read_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, READ_EXTENT_CACHE))
+		return false;
+	if (is_inode_flag_set(inode, FI_NO_EXTENT))
+		return false;
+	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+			 !f2fs_sb_has_readonly(sbi))
+		return false;
+	return S_ISREG(inode->i_mode);
+}
+
+static bool __may_extent_tree(struct inode *inode, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
@@ -42,18 +58,16 @@ static bool f2fs_may_extent_tree(struct inode *inode)
 	if (list_empty(&sbi->s_list))
 		return false;
 
-	if (!test_opt(sbi, READ_EXTENT_CACHE) ||
-			is_inode_flag_set(inode, FI_NO_EXTENT) ||
-			(is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
-			 !f2fs_sb_has_readonly(sbi)))
-		return false;
-
-	return S_ISREG(inode->i_mode);
+	if (type == EX_READ)
+		return __may_read_extent_tree(inode);
+	return false;
 }
 
 static void __try_update_largest_extent(struct extent_tree *et,
 						struct extent_node *en)
 {
+	if (et->type != EX_READ)
+		return;
 	if (en->ei.len <= et->largest.len)
 		return;
 
@@ -62,28 +76,31 @@ static void __try_update_largest_extent(struct extent_tree *et,
 }
 
 static bool __is_extent_mergeable(struct extent_info *back,
-				struct extent_info *front)
+		struct extent_info *front, enum extent_type type)
 {
+	if (type == EX_READ) {
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	if (back->c_len && back->len != back->c_len)
-		return false;
-	if (front->c_len && front->len != front->c_len)
-		return false;
+		if (back->c_len && back->len != back->c_len)
+			return false;
+		if (front->c_len && front->len != front->c_len)
+			return false;
 #endif
-	return (back->fofs + back->len == front->fofs &&
-			back->blk + back->len == front->blk);
+		return (back->fofs + back->len == front->fofs &&
+				back->blk + back->len == front->blk);
+	}
+	return false;
 }
 
 static bool __is_back_mergeable(struct extent_info *cur,
-				struct extent_info *back)
+		struct extent_info *back, enum extent_type type)
 {
-	return __is_extent_mergeable(back, cur);
+	return __is_extent_mergeable(back, cur, type);
 }
 
 static bool __is_front_mergeable(struct extent_info *cur,
-				struct extent_info *front)
+		struct extent_info *front, enum extent_type type)
 {
-	return __is_extent_mergeable(cur, front);
+	return __is_extent_mergeable(cur, front, type);
 }
 
 static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
@@ -308,6 +325,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 				struct rb_node *parent, struct rb_node **p,
 				bool leftmost)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
 	struct extent_node *en;
 
 	en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
@@ -321,16 +339,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 	rb_link_node(&en->rb_node, parent, p);
 	rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
 	atomic_inc(&et->node_cnt);
-	atomic_inc(&sbi->total_ext_node);
+	atomic_inc(&eti->total_ext_node);
 	return en;
 }
 
 static void __detach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_node *en)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
 	rb_erase_cached(&en->rb_node, &et->root);
 	atomic_dec(&et->node_cnt);
-	atomic_dec(&sbi->total_ext_node);
+	atomic_dec(&eti->total_ext_node);
 
 	if (et->cached_en == en)
 		et->cached_en = NULL;
@@ -346,42 +366,47 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
 static void __release_extent_node(struct f2fs_sb_info *sbi,
 			struct extent_tree *et, struct extent_node *en)
 {
-	spin_lock(&sbi->extent_lock);
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
+	spin_lock(&eti->extent_lock);
 	f2fs_bug_on(sbi, list_empty(&en->list));
 	list_del_init(&en->list);
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 
 	__detach_extent_node(sbi, et, en);
 }
 
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
+static struct extent_tree *__grab_extent_tree(struct inode *inode,
+						enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
 	struct extent_tree *et;
 	nid_t ino = inode->i_ino;
 
-	mutex_lock(&sbi->extent_tree_lock);
-	et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+	mutex_lock(&eti->extent_tree_lock);
+	et = radix_tree_lookup(&eti->extent_tree_root, ino);
 	if (!et) {
 		et = f2fs_kmem_cache_alloc(extent_tree_slab,
 					GFP_NOFS, true, NULL);
-		f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+		f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et);
 		memset(et, 0, sizeof(struct extent_tree));
 		et->ino = ino;
+		et->type = type;
 		et->root = RB_ROOT_CACHED;
 		et->cached_en = NULL;
 		rwlock_init(&et->lock);
 		INIT_LIST_HEAD(&et->list);
 		atomic_set(&et->node_cnt, 0);
-		atomic_inc(&sbi->total_ext_tree);
+		atomic_inc(&eti->total_ext_tree);
 	} else {
-		atomic_dec(&sbi->total_zombie_tree);
+		atomic_dec(&eti->total_zombie_tree);
 		list_del_init(&et->list);
 	}
-	mutex_unlock(&sbi->extent_tree_lock);
+	mutex_unlock(&eti->extent_tree_lock);
 
 	/* never died until evict_inode */
-	F2FS_I(inode)->extent_tree = et;
+	F2FS_I(inode)->extent_tree[type] = et;
 
 	return et;
 }
@@ -415,35 +440,38 @@ static void __drop_largest_extent(struct extent_tree *et,
 }
 
 /* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
+							enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
 	struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (!f2fs_may_extent_tree(inode)) {
-		/* drop largest extent */
-		if (i_ext && i_ext->len) {
+	if (!__may_extent_tree(inode, type)) {
+		/* drop largest read extent */
+		if (type == EX_READ && i_ext && i_ext->len) {
 			f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 			i_ext->len = 0;
 			set_page_dirty(ipage);
-			return;
 		}
-		return;
+		goto out;
 	}
 
-	et = __grab_extent_tree(inode);
+	et = __grab_extent_tree(inode, type);
 
 	if (!i_ext || !i_ext->len)
-		return;
+		goto out;
+
+	BUG_ON(type != EX_READ);
 
 	get_read_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
 	if (atomic_read(&et->node_cnt))
-		goto out;
+		goto unlock_out;
 
 	en = __attach_extent_node(sbi, et, &ei, NULL,
 				&et->root.rb_root.rb_node, true);
@@ -451,37 +479,40 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 		et->largest = en->ei;
 		et->cached_en = en;
 
-		spin_lock(&sbi->extent_lock);
-		list_add_tail(&en->list, &sbi->extent_list);
-		spin_unlock(&sbi->extent_lock);
+		spin_lock(&eti->extent_lock);
+		list_add_tail(&en->list, &eti->extent_list);
+		spin_unlock(&eti->extent_lock);
 	}
-out:
+unlock_out:
 	write_unlock(&et->lock);
+out:
+	if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ])
+		set_inode_flag(inode, FI_NO_EXTENT);
 }
 
 void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
 {
-	__f2fs_init_extent_tree(inode, ipage);
-
-	if (!F2FS_I(inode)->extent_tree)
-		set_inode_flag(inode, FI_NO_EXTENT);
+	/* initialize read cache */
+	__f2fs_init_extent_tree(inode, ipage, EX_READ);
 }
 
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
-							struct extent_info *ei)
+static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	struct extent_node *en;
 	bool ret = false;
 
 	f2fs_bug_on(sbi, !et);
 
-	trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+	trace_f2fs_lookup_extent_tree_start(inode, pgofs, type);
 
 	read_lock(&et->lock);
 
-	if (et->largest.fofs <= pgofs &&
+	if (type == EX_READ &&
+			et->largest.fofs <= pgofs &&
 			et->largest.fofs + et->largest.len > pgofs) {
 		*ei = et->largest;
 		ret = true;
@@ -495,23 +526,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 		goto out;
 
 	if (en == et->cached_en)
-		stat_inc_cached_node_hit(sbi);
+		stat_inc_cached_node_hit(sbi, type);
 	else
-		stat_inc_rbtree_node_hit(sbi);
+		stat_inc_rbtree_node_hit(sbi, type);
 
 	*ei = en->ei;
-	spin_lock(&sbi->extent_lock);
+	spin_lock(&eti->extent_lock);
 	if (!list_empty(&en->list)) {
-		list_move_tail(&en->list, &sbi->extent_list);
+		list_move_tail(&en->list, &eti->extent_list);
 		et->cached_en = en;
 	}
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 	ret = true;
 out:
-	stat_inc_total_hit(sbi);
+	stat_inc_total_hit(sbi, type);
 	read_unlock(&et->lock);
 
-	trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+	if (type == EX_READ)
+		trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
 	return ret;
 }
 
@@ -520,18 +552,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_node *prev_ex,
 				struct extent_node *next_ex)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
 	struct extent_node *en = NULL;
 
-	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+	if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) {
 		prev_ex->ei.len += ei->len;
 		ei = &prev_ex->ei;
 		en = prev_ex;
 	}
 
-	if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+	if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) {
 		next_ex->ei.fofs = ei->fofs;
-		next_ex->ei.blk = ei->blk;
 		next_ex->ei.len += ei->len;
+		if (et->type == EX_READ)
+			next_ex->ei.blk = ei->blk;
 		if (en)
 			__release_extent_node(sbi, et, prev_ex);
 
@@ -543,12 +577,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
 
 	__try_update_largest_extent(et, en);
 
-	spin_lock(&sbi->extent_lock);
+	spin_lock(&eti->extent_lock);
 	if (!list_empty(&en->list)) {
-		list_move_tail(&en->list, &sbi->extent_list);
+		list_move_tail(&en->list, &eti->extent_list);
 		et->cached_en = en;
 	}
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 	return en;
 }
 
@@ -558,6 +592,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
 				struct rb_node *insert_parent,
 				bool leftmost)
 {
+	struct extent_tree_info *eti = &sbi->extent_tree[et->type];
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct extent_node *en = NULL;
@@ -580,48 +615,51 @@ do_insert:
 	__try_update_largest_extent(et, en);
 
 	/* update in global extent list */
-	spin_lock(&sbi->extent_lock);
-	list_add_tail(&en->list, &sbi->extent_list);
+	spin_lock(&eti->extent_lock);
+	list_add_tail(&en->list, &eti->extent_list);
 	et->cached_en = en;
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 	return en;
 }
 
-static void f2fs_update_extent_tree_range(struct inode *inode,
-				pgoff_t fofs, block_t blkaddr, unsigned int len)
+static void __update_extent_tree_range(struct inode *inode,
+			struct extent_info *tei, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	struct extent_node *en = NULL, *en1 = NULL;
 	struct extent_node *prev_en = NULL, *next_en = NULL;
 	struct extent_info ei, dei, prev;
 	struct rb_node **insert_p = NULL, *insert_parent = NULL;
+	unsigned int fofs = tei->fofs, len = tei->len;
 	unsigned int end = fofs + len;
-	unsigned int pos = (unsigned int)fofs;
 	bool updated = false;
 	bool leftmost = false;
 
 	if (!et)
 		return;
 
-	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0);
-
+	if (type == EX_READ)
+		trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
+						tei->blk, 0);
 	write_lock(&et->lock);
 
-	if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
-		write_unlock(&et->lock);
-		return;
+	if (type == EX_READ) {
+		if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+			write_unlock(&et->lock);
+			return;
+		}
+
+		prev = et->largest;
+		dei.len = 0;
+
+		/*
+		 * drop largest extent before lookup, in case it's already
+		 * been shrunk from extent tree
+		 */
+		__drop_largest_extent(et, fofs, len);
 	}
 
-	prev = et->largest;
-	dei.len = 0;
-
-	/*
-	 * drop largest extent before lookup, in case it's already
-	 * been shrunk from extent tree
-	 */
-	__drop_largest_extent(et, fofs, len);
-
 	/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
 	en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
 					(struct rb_entry *)et->cached_en, fofs,
@@ -641,26 +679,30 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 
 		dei = en->ei;
 		org_end = dei.fofs + dei.len;
-		f2fs_bug_on(sbi, pos >= org_end);
+		f2fs_bug_on(sbi, fofs >= org_end);
 
-		if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
-			en->ei.len = pos - en->ei.fofs;
+		if (fofs > dei.fofs && (type != EX_READ ||
+				fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) {
+			en->ei.len = fofs - en->ei.fofs;
 			prev_en = en;
 			parts = 1;
 		}
 
-		if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+		if (end < org_end && (type != EX_READ ||
+				org_end - end >= F2FS_MIN_EXTENT_LEN)) {
 			if (parts) {
 				__set_extent_info(&ei,
 					end, org_end - end,
-					end - dei.fofs + dei.blk, false);
+					end - dei.fofs + dei.blk, false,
+					type);
 				en1 = __insert_extent_tree(sbi, et, &ei,
 							NULL, NULL, true);
 				next_en = en1;
 			} else {
 				__set_extent_info(&en->ei,
 					end, en->ei.len - (end - dei.fofs),
-					en->ei.blk + (end - dei.fofs), true);
+					en->ei.blk + (end - dei.fofs), true,
+					type);
 				next_en = en;
 			}
 			parts++;
@@ -690,9 +732,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 		en = next_en;
 	}
 
-	/* 3. update extent in extent cache */
-	if (blkaddr) {
-		__set_extent_info(&ei, fofs, len, blkaddr, false);
+	/* 3. update extent in read extent cache */
+	BUG_ON(type != EX_READ);
+
+	if (tei->blk) {
+		__set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
 					insert_p, insert_parent, leftmost);
@@ -722,19 +766,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
 }
 
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
 				pgoff_t fofs, block_t blkaddr, unsigned int llen,
 				unsigned int c_len)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
 	struct extent_node *en = NULL;
 	struct extent_node *prev_en = NULL, *next_en = NULL;
 	struct extent_info ei;
 	struct rb_node **insert_p = NULL, *insert_parent = NULL;
 	bool leftmost = false;
 
-	trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len);
+	trace_f2fs_update_read_extent_tree_range(inode, fofs, llen,
+						blkaddr, c_len);
 
 	/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
 	if (is_inode_flag_set(inode, FI_NO_EXTENT))
@@ -751,7 +796,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
 	if (en)
 		goto unlock_out;
 
-	__set_extent_info(&ei, fofs, llen, blkaddr, true);
+	__set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ);
 	ei.c_len = c_len;
 
 	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -762,24 +807,43 @@ unlock_out:
 }
 #endif
 
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
 {
+	struct extent_info ei;
+
+	if (!__may_extent_tree(dn->inode, type))
+		return;
+
+	ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+								dn->ofs_in_node;
+	ei.len = 1;
+
+	if (type == EX_READ) {
+		if (dn->data_blkaddr == NEW_ADDR)
+			ei.blk = NULL_ADDR;
+		else
+			ei.blk = dn->data_blkaddr;
+	}
+	__update_extent_tree_range(dn->inode, &ei, type);
+}
+
+static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink,
+					enum extent_type type)
+{
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
 	struct extent_tree *et, *next;
 	struct extent_node *en;
 	unsigned int node_cnt = 0, tree_cnt = 0;
 	int remained;
 
-	if (!test_opt(sbi, READ_EXTENT_CACHE))
-		return 0;
-
-	if (!atomic_read(&sbi->total_zombie_tree))
+	if (!atomic_read(&eti->total_zombie_tree))
 		goto free_node;
 
-	if (!mutex_trylock(&sbi->extent_tree_lock))
+	if (!mutex_trylock(&eti->extent_tree_lock))
 		goto out;
 
 	/* 1. remove unreferenced extent tree */
-	list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+	list_for_each_entry_safe(et, next, &eti->zombie_list, list) {
 		if (atomic_read(&et->node_cnt)) {
 			write_lock(&et->lock);
 			node_cnt += __free_extent_tree(sbi, et);
@@ -787,61 +851,100 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 		}
 		f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
 		list_del_init(&et->list);
-		radix_tree_delete(&sbi->extent_tree_root, et->ino);
+		radix_tree_delete(&eti->extent_tree_root, et->ino);
 		kmem_cache_free(extent_tree_slab, et);
-		atomic_dec(&sbi->total_ext_tree);
-		atomic_dec(&sbi->total_zombie_tree);
+		atomic_dec(&eti->total_ext_tree);
+		atomic_dec(&eti->total_zombie_tree);
 		tree_cnt++;
 
 		if (node_cnt + tree_cnt >= nr_shrink)
 			goto unlock_out;
 		cond_resched();
 	}
-	mutex_unlock(&sbi->extent_tree_lock);
+	mutex_unlock(&eti->extent_tree_lock);
 
 free_node:
 	/* 2. remove LRU extent entries */
-	if (!mutex_trylock(&sbi->extent_tree_lock))
+	if (!mutex_trylock(&eti->extent_tree_lock))
 		goto out;
 
 	remained = nr_shrink - (node_cnt + tree_cnt);
 
-	spin_lock(&sbi->extent_lock);
+	spin_lock(&eti->extent_lock);
 	for (; remained > 0; remained--) {
-		if (list_empty(&sbi->extent_list))
+		if (list_empty(&eti->extent_list))
 			break;
-		en = list_first_entry(&sbi->extent_list,
+		en = list_first_entry(&eti->extent_list,
 					struct extent_node, list);
 		et = en->et;
 		if (!write_trylock(&et->lock)) {
 			/* refresh this extent node's position in extent list */
-			list_move_tail(&en->list, &sbi->extent_list);
+			list_move_tail(&en->list, &eti->extent_list);
 			continue;
 		}
 
 		list_del_init(&en->list);
-		spin_unlock(&sbi->extent_lock);
+		spin_unlock(&eti->extent_lock);
 
 		__detach_extent_node(sbi, et, en);
 
 		write_unlock(&et->lock);
 		node_cnt++;
-		spin_lock(&sbi->extent_lock);
+		spin_lock(&eti->extent_lock);
 	}
-	spin_unlock(&sbi->extent_lock);
+	spin_unlock(&eti->extent_lock);
 
 unlock_out:
-	mutex_unlock(&sbi->extent_tree_lock);
+	mutex_unlock(&eti->extent_tree_lock);
 out:
-	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+	trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type);
 
 	return node_cnt + tree_cnt;
 }
 
-unsigned int f2fs_destroy_extent_node(struct inode *inode)
+/* read extent cache operations */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+				struct extent_info *ei)
+{
+	if (!__may_extent_tree(inode, EX_READ))
+		return false;
+
+	return __lookup_extent_tree(inode, pgofs, ei, EX_READ);
+}
+
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn)
+{
+	return __update_extent_cache(dn, EX_READ);
+}
+
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+				pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+	struct extent_info ei = {
+		.fofs = fofs,
+		.len = len,
+		.blk = blkaddr,
+	};
+
+	if (!__may_extent_tree(dn->inode, EX_READ))
+		return;
+
+	__update_extent_tree_range(dn->inode, &ei, EX_READ);
+}
+
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	if (!test_opt(sbi, READ_EXTENT_CACHE))
+		return 0;
+
+	return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
+}
+
+static unsigned int __destroy_extent_node(struct inode *inode,
+					enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	unsigned int node_cnt = 0;
 
 	if (!et || !atomic_read(&et->node_cnt))
@@ -854,31 +957,44 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
 	return node_cnt;
 }
 
-void f2fs_drop_extent_tree(struct inode *inode)
+void f2fs_destroy_extent_node(struct inode *inode)
+{
+	__destroy_extent_node(inode, EX_READ);
+}
+
+static void __drop_extent_tree(struct inode *inode, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	bool updated = false;
 
-	if (!f2fs_may_extent_tree(inode))
+	if (!__may_extent_tree(inode, type))
 		return;
 
 	write_lock(&et->lock);
-	set_inode_flag(inode, FI_NO_EXTENT);
 	__free_extent_tree(sbi, et);
-	if (et->largest.len) {
-		et->largest.len = 0;
-		updated = true;
+	if (type == EX_READ) {
+		set_inode_flag(inode, FI_NO_EXTENT);
+		if (et->largest.len) {
+			et->largest.len = 0;
+			updated = true;
+		}
 	}
 	write_unlock(&et->lock);
 	if (updated)
 		f2fs_mark_inode_dirty_sync(inode, true);
 }
 
-void f2fs_destroy_extent_tree(struct inode *inode)
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+	__drop_extent_tree(inode, EX_READ);
+}
+
+static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
 	unsigned int node_cnt = 0;
 
 	if (!et)
@@ -886,76 +1002,49 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 
 	if (inode->i_nlink && !is_bad_inode(inode) &&
 					atomic_read(&et->node_cnt)) {
-		mutex_lock(&sbi->extent_tree_lock);
-		list_add_tail(&et->list, &sbi->zombie_list);
-		atomic_inc(&sbi->total_zombie_tree);
-		mutex_unlock(&sbi->extent_tree_lock);
+		mutex_lock(&eti->extent_tree_lock);
+		list_add_tail(&et->list, &eti->zombie_list);
+		atomic_inc(&eti->total_zombie_tree);
+		mutex_unlock(&eti->extent_tree_lock);
 		return;
 	}
 
 	/* free all extent info belong to this extent tree */
-	node_cnt = f2fs_destroy_extent_node(inode);
+	node_cnt = __destroy_extent_node(inode, type);
 
 	/* delete extent tree entry in radix tree */
-	mutex_lock(&sbi->extent_tree_lock);
+	mutex_lock(&eti->extent_tree_lock);
 	f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
-	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+	radix_tree_delete(&eti->extent_tree_root, inode->i_ino);
 	kmem_cache_free(extent_tree_slab, et);
-	atomic_dec(&sbi->total_ext_tree);
-	mutex_unlock(&sbi->extent_tree_lock);
+	atomic_dec(&eti->total_ext_tree);
+	mutex_unlock(&eti->extent_tree_lock);
 
-	F2FS_I(inode)->extent_tree = NULL;
+	F2FS_I(inode)->extent_tree[type] = NULL;
 
-	trace_f2fs_destroy_extent_tree(inode, node_cnt);
+	trace_f2fs_destroy_extent_tree(inode, node_cnt, type);
 }
 
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
-					struct extent_info *ei)
+void f2fs_destroy_extent_tree(struct inode *inode)
 {
-	if (!f2fs_may_extent_tree(inode))
-		return false;
-
-	return f2fs_lookup_extent_tree(inode, pgofs, ei);
+	__destroy_extent_tree(inode, EX_READ);
 }
 
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
+static void __init_extent_tree_info(struct extent_tree_info *eti)
 {
-	pgoff_t fofs;
-	block_t blkaddr;
-
-	if (!f2fs_may_extent_tree(dn->inode))
-		return;
-
-	if (dn->data_blkaddr == NEW_ADDR)
-		blkaddr = NULL_ADDR;
-	else
-		blkaddr = dn->data_blkaddr;
-
-	fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
-								dn->ofs_in_node;
-	f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
-}
-
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
-				pgoff_t fofs, block_t blkaddr, unsigned int len)
-
-{
-	if (!f2fs_may_extent_tree(dn->inode))
-		return;
-
-	f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
+	INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO);
+	mutex_init(&eti->extent_tree_lock);
+	INIT_LIST_HEAD(&eti->extent_list);
+	spin_lock_init(&eti->extent_lock);
+	atomic_set(&eti->total_ext_tree, 0);
+	INIT_LIST_HEAD(&eti->zombie_list);
+	atomic_set(&eti->total_zombie_tree, 0);
+	atomic_set(&eti->total_ext_node, 0);
 }
 
 void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
 {
-	INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
-	mutex_init(&sbi->extent_tree_lock);
-	INIT_LIST_HEAD(&sbi->extent_list);
-	spin_lock_init(&sbi->extent_lock);
-	atomic_set(&sbi->total_ext_tree, 0);
-	INIT_LIST_HEAD(&sbi->zombie_list);
-	atomic_set(&sbi->total_zombie_tree, 0);
-	atomic_set(&sbi->total_ext_node, 0);
+	__init_extent_tree_info(&sbi->extent_tree[EX_READ]);
 }
 
 int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 04fdf010bb77..7c68bedee649 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -596,16 +596,22 @@ enum {
 /* dirty segments threshold for triggering CP */
 #define DEFAULT_DIRTY_THRESHOLD		4
 
+#define RECOVERY_MAX_RA_BLOCKS		BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS		1
+
+#define F2FS_ONSTACK_PAGES	16	/* nr of onstack pages */
+
 /* for in-memory extent cache entry */
 #define F2FS_MIN_EXTENT_LEN	64	/* minimum extent length */
 
 /* number of extent info in extent cache we try to shrink */
 #define READ_EXTENT_CACHE_SHRINK_NUMBER	128
 
-#define RECOVERY_MAX_RA_BLOCKS		BIO_MAX_VECS
-#define RECOVERY_MIN_RA_BLOCKS		1
-
-#define F2FS_ONSTACK_PAGES	16	/* nr of onstack pages */
+/* extent cache type */
+enum extent_type {
+	EX_READ,
+	NR_EXTENT_CACHES,
+};
 
 struct rb_entry {
 	struct rb_node rb_node;		/* rb node located in rb-tree */
@@ -621,10 +627,17 @@ struct rb_entry {
 struct extent_info {
 	unsigned int fofs;		/* start offset in a file */
 	unsigned int len;		/* length of the extent */
-	block_t blk;			/* start block address of the extent */
+	union {
+		/* read extent_cache */
+		struct {
+			/* start block address of the extent */
+			block_t blk;
 #ifdef CONFIG_F2FS_FS_COMPRESSION
-	unsigned int c_len;		/* physical extent length of compressed blocks */
+			/* physical extent length of compressed blocks */
+			unsigned int c_len;
 #endif
+		};
+	};
 };
 
 struct extent_node {
@@ -636,13 +649,25 @@ struct extent_node {
 
 struct extent_tree {
 	nid_t ino;			/* inode number */
+	enum extent_type type;		/* keep the extent tree type */
 	struct rb_root_cached root;	/* root of extent info rb-tree */
 	struct extent_node *cached_en;	/* recently accessed extent node */
-	struct extent_info largest;	/* largested extent info */
 	struct list_head list;		/* to be used by sbi->zombie_list */
 	rwlock_t lock;			/* protect extent info rb-tree */
 	atomic_t node_cnt;		/* # of extent node in rb-tree*/
 	bool largest_updated;		/* largest extent updated */
+	struct extent_info largest;	/* largest cached extent for EX_READ */
+};
+
+struct extent_tree_info {
+	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+	struct mutex extent_tree_lock;	/* locking extent radix tree */
+	struct list_head extent_list;		/* lru list for shrinker */
+	spinlock_t extent_lock;			/* locking extent lru list */
+	atomic_t total_ext_tree;		/* extent tree count */
+	struct list_head zombie_list;		/* extent zombie tree list */
+	atomic_t total_zombie_tree;		/* extent zombie tree count */
+	atomic_t total_ext_node;		/* extent info count */
 };
 
 /*
@@ -805,7 +830,8 @@ struct f2fs_inode_info {
 	struct list_head dirty_list;	/* dirty list for dirs and files */
 	struct list_head gdirty_list;	/* linked in global dirty list */
 	struct task_struct *atomic_write_task;	/* store atomic write task */
-	struct extent_tree *extent_tree;	/* cached extent_tree entry */
+	struct extent_tree *extent_tree[NR_EXTENT_CACHES];
+					/* cached extent_tree entry */
 	struct inode *cow_inode;	/* copy-on-write inode for atomic write */
 
 	/* avoid racing between foreground op and gc */
@@ -1626,14 +1652,7 @@ struct f2fs_sb_info {
 	struct mutex flush_lock;		/* for flush exclusion */
 
 	/* for extent tree cache */
-	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
-	struct mutex extent_tree_lock;	/* locking extent radix tree */
-	struct list_head extent_list;		/* lru list for shrinker */
-	spinlock_t extent_lock;			/* locking extent lru list */
-	atomic_t total_ext_tree;		/* extent tree count */
-	struct list_head zombie_list;		/* extent zombie tree list */
-	atomic_t total_zombie_tree;		/* extent zombie tree count */
-	atomic_t total_ext_node;		/* extent info count */
+	struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
 
 	/* basic filesystem units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -1718,10 +1737,14 @@ struct f2fs_sb_info {
 	unsigned int segment_count[2];		/* # of allocated segments */
 	unsigned int block_count[2];		/* # of allocated blocks */
 	atomic_t inplace_count;		/* # of inplace update */
-	atomic64_t total_hit_ext;		/* # of lookup extent cache */
-	atomic64_t read_hit_rbtree;		/* # of hit rbtree extent node */
-	atomic64_t read_hit_largest;		/* # of hit largest extent node */
-	atomic64_t read_hit_cached;		/* # of hit cached extent node */
+	/* # of lookup extent cache */
+	atomic64_t total_hit_ext[NR_EXTENT_CACHES];
+	/* # of hit rbtree extent node */
+	atomic64_t read_hit_rbtree[NR_EXTENT_CACHES];
+	/* # of hit cached extent node */
+	atomic64_t read_hit_cached[NR_EXTENT_CACHES];
+	/* # of hit largest extent node in read extent cache */
+	atomic64_t read_hit_largest;
 	atomic_t inline_xattr;			/* # of inline_xattr inodes */
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
@@ -3823,9 +3846,17 @@ struct f2fs_stat_info {
 	struct f2fs_sb_info *sbi;
 	int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
 	int main_area_segs, main_area_sections, main_area_zones;
-	unsigned long long hit_largest, hit_cached, hit_rbtree;
-	unsigned long long hit_total, total_ext;
-	int ext_tree, zombie_tree, ext_node;
+	unsigned long long hit_cached[NR_EXTENT_CACHES];
+	unsigned long long hit_rbtree[NR_EXTENT_CACHES];
+	unsigned long long total_ext[NR_EXTENT_CACHES];
+	unsigned long long hit_total[NR_EXTENT_CACHES];
+	int ext_tree[NR_EXTENT_CACHES];
+	int zombie_tree[NR_EXTENT_CACHES];
+	int ext_node[NR_EXTENT_CACHES];
+	/* to count memory footprint */
+	unsigned long long ext_mem[NR_EXTENT_CACHES];
+	/* for read extent cache */
+	unsigned long long hit_largest;
 	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
 	int ndirty_data, ndirty_qdata;
 	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -3884,10 +3915,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 #define stat_other_skip_bggc_count(sbi)	((sbi)->other_skip_bggc++)
 #define stat_inc_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]++)
 #define stat_dec_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]--)
-#define stat_inc_total_hit(sbi)		(atomic64_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_total_hit(sbi, type)		(atomic64_inc(&(sbi)->total_hit_ext[type]))
+#define stat_inc_rbtree_node_hit(sbi, type)	(atomic64_inc(&(sbi)->read_hit_rbtree[type]))
 #define stat_inc_largest_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_cached_node_hit(sbi, type)	(atomic64_inc(&(sbi)->read_hit_cached[type]))
 #define stat_inc_inline_xattr(inode)					\
 	do {								\
 		if (f2fs_has_inline_xattr(inode))			\
@@ -4010,10 +4041,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
 #define stat_other_skip_bggc_count(sbi)			do { } while (0)
 #define stat_inc_dirty_inode(sbi, type)			do { } while (0)
 #define stat_dec_dirty_inode(sbi, type)			do { } while (0)
-#define stat_inc_total_hit(sbi)				do { } while (0)
-#define stat_inc_rbtree_node_hit(sbi)			do { } while (0)
+#define stat_inc_total_hit(sbi, type)			do { } while (0)
+#define stat_inc_rbtree_node_hit(sbi, type)		do { } while (0)
 #define stat_inc_largest_node_hit(sbi)			do { } while (0)
-#define stat_inc_cached_node_hit(sbi)			do { } while (0)
+#define stat_inc_cached_node_hit(sbi, type)		do { } while (0)
 #define stat_inc_inline_xattr(inode)			do { } while (0)
 #define stat_dec_inline_xattr(inode)			do { } while (0)
 #define stat_inc_inline_inode(inode)			do { } while (0)
@@ -4119,20 +4150,23 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
 		bool force, bool *leftmost);
 bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
 				struct rb_root_cached *root, bool check_key);
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
 void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
 void f2fs_drop_extent_tree(struct inode *inode);
-unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_node(struct inode *inode);
 void f2fs_destroy_extent_tree(struct inode *inode);
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
-			struct extent_info *ei);
-void f2fs_update_extent_cache(struct dnode_of_data *dn);
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
-			pgoff_t fofs, block_t blkaddr, unsigned int len);
 void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi);
 int __init f2fs_create_extent_cache(void);
 void f2fs_destroy_extent_cache(void);
 
+/* read extent cache ops */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei);
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+			pgoff_t fofs, block_t blkaddr, unsigned int len);
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
+			int nr_shrink);
+
 /*
  * sysfs.c
  */
@@ -4202,9 +4236,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
 						struct writeback_control *wbc,
 						enum iostat_type io_type);
 int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
-				pgoff_t fofs, block_t blkaddr, unsigned int llen,
-				unsigned int c_len);
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
+				pgoff_t fofs, block_t blkaddr,
+				unsigned int llen, unsigned int c_len);
 int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				unsigned nr_pages, sector_t *last_block_in_bio,
 				bool is_readahead, bool for_write);
@@ -4285,9 +4319,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
 static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
 							nid_t ino) { }
 #define inc_compr_inode_stat(inode)		do { } while (0)
-static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
-				pgoff_t fofs, block_t blkaddr, unsigned int llen,
-				unsigned int c_len) { }
+static inline void f2fs_update_read_extent_tree_range_compressed(
+				struct inode *inode,
+				pgoff_t fofs, block_t blkaddr,
+				unsigned int llen, unsigned int c_len) { }
 #endif
 
 static inline int set_compress_context(struct inode *inode)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ab0a0d3730f6..cbe7c24065c7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -618,7 +618,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		 */
 		fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
 							dn->inode) + ofs;
-		f2fs_update_extent_cache_range(dn, fofs, 0, len);
+		f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
 	}
 	dn->ofs_in_node = ofs;
@@ -1496,7 +1496,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 		f2fs_set_data_blkaddr(dn);
 	}
 
-	f2fs_update_extent_cache_range(dn, start, 0, index - start);
+	f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
 
 	return ret;
 }
@@ -2558,7 +2558,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	struct f2fs_map_blocks map = { .m_next_extent = NULL,
 					.m_seg_type = NO_CHECK_TYPE,
 					.m_may_create = false };
-	struct extent_info ei = {0, 0, 0};
+	struct extent_info ei = {0, };
 	pgoff_t pg_start, pg_end, next_pgofs;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	unsigned int total = 0, sec_num;
@@ -2590,7 +2590,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	 * lookup mapping info in extent cache, skip defragmenting if physical
 	 * block addresses are continuous.
 	 */
-	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) {
 		if (ei.fofs + ei.len >= pg_end)
 			goto out;
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d19e26b2e875..f0c6506d8975 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1146,7 +1146,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 	struct address_space *mapping = inode->i_mapping;
 	struct dnode_of_data dn;
 	struct page *page;
-	struct extent_info ei = {0, 0, 0};
+	struct extent_info ei = {0, };
 	struct f2fs_io_info fio = {
 		.sbi = sbi,
 		.ino = inode->i_ino,
@@ -1164,7 +1164,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 	if (!page)
 		return -ENOMEM;
 
-	if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+	if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
 		dn.data_blkaddr = ei.blk + index - ei.fofs;
 		if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE_READ))) {
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c705c60019b..086f201f15a0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
 		return false;
 	}
 
-	if (fi->extent_tree) {
-		struct extent_info *ei = &fi->extent_tree->largest;
+	if (fi->extent_tree[EX_READ]) {
+		struct extent_info *ei = &fi->extent_tree[EX_READ]->largest;
 
 		if (ei->len &&
 			(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -607,7 +607,7 @@ retry:
 void f2fs_update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
-	struct extent_tree *et = F2FS_I(inode)->extent_tree;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
 
 	f2fs_wait_on_page_writeback(node_page, NODE, true, true);
 	set_page_dirty(node_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 84b147966080..07419c3e42a5 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -86,9 +86,11 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == READ_EXTENT_CACHE) {
-		mem_size = (atomic_read(&sbi->total_ext_tree) *
+		struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+
+		mem_size = (atomic_read(&eti->total_ext_tree) *
 				sizeof(struct extent_tree) +
-				atomic_read(&sbi->total_ext_node) *
+				atomic_read(&eti->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == DISCARD_CACHE) {
@@ -859,7 +861,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 			blkaddr = data_blkaddr(dn->inode, dn->node_page,
 						dn->ofs_in_node + 1);
 
-		f2fs_update_extent_tree_range_compressed(dn->inode,
+		f2fs_update_read_extent_tree_range_compressed(dn->inode,
 					index, blkaddr,
 					F2FS_I(dn->inode)->i_cluster_size,
 					c_len);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 51de358bc452..8722d1a13c17 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -450,7 +450,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 
 	/* try to shrink extent cache when there is no enough memory */
 	if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
-		f2fs_shrink_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER);
+		f2fs_shrink_read_extent_tree(sbi,
+				READ_EXTENT_CACHE_SHRINK_NUMBER);
 
 	/* check the # of cached NAT entries */
 	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index dd3c3c7a90ec..33c490e69ae3 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 	return count > 0 ? count : 0;
 }
 
-static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi,
+					enum extent_type type)
 {
-	return atomic_read(&sbi->total_zombie_tree) +
-				atomic_read(&sbi->total_ext_node);
+	struct extent_tree_info *eti = &sbi->extent_tree[type];
+
+	return atomic_read(&eti->total_zombie_tree) +
+				atomic_read(&eti->total_ext_node);
 }
 
 unsigned long f2fs_shrink_count(struct shrinker *shrink,
@@ -53,8 +56,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		}
 		spin_unlock(&f2fs_list_lock);
 
-		/* count extent cache entries */
-		count += __count_extent_cache(sbi);
+		/* count read extent cache entries */
+		count += __count_extent_cache(sbi, EX_READ);
 
 		/* count clean nat cache entries */
 		count += __count_nat_entries(sbi);
@@ -99,8 +102,8 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 
 		sbi->shrinker_run_no = run_no;
 
-		/* shrink extent cache entries */
-		freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+		/* shrink read extent cache entries */
+		freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1);
 
 		/* shrink clean nat cache entries */
 		if (freed < nr)
@@ -130,7 +133,7 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
 
 void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 {
-	f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+	f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
 
 	spin_lock(&f2fs_list_lock);
 	list_del_init(&sbi->s_list);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 7fbfce498472..2bb37892d2ba 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -48,6 +48,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 TRACE_DEFINE_ENUM(CP_TRIMMED);
 TRACE_DEFINE_ENUM(CP_PAUSE);
 TRACE_DEFINE_ENUM(CP_RESIZE);
+TRACE_DEFINE_ENUM(EX_READ);
 
 #define show_block_type(type)						\
 	__print_symbolic(type,						\
@@ -1522,28 +1523,31 @@ TRACE_EVENT(f2fs_issue_flush,
 
 TRACE_EVENT(f2fs_lookup_extent_tree_start,
 
-	TP_PROTO(struct inode *inode, unsigned int pgofs),
+	TP_PROTO(struct inode *inode, unsigned int pgofs, enum extent_type type),
 
-	TP_ARGS(inode, pgofs),
+	TP_ARGS(inode, pgofs, type),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(ino_t,	ino)
 		__field(unsigned int, pgofs)
+		__field(enum extent_type, type)
 	),
 
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
 		__entry->pgofs = pgofs;
+		__entry->type = type;
 	),
 
-	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u",
+	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s",
 		show_dev_ino(__entry),
-		__entry->pgofs)
+		__entry->pgofs,
+		__entry->type == EX_READ ? "Read" : "N/A")
 );
 
-TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end,
+TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end,
 
 	TP_PROTO(struct inode *inode, unsigned int pgofs,
 						struct extent_info *ei),
@@ -1557,8 +1561,8 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end,
 		__field(ino_t,	ino)
 		__field(unsigned int, pgofs)
 		__field(unsigned int, fofs)
-		__field(u32, blk)
 		__field(unsigned int, len)
+		__field(u32, blk)
 	),
 
 	TP_fast_assign(
@@ -1566,26 +1570,26 @@ TRACE_EVENT_CONDITION(f2fs_lookup_extent_tree_end,
 		__entry->ino = inode->i_ino;
 		__entry->pgofs = pgofs;
 		__entry->fofs = ei->fofs;
-		__entry->blk = ei->blk;
 		__entry->len = ei->len;
+		__entry->blk = ei->blk;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
-		"ext_info(fofs: %u, blk: %u, len: %u)",
+		"read_ext_info(fofs: %u, len: %u, blk: %u)",
 		show_dev_ino(__entry),
 		__entry->pgofs,
 		__entry->fofs,
-		__entry->blk,
-		__entry->len)
+		__entry->len,
+		__entry->blk)
 );
 
-TRACE_EVENT(f2fs_update_extent_tree_range,
+TRACE_EVENT(f2fs_update_read_extent_tree_range,
 
-	TP_PROTO(struct inode *inode, unsigned int pgofs, block_t blkaddr,
-						unsigned int len,
+	TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len,
+						block_t blkaddr,
 						unsigned int c_len),
 
-	TP_ARGS(inode, pgofs, blkaddr, len, c_len),
+	TP_ARGS(inode, pgofs, len, blkaddr, c_len),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
@@ -1600,67 +1604,73 @@ TRACE_EVENT(f2fs_update_extent_tree_range,
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
 		__entry->pgofs = pgofs;
-		__entry->blk = blkaddr;
 		__entry->len = len;
+		__entry->blk = blkaddr;
 		__entry->c_len = c_len;
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
-					"blkaddr = %u, len = %u, "
-					"c_len = %u",
+				"len = %u, blkaddr = %u, c_len = %u",
 		show_dev_ino(__entry),
 		__entry->pgofs,
-		__entry->blk,
 		__entry->len,
+		__entry->blk,
 		__entry->c_len)
 );
 
 TRACE_EVENT(f2fs_shrink_extent_tree,
 
 	TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt,
-						unsigned int tree_cnt),
+			unsigned int tree_cnt, enum extent_type type),
 
-	TP_ARGS(sbi, node_cnt, tree_cnt),
+	TP_ARGS(sbi, node_cnt, tree_cnt, type),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(unsigned int, node_cnt)
 		__field(unsigned int, tree_cnt)
+		__field(enum extent_type, type)
 	),
 
 	TP_fast_assign(
 		__entry->dev = sbi->sb->s_dev;
 		__entry->node_cnt = node_cnt;
 		__entry->tree_cnt = tree_cnt;
+		__entry->type = type;
 	),
 
-	TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u",
+	TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u, type = %s",
 		show_dev(__entry->dev),
 		__entry->node_cnt,
-		__entry->tree_cnt)
+		__entry->tree_cnt,
+		__entry->type == EX_READ ? "Read" : "N/A")
 );
 
 TRACE_EVENT(f2fs_destroy_extent_tree,
 
-	TP_PROTO(struct inode *inode, unsigned int node_cnt),
+	TP_PROTO(struct inode *inode, unsigned int node_cnt,
+				enum extent_type type),
 
-	TP_ARGS(inode, node_cnt),
+	TP_ARGS(inode, node_cnt, type),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
 		__field(ino_t,	ino)
 		__field(unsigned int, node_cnt)
+		__field(enum extent_type, type)
 	),
 
 	TP_fast_assign(
 		__entry->dev = inode->i_sb->s_dev;
 		__entry->ino = inode->i_ino;
 		__entry->node_cnt = node_cnt;
+		__entry->type = type;
 	),
 
-	TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u",
+	TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s",
 		show_dev_ino(__entry),
-		__entry->node_cnt)
+		__entry->node_cnt,
+		__entry->type == EX_READ ? "Read" : "N/A")
 );
 
 DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes,

From 5ce7de6415c02b5d02731c7b2f27ba10ad566c1a Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 2 Dec 2022 13:51:09 -0800
Subject: [PATCH 328/457] f2fs: allocate the extent_cache by default

Let's allocate it to remove the runtime complexity.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 38 +++++++++++++++++++-------------------
 fs/f2fs/f2fs.h         |  3 ++-
 fs/f2fs/inode.c        |  6 ++++--
 fs/f2fs/namei.c        |  4 ++--
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 654a14ab8977..305f969e3ad1 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -47,20 +47,23 @@ static bool __may_read_extent_tree(struct inode *inode)
 	return S_ISREG(inode->i_mode);
 }
 
+static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
+{
+	if (type == EX_READ)
+		return __may_read_extent_tree(inode);
+	return false;
+}
+
 static bool __may_extent_tree(struct inode *inode, enum extent_type type)
 {
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
 	/*
 	 * for recovered files during mount do not create extents
 	 * if shrinker is not registered.
 	 */
-	if (list_empty(&sbi->s_list))
+	if (list_empty(&F2FS_I_SB(inode)->s_list))
 		return false;
 
-	if (type == EX_READ)
-		return __may_read_extent_tree(inode);
-	return false;
+	return __init_may_extent_tree(inode, type);
 }
 
 static void __try_update_largest_extent(struct extent_tree *et,
@@ -439,20 +442,18 @@ static void __drop_largest_extent(struct extent_tree *et,
 	}
 }
 
-/* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
-							enum extent_type type)
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_tree_info *eti = &sbi->extent_tree[type];
-	struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
+	struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+	struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (!__may_extent_tree(inode, type)) {
+	if (!__may_extent_tree(inode, EX_READ)) {
 		/* drop largest read extent */
-		if (type == EX_READ && i_ext && i_ext->len) {
+		if (i_ext && i_ext->len) {
 			f2fs_wait_on_page_writeback(ipage, NODE, true, true);
 			i_ext->len = 0;
 			set_page_dirty(ipage);
@@ -460,13 +461,11 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
 		goto out;
 	}
 
-	et = __grab_extent_tree(inode, type);
+	et = __grab_extent_tree(inode, EX_READ);
 
 	if (!i_ext || !i_ext->len)
 		goto out;
 
-	BUG_ON(type != EX_READ);
-
 	get_read_extent_info(&ei, i_ext);
 
 	write_lock(&et->lock);
@@ -486,14 +485,15 @@ static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage,
 unlock_out:
 	write_unlock(&et->lock);
 out:
-	if (type == EX_READ && !F2FS_I(inode)->extent_tree[EX_READ])
+	if (!F2FS_I(inode)->extent_tree[EX_READ])
 		set_inode_flag(inode, FI_NO_EXTENT);
 }
 
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_extent_tree(struct inode *inode)
 {
 	/* initialize read cache */
-	__f2fs_init_extent_tree(inode, ipage, EX_READ);
+	if (__init_may_extent_tree(inode, EX_READ))
+		__grab_extent_tree(inode, EX_READ);
 }
 
 static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7c68bedee649..ec52e06f8e61 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4150,7 +4150,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
 		bool force, bool *leftmost);
 bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
 				struct rb_root_cached *root, bool check_key);
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
+void f2fs_init_extent_tree(struct inode *inode);
 void f2fs_drop_extent_tree(struct inode *inode);
 void f2fs_destroy_extent_node(struct inode *inode);
 void f2fs_destroy_extent_tree(struct inode *inode);
@@ -4159,6 +4159,7 @@ int __init f2fs_create_extent_cache(void);
 void f2fs_destroy_extent_cache(void);
 
 /* read extent cache ops */
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
 bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
 			struct extent_info *ei);
 void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 086f201f15a0..c845c16f97d0 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode)
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	fi->i_dir_level = ri->i_dir_level;
 
-	f2fs_init_extent_tree(inode, node_page);
-
 	get_inline_info(inode, ri);
 
 	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
@@ -479,6 +477,10 @@ static int do_read_inode(struct inode *inode)
 	}
 
 	init_idisk_time(inode);
+
+	/* Need all the flag bits */
+	f2fs_init_read_extent_tree(inode, node_page);
+
 	f2fs_put_page(node_page, 1);
 
 	stat_inc_inline_xattr(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 58a91ce8fe08..46de782c2baa 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -284,8 +284,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 	}
 	F2FS_I(inode)->i_inline_xattr_size = xattr_size;
 
-	f2fs_init_extent_tree(inode, NULL);
-
 	F2FS_I(inode)->i_flags =
 		f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
 
@@ -311,6 +309,8 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
 
 	f2fs_set_inode_flags(inode);
 
+	f2fs_init_extent_tree(inode);
+
 	trace_f2fs_new_inode(inode, 0);
 	return inode;
 

From 62ee04f1fd03e7410ff555e4d95de4f4b22c9b7e Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 1 Dec 2022 17:37:15 -0800
Subject: [PATCH 329/457] f2fs: add block_age-based extent cache

This patch introduces a runtime hot/cold data separation method
for f2fs, in order to improve the accuracy for data temperature
classification, reduce the garbage collection overhead after
long-term data updates.

Enhanced hot/cold data separation can record data block update
frequency as "age" of the extent per inode, and take use of the age
info to indicate better temperature type for data block allocation:
 - It records total data blocks allocated since mount;
 - When file extent has been updated, it calculate the count of data
blocks allocated since last update as the age of the extent;
 - Before the data block allocated, it searches for the age info and
chooses the suitable segment for allocation.

Test and result:
 - Prepare: create about 30000 files
  * 3% for cold files (with cold file extension like .apk, from 3M to 10M)
  * 50% for warm files (with random file extension like .FcDxq, from 1K
to 4M)
  * 47% for hot files (with hot file extension like .db, from 1K to 256K)
 - create(5%)/random update(90%)/delete(5%) the files
  * total write amount is about 70G
  * fsync will be called for .db files, and buffered write will be used
for other files

The storage of test device is large enough(128G) so that it will not
switch to SSR mode during the test.

Benefit: dirty segment count increment reduce about 14%
 - before: Dirty +21110
 - after:  Dirty +18286

Signed-off-by: qixiaoyu1 <qixiaoyu1@xiaomi.com>
Signed-off-by: xiongping1 <xiongping1@xiaomi.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/ABI/testing/sysfs-fs-f2fs |  14 ++
 Documentation/filesystems/f2fs.rst      |   4 +
 fs/f2fs/debug.c                         |  21 +++
 fs/f2fs/extent_cache.c                  | 183 +++++++++++++++++++++++-
 fs/f2fs/f2fs.h                          |  38 +++++
 fs/f2fs/file.c                          |   1 +
 fs/f2fs/inode.c                         |   1 +
 fs/f2fs/node.c                          |  10 +-
 fs/f2fs/node.h                          |   1 +
 fs/f2fs/segment.c                       |  33 +++++
 fs/f2fs/shrinker.c                      |  10 +-
 fs/f2fs/super.c                         |  14 ++
 fs/f2fs/sysfs.c                         |  24 ++++
 include/trace/events/f2fs.h             |  86 ++++++++++-
 14 files changed, 430 insertions(+), 10 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 84a009aab1a1..9e3756625a81 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -655,3 +655,17 @@ Description:	When space utilization exceeds this, do background DISCARD aggressi
 		Does DISCARD forcibly in a period of given min_discard_issue_time when the number
 		of discards is not 0 and set discard granularity to 1.
 		Default: 80
+
+What:		/sys/fs/f2fs/<disk>/hot_data_age_threshold
+Date:		November 2022
+Contact:	"Ping Xiong" <xiongping1@xiaomi.com>
+Description:	When DATA SEPARATION is on, it controls the age threshold to indicate
+		the data blocks as hot. By default it was initialized as 262144 blocks
+		(equals to 1GB).
+
+What:		/sys/fs/f2fs/<disk>/warm_data_age_threshold
+Date:		November 2022
+Contact:	"Ping Xiong" <xiongping1@xiaomi.com>
+Description:	When DATA SEPARATION is on, it controls the age threshold to indicate
+		the data blocks as warm. By default it was initialized as 2621440 blocks
+		(equals to 10GB).
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index 67e1f3e86f32..220f3e0d3f55 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -347,6 +347,10 @@ memory=%s		 Control memory mode. This supports "normal" and "low" modes.
 			 Because of the nature of low memory devices, in this mode, f2fs
 			 will try to save memory sometimes by sacrificing performance.
 			 "normal" mode is the default mode and same as before.
+age_extent_cache	 Enable an age extent cache based on rb-tree. It records
+			 data block update frequency of the extent per inode, in
+			 order to provide better temperature hints for data block
+			 allocation.
 ======================== ============================================================
 
 Debugfs Entries
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a9baa121d829..8f1ef742551f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -88,6 +88,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->hit_largest = atomic64_read(&sbi->read_hit_largest);
 	si->hit_total[EX_READ] += si->hit_largest;
 
+	/* block age extent_cache only */
+	si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks);
+
 	/* validation check of the segment numbers */
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
@@ -516,6 +519,22 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
 				si->ext_node[EX_READ]);
+		seq_puts(s, "\nExtent Cache (Block Age):\n");
+		seq_printf(s, "  - Allocated Data Blocks: %llu\n",
+				si->allocated_data_blocks);
+		seq_printf(s, "  - Hit Count: L1:%llu L2:%llu\n",
+				si->hit_cached[EX_BLOCK_AGE],
+				si->hit_rbtree[EX_BLOCK_AGE]);
+		seq_printf(s, "  - Hit Ratio: %llu%% (%llu / %llu)\n",
+				!si->total_ext[EX_BLOCK_AGE] ? 0 :
+				div64_u64(si->hit_total[EX_BLOCK_AGE] * 100,
+				si->total_ext[EX_BLOCK_AGE]),
+				si->hit_total[EX_BLOCK_AGE],
+				si->total_ext[EX_BLOCK_AGE]);
+		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
+				si->ext_tree[EX_BLOCK_AGE],
+				si->zombie_tree[EX_BLOCK_AGE],
+				si->ext_node[EX_BLOCK_AGE]);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - DIO (R: %4d, W: %4d)\n",
 			   si->nr_dio_read, si->nr_dio_write);
@@ -586,6 +605,8 @@ static int stat_show(struct seq_file *s, void *v)
 				si->cache_mem >> 10);
 		seq_printf(s, "  - read extent cache: %llu KB\n",
 				si->ext_mem[EX_READ] >> 10);
+		seq_printf(s, "  - block age extent cache: %llu KB\n",
+				si->ext_mem[EX_BLOCK_AGE] >> 10);
 		seq_printf(s, "  - paged : %llu KB\n",
 				si->page_mem >> 10);
 	}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 305f969e3ad1..1bd38a78ebba 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -6,6 +6,10 @@
  * Copyright (c) 2015 Samsung Electronics
  * Authors: Jaegeuk Kim <jaegeuk@kernel.org>
  *          Chao Yu <chao2.yu@samsung.com>
+ *
+ * block_age-based extent cache added by:
+ * Copyright (c) 2022 xiaomi Co., Ltd.
+ *             http://www.xiaomi.com/
  */
 
 #include <linux/fs.h>
@@ -18,6 +22,7 @@
 static void __set_extent_info(struct extent_info *ei,
 				unsigned int fofs, unsigned int len,
 				block_t blk, bool keep_clen,
+				unsigned long age, unsigned long last_blocks,
 				enum extent_type type)
 {
 	ei->fofs = fofs;
@@ -30,6 +35,9 @@ static void __set_extent_info(struct extent_info *ei,
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 		ei->c_len = 0;
 #endif
+	} else if (type == EX_BLOCK_AGE) {
+		ei->age = age;
+		ei->last_blocks = last_blocks;
 	}
 }
 
@@ -47,10 +55,27 @@ static bool __may_read_extent_tree(struct inode *inode)
 	return S_ISREG(inode->i_mode);
 }
 
+static bool __may_age_extent_tree(struct inode *inode)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, AGE_EXTENT_CACHE))
+		return false;
+	/* don't cache block age info for cold file */
+	if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+		return false;
+	if (file_is_cold(inode))
+		return false;
+
+	return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
 static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
 {
 	if (type == EX_READ)
 		return __may_read_extent_tree(inode);
+	else if (type == EX_BLOCK_AGE)
+		return __may_age_extent_tree(inode);
 	return false;
 }
 
@@ -90,6 +115,11 @@ static bool __is_extent_mergeable(struct extent_info *back,
 #endif
 		return (back->fofs + back->len == front->fofs &&
 				back->blk + back->len == front->blk);
+	} else if (type == EX_BLOCK_AGE) {
+		return (back->fofs + back->len == front->fofs &&
+			abs(back->age - front->age) <= SAME_AGE_REGION &&
+			abs(back->last_blocks - front->last_blocks) <=
+							SAME_AGE_REGION);
 	}
 	return false;
 }
@@ -489,11 +519,22 @@ out:
 		set_inode_flag(inode, FI_NO_EXTENT);
 }
 
+void f2fs_init_age_extent_tree(struct inode *inode)
+{
+	if (!__init_may_extent_tree(inode, EX_BLOCK_AGE))
+		return;
+	__grab_extent_tree(inode, EX_BLOCK_AGE);
+}
+
 void f2fs_init_extent_tree(struct inode *inode)
 {
 	/* initialize read cache */
 	if (__init_may_extent_tree(inode, EX_READ))
 		__grab_extent_tree(inode, EX_READ);
+
+	/* initialize block age cache */
+	if (__init_may_extent_tree(inode, EX_BLOCK_AGE))
+		__grab_extent_tree(inode, EX_BLOCK_AGE);
 }
 
 static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -544,6 +585,8 @@ out:
 
 	if (type == EX_READ)
 		trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
+	else if (type == EX_BLOCK_AGE)
+		trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei);
 	return ret;
 }
 
@@ -642,6 +685,10 @@ static void __update_extent_tree_range(struct inode *inode,
 	if (type == EX_READ)
 		trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
 						tei->blk, 0);
+	else if (type == EX_BLOCK_AGE)
+		trace_f2fs_update_age_extent_tree_range(inode, fofs, len,
+						tei->age, tei->last_blocks);
+
 	write_lock(&et->lock);
 
 	if (type == EX_READ) {
@@ -694,6 +741,7 @@ static void __update_extent_tree_range(struct inode *inode,
 				__set_extent_info(&ei,
 					end, org_end - end,
 					end - dei.fofs + dei.blk, false,
+					dei.age, dei.last_blocks,
 					type);
 				en1 = __insert_extent_tree(sbi, et, &ei,
 							NULL, NULL, true);
@@ -702,6 +750,7 @@ static void __update_extent_tree_range(struct inode *inode,
 				__set_extent_info(&en->ei,
 					end, en->ei.len - (end - dei.fofs),
 					en->ei.blk + (end - dei.fofs), true,
+					dei.age, dei.last_blocks,
 					type);
 				next_en = en;
 			}
@@ -732,11 +781,15 @@ static void __update_extent_tree_range(struct inode *inode,
 		en = next_en;
 	}
 
+	if (type == EX_BLOCK_AGE)
+		goto update_age_extent_cache;
+
 	/* 3. update extent in read extent cache */
 	BUG_ON(type != EX_READ);
 
 	if (tei->blk) {
-		__set_extent_info(&ei, fofs, len, tei->blk, false, EX_READ);
+		__set_extent_info(&ei, fofs, len, tei->blk, false,
+				  0, 0, EX_READ);
 		if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
 			__insert_extent_tree(sbi, et, &ei,
 					insert_p, insert_parent, leftmost);
@@ -758,7 +811,17 @@ static void __update_extent_tree_range(struct inode *inode,
 		et->largest_updated = false;
 		updated = true;
 	}
+	goto out_read_extent_cache;
+update_age_extent_cache:
+	if (!tei->last_blocks)
+		goto out_read_extent_cache;
 
+	__set_extent_info(&ei, fofs, len, 0, false,
+			tei->age, tei->last_blocks, EX_BLOCK_AGE);
+	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+		__insert_extent_tree(sbi, et, &ei,
+					insert_p, insert_parent, leftmost);
+out_read_extent_cache:
 	write_unlock(&et->lock);
 
 	if (updated)
@@ -796,7 +859,7 @@ void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
 	if (en)
 		goto unlock_out;
 
-	__set_extent_info(&ei, fofs, llen, blkaddr, true, EX_READ);
+	__set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ);
 	ei.c_len = c_len;
 
 	if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -807,6 +870,72 @@ unlock_out:
 }
 #endif
 
+static unsigned long long __calculate_block_age(unsigned long long new,
+						unsigned long long old)
+{
+	unsigned long long diff;
+
+	diff = (new >= old) ? new - (new - old) : new + (old - new);
+
+	return div_u64(diff * LAST_AGE_WEIGHT, 100);
+}
+
+/* This returns a new age and allocated blocks in ei */
+static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	loff_t f_size = i_size_read(inode);
+	unsigned long long cur_blocks =
+				atomic64_read(&sbi->allocated_data_blocks);
+
+	/*
+	 * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
+	 * file block even in seq write. So don't record age for newly last file
+	 * block here.
+	 */
+	if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
+			ei->blk == NEW_ADDR)
+		return -EINVAL;
+
+	if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) {
+		unsigned long long cur_age;
+
+		if (cur_blocks >= ei->last_blocks)
+			cur_age = cur_blocks - ei->last_blocks;
+		else
+			/* allocated_data_blocks overflow */
+			cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks;
+
+		if (ei->age)
+			ei->age = __calculate_block_age(cur_age, ei->age);
+		else
+			ei->age = cur_age;
+		ei->last_blocks = cur_blocks;
+		WARN_ON(ei->age > cur_blocks);
+		return 0;
+	}
+
+	f2fs_bug_on(sbi, ei->blk == NULL_ADDR);
+
+	/* the data block was allocated for the first time */
+	if (ei->blk == NEW_ADDR)
+		goto out;
+
+	if (__is_valid_data_blkaddr(ei->blk) &&
+			!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) {
+		f2fs_bug_on(sbi, 1);
+		return -EINVAL;
+	}
+out:
+	/*
+	 * init block age with zero, this can happen when the block age extent
+	 * was reclaimed due to memory constraint or system reboot
+	 */
+	ei->age = 0;
+	ei->last_blocks = cur_blocks;
+	return 0;
+}
+
 static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
 {
 	struct extent_info ei;
@@ -823,6 +952,10 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ
 			ei.blk = NULL_ADDR;
 		else
 			ei.blk = dn->data_blkaddr;
+	} else if (type == EX_BLOCK_AGE) {
+		ei.blk = dn->data_blkaddr;
+		if (__get_new_block_age(dn->inode, &ei))
+			return;
 	}
 	__update_extent_tree_range(dn->inode, &ei, type);
 }
@@ -940,6 +1073,43 @@ unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrin
 	return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
 }
 
+/* block age extent cache operations */
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+				struct extent_info *ei)
+{
+	if (!__may_extent_tree(inode, EX_BLOCK_AGE))
+		return false;
+
+	return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn)
+{
+	return __update_extent_cache(dn, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+				pgoff_t fofs, unsigned int len)
+{
+	struct extent_info ei = {
+		.fofs = fofs,
+		.len = len,
+	};
+
+	if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
+		return;
+
+	__update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE);
+}
+
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+	if (!test_opt(sbi, AGE_EXTENT_CACHE))
+		return 0;
+
+	return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
+}
+
 static unsigned int __destroy_extent_node(struct inode *inode,
 					enum extent_type type)
 {
@@ -960,6 +1130,7 @@ static unsigned int __destroy_extent_node(struct inode *inode,
 void f2fs_destroy_extent_node(struct inode *inode)
 {
 	__destroy_extent_node(inode, EX_READ);
+	__destroy_extent_node(inode, EX_BLOCK_AGE);
 }
 
 static void __drop_extent_tree(struct inode *inode, enum extent_type type)
@@ -988,6 +1159,7 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type)
 void f2fs_drop_extent_tree(struct inode *inode)
 {
 	__drop_extent_tree(inode, EX_READ);
+	__drop_extent_tree(inode, EX_BLOCK_AGE);
 }
 
 static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
@@ -1028,6 +1200,7 @@ static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
 void f2fs_destroy_extent_tree(struct inode *inode)
 {
 	__destroy_extent_tree(inode, EX_READ);
+	__destroy_extent_tree(inode, EX_BLOCK_AGE);
 }
 
 static void __init_extent_tree_info(struct extent_tree_info *eti)
@@ -1045,6 +1218,12 @@ static void __init_extent_tree_info(struct extent_tree_info *eti)
 void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
 {
 	__init_extent_tree_info(&sbi->extent_tree[EX_READ]);
+	__init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]);
+
+	/* initialize for block age extents */
+	atomic64_set(&sbi->allocated_data_blocks, 0);
+	sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
+	sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
 }
 
 int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec52e06f8e61..e8953c3dc81a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -107,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_MERGE_CHECKPOINT	0x10000000
 #define	F2FS_MOUNT_GC_MERGE		0x20000000
 #define F2FS_MOUNT_COMPRESS_CACHE	0x40000000
+#define F2FS_MOUNT_AGE_EXTENT_CACHE	0x80000000
 
 #define F2FS_OPTION(sbi)	((sbi)->mount_opt)
 #define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -607,9 +608,22 @@ enum {
 /* number of extent info in extent cache we try to shrink */
 #define READ_EXTENT_CACHE_SHRINK_NUMBER	128
 
+/* number of age extent info in extent cache we try to shrink */
+#define AGE_EXTENT_CACHE_SHRINK_NUMBER	128
+#define LAST_AGE_WEIGHT			30
+#define SAME_AGE_REGION			1024
+
+/*
+ * Define data block with age less than 1GB as hot data
+ * define data block with age less than 10GB but more than 1GB as warm data
+ */
+#define DEF_HOT_DATA_AGE_THRESHOLD	262144
+#define DEF_WARM_DATA_AGE_THRESHOLD	2621440
+
 /* extent cache type */
 enum extent_type {
 	EX_READ,
+	EX_BLOCK_AGE,
 	NR_EXTENT_CACHES,
 };
 
@@ -637,6 +651,13 @@ struct extent_info {
 			unsigned int c_len;
 #endif
 		};
+		/* block age extent_cache */
+		struct {
+			/* block age of the extent */
+			unsigned long long age;
+			/* last total blocks allocated */
+			unsigned long long last_blocks;
+		};
 	};
 };
 
@@ -1653,6 +1674,11 @@ struct f2fs_sb_info {
 
 	/* for extent tree cache */
 	struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
+	atomic64_t allocated_data_blocks;	/* for block age extent_cache */
+
+	/* The threshold used for hot and warm data seperation*/
+	unsigned int hot_data_age_threshold;
+	unsigned int warm_data_age_threshold;
 
 	/* basic filesystem units */
 	unsigned int log_sectors_per_block;	/* log2 sectors per block */
@@ -3857,6 +3883,8 @@ struct f2fs_stat_info {
 	unsigned long long ext_mem[NR_EXTENT_CACHES];
 	/* for read extent cache */
 	unsigned long long hit_largest;
+	/* for block age extent cache */
+	unsigned long long allocated_data_blocks;
 	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
 	int ndirty_data, ndirty_qdata;
 	unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -4168,6 +4196,16 @@ void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
 unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
 			int nr_shrink);
 
+/* block age extent cache ops */
+void f2fs_init_age_extent_tree(struct inode *inode);
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+			struct extent_info *ei);
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+			pgoff_t fofs, unsigned int len);
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi,
+			int nr_shrink);
+
 /*
  * sysfs.c
  */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index cbe7c24065c7..56c23b5e9d65 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -619,6 +619,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
 							dn->inode) + ofs;
 		f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
+		f2fs_update_age_extent_cache_range(dn, fofs, nr_free);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
 	}
 	dn->ofs_in_node = ofs;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index c845c16f97d0..ff6cf66ed46b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -480,6 +480,7 @@ static int do_read_inode(struct inode *inode)
 
 	/* Need all the flag bits */
 	f2fs_init_read_extent_tree(inode, node_page);
+	f2fs_init_age_extent_tree(inode);
 
 	f2fs_put_page(node_page, 1);
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 07419c3e42a5..dde4c0458704 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 	avail_ram = val.totalram - val.totalhigh;
 
 	/*
-	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+	 * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
 		mem_size = (nm_i->nid_cnt[FREE_NID] *
@@ -85,14 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
 						sizeof(struct ino_entry);
 		mem_size >>= PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
-	} else if (type == READ_EXTENT_CACHE) {
-		struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+	} else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) {
+		enum extent_type etype = type == READ_EXTENT_CACHE ?
+						EX_READ : EX_BLOCK_AGE;
+		struct extent_tree_info *eti = &sbi->extent_tree[etype];
 
 		mem_size = (atomic_read(&eti->total_ext_tree) *
 				sizeof(struct extent_tree) +
 				atomic_read(&eti->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_SHIFT;
-		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == DISCARD_CACHE) {
 		mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
 				sizeof(struct discard_cmd)) >> PAGE_SHIFT;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0aa48704c77a..99454d46a939 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -147,6 +147,7 @@ enum mem_type {
 	DIRTY_DENTS,	/* indicates dirty dentry pages */
 	INO_ENTRIES,	/* indicates inode entries */
 	READ_EXTENT_CACHE,	/* indicates read extent cache */
+	AGE_EXTENT_CACHE,	/* indicates age extent cache */
 	DISCARD_CACHE,	/* indicates memory of cached discard cmds */
 	COMPRESS_PAGE,	/* indicates memory of cached compressed pages */
 	BASE_CHECK,	/* check kernel status */
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8722d1a13c17..dee712f7225f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -453,6 +453,11 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
 		f2fs_shrink_read_extent_tree(sbi,
 				READ_EXTENT_CACHE_SHRINK_NUMBER);
 
+	/* try to shrink age extent cache when there is no enough memory */
+	if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE))
+		f2fs_shrink_age_extent_tree(sbi,
+				AGE_EXTENT_CACHE_SHRINK_NUMBER);
+
 	/* check the # of cached NAT entries */
 	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
 		f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
@@ -3151,10 +3156,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
 	}
 }
 
+static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct extent_info ei;
+
+	if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
+		if (!ei.age)
+			return NO_CHECK_TYPE;
+		if (ei.age <= sbi->hot_data_age_threshold)
+			return CURSEG_HOT_DATA;
+		if (ei.age <= sbi->warm_data_age_threshold)
+			return CURSEG_WARM_DATA;
+		return CURSEG_COLD_DATA;
+	}
+	return NO_CHECK_TYPE;
+}
+
 static int __get_segment_type_6(struct f2fs_io_info *fio)
 {
 	if (fio->type == DATA) {
 		struct inode *inode = fio->page->mapping->host;
+		int type;
 
 		if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
 			return CURSEG_COLD_DATA_PINNED;
@@ -3169,6 +3192,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
 		}
 		if (file_is_cold(inode) || f2fs_need_compress_data(inode))
 			return CURSEG_COLD_DATA;
+
+		type = __get_age_segment_type(inode, fio->page->index);
+		if (type != NO_CHECK_TYPE)
+			return type;
+
 		if (file_is_hot(inode) ||
 				is_inode_flag_set(inode, FI_HOT_DATA) ||
 				f2fs_is_cow_file(inode))
@@ -3287,6 +3315,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
 
+	if (IS_DATASEG(type))
+		atomic64_inc(&sbi->allocated_data_blocks);
+
 	up_write(&sit_i->sentry_lock);
 
 	if (page && IS_NODESEG(type)) {
@@ -3414,6 +3445,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
 	struct f2fs_summary sum;
 
 	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
+	if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO)
+		f2fs_update_age_extent_cache(dn);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
 	do_write_page(&sum, fio);
 	f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 33c490e69ae3..83d6fb97dcae 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -59,6 +59,9 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
 		/* count read extent cache entries */
 		count += __count_extent_cache(sbi, EX_READ);
 
+		/* count block age extent cache entries */
+		count += __count_extent_cache(sbi, EX_BLOCK_AGE);
+
 		/* count clean nat cache entries */
 		count += __count_nat_entries(sbi);
 
@@ -102,8 +105,11 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
 
 		sbi->shrinker_run_no = run_no;
 
+		/* shrink extent cache entries */
+		freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2);
+
 		/* shrink read extent cache entries */
-		freed += f2fs_shrink_read_extent_tree(sbi, nr >> 1);
+		freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2);
 
 		/* shrink clean nat cache entries */
 		if (freed < nr)
@@ -134,6 +140,8 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
 void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 {
 	f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
+	f2fs_shrink_age_extent_tree(sbi,
+				__count_extent_cache(sbi, EX_BLOCK_AGE));
 
 	spin_lock(&f2fs_list_lock);
 	list_del_init(&sbi->s_list);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 412c2e7352c0..180d8b804d13 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -163,6 +163,7 @@ enum {
 	Opt_nogc_merge,
 	Opt_discard_unit,
 	Opt_memory_mode,
+	Opt_age_extent_cache,
 	Opt_err,
 };
 
@@ -241,6 +242,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_nogc_merge, "nogc_merge"},
 	{Opt_discard_unit, "discard_unit=%s"},
 	{Opt_memory_mode, "memory=%s"},
+	{Opt_age_extent_cache, "age_extent_cache"},
 	{Opt_err, NULL},
 };
 
@@ -1257,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			}
 			kfree(name);
 			break;
+		case Opt_age_extent_cache:
+			set_opt(sbi, AGE_EXTENT_CACHE);
+			break;
 		default:
 			f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
 				 p);
@@ -1958,6 +1963,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",extent_cache");
 	else
 		seq_puts(seq, ",noextent_cache");
+	if (test_opt(sbi, AGE_EXTENT_CACHE))
+		seq_puts(seq, ",age_extent_cache");
 	if (test_opt(sbi, DATA_FLUSH))
 		seq_puts(seq, ",data_flush");
 
@@ -2219,6 +2226,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool need_restart_flush = false, need_stop_flush = false;
 	bool need_restart_discard = false, need_stop_discard = false;
 	bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
+	bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
 	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
 	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
@@ -2313,6 +2321,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		f2fs_warn(sbi, "switch extent_cache option is not allowed");
 		goto restore_opts;
 	}
+	/* disallow enable/disable age extent_cache dynamically */
+	if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) {
+		err = -EINVAL;
+		f2fs_warn(sbi, "switch age_extent_cache option is not allowed");
+		goto restore_opts;
+	}
 
 	if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
 		err = -EINVAL;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a4745d596310..2ab215110596 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -668,6 +668,24 @@ out:
 		return count;
 	}
 
+	if (!strcmp(a->attr.name, "hot_data_age_threshold")) {
+		if (t == 0 || t >= sbi->warm_data_age_threshold)
+			return -EINVAL;
+		if (t == *ui)
+			return count;
+		*ui = (unsigned int)t;
+		return count;
+	}
+
+	if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
+		if (t == 0 || t <= sbi->hot_data_age_threshold)
+			return -EINVAL;
+		if (t == *ui)
+			return count;
+		*ui = (unsigned int)t;
+		return count;
+	}
+
 	*ui = (unsigned int)t;
 
 	return count;
@@ -923,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
 
+/* For block age extent cache */
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold);
+
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(gc_urgent_sleep_time),
@@ -1018,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(peak_atomic_write),
 	ATTR_LIST(committed_atomic_block),
 	ATTR_LIST(revoked_atomic_block),
+	ATTR_LIST(hot_data_age_threshold),
+	ATTR_LIST(warm_data_age_threshold),
 	NULL,
 };
 ATTRIBUTE_GROUPS(f2fs);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 2bb37892d2ba..31d994e6b4ca 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -49,6 +49,7 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
 TRACE_DEFINE_ENUM(CP_PAUSE);
 TRACE_DEFINE_ENUM(CP_RESIZE);
 TRACE_DEFINE_ENUM(EX_READ);
+TRACE_DEFINE_ENUM(EX_BLOCK_AGE);
 
 #define show_block_type(type)						\
 	__print_symbolic(type,						\
@@ -155,6 +156,11 @@ TRACE_DEFINE_ENUM(EX_READ);
 		{ COMPRESS_ZSTD,	"ZSTD" },			\
 		{ COMPRESS_LZORLE,	"LZO-RLE" })
 
+#define show_extent_type(type)						\
+	__print_symbolic(type,						\
+		{ EX_READ,	"Read" },				\
+		{ EX_BLOCK_AGE,	"Block Age" })
+
 struct f2fs_sb_info;
 struct f2fs_io_info;
 struct extent_info;
@@ -1544,7 +1550,7 @@ TRACE_EVENT(f2fs_lookup_extent_tree_start,
 	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s",
 		show_dev_ino(__entry),
 		__entry->pgofs,
-		__entry->type == EX_READ ? "Read" : "N/A")
+		show_extent_type(__entry->type))
 );
 
 TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end,
@@ -1583,6 +1589,45 @@ TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end,
 		__entry->blk)
 );
 
+TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end,
+
+	TP_PROTO(struct inode *inode, unsigned int pgofs,
+						struct extent_info *ei),
+
+	TP_ARGS(inode, pgofs, ei),
+
+	TP_CONDITION(ei),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(unsigned int, pgofs)
+		__field(unsigned int, fofs)
+		__field(unsigned int, len)
+		__field(unsigned long long, age)
+		__field(unsigned long long, blocks)
+	),
+
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->pgofs = pgofs;
+		__entry->fofs = ei->fofs;
+		__entry->len = ei->len;
+		__entry->age = ei->age;
+		__entry->blocks = ei->last_blocks;
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
+		"age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)",
+		show_dev_ino(__entry),
+		__entry->pgofs,
+		__entry->fofs,
+		__entry->len,
+		__entry->age,
+		__entry->blocks)
+);
+
 TRACE_EVENT(f2fs_update_read_extent_tree_range,
 
 	TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len,
@@ -1618,6 +1663,41 @@ TRACE_EVENT(f2fs_update_read_extent_tree_range,
 		__entry->c_len)
 );
 
+TRACE_EVENT(f2fs_update_age_extent_tree_range,
+
+	TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len,
+					unsigned long long age,
+					unsigned long long last_blks),
+
+	TP_ARGS(inode, pgofs, len, age, last_blks),
+
+	TP_STRUCT__entry(
+		__field(dev_t,	dev)
+		__field(ino_t,	ino)
+		__field(unsigned int, pgofs)
+		__field(unsigned int, len)
+		__field(unsigned long long, age)
+		__field(unsigned long long, blocks)
+	),
+
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->pgofs = pgofs;
+		__entry->len = len;
+		__entry->age = age;
+		__entry->blocks = last_blks;
+	),
+
+	TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, "
+				"len = %u, age = %llu, blocks = %llu",
+		show_dev_ino(__entry),
+		__entry->pgofs,
+		__entry->len,
+		__entry->age,
+		__entry->blocks)
+);
+
 TRACE_EVENT(f2fs_shrink_extent_tree,
 
 	TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt,
@@ -1643,7 +1723,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree,
 		show_dev(__entry->dev),
 		__entry->node_cnt,
 		__entry->tree_cnt,
-		__entry->type == EX_READ ? "Read" : "N/A")
+		show_extent_type(__entry->type))
 );
 
 TRACE_EVENT(f2fs_destroy_extent_tree,
@@ -1670,7 +1750,7 @@ TRACE_EVENT(f2fs_destroy_extent_tree,
 	TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s",
 		show_dev_ino(__entry),
 		__entry->node_cnt,
-		__entry->type == EX_READ ? "Read" : "N/A")
+		show_extent_type(__entry->type))
 );
 
 DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes,

From ae89758cb880a531cdb886b04072b32e5aeede06 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Wed, 7 Dec 2022 13:42:17 +0000
Subject: [PATCH 330/457] f2fs: Fix spelling mistake in label:
 free_bio_enrty_cache -> free_bio_entry_cache

There is a spelling mistake in a label name. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 180d8b804d13..c02a717cf880 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -4723,7 +4723,7 @@ static int __init init_f2fs_fs(void)
 		goto free_iostat;
 	err = f2fs_init_bioset();
 	if (err)
-		goto free_bio_enrty_cache;
+		goto free_bio_entry_cache;
 	err = f2fs_init_compress_mempool();
 	if (err)
 		goto free_bioset;
@@ -4740,7 +4740,7 @@ free_compress_mempool:
 	f2fs_destroy_compress_mempool();
 free_bioset:
 	f2fs_destroy_bioset();
-free_bio_enrty_cache:
+free_bio_entry_cache:
 	f2fs_destroy_bio_entry_cache();
 free_iostat:
 	f2fs_destroy_iostat_processing();

From 9a7d434257f41d056d1c56766f7a1e10234b7a3d Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Mon, 5 Dec 2022 22:56:03 +0800
Subject: [PATCH 331/457] f2fs: fix iostat parameter for discard

Just like other data we count uses the number of bytes as the basic unit,
but discard uses the number of cmds as the statistical unit. In fact the
discard command contains the number of blocks, so let's change to the
number of bytes as the base unit.

Fixes: b0af6d491a6b ("f2fs: add app/fs io stat")
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index dee712f7225f..f1845a032885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1187,7 +1187,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
 
 		atomic_inc(&dcc->issued_discard);
 
-		f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1);
+		f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
 
 		lstart += len;
 		start += len;

From 9d44110b4c4decd062548bc9bbad7ccadace16d7 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Fri, 2 Dec 2022 12:58:41 +0800
Subject: [PATCH 332/457] f2fs: don't call f2fs_issue_discard_timeout() when
 discard_cmd_cnt is 0 in f2fs_put_super()

No need to call f2fs_issue_discard_timeout() in f2fs_put_super,
when no discard command requires issue. Since the caller of
f2fs_issue_discard_timeout() usually judges the number of discard
commands before using it. Let's move this logic to
f2fs_issue_discard_timeout().

By the way, use f2fs_realtime_discard_enable to simplify the code.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 6 ++++--
 fs/f2fs/super.c   | 8 ++------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f1845a032885..a9099a754dd2 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1661,6 +1661,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
 	struct discard_policy dpolicy;
 	bool dropped;
 
+	if (!atomic_read(&dcc->discard_cmd_cnt))
+		return false;
+
 	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
 					dcc->discard_granularity);
 	__issue_discard_cmd(sbi, &dpolicy);
@@ -2116,8 +2119,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
 	 * Recovery can cache discard commands, so in error path of
 	 * fill_super(), it needs to give a chance to handle them.
 	 */
-	if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
-		f2fs_issue_discard_timeout(sbi);
+	f2fs_issue_discard_timeout(sbi);
 
 	kfree(dcc);
 	SM_I(sbi)->dcc_info = NULL;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c02a717cf880..1f812b9ce985 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1581,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb)
 	/* be sure to wait for any on-going discard commands */
 	dropped = f2fs_issue_discard_timeout(sbi);
 
-	if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
-					!sbi->discard_blks && !dropped) {
+	if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
@@ -2233,7 +2232,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool no_discard = !test_opt(sbi, DISCARD);
 	bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
 	bool block_unit_discard = f2fs_block_unit_discard(sbi);
-	struct discard_cmd_control *dcc;
 #ifdef CONFIG_QUOTA
 	int i, j;
 #endif
@@ -2420,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 				goto restore_flush;
 			need_stop_discard = true;
 		} else {
-			dcc = SM_I(sbi)->dcc_info;
 			f2fs_stop_discard_thread(sbi);
-			if (atomic_read(&dcc->discard_cmd_cnt))
-				f2fs_issue_discard_timeout(sbi);
+			f2fs_issue_discard_timeout(sbi);
 			need_restart_discard = true;
 		}
 	}

From 913051f69f1d8538e3f15a827ca2a60e1952d447 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank.li@vivo.com>
Date: Tue, 29 Nov 2022 12:15:23 +0800
Subject: [PATCH 333/457] f2fs: fix some format WARNING in debug.c and sysfs.c

To fix:

WARNING: function definition argument 'struct f2fs_attr *' should also have an identifier name
+       ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);

WARNING: return sysfs_emit(...) formats should include a terminating newline
+       return sysfs_emit(buf, "(none)");

WARNING: Prefer 'unsigned int' to bare use of 'unsigned'
+               unsigned npages = NODE_MAPPING(sbi)->nrpages;

WARNING: Missing a blank line after declarations
+               unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+               si->page_mem += (unsigned long long)npages << PAGE_SHIFT;

WARNING: quoted string split across lines
+               seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
+                               "Cur time: %4d(ms), Peak time: %4d(ms))\n",

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/debug.c | 45 +++++++++++++++++++++++----------------------
 fs/f2fs/sysfs.c | 10 +++++-----
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8f1ef742551f..32af4f0c5735 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -318,18 +318,19 @@ get_cache:
 
 	si->page_mem = 0;
 	if (sbi->node_inode) {
-		unsigned npages = NODE_MAPPING(sbi)->nrpages;
+		unsigned long npages = NODE_MAPPING(sbi)->nrpages;
 
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
 	if (sbi->meta_inode) {
-		unsigned npages = META_MAPPING(sbi)->nrpages;
+		unsigned long npages = META_MAPPING(sbi)->nrpages;
 
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	if (sbi->compress_inode) {
-		unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+		unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages;
+
 		si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	}
 #endif
@@ -477,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v)
 				si->meta_count[META_NAT]);
 		seq_printf(s, "  - ssa blocks : %u\n",
 				si->meta_count[META_SSA]);
-		seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
-				"Cur time: %4d(ms), Peak time: %4d(ms))\n",
-				si->nr_queued_ckpt, si->nr_issued_ckpt,
-				si->nr_total_ckpt, si->cur_ckpt_time,
-				si->peak_ckpt_time);
+		seq_puts(s, "CP merge:\n");
+		seq_printf(s, "  - Queued : %4d\n", si->nr_queued_ckpt);
+		seq_printf(s, "  - Issued : %4d\n", si->nr_issued_ckpt);
+		seq_printf(s, "  - Total : %4d\n", si->nr_total_ckpt);
+		seq_printf(s, "  - Cur time : %4d(ms)\n", si->cur_ckpt_time);
+		seq_printf(s, "  - Peak time : %4d(ms)\n", si->peak_ckpt_time);
 		seq_printf(s, "GC calls: %d (BG: %d)\n",
 			   si->call_count, si->bg_gc);
 		seq_printf(s, "  - data segments : %d (%d)\n",
 				si->data_segs, si->bg_data_segs);
 		seq_printf(s, "  - node segments : %d (%d)\n",
 				si->node_segs, si->bg_node_segs);
-		seq_printf(s, "  - Reclaimed segs : Normal (%d), Idle CB (%d), "
-				"Idle Greedy (%d), Idle AT (%d), "
-				"Urgent High (%d), Urgent Mid (%d), "
-				"Urgent Low (%d)\n",
-				si->sbi->gc_reclaimed_segs[GC_NORMAL],
-				si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
-				si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
-				si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
-				si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
-				si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
-				si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
+		seq_puts(s, "  - Reclaimed segs :\n");
+		seq_printf(s, "    - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]);
+		seq_printf(s, "    - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]);
+		seq_printf(s, "    - Idle Greedy : %d\n",
+				si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
+		seq_printf(s, "    - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]);
+		seq_printf(s, "    - Urgent High : %d\n",
+				si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
+		seq_printf(s, "    - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]);
+		seq_printf(s, "    - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
 		seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
 				si->bg_data_blks + si->bg_node_blks);
 		seq_printf(s, "  - data blocks : %d (%d)\n", si->data_blks,
@@ -540,11 +541,11 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->nr_dio_read, si->nr_dio_write);
 		seq_printf(s, "  - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
 			   si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
-		seq_printf(s, "  - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
-			"Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+		seq_printf(s, "  - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ",
 			   si->nr_wb_cp_data, si->nr_wb_data,
 			   si->nr_flushing, si->nr_flushed,
-			   si->flush_list_empty,
+			   si->flush_list_empty);
+		seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
 			   si->nr_discarding, si->nr_discarded,
 			   si->nr_discard_cmd, si->undiscard_blks);
 		seq_printf(s, "  - atomic IO: %4d (Max. %4d)\n",
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 2ab215110596..83a366f3ee80 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = {
 
 struct f2fs_attr {
 	struct attribute attr;
-	ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
-	ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
-			 const char *, size_t);
+	ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf);
+	ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi,
+			 const char *buf, size_t len);
 	int struct_type;
 	int offset;
 	int id;
@@ -232,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
 			(sb->s_encoding->version >> 8) & 0xff,
 			sb->s_encoding->version & 0xff);
 #endif
-	return sysfs_emit(buf, "(none)");
+	return sysfs_emit(buf, "(none)\n");
 }
 
 static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	return sysfs_emit(buf, "%llu", SIT_I(sbi)->mounted_time);
+	return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time);
 }
 
 #ifdef CONFIG_F2FS_STAT_FS

From be14cf2267e6ec5ffec5ad974a469461e7de4894 Mon Sep 17 00:00:00 2001
From: Yuwei Guan <ssawgyw@gmail.com>
Date: Sun, 11 Dec 2022 21:08:41 +0800
Subject: [PATCH 334/457] f2fs: reset wait_ms to default if any of the victims
 have been selected

In non-foreground gc mode, if no victim is selected, the gc process
will wait for no_gc_sleep_time before waking up again. In this
subsequent time, even though a victim will be selected, the gc process
still waits for no_gc_sleep_time before waking up. The configuration
of wait_ms is not reasonable.

After any of the victims have been selected, we need to reset wait_ms to
default sleep time from no_gc_sleep_time.

Signed-off-by: Yuwei Guan <Yuwei.Guan@zeekrlife.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f0c6506d8975..d7a9d84ba57c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -141,6 +141,10 @@ do_gc:
 			/* don't bother wait_ms by foreground gc */
 			if (!foreground)
 				wait_ms = gc_th->no_gc_sleep_time;
+		} else {
+			/* reset wait_ms to default sleep time */
+			if (wait_ms == gc_th->no_gc_sleep_time)
+				wait_ms = gc_th->min_sleep_time;
 		}
 
 		if (foreground)

From 792217ddb38ded7a2d45407f87e8c46ce5660bca Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 28 Nov 2022 23:04:01 -0800
Subject: [PATCH 335/457] fsverity: stop using PG_error to track error status

As a step towards freeing the PG_error flag for other uses, change ext4
and f2fs to stop using PG_error to track verity errors.  Instead, if a
verity error occurs, just mark the whole bio as failed.  The coarser
granularity isn't really a problem since it isn't any worse than what
the block layer provides, and errors from a multi-page readahead aren't
reported to applications unless a single-page read fails too.

f2fs supports compression, which makes the f2fs changes a bit more
complicated than desired, but the basic premise still works.

Note: there are still a few uses of PageError in f2fs, but they are on
the write path, so they are unrelated and this patch doesn't touch them.

Reviewed-by: Chao Yu <chao@kernel.org>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221129070401.156114-1-ebiggers@kernel.org
---
 fs/ext4/readpage.c |  8 ++----
 fs/f2fs/compress.c | 68 ++++++++++++++++++++++------------------------
 fs/f2fs/data.c     | 53 ++++++++++++++++++++++--------------
 fs/verity/verify.c | 12 ++++----
 4 files changed, 74 insertions(+), 67 deletions(-)

diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3d21eae267fc..e604ea4e102b 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio)
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		page = bv->bv_page;
 
-		/* PG_error was set if verity failed. */
-		if (bio->bi_status || PageError(page)) {
+		if (bio->bi_status)
 			ClearPageUptodate(page);
-			/* will re-read again later */
-			ClearPageError(page);
-		} else {
+		else
 			SetPageUptodate(page);
-		}
 		unlock_page(page);
 	}
 	if (bio->bi_private)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 9723f0bed923..2532f369cb10 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1708,50 +1708,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
 	}
 }
 
-/*
- * Update and unlock the cluster's pagecache pages, and release the reference to
- * the decompress_io_ctx that was being held for I/O completion.
- */
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
-				bool in_task)
-{
-	int i;
-
-	for (i = 0; i < dic->cluster_size; i++) {
-		struct page *rpage = dic->rpages[i];
-
-		if (!rpage)
-			continue;
-
-		/* PG_error was set if verity failed. */
-		if (failed || PageError(rpage)) {
-			ClearPageUptodate(rpage);
-			/* will re-read again later */
-			ClearPageError(rpage);
-		} else {
-			SetPageUptodate(rpage);
-		}
-		unlock_page(rpage);
-	}
-
-	f2fs_put_dic(dic, in_task);
-}
-
 static void f2fs_verify_cluster(struct work_struct *work)
 {
 	struct decompress_io_ctx *dic =
 		container_of(work, struct decompress_io_ctx, verity_work);
 	int i;
 
-	/* Verify the cluster's decompressed pages with fs-verity. */
+	/* Verify, update, and unlock the decompressed pages. */
 	for (i = 0; i < dic->cluster_size; i++) {
 		struct page *rpage = dic->rpages[i];
 
-		if (rpage && !fsverity_verify_page(rpage))
-			SetPageError(rpage);
+		if (!rpage)
+			continue;
+
+		if (fsverity_verify_page(rpage))
+			SetPageUptodate(rpage);
+		else
+			ClearPageUptodate(rpage);
+		unlock_page(rpage);
 	}
 
-	__f2fs_decompress_end_io(dic, false, true);
+	f2fs_put_dic(dic, true);
 }
 
 /*
@@ -1761,6 +1738,8 @@ static void f2fs_verify_cluster(struct work_struct *work)
 void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
 				bool in_task)
 {
+	int i;
+
 	if (!failed && dic->need_verity) {
 		/*
 		 * Note that to avoid deadlocks, the verity work can't be done
@@ -1770,9 +1749,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
 		 */
 		INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
 		fsverity_enqueue_verify_work(&dic->verity_work);
-	} else {
-		__f2fs_decompress_end_io(dic, failed, in_task);
+		return;
 	}
+
+	/* Update and unlock the cluster's pagecache pages. */
+	for (i = 0; i < dic->cluster_size; i++) {
+		struct page *rpage = dic->rpages[i];
+
+		if (!rpage)
+			continue;
+
+		if (failed)
+			ClearPageUptodate(rpage);
+		else
+			SetPageUptodate(rpage);
+		unlock_page(rpage);
+	}
+
+	/*
+	 * Release the reference to the decompress_io_ctx that was being held
+	 * for I/O completion.
+	 */
+	f2fs_put_dic(dic, in_task);
 }
 
 /*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 75abd450730b..6e43e19c7d1c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -114,43 +114,56 @@ struct bio_post_read_ctx {
 	struct f2fs_sb_info *sbi;
 	struct work_struct work;
 	unsigned int enabled_steps;
+	/*
+	 * decompression_attempted keeps track of whether
+	 * f2fs_end_read_compressed_page() has been called on the pages in the
+	 * bio that belong to a compressed cluster yet.
+	 */
+	bool decompression_attempted;
 	block_t fs_blkaddr;
 };
 
+/*
+ * Update and unlock a bio's pages, and free the bio.
+ *
+ * This marks pages up-to-date only if there was no error in the bio (I/O error,
+ * decryption error, or verity error), as indicated by bio->bi_status.
+ *
+ * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk)
+ * aren't marked up-to-date here, as decompression is done on a per-compression-
+ * cluster basis rather than a per-bio basis.  Instead, we only must do two
+ * things for each compressed page here: call f2fs_end_read_compressed_page()
+ * with failed=true if an error occurred before it would have normally gotten
+ * called (i.e., I/O error or decryption error, but *not* verity error), and
+ * release the bio's reference to the decompress_io_ctx of the page's cluster.
+ */
 static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
 {
 	struct bio_vec *bv;
 	struct bvec_iter_all iter_all;
+	struct bio_post_read_ctx *ctx = bio->bi_private;
 
-	/*
-	 * Update and unlock the bio's pagecache pages, and put the
-	 * decompression context for any compressed pages.
-	 */
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		struct page *page = bv->bv_page;
 
 		if (f2fs_is_compressed_page(page)) {
-			if (bio->bi_status)
+			if (ctx && !ctx->decompression_attempted)
 				f2fs_end_read_compressed_page(page, true, 0,
 							in_task);
 			f2fs_put_page_dic(page, in_task);
 			continue;
 		}
 
-		/* PG_error was set if verity failed. */
-		if (bio->bi_status || PageError(page)) {
+		if (bio->bi_status)
 			ClearPageUptodate(page);
-			/* will re-read again later */
-			ClearPageError(page);
-		} else {
+		else
 			SetPageUptodate(page);
-		}
 		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
 		unlock_page(page);
 	}
 
-	if (bio->bi_private)
-		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	if (ctx)
+		mempool_free(ctx, bio_post_read_ctx_pool);
 	bio_put(bio);
 }
 
@@ -183,8 +196,10 @@ static void f2fs_verify_bio(struct work_struct *work)
 			struct page *page = bv->bv_page;
 
 			if (!f2fs_is_compressed_page(page) &&
-			    !fsverity_verify_page(page))
-				SetPageError(page);
+			    !fsverity_verify_page(page)) {
+				bio->bi_status = BLK_STS_IOERR;
+				break;
+			}
 		}
 	} else {
 		fsverity_verify_bio(bio);
@@ -243,6 +258,8 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
 		blkaddr++;
 	}
 
+	ctx->decompression_attempted = true;
+
 	/*
 	 * Optimization: if all the bio's pages are compressed, then scheduling
 	 * the per-bio verity work is unnecessary, as verity will be fully
@@ -1060,6 +1077,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 		ctx->sbi = sbi;
 		ctx->enabled_steps = post_read_steps;
 		ctx->fs_blkaddr = blkaddr;
+		ctx->decompression_attempted = false;
 		bio->bi_private = ctx;
 	}
 	iostat_alloc_and_bind_ctx(sbi, bio, ctx);
@@ -1087,7 +1105,6 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 		bio_put(bio);
 		return -EFAULT;
 	}
-	ClearPageError(page);
 	inc_page_count(sbi, F2FS_RD_DATA);
 	f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
 	__submit_bio(sbi, bio, DATA);
@@ -2146,7 +2163,6 @@ submit_and_realloc:
 	inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
 	f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
 							F2FS_BLKSIZE);
-	ClearPageError(page);
 	*last_block_in_bio = block_nr;
 	goto out;
 out:
@@ -2294,7 +2310,6 @@ submit_and_realloc:
 
 		inc_page_count(sbi, F2FS_RD_DATA);
 		f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
-		ClearPageError(page);
 		*last_block_in_bio = blkaddr;
 	}
 
@@ -2311,7 +2326,6 @@ out:
 	for (i = 0; i < cc->cluster_size; i++) {
 		if (cc->rpages[i]) {
 			ClearPageUptodate(cc->rpages[i]);
-			ClearPageError(cc->rpages[i]);
 			unlock_page(cc->rpages[i]);
 		}
 	}
@@ -2408,7 +2422,6 @@ read_single_page:
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 set_error_page:
 #endif
-			SetPageError(page);
 			zero_user_segment(page, 0, PAGE_SIZE);
 			unlock_page(page);
 		}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index bde8c9b7d25f..961ba248021f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -200,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
  * @bio: the bio to verify
  *
  * Verify a set of pages that have just been read from a verity file.  The pages
- * must be pagecache pages that are still locked and not yet uptodate.  Pages
- * that fail verification are set to the Error state.  Verification is skipped
- * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ * must be pagecache pages that are still locked and not yet uptodate.  If a
+ * page fails verification, then bio->bi_status is set to an error status.
  *
  * This is a helper function for use by the ->readahead() method of filesystems
  * that issue bios to read data directly into the page cache.  Filesystems that
@@ -244,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio)
 		unsigned long level0_ra_pages =
 			min(max_ra_pages, params->level0_blocks - level0_index);
 
-		if (!PageError(page) &&
-		    !verify_page(inode, vi, req, page, level0_ra_pages))
-			SetPageError(page);
+		if (!verify_page(inode, vi, req, page, level0_ra_pages)) {
+			bio->bi_status = BLK_STS_IOERR;
+			break;
+		}
 	}
 
 	fsverity_free_hash_request(params->hash_alg, req);

From c969b3e7fcade7ee15b376f5d6e7e6f68ca8a517 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 28 Nov 2022 20:51:39 -0800
Subject: [PATCH 336/457] fsverity: simplify fsverity_get_digest()

Instead of looking up the algorithm by name in hash_algo_name[] to get
its hash_algo ID, just store the hash_algo ID in the fsverity_hash_alg
struct.  Verify at boot time that every fsverity_hash_alg has a valid
hash_algo ID with matching digest size.

Remove an unnecessary memset() of the whole digest array to 0 before the
digest is copied into it.

Finally, remove the pr_debug statement.  There is already a pr_debug for
the fsverity digest when the file is opened.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Link: https://lore.kernel.org/r/20221129045139.69803-1-ebiggers@kernel.org
---
 fs/verity/fsverity_private.h |  5 +++++
 fs/verity/hash_algs.c        |  6 ++++++
 fs/verity/measure.c          | 19 ++-----------------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index dbe1ce5b450a..c7fcb855e068 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -32,6 +32,11 @@ struct fsverity_hash_alg {
 	unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
 	unsigned int block_size;  /* block size in bytes, e.g. 64 for SHA-256 */
 	mempool_t req_pool;	  /* mempool with a preallocated hash request */
+	/*
+	 * The HASH_ALGO_* constant for this algorithm.  This is different from
+	 * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
+	 */
+	enum hash_algo algo_id;
 };
 
 /* Merkle tree parameters: hash algorithm, initial hash state, and topology */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 71d0fccb6d4c..6f8170cf4ae7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
 		.name = "sha256",
 		.digest_size = SHA256_DIGEST_SIZE,
 		.block_size = SHA256_BLOCK_SIZE,
+		.algo_id = HASH_ALGO_SHA256,
 	},
 	[FS_VERITY_HASH_ALG_SHA512] = {
 		.name = "sha512",
 		.digest_size = SHA512_DIGEST_SIZE,
 		.block_size = SHA512_BLOCK_SIZE,
+		.algo_id = HASH_ALGO_SHA512,
 	},
 };
 
@@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void)
 		 */
 		BUG_ON(!is_power_of_2(alg->digest_size));
 		BUG_ON(!is_power_of_2(alg->block_size));
+
+		/* Verify that there is a valid mapping to HASH_ALGO_*. */
+		BUG_ON(alg->algo_id == 0);
+		BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]);
 	}
 }
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index e99c00350c28..5c79ea1b2468 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
  * @alg: (out) pointer to the hash algorithm enumeration
  *
  * Return the file hash algorithm and digest of an fsverity protected file.
- * Assumption: before calling fsverity_get_digest(), the file must have been
- * opened.
+ * Assumption: before calling this, the file must have been opened.
  *
  * Return: 0 on success, -errno on failure
  */
@@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode,
 {
 	const struct fsverity_info *vi;
 	const struct fsverity_hash_alg *hash_alg;
-	int i;
 
 	vi = fsverity_get_info(inode);
 	if (!vi)
 		return -ENODATA; /* not a verity file */
 
 	hash_alg = vi->tree_params.hash_alg;
-	memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
-
-	/* convert the verity hash algorithm name to a hash_algo_name enum */
-	i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
-	if (i < 0)
-		return -EINVAL;
-	*alg = i;
-
-	if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
-		return -EINVAL;
 	memcpy(digest, vi->file_digest, hash_alg->digest_size);
-
-	pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
-		 hash_digest_size[*alg], digest);
-
+	*alg = hash_alg->algo_id;
 	return 0;
 }

From b6d0e23df0ad9f76757f94637f9d66a02c9bd07e Mon Sep 17 00:00:00 2001
From: Sai Harshini Nimmala <quic_snimmala@quicinc.com>
Date: Thu, 27 Oct 2022 23:08:10 -0700
Subject: [PATCH 337/457] ANDROID: cpuset: Make cpusets restore on hotplug

This deliberately changes the behavior of the per-cpuset
cpus file to not be effected by hotplug. When a cpu is offlined, it will
be removed from the cpuset/cpus file. When a cpu is onlined, if the cpuset
originally requested that cpu be a part of the cpuset, that cpu will be
restored to the cpuset. The cpus files still have to be hierachical, but
the ranges no longer have to be out of the currently online cpus, just the
physically present cpus.

This reverts commit 3fc3fe757fde ("Revert "ANDROID: cpuset: Make cpusets
restore on hotplug""). Reverting the revert effectively bringing back
the original change.

Bug: 174125747
Bug: 120444281
Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
[AmitP: Refactored original changes to align with upstream commit
201af4c0fab0 ("cgroup: move cgroup files under kernel/cgroup/")]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[satyap@codeaurora.org: port to android-mainline kernel]
Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
[SaiHarshiniN: Resolved merge conflict post 6.1-rc1]
Signed-off-by: Sai Harshini Nimmala <quic_snimmala@quicinc.com>
Change-Id: I588f6172c15b48ecadb85f161dae948ce9aeca93
---
 kernel/cgroup/cpuset.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 08cff7ac6b1c..de18cb9e4f9e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -137,6 +137,7 @@ struct cpuset {
 
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
+	cpumask_var_t cpus_requested;
 	nodemask_t mems_allowed;
 
 	/* effective CPUs and Memory Nodes allow to tasks */
@@ -565,7 +566,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 {
-	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+	return	cpumask_subset(p->cpus_requested, q->cpus_requested) &&
 		nodes_subset(p->mems_allowed, q->mems_allowed) &&
 		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 		is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -602,8 +603,13 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 		goto free_two;
 
+	if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+		goto free_three;
+
 	return 0;
 
+free_three:
+	free_cpumask_var(*pmask3);
 free_two:
 	free_cpumask_var(*pmask2);
 free_one:
@@ -620,6 +626,7 @@ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	if (cs) {
 		free_cpumask_var(cs->cpus_allowed);
+		free_cpumask_var(cs->cpus_requested);
 		free_cpumask_var(cs->effective_cpus);
 		free_cpumask_var(cs->subparts_cpus);
 	}
@@ -648,6 +655,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 	}
 
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+	cpumask_copy(trial->cpus_requested, cs->cpus_requested);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 	return trial;
 }
@@ -762,7 +770,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	cpuset_for_each_child(c, css, par) {
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 		    c != cur &&
-		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+		    cpumask_intersects(trial->cpus_requested, c->cpus_requested))
 			goto out;
 		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 		    c != cur &&
@@ -1232,10 +1240,10 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
 	if (parent->nr_subparts_cpus) {
 		cpumask_or(new_cpus, parent->effective_cpus,
 			   parent->subparts_cpus);
-		cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+		cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
 		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
 	} else {
-		cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+		cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
 	}
 }
 
@@ -1727,25 +1735,26 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 		return -EACCES;
 
 	/*
-	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
+	 * An empty cpus_requested is ok only if the cpuset has no tasks.
 	 * Since cpulist_parse() fails on an empty mask, we special case
 	 * that parsing.  The validate_change() call ensures that cpusets
 	 * with tasks have cpus.
 	 */
 	if (!*buf) {
-		cpumask_clear(trialcs->cpus_allowed);
+		cpumask_clear(trialcs->cpus_requested);
 	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
+		retval = cpulist_parse(buf, trialcs->cpus_requested);
 		if (retval < 0)
 			return retval;
-
-		if (!cpumask_subset(trialcs->cpus_allowed,
-				    top_cpuset.cpus_allowed))
-			return -EINVAL;
 	}
 
+	if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
+		return -EINVAL;
+
+	cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
+
 	/* Nothing to do if the cpus didn't change */
-	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+	if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
 		return 0;
 
 #ifdef CONFIG_CPUMASK_OFFSTACK
@@ -1800,6 +1809,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 				  parent_cs(cs));
 	spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+	cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
 
 	/*
 	 * Make sure that subparts_cpus, if not empty, is a subset of
@@ -2741,7 +2751,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 
 	switch (type) {
 	case FILE_CPULIST:
-		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
 		break;
 	case FILE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -3130,6 +3140,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->mems_allowed = parent->mems_allowed;
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+	cpumask_copy(cs->cpus_requested, parent->cpus_requested);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
 	spin_unlock_irq(&callback_lock);
 out_unlock:
@@ -3246,8 +3257,10 @@ int __init cpuset_init(void)
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
 
 	cpumask_setall(top_cpuset.cpus_allowed);
+	cpumask_setall(top_cpuset.cpus_requested);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	nodes_setall(top_cpuset.effective_mems);

From 8ecd88d9d38f849d5aac59596ebc7a8679405f5b Mon Sep 17 00:00:00 2001
From: Sai Harshini Nimmala <quic_snimmala@quicinc.com>
Date: Thu, 27 Oct 2022 12:52:45 -0700
Subject: [PATCH 338/457] ANDROID: sched/cpuset: Add vendor hook to change
 tasks affinity

Vendors might want to change tasks affinity settings when they are
moving from one cpuset into the other. Add vendor hook to give control
to vendor to implement what they need.

This feature is necessary to control hotplug behaviour within Qualcomm's
proprietary load tracking scheme, WALT.

This reverts commit 034ddf86f70f ("Revert "ANDROID: sched/cpuset: Add
vendor hook to change tasks affinity"") to effectively bring back the
original change.

Bug: 174125747
Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
Signed-off-by: Sai Harshini Nimmala <quic_snimmala@quicinc.com>
Change-Id: Id4e9c3e47e3b4e041804bdf10cbd9e36179bc172
---
 include/trace/hooks/sched.h |  5 +++++
 kernel/cgroup/cpuset.c      | 18 ++++++++++++++++--
 kernel/sched/vendor_hooks.c |  1 +
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/trace/hooks/sched.h b/include/trace/hooks/sched.h
index c6fa88f03013..0773eae433a6 100644
--- a/include/trace/hooks/sched.h
+++ b/include/trace/hooks/sched.h
@@ -193,6 +193,11 @@ DECLARE_RESTRICTED_HOOK(android_rvh_update_misfit_status,
 	TP_PROTO(struct task_struct *p, struct rq *rq, bool *need_update),
 	TP_ARGS(p, rq, need_update), 1);
 
+DECLARE_RESTRICTED_HOOK(android_rvh_update_cpus_allowed,
+	TP_PROTO(struct task_struct *p, cpumask_var_t cpus_requested,
+		 const struct cpumask *new_mask, int *ret),
+	TP_ARGS(p, cpus_requested, new_mask, ret), 1);
+
 DECLARE_RESTRICTED_HOOK(android_rvh_sched_fork_init,
 	TP_PROTO(struct task_struct *p),
 	TP_ARGS(p), 1);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index de18cb9e4f9e..24a75f5c2920 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -67,6 +67,8 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
+#include <trace/hooks/sched.h>
+
 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
@@ -1196,6 +1198,18 @@ void rebuild_sched_domains(void)
 }
 EXPORT_SYMBOL_GPL(rebuild_sched_domains);
 
+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
+				const struct cpumask *new_mask)
+{
+	int ret = -EINVAL;
+
+	trace_android_rvh_update_cpus_allowed(p, cs->cpus_requested, new_mask, &ret);
+	if (!ret)
+		return ret;
+
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -1218,7 +1232,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
 		if (top_cs && (task->flags & PF_KTHREAD) &&
 		    kthread_is_per_cpu(task))
 			continue;
-		set_cpus_allowed_ptr(task, cs->effective_cpus);
+		update_cpus_allowed(cs, task, cs->effective_cpus);
 	}
 	css_task_iter_end(&it);
 }
@@ -2527,7 +2541,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 * can_attach beforehand should guarantee that this doesn't
 		 * fail.  TODO: have a better way to handle failure here
 		 */
-		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+		WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
 
 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
diff --git a/kernel/sched/vendor_hooks.c b/kernel/sched/vendor_hooks.c
index 9fc169565287..a5a581cf888c 100644
--- a/kernel/sched/vendor_hooks.c
+++ b/kernel/sched/vendor_hooks.c
@@ -35,6 +35,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_find_busiest_queue);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_migrate_queued_task);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_cpu_overutilized);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_setaffinity);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_cpus_allowed);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_build_sched_domains);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_check_preempt_tick);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_check_preempt_wakeup_ignore);

From 55bc1d91e401392b069098184eec997c52dae2f6 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 20 Sep 2022 21:43:51 +0000
Subject: [PATCH 339/457] FROMLIST: softirq: Add generic accessor to percpu
 softirq_pending data

In a previous iteration of this patch series, I was checking:

   per_cpu(irq_stat, cpu).__softirq_pending

which resulted in build errors on s390.

This patch tries to create a generic accessor to this percpu
softirq_pending data.

This interface is inherently racy as its reading percpu data
without a lock. However, being able to peek at the softirq
pending data allows us to make better decisions about rt task
placement vs just ignoring it.

On s390 this call returns 0, which maybe isn't ideal but
results in no functional change from what we do now.

TODO: Heiko suggested changing s390 to use a proper per-cpu
irqstat variable instead.

Feedback or suggestions for better approach here would be
welcome!

Cc: John Dias <joaodias@google.com>
Cc: Connor O'Brien <connoro@google.com>
Cc: Rick Yiu <rickyiu@google.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Qais Yousef <qyousef@google.com>
Cc: Chris Redpath <chris.redpath@arm.com>
Cc: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: kernel-team@android.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/lkml/20221116075929.453876-2-jstultz@google.com/
Change-Id: I333565755ef3804d1669e1c8328b2a5d72dd3a75
Bug: 168521633
---
 arch/s390/include/asm/hardirq.h |  6 ++++++
 include/linux/interrupt.h       | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index 58668ffb5488..cd9cc11588ab 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -16,6 +16,12 @@
 #define local_softirq_pending() (S390_lowcore.softirq_pending)
 #define set_softirq_pending(x) (S390_lowcore.softirq_pending = (x))
 #define or_softirq_pending(x)  (S390_lowcore.softirq_pending |= (x))
+/*
+ *  Not sure what the right thing is here  for s390,
+ *  but returning 0 will result in no logical change
+ *  from what happens now
+ */
+#define __cpu_softirq_pending(x) (0)
 
 #define __ARCH_IRQ_STAT
 #define __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..a749a8663841 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -527,6 +527,17 @@ DECLARE_STATIC_KEY_FALSE(force_irqthreads_key);
 #define set_softirq_pending(x)	(__this_cpu_write(local_softirq_pending_ref, (x)))
 #define or_softirq_pending(x)	(__this_cpu_or(local_softirq_pending_ref, (x)))
 
+/**
+ * __cpu_softirq_pending() - Checks to see if softirq is pending on a cpu
+ *
+ * This helper is inherently racy, as we're accessing per-cpu data w/o locks.
+ * But peeking at the flag can still be useful when deciding where to place a
+ * task.
+ */
+static inline u32 __cpu_softirq_pending(int cpu)
+{
+	return (u32)per_cpu(local_softirq_pending_ref, cpu);
+}
 #endif /* local_softirq_pending */
 
 /* Some architectures might implement lazy enabling/disabling of

From 35a06697da792f6a8e56a600c671bdd980c10d36 Mon Sep 17 00:00:00 2001
From: Connor O'Brien <connoro@google.com>
Date: Tue, 10 May 2022 13:07:09 -0700
Subject: [PATCH 340/457] FROMLIST: sched: Avoid placing RT threads on cores
 handling long softirqs

In certain audio use cases, scheduling RT threads on cores that
are handling softirqs can lead to glitches. Prevent this
behavior in cases where the softirq is likely to take a long
time. To avoid unnecessary migrations, the old behavior is
preserved for RCU, SCHED and TIMER irqs which are expected to be
relatively quick.

This patch reworks and combines two related changes originally
by John Dias <joaodias@google.com>

Cc: John Dias <joaodias@google.com>
Cc: Connor O'Brien <connoro@google.com>
Cc: Rick Yiu <rickyiu@google.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Qais Yousef <qyousef@google.com>
Cc: Chris Redpath <chris.redpath@arm.com>
Cc: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: kernel-team@android.com
Signed-off-by: John Dias <joaodias@google.com>
[elavila: Port to mainline, amend commit text]
Signed-off-by: J. Avila <elavila@google.com>
[connoro: Reworked, simplified, and merged two patches together]
Signed-off-by: Connor O'Brien <connoro@google.com>
[jstultz: Further simplified and fixed issues, reworded commit
 message, removed arm64-isms]
Signed-off-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/lkml/20221116075929.453876-3-jstultz@google.com/
Change-Id: Iea6719e1d900be4a3492949685f1218f299f29e8
Bug: 168521633
---
v2:
* Reformatted Kconfig entry to match coding style
  (Reported-by: Randy Dunlap <rdunlap@infradead.org>)
* Made rt_task_fits_capacity_and_may_preempt static to
  avoid warnings (Reported-by: kernel test robot <lkp@intel.com>)
* Rework to use preempt_count and drop kconfig dependency on ARM64
v3:
* Use introduced __cpu_softirq_pending() to avoid s390 build
  issues (Reported-by: kernel test robot <lkp@intel.com>)
v4:
* Drop TASKLET_SOFTIRQ from LONG_SOFTIRQS (suggested by Qais)
* Depend on !PREEMPT_RT (Suggested by Qais)
* Larger simplification of logic (suggested by Qais)
* Rework LONG_SOFTIRQS to use BIT() macros
* Rename task_may_preempt() to cpu_busy_with_softirqs()
v5:
* Conditionalize active_softirqs handling (suggested by Alexander
  Gordeev <agordeev@linux.ibm.com>)
* Reorder rt_task_fits_cpu to have the "fast" function first
  (Suggested by Alexander Gordeev <agordeev@linux.ibm.com>)
* Fix bug I introduced in v2 condensing
  task_thread_info(task)->preempt_count to preempt_count()
  (Reported-by: Alexander Gordeev <agordeev@linux.ibm.com>)
* Tweak comment discription to remove the vauge "slow"
  descriptor of softirqs being run by ksoftirqd
  (Suggested by Alexander Gordeev <agordeev@linux.ibm.com>)
* Switch to using CONFIG_RT_SOFTIRQ_AWARE_SCHED (suggested by
  Joel Fernandes <joel@joelfernandes.org>)
* Simplify cpu_busy_with_softirqs() logic as pointed out by
  Alexander Gordeev <agordeev@linux.ibm.com>
* Switch to using IS_ENABLED rather then defining my own macro
  (suggsted by Joel Fernandes <joel@joelfernandes.org>)
---
 include/linux/interrupt.h |  9 +++++++
 init/Kconfig              | 10 ++++++++
 kernel/sched/rt.c         | 49 ++++++++++++++++++++++++++++++++-------
 kernel/softirq.c          | 17 ++++++++++++++
 4 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a749a8663841..7d09eb998d4c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -582,6 +582,11 @@ enum
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
  */
 #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(RCU_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ))
+/* Softirq's where the handling might be long: */
+#define LONG_SOFTIRQ_MASK (BIT(NET_TX_SOFTIRQ)    | \
+			   BIT(NET_RX_SOFTIRQ)    | \
+			   BIT(BLOCK_SOFTIRQ)     | \
+			   BIT(IRQ_POLL_SOFTIRQ))
 
 /* map softirq index to softirq name. update 'softirq_to_name' in
  * kernel/softirq.c when adding a new softirq.
@@ -618,6 +623,10 @@ extern void raise_softirq(unsigned int nr);
 
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
 
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+DECLARE_PER_CPU(u32, active_softirqs);
+#endif
+
 static inline struct task_struct *this_cpu_ksoftirqd(void)
 {
 	return this_cpu_read(ksoftirqd);
diff --git a/init/Kconfig b/init/Kconfig
index 971df9846f20..dc8ee459392b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1291,6 +1291,16 @@ config SCHED_AUTOGROUP
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config RT_SOFTIRQ_AWARE_SCHED
+	bool "Improve RT scheduling during long softirq execution"
+	depends on SMP && !PREEMPT_RT
+	default n
+	help
+	  Enable an optimization which tries to avoid placing RT tasks on CPUs
+	  occupied by nonpreemptible tasks, such as a long softirq or CPUs
+	  which may soon block preemptions, such as a CPU running a ksoftirq
+	  thread which handles slow softirqs.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 18ad8d7b39a0..3f02aa9eaffd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1627,6 +1627,32 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+/*
+ * Return whether the given cpu is currently non-preemptible
+ * while handling a potentially long softirq, or if the current
+ * task is likely to block preemptions soon because it is a
+ * ksoftirq thread that is handling softirqs.
+ */
+static bool cpu_busy_with_softirqs(int cpu)
+{
+	u32 softirqs = per_cpu(active_softirqs, cpu) |
+		       __cpu_softirq_pending(cpu);
+
+	return softirqs & LONG_SOFTIRQ_MASK;
+}
+#else
+static bool cpu_busy_with_softirqs(int cpu)
+{
+	return false;
+}
+#endif /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
+
+static bool rt_task_fits_cpu(struct task_struct *p, int cpu)
+{
+	return rt_task_fits_capacity(p, cpu) && !cpu_busy_with_softirqs(cpu);
+}
+
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 {
@@ -1676,9 +1702,11 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 	 * This test is optimistic, if we get it wrong the load-balancer
 	 * will have to sort it out.
 	 *
-	 * We take into account the capacity of the CPU to ensure it fits the
-	 * requirement of the task - which is only important on heterogeneous
-	 * systems like big.LITTLE.
+	 * We use rt_task_fits_cpu() to evaluate if the CPU is busy with
+	 * potentially long-running softirq work, as well as take into
+	 * account the capacity of the CPU to ensure it fits the
+	 * requirement of the task - which is only important on
+	 * heterogeneous systems like big.LITTLE.
 	 */
 	test = curr &&
 	       unlikely(rt_task(curr)) &&
@@ -1693,14 +1721,14 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 		goto out_unlock;
 	}
 
-	if (test || !rt_task_fits_capacity(p, cpu)) {
+	if (test || !rt_task_fits_cpu(p, cpu)) {
 		int target = find_lowest_rq(p);
 
 		/*
 		 * Bail out if we were forcing a migration to find a better
 		 * fitting CPU but our search failed.
 		 */
-		if (!test && target != -1 && !rt_task_fits_capacity(p, target))
+		if (!test && target != -1 && !rt_task_fits_cpu(p, target))
 			goto out_unlock;
 
 		/*
@@ -1947,14 +1975,17 @@ static int find_lowest_rq(struct task_struct *task)
 		return -1; /* No other targets possible */
 
 	/*
-	 * If we're on asym system ensure we consider the different capacities
-	 * of the CPUs when searching for the lowest_mask.
+	 * If we're using the softirq optimization or if we are
+	 * on asym system, ensure we consider the softirq processing
+	 * or different capacities of the CPUs when searching for the
+	 * lowest_mask.
 	 */
-	if (sched_asym_cpucap_active()) {
+	if (IS_ENABLED(CONFIG_RT_SOFTIRQ_AWARE_SCHED) ||
+	    sched_asym_cpucap_active()) {
 
 		ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
 					  task, lowest_mask,
-					  rt_task_fits_capacity);
+					  rt_task_fits_cpu);
 	} else {
 
 		ret = cpupri_find(&task_rq(task)->rd->cpupri,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4a4625b4e678..f861dade8a1a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -63,6 +63,21 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 EXPORT_PER_CPU_SYMBOL_GPL(ksoftirqd);
 
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(u32, active_softirqs);
+static inline void set_active_softirqs(u32 pending)
+{
+	__this_cpu_write(active_softirqs, pending);
+}
+#else /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
+static inline void set_active_softirqs(u32 pending) {};
+#endif /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
+
 const char * const softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -554,6 +569,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
+	set_active_softirqs(pending);
 
 	local_irq_enable();
 
@@ -583,6 +599,7 @@ restart:
 		pending >>= softirq_bit;
 	}
 
+	set_active_softirqs(0);
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
 	    __this_cpu_read(ksoftirqd) == current)
 		rcu_softirq_qs();

From f302663810a643c6af4dd84989916c43b323d4d2 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Wed, 28 Jun 2017 12:00:31 +0530
Subject: [PATCH 341/457] FROMLIST: softirq: defer softirq processing to
 ksoftirqd if CPU is busy with RT

Defer the softirq processing to ksoftirqd if a RT task is
running or queued on the current CPU. This complements the RT
task placement algorithm which tries to find a CPU that is not
currently busy with softirqs.

Currently NET_TX, NET_RX, BLOCK and IRQ_POLL softirqs are only
deferred as they can potentially run for long time.

Additionally, this patch stubs out ksoftirqd_running() logic,
in the CONFIG_RT_SOFTIRQ_AWARE_SCHED case, as deferring
potentially long-running softirqs will cause the logic to not
process shorter-running softirqs immediately. By stubbing it out
the potentially long running softirqs are deferred, but the
shorter running ones can still run immediately.

This patch includes folded-in fixes by:
  Lingutla Chandrasekhar <clingutla@codeaurora.org>
  Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
  J. Avila <elavila@google.com>

Cc: John Dias <joaodias@google.com>
Cc: Connor O'Brien <connoro@google.com>
Cc: Rick Yiu <rickyiu@google.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Qais Yousef <qyousef@google.com>
Cc: Chris Redpath <chris.redpath@arm.com>
Cc: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: kernel-team@android.com
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
[satyap@codeaurora.org: trivial merge conflict resolution.]
Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
[elavila: Port to mainline, squash with bugfix]
Signed-off-by: J. Avila <elavila@google.com>
[jstultz: Rebase to linus/HEAD, minor rearranging of code,
 included bug fix Reported-by: Qais Yousef <qais.yousef@arm.com> ]
Signed-off-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/lkml/20221116075929.453876-4-jstultz@google.com/
Change-Id: Iedb590216e373788950243165ff83f4c43015f31
Bug: 168521633
---
v4:
* Fix commit message to accurately note long-running softirqs
  (suggested by Qais)
* Switch to using rt_task(current) (suggested by Qais)
v5:
* Switch to using CONFIG_RT_SOFTIRQ_AWARE_SCHED (suggested by
  Joel Fernandes <joel@joelfernandes.org>)
---
 kernel/softirq.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index f861dade8a1a..411491a6f7d7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -98,6 +98,7 @@ static void wakeup_softirqd(void)
 		wake_up_process(tsk);
 }
 
+#ifndef CONFIG_RT_SOFTIRQ_AWARE_SCHED
 /*
  * If ksoftirqd is scheduled, we do not want to process pending softirqs
  * right now. Let ksoftirqd handle this at its own rate, to get fairness,
@@ -112,6 +113,9 @@ static bool ksoftirqd_running(unsigned long pending)
 		return false;
 	return tsk && task_is_running(tsk) && !__kthread_should_park(tsk);
 }
+#else
+#define ksoftirqd_running(pending) (false)
+#endif /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 DEFINE_PER_CPU(int, hardirqs_enabled);
@@ -543,6 +547,21 @@ static inline bool lockdep_softirq_start(void) { return false; }
 static inline void lockdep_softirq_end(bool in_hardirq) { }
 #endif
 
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+static __u32 softirq_deferred_for_rt(__u32 *pending)
+{
+	__u32 deferred = 0;
+
+	if (rt_task(current)) {
+		deferred = *pending & LONG_SOFTIRQ_MASK;
+		*pending &= ~LONG_SOFTIRQ_MASK;
+	}
+	return deferred;
+}
+#else
+#define softirq_deferred_for_rt(x) (0)
+#endif
+
 asmlinkage __visible void __softirq_entry __do_softirq(void)
 {
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
@@ -550,6 +569,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 	int max_restart = MAX_SOFTIRQ_RESTART;
 	struct softirq_action *h;
 	bool in_hardirq;
+	__u32 deferred;
 	__u32 pending;
 	int softirq_bit;
 
@@ -561,14 +581,16 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 	current->flags &= ~PF_MEMALLOC;
 
 	pending = local_softirq_pending();
+	deferred = softirq_deferred_for_rt(&pending);
 
 	softirq_handle_begin();
+
 	in_hardirq = lockdep_softirq_start();
 	account_softirq_enter(current);
 
 restart:
 	/* Reset the pending bitmask before enabling irqs */
-	set_softirq_pending(0);
+	set_softirq_pending(deferred);
 	set_active_softirqs(pending);
 
 	local_irq_enable();
@@ -607,14 +629,17 @@ restart:
 	local_irq_disable();
 
 	pending = local_softirq_pending();
+	deferred = softirq_deferred_for_rt(&pending);
+
 	if (pending) {
 		if (time_before(jiffies, end) && !need_resched() &&
 		    --max_restart)
 			goto restart;
-
-		wakeup_softirqd();
 	}
 
+	if (pending | deferred)
+		wakeup_softirqd();
+
 	account_softirq_exit(current);
 	lockdep_softirq_end(in_hardirq);
 	softirq_handle_end();

From b20490aa50898338c4e7936e32c4a3a19cc3db11 Mon Sep 17 00:00:00 2001
From: Lingutla Chandrasekhar <clingutla@codeaurora.org>
Date: Fri, 1 Mar 2019 17:17:09 +0530
Subject: [PATCH 342/457] FROMLIST: trace: Add trace points for tasklet
 entry/exit

Tasklets are supposed to finish their work quickly and
should not block the current running process, but it is not
guaranteed that. Currently softirq_entry/exit can be used to
know total tasklets execution time, but not helpful to track
individual tasklet's execution time. With that we can't find
any culprit tasklet function, which is taking more time.

Add tasklet_entry/exit trace point support to track
individual tasklet execution.

This patch has been carried in the Android tree for awhile
so I wanted to submit it for review upstream. Feedback would
be appreciated!

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Connor O'Brien <connoro@google.com>
Cc: kernel-team@android.com
Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org>
[elavila: Port to android-mainline]
Signed-off-by: J. Avila <elavila@google.com>
[jstultz: Rebased to upstream, cut unused trace points, added
 comments for the tracepoints, reworded commit]
Signed-off-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/lkml/20221213185310.1315794-1-jstultz@google.com/
Bug: 168521633
Change-Id: I3944fcedffae54a5f761d0b18ff1c41d2c3e4aeb
---
 include/trace/events/irq.h | 43 ++++++++++++++++++++++++++++++++++++++
 kernel/softirq.c           |  9 ++++++--
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index eeceafaaea4c..da85851d4ec1 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -160,6 +160,49 @@ DEFINE_EVENT(softirq, softirq_raise,
 	TP_ARGS(vec_nr)
 );
 
+DECLARE_EVENT_CLASS(tasklet,
+
+	TP_PROTO(void *func),
+
+	TP_ARGS(func),
+
+	TP_STRUCT__entry(
+		__field(	void *,	func)
+	),
+
+	TP_fast_assign(
+		__entry->func = func;
+	),
+
+	TP_printk("function=%ps", __entry->func)
+);
+
+/**
+ * tasklet_entry - called immediately before the tasklet is run
+ * @func:  tasklet callback or function being run
+ *
+ * Used to find individual tasklet execution time
+ */
+DEFINE_EVENT(tasklet, tasklet_entry,
+
+	TP_PROTO(void *func),
+
+	TP_ARGS(func)
+);
+
+/**
+ * tasklet_exit - called immediately after the tasklet is run
+ * @func:  tasklet callback or function being run
+ *
+ * Used to find individual tasklet execution time
+ */
+DEFINE_EVENT(tasklet, tasklet_exit,
+
+	TP_PROTO(void *func),
+
+	TP_ARGS(func)
+);
+
 #endif /*  _TRACE_IRQ_H */
 
 /* This part must be outside protection */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 411491a6f7d7..eb6d0821bc8c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -838,10 +838,15 @@ static void tasklet_action_common(struct softirq_action *a,
 		if (tasklet_trylock(t)) {
 			if (!atomic_read(&t->count)) {
 				if (tasklet_clear_sched(t)) {
-					if (t->use_callback)
+					if (t->use_callback) {
+						trace_tasklet_entry(t->callback);
 						t->callback(t);
-					else
+						trace_tasklet_exit(t->callback);
+					} else {
+						trace_tasklet_entry(t->func);
 						t->func(t->data);
+						trace_tasklet_exit(t->func);
+					}
 				}
 				tasklet_unlock(t);
 				continue;

From a3a71b3b0bf755abb9e8011fd6b1eb8e7729c842 Mon Sep 17 00:00:00 2001
From: "J. Avila" <elavila@google.com>
Date: Sat, 7 Nov 2020 00:24:15 +0000
Subject: [PATCH 343/457] ANDROID: GKI: Enable CONFIG_RT_SOFTIRQ_AWARE_SCHED

This config helps address audio buffer underrun issues on arm64 targets.

Bug: 168521633
Signed-off-by: J. Avila <elavila@google.com>
[jstultz: Rebased to android-mainline, also enabled on x86]
Signed-off-by: John Stultz <jstultz@google.com>
Change-Id: Iffb39b2c1d55f5d88d1475f68d5ed5a3bba90a2b
---
 arch/arm64/configs/gki_defconfig | 1 +
 arch/x86/configs/gki_defconfig   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index 5383353f960f..76cae674749c 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -31,6 +31,7 @@ CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_BPF=y
 CONFIG_NAMESPACES=y
 # CONFIG_PID_NS is not set
+CONFIG_RT_SOFTIRQ_AWARE_SCHED=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
 # CONFIG_RD_XZ is not set
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index 32009d7e745c..a4ccc6d4d307 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -34,6 +34,7 @@ CONFIG_CGROUP_BPF=y
 CONFIG_NAMESPACES=y
 # CONFIG_TIME_NS is not set
 # CONFIG_PID_NS is not set
+CONFIG_RT_SOFTIRQ_AWARE_SCHED=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
 # CONFIG_RD_XZ is not set

From ce17e299d0f6d17350d53e2097bea45420553dca Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <quic_psodagud@quicinc.com>
Date: Tue, 10 May 2022 13:47:07 -0700
Subject: [PATCH 344/457] ANDROID: firmware_loader: Add support for customer
 firmware paths

Currently firmware_class.patch commandline can take a single path for
loading firmwares on custom paths. SoC vendors and oems can have
firmwares in multiple file system paths. So add support for paassing
multiple paths through command line for firmware loader.
Add a getter function to read the class path.

For example - firmware_class.path="/vendor,/vendor/firmware_mnt,
/oem/firmware". firmware_class.path can take upto 10 file system
paths with ',' separation.

Bug: 259298396
Change-Id: I31d1470d7dd0255c7aefd856f3c129bdb4b7f2e8
Signed-off-by: Prasad Sodagudi <quic_psodagud@quicinc.com>
Signed-off-by: Vamsi Krishna Lanka <quic_vamslank@quicinc.com>
---
 drivers/base/firmware_loader/main.c | 74 +++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 7c3590fd97c2..6966a1854832 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -467,21 +467,85 @@ static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv,
 #endif /* CONFIG_FW_LOADER_COMPRESS_XZ */
 
 /* direct firmware loading support */
-static char fw_path_para[256];
+#define CUSTOM_FW_PATH_COUNT	10
+#define PATH_SIZE		255
+static char fw_path_para[CUSTOM_FW_PATH_COUNT][PATH_SIZE];
 static const char * const fw_path[] = {
-	fw_path_para,
+	fw_path_para[0],
+	fw_path_para[1],
+	fw_path_para[2],
+	fw_path_para[3],
+	fw_path_para[4],
+	fw_path_para[5],
+	fw_path_para[6],
+	fw_path_para[7],
+	fw_path_para[8],
+	fw_path_para[9],
 	"/lib/firmware/updates/" UTS_RELEASE,
 	"/lib/firmware/updates",
 	"/lib/firmware/" UTS_RELEASE,
 	"/lib/firmware"
 };
 
+static char strpath[PATH_SIZE * CUSTOM_FW_PATH_COUNT];
+static int firmware_param_path_set(const char *val, const struct kernel_param *kp)
+{
+	int i;
+	char *path, *end;
+
+	strscpy(strpath, val, sizeof(strpath));
+	/* Remove leading and trailing spaces from path */
+	path = strim(strpath);
+	for (i = 0; path && i < CUSTOM_FW_PATH_COUNT; i++) {
+		end = strchr(path, ',');
+
+		/* Skip continuous token case, for example ',,,' */
+		if (end == path) {
+			i--;
+			path = ++end;
+			continue;
+		}
+
+		if (end != NULL)
+			*end = '\0';
+		else {
+			/* end of the string reached and no other tockens ','  */
+			strscpy(fw_path_para[i], path, PATH_SIZE);
+			break;
+		}
+
+		strscpy(fw_path_para[i], path, PATH_SIZE);
+		path = ++end;
+	}
+
+	return 0;
+}
+
+static int firmware_param_path_get(char *buffer, const struct kernel_param *kp)
+{
+	int count = 0, i;
+
+	for (i = 0; i < CUSTOM_FW_PATH_COUNT; i++) {
+		if (strlen(fw_path_para[i]) != 0)
+			count += scnprintf(buffer + count, PATH_SIZE, "%s%s", fw_path_para[i], ",");
+	}
+
+	buffer[count - 1] = '\0';
+
+	return count - 1;
+}
 /*
- * Typical usage is that passing 'firmware_class.path=$CUSTOMIZED_PATH'
+ * Typical usage is that passing 'firmware_class.path=/vendor,/firwmare_mnt'
  * from kernel command line because firmware_class is generally built in
- * kernel instead of module.
+ * kernel instead of module. ',' is used as delimiter for setting 10
+ * custom paths for firmware loader.
  */
-module_param_string(path, fw_path_para, sizeof(fw_path_para), 0644);
+
+static const struct kernel_param_ops firmware_param_ops = {
+	.set = firmware_param_path_set,
+	.get = firmware_param_path_get,
+};
+module_param_cb(path, &firmware_param_ops, NULL, 0644);
 MODULE_PARM_DESC(path, "customized firmware image search path with a higher priority than default path");
 
 static int

From 94fa0a083126c79067558354cbdcb9009c503588 Mon Sep 17 00:00:00 2001
From: Naman Jain <quic_namajain@quicinc.com>
Date: Wed, 14 Dec 2022 10:17:45 +0530
Subject: [PATCH 345/457] FROMGIT: asm-generic/io: Add _RET_IP_ to MMIO trace
 for more accurate debug info

Due to compiler optimizations like inlining, there are cases where
MMIO traces using _THIS_IP_ for caller information might not be
sufficient to provide accurate debug traces.

1) With optimizations (Seen with GCC):

In this case, _THIS_IP_ works fine and prints the caller information
since it will be inlined into the caller and we get the debug traces
on who made the MMIO access, for ex:

rwmmio_read: qcom_smmu_tlb_sync+0xe0/0x1b0 width=32 addr=0xffff8000087447f4
rwmmio_post_read: qcom_smmu_tlb_sync+0xe0/0x1b0 width=32 val=0x0 addr=0xffff8000087447f4

2) Without optimizations (Seen with Clang):

_THIS_IP_ will not be sufficient in this case as it will print only
the MMIO accessors itself which is of not much use since it is not
inlined as below for example:

rwmmio_read: readl+0x4/0x80 width=32 addr=0xffff8000087447f4
rwmmio_post_read: readl+0x48/0x80 width=32 val=0x4 addr=0xffff8000087447f4

So in order to handle this second case as well irrespective of the compiler
optimizations, add _RET_IP_ to MMIO trace to make it provide more accurate
debug information in all these scenarios.

Before:

rwmmio_read: readl+0x4/0x80 width=32 addr=0xffff8000087447f4
rwmmio_post_read: readl+0x48/0x80 width=32 val=0x4 addr=0xffff8000087447f4

After:

rwmmio_read: qcom_smmu_tlb_sync+0xe0/0x1b0 -> readl+0x4/0x80 width=32 addr=0xffff8000087447f4
rwmmio_post_read: qcom_smmu_tlb_sync+0xe0/0x1b0 -> readl+0x4/0x80 width=32 val=0x0 addr=0xffff8000087447f4

Fixes: 210031971cdd ("asm-generic/io: Add logging support for MMIO accessors")
Signed-off-by: Sai Prakash Ranjan <quic_saipraka@quicinc.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>

Bug: 262467838
Change-Id: I04e50dfbc59574a06ed5def8027fc4f48f422b1d
(cherry picked from commit 5e5ff73c2e5863f93fc5fd78d178cd8f2af12464 https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master)
Signed-off-by: Naman Jain <quic_namajain@quicinc.com>
---
 include/asm-generic/io.h      | 80 +++++++++++++++++------------------
 include/trace/events/rwmmio.h | 43 ++++++++++++-------
 lib/trace_readwrite.c         | 16 +++----
 3 files changed, 75 insertions(+), 64 deletions(-)

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index a68f8fbf423b..4c44a29b5e8e 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -80,24 +80,24 @@ DECLARE_TRACEPOINT(rwmmio_read);
 DECLARE_TRACEPOINT(rwmmio_post_read);
 
 void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
-		    unsigned long caller_addr);
+		    unsigned long caller_addr, unsigned long caller_addr0);
 void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
-			 unsigned long caller_addr);
+			 unsigned long caller_addr, unsigned long caller_addr0);
 void log_read_mmio(u8 width, const volatile void __iomem *addr,
-		   unsigned long caller_addr);
+		   unsigned long caller_addr, unsigned long caller_addr0);
 void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr,
-			unsigned long caller_addr);
+			unsigned long caller_addr, unsigned long caller_addr0);
 
 #else
 
 static inline void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
-				  unsigned long caller_addr) {}
+				  unsigned long caller_addr, unsigned long caller_addr0) {}
 static inline void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
-				       unsigned long caller_addr) {}
+				       unsigned long caller_addr, unsigned long caller_addr0) {}
 static inline void log_read_mmio(u8 width, const volatile void __iomem *addr,
-				 unsigned long caller_addr) {}
+				 unsigned long caller_addr, unsigned long caller_addr0) {}
 static inline void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr,
-				      unsigned long caller_addr) {}
+				      unsigned long caller_addr, unsigned long caller_addr0) {}
 
 #endif /* CONFIG_TRACE_MMIO_ACCESS */
 
@@ -188,11 +188,11 @@ static inline u8 readb(const volatile void __iomem *addr)
 {
 	u8 val;
 
-	log_read_mmio(8, addr, _THIS_IP_);
+	log_read_mmio(8, addr, _THIS_IP_, _RET_IP_);
 	__io_br();
 	val = __raw_readb(addr);
 	__io_ar(val);
-	log_post_read_mmio(val, 8, addr, _THIS_IP_);
+	log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -203,11 +203,11 @@ static inline u16 readw(const volatile void __iomem *addr)
 {
 	u16 val;
 
-	log_read_mmio(16, addr, _THIS_IP_);
+	log_read_mmio(16, addr, _THIS_IP_, _RET_IP_);
 	__io_br();
 	val = __le16_to_cpu((__le16 __force)__raw_readw(addr));
 	__io_ar(val);
-	log_post_read_mmio(val, 16, addr, _THIS_IP_);
+	log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -218,11 +218,11 @@ static inline u32 readl(const volatile void __iomem *addr)
 {
 	u32 val;
 
-	log_read_mmio(32, addr, _THIS_IP_);
+	log_read_mmio(32, addr, _THIS_IP_, _RET_IP_);
 	__io_br();
 	val = __le32_to_cpu((__le32 __force)__raw_readl(addr));
 	__io_ar(val);
-	log_post_read_mmio(val, 32, addr, _THIS_IP_);
+	log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -234,11 +234,11 @@ static inline u64 readq(const volatile void __iomem *addr)
 {
 	u64 val;
 
-	log_read_mmio(64, addr, _THIS_IP_);
+	log_read_mmio(64, addr, _THIS_IP_, _RET_IP_);
 	__io_br();
 	val = __le64_to_cpu(__raw_readq(addr));
 	__io_ar(val);
-	log_post_read_mmio(val, 64, addr, _THIS_IP_);
+	log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -248,11 +248,11 @@ static inline u64 readq(const volatile void __iomem *addr)
 #define writeb writeb
 static inline void writeb(u8 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 8, addr, _THIS_IP_);
+	log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_);
 	__io_bw();
 	__raw_writeb(value, addr);
 	__io_aw();
-	log_post_write_mmio(value, 8, addr, _THIS_IP_);
+	log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
@@ -260,11 +260,11 @@ static inline void writeb(u8 value, volatile void __iomem *addr)
 #define writew writew
 static inline void writew(u16 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 16, addr, _THIS_IP_);
+	log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_);
 	__io_bw();
 	__raw_writew((u16 __force)cpu_to_le16(value), addr);
 	__io_aw();
-	log_post_write_mmio(value, 16, addr, _THIS_IP_);
+	log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
@@ -272,11 +272,11 @@ static inline void writew(u16 value, volatile void __iomem *addr)
 #define writel writel
 static inline void writel(u32 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 32, addr, _THIS_IP_);
+	log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_);
 	__io_bw();
 	__raw_writel((u32 __force)__cpu_to_le32(value), addr);
 	__io_aw();
-	log_post_write_mmio(value, 32, addr, _THIS_IP_);
+	log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
@@ -285,11 +285,11 @@ static inline void writel(u32 value, volatile void __iomem *addr)
 #define writeq writeq
 static inline void writeq(u64 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 64, addr, _THIS_IP_);
+	log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 	__io_bw();
 	__raw_writeq(__cpu_to_le64(value), addr);
 	__io_aw();
-	log_post_write_mmio(value, 64, addr, _THIS_IP_);
+	log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 #endif /* CONFIG_64BIT */
@@ -305,9 +305,9 @@ static inline u8 readb_relaxed(const volatile void __iomem *addr)
 {
 	u8 val;
 
-	log_read_mmio(8, addr, _THIS_IP_);
+	log_read_mmio(8, addr, _THIS_IP_, _RET_IP_);
 	val = __raw_readb(addr);
-	log_post_read_mmio(val, 8, addr, _THIS_IP_);
+	log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -318,9 +318,9 @@ static inline u16 readw_relaxed(const volatile void __iomem *addr)
 {
 	u16 val;
 
-	log_read_mmio(16, addr, _THIS_IP_);
+	log_read_mmio(16, addr, _THIS_IP_, _RET_IP_);
 	val = __le16_to_cpu(__raw_readw(addr));
-	log_post_read_mmio(val, 16, addr, _THIS_IP_);
+	log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -331,9 +331,9 @@ static inline u32 readl_relaxed(const volatile void __iomem *addr)
 {
 	u32 val;
 
-	log_read_mmio(32, addr, _THIS_IP_);
+	log_read_mmio(32, addr, _THIS_IP_, _RET_IP_);
 	val = __le32_to_cpu(__raw_readl(addr));
-	log_post_read_mmio(val, 32, addr, _THIS_IP_);
+	log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -344,9 +344,9 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr)
 {
 	u64 val;
 
-	log_read_mmio(64, addr, _THIS_IP_);
+	log_read_mmio(64, addr, _THIS_IP_, _RET_IP_);
 	val = __le64_to_cpu(__raw_readq(addr));
-	log_post_read_mmio(val, 64, addr, _THIS_IP_);
+	log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_);
 	return val;
 }
 #endif
@@ -355,9 +355,9 @@ static inline u64 readq_relaxed(const volatile void __iomem *addr)
 #define writeb_relaxed writeb_relaxed
 static inline void writeb_relaxed(u8 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 8, addr, _THIS_IP_);
+	log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_);
 	__raw_writeb(value, addr);
-	log_post_write_mmio(value, 8, addr, _THIS_IP_);
+	log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
@@ -365,9 +365,9 @@ static inline void writeb_relaxed(u8 value, volatile void __iomem *addr)
 #define writew_relaxed writew_relaxed
 static inline void writew_relaxed(u16 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 16, addr, _THIS_IP_);
+	log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_);
 	__raw_writew(cpu_to_le16(value), addr);
-	log_post_write_mmio(value, 16, addr, _THIS_IP_);
+	log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
@@ -375,9 +375,9 @@ static inline void writew_relaxed(u16 value, volatile void __iomem *addr)
 #define writel_relaxed writel_relaxed
 static inline void writel_relaxed(u32 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 32, addr, _THIS_IP_);
+	log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_);
 	__raw_writel(__cpu_to_le32(value), addr);
-	log_post_write_mmio(value, 32, addr, _THIS_IP_);
+	log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
@@ -385,9 +385,9 @@ static inline void writel_relaxed(u32 value, volatile void __iomem *addr)
 #define writeq_relaxed writeq_relaxed
 static inline void writeq_relaxed(u64 value, volatile void __iomem *addr)
 {
-	log_write_mmio(value, 64, addr, _THIS_IP_);
+	log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 	__raw_writeq(__cpu_to_le64(value), addr);
-	log_post_write_mmio(value, 64, addr, _THIS_IP_);
+	log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_);
 }
 #endif
 
diff --git a/include/trace/events/rwmmio.h b/include/trace/events/rwmmio.h
index de41159216c1..a43e5dd7436b 100644
--- a/include/trace/events/rwmmio.h
+++ b/include/trace/events/rwmmio.h
@@ -12,12 +12,14 @@
 
 DECLARE_EVENT_CLASS(rwmmio_rw_template,
 
-	TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr),
+	TP_PROTO(unsigned long caller, unsigned long caller0, u64 val, u8 width,
+		 volatile void __iomem *addr),
 
-	TP_ARGS(caller, val, width, addr),
+	TP_ARGS(caller, caller0, val, width, addr),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, caller)
+		__field(unsigned long, caller0)
 		__field(unsigned long, addr)
 		__field(u64, val)
 		__field(u8, width)
@@ -25,56 +27,64 @@ DECLARE_EVENT_CLASS(rwmmio_rw_template,
 
 	TP_fast_assign(
 		__entry->caller = caller;
+		__entry->caller0 = caller0;
 		__entry->val = val;
 		__entry->addr = (unsigned long)addr;
 		__entry->width = width;
 	),
 
-	TP_printk("%pS width=%d val=%#llx addr=%#lx",
-		(void *)__entry->caller, __entry->width,
+	TP_printk("%pS -> %pS width=%d val=%#llx addr=%#lx",
+		(void *)__entry->caller0, (void *)__entry->caller, __entry->width,
 		__entry->val, __entry->addr)
 );
 
 DEFINE_EVENT(rwmmio_rw_template, rwmmio_write,
-	TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr),
-	TP_ARGS(caller, val, width, addr)
+	TP_PROTO(unsigned long caller, unsigned long caller0, u64 val, u8 width,
+		 volatile void __iomem *addr),
+	TP_ARGS(caller, caller0, val, width, addr)
 );
 
 DEFINE_EVENT(rwmmio_rw_template, rwmmio_post_write,
-	TP_PROTO(unsigned long caller, u64 val, u8 width, volatile void __iomem *addr),
-	TP_ARGS(caller, val, width, addr)
+	TP_PROTO(unsigned long caller, unsigned long caller0, u64 val, u8 width,
+		 volatile void __iomem *addr),
+	TP_ARGS(caller, caller0, val, width, addr)
 );
 
 TRACE_EVENT(rwmmio_read,
 
-	TP_PROTO(unsigned long caller, u8 width, const volatile void __iomem *addr),
+	TP_PROTO(unsigned long caller, unsigned long caller0, u8 width,
+		 const volatile void __iomem *addr),
 
-	TP_ARGS(caller, width, addr),
+	TP_ARGS(caller, caller0, width, addr),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, caller)
+		__field(unsigned long, caller0)
 		__field(unsigned long, addr)
 		__field(u8, width)
 	),
 
 	TP_fast_assign(
 		__entry->caller = caller;
+		__entry->caller0 = caller0;
 		__entry->addr = (unsigned long)addr;
 		__entry->width = width;
 	),
 
-	TP_printk("%pS width=%d addr=%#lx",
-		 (void *)__entry->caller, __entry->width, __entry->addr)
+	TP_printk("%pS -> %pS width=%d addr=%#lx",
+		 (void *)__entry->caller0, (void *)__entry->caller, __entry->width, __entry->addr)
 );
 
 TRACE_EVENT(rwmmio_post_read,
 
-	TP_PROTO(unsigned long caller, u64 val, u8 width, const volatile void __iomem *addr),
+	TP_PROTO(unsigned long caller, unsigned long caller0, u64 val, u8 width,
+		 const volatile void __iomem *addr),
 
-	TP_ARGS(caller, val, width, addr),
+	TP_ARGS(caller, caller0, val, width, addr),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, caller)
+		__field(unsigned long, caller0)
 		__field(unsigned long, addr)
 		__field(u64, val)
 		__field(u8, width)
@@ -82,13 +92,14 @@ TRACE_EVENT(rwmmio_post_read,
 
 	TP_fast_assign(
 		__entry->caller = caller;
+		__entry->caller0 = caller0;
 		__entry->val = val;
 		__entry->addr = (unsigned long)addr;
 		__entry->width = width;
 	),
 
-	TP_printk("%pS width=%d val=%#llx addr=%#lx",
-		 (void *)__entry->caller, __entry->width,
+	TP_printk("%pS -> %pS width=%d val=%#llx addr=%#lx",
+		 (void *)__entry->caller0, (void *)__entry->caller, __entry->width,
 		 __entry->val, __entry->addr)
 );
 
diff --git a/lib/trace_readwrite.c b/lib/trace_readwrite.c
index 88637038b30c..62b4e8b3c733 100644
--- a/lib/trace_readwrite.c
+++ b/lib/trace_readwrite.c
@@ -14,33 +14,33 @@
 
 #ifdef CONFIG_TRACE_MMIO_ACCESS
 void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
-		    unsigned long caller_addr)
+		    unsigned long caller_addr, unsigned long caller_addr0)
 {
-	trace_rwmmio_write(caller_addr, val, width, addr);
+	trace_rwmmio_write(caller_addr, caller_addr0, val, width, addr);
 }
 EXPORT_SYMBOL_GPL(log_write_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_write);
 
 void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr,
-			 unsigned long caller_addr)
+			 unsigned long caller_addr, unsigned long caller_addr0)
 {
-	trace_rwmmio_post_write(caller_addr, val, width, addr);
+	trace_rwmmio_post_write(caller_addr, caller_addr0, val, width, addr);
 }
 EXPORT_SYMBOL_GPL(log_post_write_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_post_write);
 
 void log_read_mmio(u8 width, const volatile void __iomem *addr,
-		   unsigned long caller_addr)
+		   unsigned long caller_addr, unsigned long caller_addr0)
 {
-	trace_rwmmio_read(caller_addr, width, addr);
+	trace_rwmmio_read(caller_addr, caller_addr0, width, addr);
 }
 EXPORT_SYMBOL_GPL(log_read_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_read);
 
 void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr,
-			unsigned long caller_addr)
+			unsigned long caller_addr, unsigned long caller_addr0)
 {
-	trace_rwmmio_post_read(caller_addr, val, width, addr);
+	trace_rwmmio_post_read(caller_addr, caller_addr0, val, width, addr);
 }
 EXPORT_SYMBOL_GPL(log_post_read_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(rwmmio_post_read);

From 8689a2067103d14812e7bc958e75c1b56c58d429 Mon Sep 17 00:00:00 2001
From: Doug Anderson <dianders@chromium.org>
Date: Thu, 2 Feb 2012 22:58:28 -0800
Subject: [PATCH 346/457] ANDROID: of: Support CONFIG_CMDLINE_EXTEND config
 option

The old logic assumes CMDLINE_FROM_BOOTLOADER vs. CMDLINE_FORCE and
ignores CMDLINE_EXTEND.  Here's the old logic:

- CONFIG_CMDLINE_FORCE=true
    CONFIG_CMDLINE
- dt bootargs=non-empty:
    dt bootargs
- dt bootargs=empty, @data is non-empty string
    @data is left unchanged
- dt bootargs=empty, @data is empty string
    CONFIG_CMDLINE (or "" if that's not defined)

The new logic is now documented in of_fdt.h and is copied here for
reference:

- CONFIG_CMDLINE_FORCE=true
    CONFIG_CMDLINE
- CONFIG_CMDLINE_EXTEND=true, @data is non-empty string
    @data + dt bootargs (even if dt bootargs are empty)
- CONFIG_CMDLINE_EXTEND=true, @data is empty string
    CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty)
- CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty:
    dt bootargs
- CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string
    @data is left unchanged
- CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string
    CONFIG_CMDLINE (or "" if that's not defined)

Signed-off-by: Doug Anderson <dianders@chromium.org>
CC: devicetree-discuss@lists.ozlabs.org
CC: Grant Likely <grant.likely@secretlab.ca>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Rob Herring <rob.herring@calxeda.com>
Bug: 120440972
Bug: 261894961
Change-Id: I40ace250847f813358125dfcaa8998fd32cf7ea3
Signed-off-by: Colin Cross <ccross@android.com>
[AmitP: Folded following android-4.9 commit changes into this patch
        e820270abb5d ("ANDROID: of: fix CONFIG_CMDLINE_EXTEND")
        9a4a74055444 ("ANDROID: of: Fix build warnings")]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
[eberman: Resolve conflicts due to
	commit 60f20d84dc81 ("of/fdt: Rework early_init_dt_scan_chosen() to call directly")
	commit 7a12dd077e52 ("of: move from strlcpy with unused retval to strscpy")]
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 drivers/of/fdt.c       | 68 ++++++++++++++++++++++++++++--------------
 include/linux/of_fdt.h | 21 +++++++++++++
 2 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index 7b571a631639..bbf65f636337 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1150,10 +1150,32 @@ int __init early_init_dt_scan_memory(void)
 	return 0;
 }
 
+/*
+ * Convert configs to something easy to use in C code
+ */
+#if defined(CONFIG_CMDLINE_FORCE)
+static const int overwrite_incoming_cmdline = 1;
+static const int read_dt_cmdline;
+static const int concat_cmdline;
+#elif defined(CONFIG_CMDLINE_EXTEND)
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline = 1;
+#else /* CMDLINE_FROM_BOOTLOADER */
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline;
+#endif
+#ifdef CONFIG_CMDLINE
+static const char *config_cmdline = CONFIG_CMDLINE;
+#else
+static const char *config_cmdline = "";
+#endif
+
 int __init early_init_dt_scan_chosen(char *cmdline)
 {
-	int l, node;
-	const char *p;
+	int l = 0, node;
+	const char *p = NULL;
 	const void *rng_seed;
 	const void *fdt = initial_boot_params;
 
@@ -1168,28 +1190,28 @@ int __init early_init_dt_scan_chosen(char *cmdline)
 	early_init_dt_check_for_initrd(node);
 	early_init_dt_check_for_elfcorehdr(node);
 
-	/* Retrieve command line */
-	p = of_get_flat_dt_prop(node, "bootargs", &l);
-	if (p != NULL && l > 0)
-		strscpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
 
-	/*
-	 * CONFIG_CMDLINE is meant to be a default in case nothing else
-	 * managed to set the command line, unless CONFIG_CMDLINE_FORCE
-	 * is set in which case we override whatever was found earlier.
-	 */
-#ifdef CONFIG_CMDLINE
-#if defined(CONFIG_CMDLINE_EXTEND)
-	strlcat(cmdline, " ", COMMAND_LINE_SIZE);
-	strlcat(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#elif defined(CONFIG_CMDLINE_FORCE)
-	strscpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#else
-	/* No arguments from boot loader, use kernel's  cmdl*/
-	if (!((char *)cmdline)[0])
-		strscpy(cmdline, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#endif
-#endif /* CONFIG_CMDLINE */
+	/* Put CONFIG_CMDLINE in if forced or if data had nothing in it to start */
+	if (overwrite_incoming_cmdline || !cmdline[0])
+		strscpy(cmdline, config_cmdline, COMMAND_LINE_SIZE);
+
+	/* Retrieve command line unless forcing */
+	if (read_dt_cmdline)
+		p = of_get_flat_dt_prop(node, "bootargs", &l);
+	if (p != NULL && l > 0) {
+		if (concat_cmdline) {
+			int cmdline_len;
+			int copy_len;
+			strlcat(cmdline, " ", COMMAND_LINE_SIZE);
+			cmdline_len = strlen(cmdline);
+			copy_len = COMMAND_LINE_SIZE - cmdline_len - 1;
+			copy_len = min((int)l, copy_len);
+			strncpy(cmdline + cmdline_len, p, copy_len);
+			cmdline[cmdline_len + copy_len] = '\0';
+		} else {
+			strscpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
+		}
+	}
 
 	pr_debug("Command line is: %s\n", (char *)cmdline);
 
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index d69ad5bb1eb1..991e4c1c87ba 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -58,6 +58,27 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
 extern unsigned long of_get_flat_dt_root(void);
 extern uint32_t of_get_flat_dt_phandle(unsigned long node);
 
+/*
+ * early_init_dt_scan_chosen - scan the device tree for ramdisk and bootargs
+ *
+ * The boot arguments will be placed into the memory pointed to by @data.
+ * That memory should be COMMAND_LINE_SIZE big and initialized to be a valid
+ * (possibly empty) string.  Logic for what will be in @data after this
+ * function finishes:
+ *
+ * - CONFIG_CMDLINE_FORCE=true
+ *     CONFIG_CMDLINE
+ * - CONFIG_CMDLINE_EXTEND=true, @data is non-empty string
+ *     @data + dt bootargs (even if dt bootargs are empty)
+ * - CONFIG_CMDLINE_EXTEND=true, @data is empty string
+ *     CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty)
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty:
+ *     dt bootargs
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string
+ *     @data is left unchanged
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string
+ *     CONFIG_CMDLINE (or "" if that's not defined)
+ */
 extern int early_init_dt_scan_chosen(char *cmdline);
 extern int early_init_dt_scan_memory(void);
 extern void early_init_dt_check_for_usable_mem_range(void);

From 0778b7e91cd26987dc3cbfbfbeeae884036a4194 Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Sun, 19 Sep 2021 01:30:28 +0000
Subject: [PATCH 347/457] ANDROID: Revert "arm64: Drop support for
 CMDLINE_EXTEND"

This reverts commit cae118b6acc309539b33339e846cbb19187c164c.

Upstream removed the support for CMDLINE_EXTEND on arm64 due to some
inconsistencies in the way command-line was assembled together. However,
the patch inadvertently deleted same config option from the android tree
which was first introduced by commit 056f12819b75 ("ANDROID: arm64: copy
CONFIG_CMDLINE_EXTEND from ARM") back in 2014.

Upstream is currently working on a solution to fix the divergence by
allowing prepend and append functionalities. While we wait for the new
series to land let's bring back CMDLINE_EXTEND option for android, so
arm64 can actually use of the functionality provided by d9a2a3f2c2c4
("ANDROID: of: Support CONFIG_CMDLINE_EXTEND config option").

Bug: 120440972
Bug: 261894961
Change-Id: I3b7fd55e95af49b64f284582dee40392074d32f1
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 arch/arm64/Kconfig                 | 6 ++++++
 arch/arm64/kernel/idreg-override.c | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1489ef400c6a..1478a5393b36 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2195,6 +2195,12 @@ config CMDLINE_FROM_BOOTLOADER
 	  the boot loader doesn't provide any, the default kernel command
 	  string provided in CMDLINE will be used.
 
+config CMDLINE_EXTEND
+	bool "Extend bootloader kernel arguments"
+	help
+	  The command-line arguments provided by the boot loader will be
+	  appended to the default kernel command string.
+
 config CMDLINE_FORCE
 	bool "Always use the default kernel command string"
 	help
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index 95133765ed29..1af3ac47faa0 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -287,8 +287,11 @@ static __init void parse_cmdline(void)
 {
 	const u8 *prop = get_bootargs_cmdline();
 
-	if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop)
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    !prop) {
 		__parse_cmdline(CONFIG_CMDLINE, true);
+	}
 
 	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop)
 		__parse_cmdline(prop, true);

From 9933cd0873aa445bb081d227c39146b2237b152f Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Sun, 19 Sep 2021 15:32:02 +0000
Subject: [PATCH 348/457] Revert "ANDROID: GKI: remove CONFIG_CMDLINE_EXTEND
 from arm64 gki_defconfig"

This reverts commit 4c2fb69cfa1f9c1605b00a5cc90f9398692d8366.

Bring back CONFIG_CMDLINE_EXTEND while we wait for an alternative
solution to be accepted upstream.

Bug: 120440972
Bug: 261894961
[cmllamas: fix trivial revert conflict]
Signed-off-by: Carlos Llamas <cmllamas@google.com>
[eberman: fix trivial conflict]
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Change-Id: Ifcb2f9b345d9826e6cffa4cd380c68bb35c60b31
---
 arch/arm64/configs/gki_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index 76cae674749c..1004c61cbebf 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -58,6 +58,7 @@ CONFIG_SETEND_EMULATION=y
 CONFIG_RANDOMIZE_BASE=y
 # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
 CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure kasan.stacktrace=off kvm-arm.mode=protected bootconfig ioremap_guard"
+CONFIG_CMDLINE_EXTEND=y
 # CONFIG_DMI is not set
 CONFIG_PM_WAKELOCKS=y
 CONFIG_PM_WAKELOCKS_LIMIT=0

From 96c5043a4d6443d56b92b5740e8e7d25907d1c66 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 8 Nov 2022 17:03:07 -0700
Subject: [PATCH 349/457] x86/vdso: Conditionally export
 __vdso_sgx_enter_enclave()

commit 45be2ad007a9c6bea70249c4cf3e4905afe4caeb upstream.

Recently, ld.lld moved from '--undefined-version' to
'--no-undefined-version' as the default, which breaks building the vDSO
when CONFIG_X86_SGX is not set:

  ld.lld: error: version script assignment of 'LINUX_2.6' to symbol '__vdso_sgx_enter_enclave' failed: symbol not defined

__vdso_sgx_enter_enclave is only included in the vDSO when
CONFIG_X86_SGX is set. Only export it if it will be present in the final
object, which clears up the error.

Fixes: 8466436952017 ("x86/vdso: Implement a vDSO for Intel SGX enclave call")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://github.com/ClangBuiltLinux/linux/issues/1756
Link: https://lore.kernel.org/r/20221109000306.1407357-1-nathan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vdso/vdso.lds.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 4bf48462fca7..e8c60ae7a7c8 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -27,7 +27,9 @@ VERSION {
 		__vdso_time;
 		clock_getres;
 		__vdso_clock_getres;
+#ifdef CONFIG_X86_SGX
 		__vdso_sgx_enter_enclave;
+#endif
 	local: *;
 	};
 }

From 76c6303530ebcb1459302d8943527cba04baf42e Mon Sep 17 00:00:00 2001
From: David Michael <fedora.dm0@gmail.com>
Date: Sun, 13 Nov 2022 15:52:17 -0500
Subject: [PATCH 350/457] libbpf: Fix uninitialized warning in
 btf_dump_dump_type_data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit dfd0afbf151d85411b371e841f62b81ee5d1ca54 upstream.

GCC 11.3.0 fails to compile btf_dump.c due to the following error,
which seems to originate in btf_dump_struct_data where the returned
value would be uninitialized if btf_vlen returns zero.

btf_dump.c: In function ‘btf_dump_dump_type_data’:
btf_dump.c:2363:12: error: ‘err’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
 2363 |         if (err < 0)
      |            ^

Fixes: 920d16af9b42 ("libbpf: BTF dumper support for typed data")
Signed-off-by: David Michael <fedora.dm0@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Alan Maguire <alan.maguire@oracle.com>
Link: https://lore.kernel.org/bpf/87zgcu60hq.fsf@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 tools/lib/bpf/btf_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 4221f73a74d0..3937f66c7f8d 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -1963,7 +1963,7 @@ static int btf_dump_struct_data(struct btf_dump *d,
 {
 	const struct btf_member *m = btf_members(t);
 	__u16 n = btf_vlen(t);
-	int i, err;
+	int i, err = 0;
 
 	/* note that we increment depth before calling btf_dump_print() below;
 	 * this is intentional.  btf_dump_data_newline() will not print a

From a4997bae1b5b012c8a6e2643e26578a7bc2cae36 Mon Sep 17 00:00:00 2001
From: John Thomson <git@johnthomson.fastmail.com.au>
Date: Tue, 6 Dec 2022 06:46:45 +1000
Subject: [PATCH 351/457] PCI: mt7621: Add sentinel to quirks table

commit 19098934f910b4d47cb30251dd39ffa57bef9523 upstream.

Current driver is missing a sentinel in the struct soc_device_attribute
array, which causes an oops when assessed by the
soc_device_match(mt7621_pcie_quirks_match) call.

This was only exposed once the CONFIG_SOC_MT7621 mt7621 soc_dev_attr
was fixed to register the SOC as a device, in:

commit 7c18b64bba3b ("mips: ralink: mt7621: do not use kzalloc too early")

Fix it by adding the required sentinel.

Link: https://lore.kernel.org/lkml/26ebbed1-0fe9-4af9-8466-65f841d0b382@app.fastmail.com
Link: https://lore.kernel.org/r/20221205204645.301301-1-git@johnthomson.fastmail.com.au
Fixes: b483b4e4d3f6 ("staging: mt7621-pci: add quirks for 'E2' revision using 'soc_device_attribute'")
Signed-off-by: John Thomson <git@johnthomson.fastmail.com.au>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Acked-by: Sergio Paracuellos <sergio.paracuellos@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pci/controller/pcie-mt7621.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pcie-mt7621.c b/drivers/pci/controller/pcie-mt7621.c
index 4bd1abf26008..ee7aad09d627 100644
--- a/drivers/pci/controller/pcie-mt7621.c
+++ b/drivers/pci/controller/pcie-mt7621.c
@@ -466,7 +466,8 @@ static int mt7621_pcie_register_host(struct pci_host_bridge *host)
 }
 
 static const struct soc_device_attribute mt7621_pcie_quirks_match[] = {
-	{ .soc_id = "mt7621", .revision = "E2" }
+	{ .soc_id = "mt7621", .revision = "E2" },
+	{ /* sentinel */ }
 };
 
 static int mt7621_pcie_probe(struct platform_device *pdev)

From a1d9199ba485e55f52e985ee13c2a653d259593f Mon Sep 17 00:00:00 2001
From: John Thomson <git@johnthomson.fastmail.com.au>
Date: Mon, 14 Nov 2022 11:56:56 +1000
Subject: [PATCH 352/457] mips: ralink: mt7621: define MT7621_SYSC_BASE with
 __iomem

commit a2cab953b4c077cc02878d424466d3a6eac32aaf upstream.

So that MT7621_SYSC_BASE can be used later in multiple functions without
needing to repeat this __iomem declaration each time

Signed-off-by: John Thomson <git@johnthomson.fastmail.com.au>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/include/asm/mach-ralink/mt7621.h | 4 +++-
 arch/mips/ralink/mt7621.c                  | 7 +++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/mips/include/asm/mach-ralink/mt7621.h b/arch/mips/include/asm/mach-ralink/mt7621.h
index 6bbf082dd149..79d5bb0e06d6 100644
--- a/arch/mips/include/asm/mach-ralink/mt7621.h
+++ b/arch/mips/include/asm/mach-ralink/mt7621.h
@@ -7,10 +7,12 @@
 #ifndef _MT7621_REGS_H_
 #define _MT7621_REGS_H_
 
+#define IOMEM(x)			((void __iomem *)(KSEG1ADDR(x)))
+
 #define MT7621_PALMBUS_BASE		0x1C000000
 #define MT7621_PALMBUS_SIZE		0x03FFFFFF
 
-#define MT7621_SYSC_BASE		0x1E000000
+#define MT7621_SYSC_BASE		IOMEM(0x1E000000)
 
 #define SYSC_REG_CHIP_NAME0		0x00
 #define SYSC_REG_CHIP_NAME1		0x04
diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c
index fb0565bc34fd..17dbf28897e0 100644
--- a/arch/mips/ralink/mt7621.c
+++ b/arch/mips/ralink/mt7621.c
@@ -126,7 +126,6 @@ static void soc_dev_init(struct ralink_soc_info *soc_info, u32 rev)
 
 void __init prom_soc_init(struct ralink_soc_info *soc_info)
 {
-	void __iomem *sysc = (void __iomem *) KSEG1ADDR(MT7621_SYSC_BASE);
 	unsigned char *name = NULL;
 	u32 n0;
 	u32 n1;
@@ -154,8 +153,8 @@ void __init prom_soc_init(struct ralink_soc_info *soc_info)
 		__sync();
 	}
 
-	n0 = __raw_readl(sysc + SYSC_REG_CHIP_NAME0);
-	n1 = __raw_readl(sysc + SYSC_REG_CHIP_NAME1);
+	n0 = __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_NAME0);
+	n1 = __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_NAME1);
 
 	if (n0 == MT7621_CHIP_NAME0 && n1 == MT7621_CHIP_NAME1) {
 		name = "MT7621";
@@ -164,7 +163,7 @@ void __init prom_soc_init(struct ralink_soc_info *soc_info)
 		panic("mt7621: unknown SoC, n0:%08x n1:%08x\n", n0, n1);
 	}
 	ralink_soc = MT762X_SOC_MT7621AT;
-	rev = __raw_readl(sysc + SYSC_REG_CHIP_REV);
+	rev = __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_REV);
 
 	snprintf(soc_info->sys_type, RAMIPS_SYS_TYPE_LEN,
 		"MediaTek %s ver:%u eco:%u",

From ee11da28a62e8b1fbaf471027257c44b40e7f244 Mon Sep 17 00:00:00 2001
From: John Thomson <git@johnthomson.fastmail.com.au>
Date: Mon, 14 Nov 2022 11:56:57 +1000
Subject: [PATCH 353/457] mips: ralink: mt7621: soc queries and tests as
 functions

commit b4767d4c072583dec987225b6fe3f5524a735f42 upstream.

Move the SoC register value queries and tests to specific functions,
to remove repetition of logic
No functional changes intended

Signed-off-by: John Thomson <git@johnthomson.fastmail.com.au>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/ralink/mt7621.c | 86 +++++++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 25 deletions(-)

diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c
index 17dbf28897e0..6e126f570f0c 100644
--- a/arch/mips/ralink/mt7621.c
+++ b/arch/mips/ralink/mt7621.c
@@ -97,7 +97,57 @@ void __init ralink_of_remap(void)
 		panic("Failed to remap core resources");
 }
 
-static void soc_dev_init(struct ralink_soc_info *soc_info, u32 rev)
+static unsigned int __init mt7621_get_soc_name0(void)
+{
+	return __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_NAME0);
+}
+
+static unsigned int __init mt7621_get_soc_name1(void)
+{
+	return __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_NAME1);
+}
+
+static bool __init mt7621_soc_valid(void)
+{
+	if (mt7621_get_soc_name0() == MT7621_CHIP_NAME0 &&
+			mt7621_get_soc_name1() == MT7621_CHIP_NAME1)
+		return true;
+	else
+		return false;
+}
+
+static const char __init *mt7621_get_soc_id(void)
+{
+	if (mt7621_soc_valid())
+		return "MT7621";
+	else
+		return "invalid";
+}
+
+static unsigned int __init mt7621_get_soc_rev(void)
+{
+	return __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_REV);
+}
+
+static unsigned int __init mt7621_get_soc_ver(void)
+{
+	return (mt7621_get_soc_rev() >> CHIP_REV_VER_SHIFT) & CHIP_REV_VER_MASK;
+}
+
+static unsigned int __init mt7621_get_soc_eco(void)
+{
+	return (mt7621_get_soc_rev() & CHIP_REV_ECO_MASK);
+}
+
+static const char __init *mt7621_get_soc_revision(void)
+{
+	if (mt7621_get_soc_rev() == 1 && mt7621_get_soc_eco() == 1)
+		return "E2";
+	else
+		return "E1";
+}
+
+static void soc_dev_init(struct ralink_soc_info *soc_info)
 {
 	struct soc_device *soc_dev;
 	struct soc_device_attribute *soc_dev_attr;
@@ -108,12 +158,7 @@ static void soc_dev_init(struct ralink_soc_info *soc_info, u32 rev)
 
 	soc_dev_attr->soc_id = "mt7621";
 	soc_dev_attr->family = "Ralink";
-
-	if (((rev >> CHIP_REV_VER_SHIFT) & CHIP_REV_VER_MASK) == 1 &&
-	    (rev & CHIP_REV_ECO_MASK) == 1)
-		soc_dev_attr->revision = "E2";
-	else
-		soc_dev_attr->revision = "E1";
+	soc_dev_attr->revision = mt7621_get_soc_revision();
 
 	soc_dev_attr->data = soc_info;
 
@@ -126,11 +171,6 @@ static void soc_dev_init(struct ralink_soc_info *soc_info, u32 rev)
 
 void __init prom_soc_init(struct ralink_soc_info *soc_info)
 {
-	unsigned char *name = NULL;
-	u32 n0;
-	u32 n1;
-	u32 rev;
-
 	/* Early detection of CMP support */
 	mips_cm_probe();
 	mips_cpc_probe();
@@ -153,27 +193,23 @@ void __init prom_soc_init(struct ralink_soc_info *soc_info)
 		__sync();
 	}
 
-	n0 = __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_NAME0);
-	n1 = __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_NAME1);
-
-	if (n0 == MT7621_CHIP_NAME0 && n1 == MT7621_CHIP_NAME1) {
-		name = "MT7621";
+	if (mt7621_soc_valid())
 		soc_info->compatible = "mediatek,mt7621-soc";
-	} else {
-		panic("mt7621: unknown SoC, n0:%08x n1:%08x\n", n0, n1);
-	}
+	else
+		panic("mt7621: unknown SoC, n0:%08x n1:%08x\n",
+				mt7621_get_soc_name0(),
+				mt7621_get_soc_name1());
 	ralink_soc = MT762X_SOC_MT7621AT;
-	rev = __raw_readl(MT7621_SYSC_BASE + SYSC_REG_CHIP_REV);
 
 	snprintf(soc_info->sys_type, RAMIPS_SYS_TYPE_LEN,
 		"MediaTek %s ver:%u eco:%u",
-		name,
-		(rev >> CHIP_REV_VER_SHIFT) & CHIP_REV_VER_MASK,
-		(rev & CHIP_REV_ECO_MASK));
+		mt7621_get_soc_id(),
+		mt7621_get_soc_ver(),
+		mt7621_get_soc_eco());
 
 	soc_info->mem_detect = mt7621_memory_detect;
 
-	soc_dev_init(soc_info, rev);
+	soc_dev_init(soc_info);
 
 	if (!register_cps_smp_ops())
 		return;

From 18301e16ea553cf3cd4352d5948722a42034f284 Mon Sep 17 00:00:00 2001
From: John Thomson <git@johnthomson.fastmail.com.au>
Date: Mon, 14 Nov 2022 11:56:58 +1000
Subject: [PATCH 354/457] mips: ralink: mt7621: do not use kzalloc too early

commit 7c18b64bba3bcad1be94b404f47b94a04b91ce79 upstream.

With CONFIG_SLUB=y, following commit 6edf2576a6cc ("mm/slub: enable
debugging memory wasting of kmalloc") mt7621 failed to boot very early,
without showing any console messages.
This exposed the pre-existing bug of mt7621.c using kzalloc before normal
memory management was available.
Prior to this slub change, there existed the unintended protection against
"kmem_cache *s" being NULL as slab_pre_alloc_hook() happened to
return NULL and bailed out of slab_alloc_node().
This allowed mt7621 prom_soc_init to fail in the soc_dev_init kzalloc,
but continue booting without the SOC_BUS driver device registered.

Console output from a DEBUG_ZBOOT vmlinuz kernel loading,
with mm/slub modified to warn on kmem_cache zero or null:

zimage at:     80B842A0 810B4BC0
Uncompressing Linux at load address 80001000
Copy device tree to address  80B80EE0
Now, booting the kernel...

[    0.000000] Linux version 6.1.0-rc3+ (john@john)
(mipsel-buildroot-linux-gnu-gcc.br_real (Buildroot
2021.11-4428-g6b6741b) 12.2.0, GNU ld (GNU Binutils) 2.39) #73 SMP Wed
     Nov  2 05:10:01 AEST 2022
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: CPU: 0 PID: 0 at mm/slub.c:3416
kmem_cache_alloc+0x5a4/0x5e8
[    0.000000] Modules linked in:
[    0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 6.1.0-rc3+ #73
[    0.000000] Stack : 810fff78 80084d98 00000000 00000004 00000000
00000000 80889d04 80c90000
[    0.000000]         80920000 807bd328 8089d368 80923bd3 00000000
00000001 80889cb0 00000000
[    0.000000]         00000000 00000000 807bd328 8084bcb1 00000002
00000002 00000001 6d6f4320
[    0.000000]         00000000 80c97d3d 80c97d68 fffffffc 807bd328
00000000 00000000 00000000
[    0.000000]         00000000 a0000000 80910000 8110a0b4 00000000
00000020 80010000 80010000
[    0.000000]         ...
[    0.000000] Call Trace:
[    0.000000] [<80008260>] show_stack+0x28/0xf0
[    0.000000] [<8070c958>] dump_stack_lvl+0x60/0x80
[    0.000000] [<8002e184>] __warn+0xc4/0xf8
[    0.000000] [<8002e210>] warn_slowpath_fmt+0x58/0xa4
[    0.000000] [<801c0fac>] kmem_cache_alloc+0x5a4/0x5e8
[    0.000000] [<8092856c>] prom_soc_init+0x1fc/0x2b4
[    0.000000] [<80928060>] prom_init+0x44/0xf0
[    0.000000] [<80929214>] setup_arch+0x4c/0x6a8
[    0.000000] [<809257e0>] start_kernel+0x88/0x7c0
[    0.000000]
[    0.000000] ---[ end trace 0000000000000000 ]---
[    0.000000] SoC Type: MediaTek MT7621 ver:1 eco:3
[    0.000000] printk: bootconsole [early0] enabled

Allowing soc_device_register to work exposed oops in the mt7621 phy pci,
and pci controller drivers from soc_device_match_attr, due to missing
sentinels in the quirks tables. These were fixed with:
commit 819b885cd886 ("phy: ralink: mt7621-pci: add sentinel to quirks
table")
not yet applied ("PCI: mt7621: add sentinel to quirks table")

Link: https://lore.kernel.org/linux-mm/becf2ac3-2a90-4f3a-96d9-a70f67c66e4a@app.fastmail.com/
Fixes: 71b9b5e0130d ("MIPS: ralink: mt7621: introduce 'soc_device' initialization")
Signed-off-by: John Thomson <git@johnthomson.fastmail.com.au>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/ralink/mt7621.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c
index 6e126f570f0c..bbf5811afbf2 100644
--- a/arch/mips/ralink/mt7621.c
+++ b/arch/mips/ralink/mt7621.c
@@ -25,6 +25,7 @@
 #define MT7621_MEM_TEST_PATTERN         0xaa5555aa
 
 static u32 detect_magic __initdata;
+static struct ralink_soc_info *soc_info_ptr;
 
 int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
@@ -147,27 +148,30 @@ static const char __init *mt7621_get_soc_revision(void)
 		return "E1";
 }
 
-static void soc_dev_init(struct ralink_soc_info *soc_info)
+static int __init mt7621_soc_dev_init(void)
 {
 	struct soc_device *soc_dev;
 	struct soc_device_attribute *soc_dev_attr;
 
 	soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL);
 	if (!soc_dev_attr)
-		return;
+		return -ENOMEM;
 
 	soc_dev_attr->soc_id = "mt7621";
 	soc_dev_attr->family = "Ralink";
 	soc_dev_attr->revision = mt7621_get_soc_revision();
 
-	soc_dev_attr->data = soc_info;
+	soc_dev_attr->data = soc_info_ptr;
 
 	soc_dev = soc_device_register(soc_dev_attr);
 	if (IS_ERR(soc_dev)) {
 		kfree(soc_dev_attr);
-		return;
+		return PTR_ERR(soc_dev);
 	}
+
+	return 0;
 }
+device_initcall(mt7621_soc_dev_init);
 
 void __init prom_soc_init(struct ralink_soc_info *soc_info)
 {
@@ -209,7 +213,7 @@ void __init prom_soc_init(struct ralink_soc_info *soc_info)
 
 	soc_info->mem_detect = mt7621_memory_detect;
 
-	soc_dev_init(soc_info);
+	soc_info_ptr = soc_info;
 
 	if (!register_cps_smp_ops())
 		return;

From fb9b502cf9113c835321e1294c8c6d9629864686 Mon Sep 17 00:00:00 2001
From: Sean Anderson <sean.anderson@seco.com>
Date: Thu, 1 Dec 2022 16:28:07 -0500
Subject: [PATCH 355/457] irqchip/ls-extirq: Fix endianness detection

commit 3ae977d0e4e3a2a2ccc912ca2d20c9430508ecdd upstream.

parent is the interrupt parent, not the parent of node. Use
node->parent. This fixes endianness detection on big-endian platforms.

Fixes: 1b00adce8afd ("irqchip/ls-extirq: Fix invalid wait context by avoiding to use regmap")
Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20221201212807.616191-1-sean.anderson@seco.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/irqchip/irq-ls-extirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-ls-extirq.c b/drivers/irqchip/irq-ls-extirq.c
index d8d48b1f7c29..139f26b0a6ef 100644
--- a/drivers/irqchip/irq-ls-extirq.c
+++ b/drivers/irqchip/irq-ls-extirq.c
@@ -203,7 +203,7 @@ ls_extirq_of_init(struct device_node *node, struct device_node *parent)
 	if (ret)
 		goto err_parse_map;
 
-	priv->big_endian = of_device_is_big_endian(parent);
+	priv->big_endian = of_device_is_big_endian(node->parent);
 	priv->is_ls1021a_or_ls1043a = of_device_is_compatible(node, "fsl,ls1021a-extirq") ||
 				      of_device_is_compatible(node, "fsl,ls1043a-extirq");
 	raw_spin_lock_init(&priv->lock);

From e6b01f6a0e774b4f45759791dff5bd4f98c64226 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 18:17:34 +0100
Subject: [PATCH 356/457] udf: Discard preallocation before extending file with
 a hole

commit 16d0556568148bdcaa45d077cac9f8f7077cf70a upstream.

When extending file with a hole, we tried to preserve existing
preallocation for the file. However that is not very useful and
complicates code because the previous extent may need to be rounded to
block boundary as well (which we forgot to do thus causing data
corruption for sequence like:

xfs_io -f -c "pwrite 0x75e63 11008" -c "truncate 0x7b24b" \
  -c "truncate 0xabaa3" -c "pwrite 0xac70b 22954" \
  -c "pwrite 0x93a43 11358" -c "pwrite 0xb8e65 52211" file

with 512-byte block size. Just discard preallocation before extending
file to simplify things and also fix this data corruption.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/inode.c | 46 ++++++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index dce6ae9ae306..9b18cd258c68 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -439,6 +439,12 @@ static int udf_get_block(struct inode *inode, sector_t block,
 		iinfo->i_next_alloc_goal++;
 	}
 
+	/*
+	 * Block beyond EOF and prealloc extents? Just discard preallocation
+	 * as it is not useful and complicates things.
+	 */
+	if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents)
+		udf_discard_prealloc(inode);
 	udf_clear_extent_cache(inode);
 	phys = inode_getblk(inode, block, &err, &new);
 	if (!phys)
@@ -488,8 +494,6 @@ static int udf_do_extend_file(struct inode *inode,
 	uint32_t add;
 	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
 	struct super_block *sb = inode->i_sb;
-	struct kernel_lb_addr prealloc_loc = {};
-	uint32_t prealloc_len = 0;
 	struct udf_inode_info *iinfo;
 	int err;
 
@@ -510,19 +514,6 @@ static int udf_do_extend_file(struct inode *inode,
 			~(sb->s_blocksize - 1);
 	}
 
-	/* Last extent are just preallocated blocks? */
-	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
-						EXT_NOT_RECORDED_ALLOCATED) {
-		/* Save the extent so that we can reattach it to the end */
-		prealloc_loc = last_ext->extLocation;
-		prealloc_len = last_ext->extLength;
-		/* Mark the extent as a hole */
-		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
-			(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
-		last_ext->extLocation.logicalBlockNum = 0;
-		last_ext->extLocation.partitionReferenceNum = 0;
-	}
-
 	/* Can we merge with the previous extent? */
 	if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
 					EXT_NOT_RECORDED_NOT_ALLOCATED) {
@@ -550,7 +541,7 @@ static int udf_do_extend_file(struct inode *inode,
 		 * more extents, we may need to enter possible following
 		 * empty indirect extent.
 		 */
-		if (new_block_bytes || prealloc_len)
+		if (new_block_bytes)
 			udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
 	}
 
@@ -584,17 +575,6 @@ static int udf_do_extend_file(struct inode *inode,
 	}
 
 out:
-	/* Do we have some preallocated blocks saved? */
-	if (prealloc_len) {
-		err = udf_add_aext(inode, last_pos, &prealloc_loc,
-				   prealloc_len, 1);
-		if (err)
-			return err;
-		last_ext->extLocation = prealloc_loc;
-		last_ext->extLength = prealloc_len;
-		count++;
-	}
-
 	/* last_pos should point to the last written extent... */
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 		last_pos->offset -= sizeof(struct short_ad);
@@ -647,8 +627,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 	else
 		BUG();
 
+	/*
+	 * When creating hole in file, just don't bother with preserving
+	 * preallocation. It likely won't be very useful anyway.
+	 */
+	udf_discard_prealloc(inode);
+
 	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
 	within_final_block = (etype != -1);
+	/* We don't expect extents past EOF... */
+	WARN_ON_ONCE(etype != -1 &&
+		     elen > ((loff_t)offset + 1) << inode->i_blkbits);
 
 	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
 	    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
@@ -777,10 +766,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 		goto out_free;
 	}
 
-	/* Are we beyond EOF? */
+	/* Are we beyond EOF and preallocated extent? */
 	if (etype == -1) {
 		int ret;
 		loff_t hole_len;
+
 		isBeyondEOF = true;
 		if (count) {
 			if (c)

From 12a88f572d6d94b5c0b72e2d1782cc2e96ac06cf Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 17:25:10 +0100
Subject: [PATCH 357/457] udf: Fix preallocation discarding at indirect extent
 boundary

commit cfe4c1b25dd6d2f056afc00b7c98bcb3dd0b1fc3 upstream.

When preallocation extent is the first one in the extent block, the
code would corrupt extent tree header instead. Fix the problem and use
udf_delete_aext() for deleting extent to avoid some code duplication.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/truncate.c | 45 +++++++++++++--------------------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 532cda99644e..a9790fb32f5f 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -120,60 +120,41 @@ void udf_truncate_tail_extent(struct inode *inode)
 
 void udf_discard_prealloc(struct inode *inode)
 {
-	struct extent_position epos = { NULL, 0, {0, 0} };
+	struct extent_position epos = {};
+	struct extent_position prev_epos = {};
 	struct kernel_lb_addr eloc;
 	uint32_t elen;
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
-	int adsize;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
 	    inode->i_size == iinfo->i_lenExtents)
 		return;
 
-	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
-		adsize = sizeof(struct short_ad);
-	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
-		adsize = sizeof(struct long_ad);
-	else
-		adsize = 0;
-
 	epos.block = iinfo->i_location;
 
 	/* Find the last extent in the file */
-	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
-		etype = netype;
+	while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) {
+		brelse(prev_epos.bh);
+		prev_epos = epos;
+		if (prev_epos.bh)
+			get_bh(prev_epos.bh);
+
+		etype = udf_next_aext(inode, &epos, &eloc, &elen, 1);
 		lbcount += elen;
 	}
 	if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
-		epos.offset -= adsize;
 		lbcount -= elen;
-		extent_trunc(inode, &epos, &eloc, etype, elen, 0);
-		if (!epos.bh) {
-			iinfo->i_lenAlloc =
-				epos.offset -
-				udf_file_entry_alloc_offset(inode);
-			mark_inode_dirty(inode);
-		} else {
-			struct allocExtDesc *aed =
-				(struct allocExtDesc *)(epos.bh->b_data);
-			aed->lengthAllocDescs =
-				cpu_to_le32(epos.offset -
-					    sizeof(struct allocExtDesc));
-			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
-			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
-				udf_update_tag(epos.bh->b_data, epos.offset);
-			else
-				udf_update_tag(epos.bh->b_data,
-					       sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(epos.bh, inode);
-		}
+		udf_delete_aext(inode, prev_epos);
+		udf_free_blocks(inode->i_sb, inode, &eloc, 0,
+				DIV_ROUND_UP(elen, 1 << inode->i_blkbits));
 	}
 	/* This inode entry is in-memory only and thus we don't have to mark
 	 * the inode dirty */
 	iinfo->i_lenExtents = lbcount;
 	brelse(epos.bh);
+	brelse(prev_epos.bh);
 }
 
 static void udf_update_alloc_ext_desc(struct inode *inode,

From 1cd3e9297d44a29bf9106f87e94c5ef6b248effe Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Dec 2022 17:34:33 +0100
Subject: [PATCH 358/457] udf: Do not bother looking for prealloc extents if
 i_lenExtents matches i_size

commit 6ad53f0f71c52871202a7bf096feb2c59db33fc5 upstream.

If rounded block-rounded i_lenExtents matches block rounded i_size,
there are no preallocation extents. Do not bother walking extent linked
list.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/truncate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index a9790fb32f5f..036ebd892b85 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -127,9 +127,10 @@ void udf_discard_prealloc(struct inode *inode)
 	uint64_t lbcount = 0;
 	int8_t etype = -1, netype;
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	int bsize = 1 << inode->i_blkbits;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
-	    inode->i_size == iinfo->i_lenExtents)
+	    ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize))
 		return;
 
 	epos.block = iinfo->i_location;

From 2cd2e9322726a487acf224431fbfc96f6c8880c9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Dec 2022 13:03:30 +0100
Subject: [PATCH 359/457] udf: Fix extending file within last block

commit 1f3868f06855c97a4954c99b36f3fc9eb8f60326 upstream.

When extending file within last block it can happen that the extent is
already rounded to the blocksize and thus contains the offset we want to
grow up to. In such case we would mistakenly expand the last extent and
make it one block longer than it should be, exposing unallocated block
in a file and causing data corruption. Fix the problem by properly
detecting this case and bailing out.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/inode.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 9b18cd258c68..f713d108f21d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -590,13 +590,17 @@ out:
 static void udf_do_extend_final_block(struct inode *inode,
 				      struct extent_position *last_pos,
 				      struct kernel_long_ad *last_ext,
-				      uint32_t final_block_len)
+				      uint32_t new_elen)
 {
-	struct super_block *sb = inode->i_sb;
 	uint32_t added_bytes;
 
-	added_bytes = final_block_len -
-		      (last_ext->extLength & (sb->s_blocksize - 1));
+	/*
+	 * Extent already large enough? It may be already rounded up to block
+	 * size...
+	 */
+	if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
+		return;
+	added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen;
 	last_ext->extLength += added_bytes;
 	UDF_I(inode)->i_lenExtents += added_bytes;
 
@@ -613,12 +617,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 	int8_t etype;
 	struct super_block *sb = inode->i_sb;
 	sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
-	unsigned long partial_final_block;
+	loff_t new_elen;
 	int adsize;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	struct kernel_long_ad extent;
 	int err = 0;
-	int within_final_block;
+	bool within_last_ext;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 		adsize = sizeof(struct short_ad);
@@ -634,9 +638,9 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 	udf_discard_prealloc(inode);
 
 	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
-	within_final_block = (etype != -1);
+	within_last_ext = (etype != -1);
 	/* We don't expect extents past EOF... */
-	WARN_ON_ONCE(etype != -1 &&
+	WARN_ON_ONCE(within_last_ext &&
 		     elen > ((loff_t)offset + 1) << inode->i_blkbits);
 
 	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
@@ -653,19 +657,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
 		extent.extLength |= etype << 30;
 	}
 
-	partial_final_block = newsize & (sb->s_blocksize - 1);
+	new_elen = ((loff_t)offset << inode->i_blkbits) |
+					(newsize & (sb->s_blocksize - 1));
 
 	/* File has extent covering the new size (could happen when extending
 	 * inside a block)?
 	 */
-	if (within_final_block) {
+	if (within_last_ext) {
 		/* Extending file within the last file block */
-		udf_do_extend_final_block(inode, &epos, &extent,
-					  partial_final_block);
+		udf_do_extend_final_block(inode, &epos, &extent, new_elen);
 	} else {
-		loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
-			     partial_final_block;
-		err = udf_do_extend_file(inode, &epos, &extent, add);
+		err = udf_do_extend_file(inode, &epos, &extent, new_elen);
 	}
 
 	if (err < 0)

From d1a92bb8d697f170d93fe922da763d7d156b8841 Mon Sep 17 00:00:00 2001
From: Szymon Heidrich <szymon.heidrich@gmail.com>
Date: Tue, 6 Dec 2022 15:13:01 +0100
Subject: [PATCH 360/457] usb: gadget: uvc: Prevent buffer overflow in setup
 handler

commit 4c92670b16727365699fe4b19ed32013bab2c107 upstream.

Setup function uvc_function_setup permits control transfer
requests with up to 64 bytes of payload (UVC_MAX_REQUEST_SIZE),
data stage handler for OUT transfer uses memcpy to copy req->actual
bytes to uvc_event->data.data array of size 60. This may result
in an overflow of 4 bytes.

Fixes: cdda479f15cd ("USB gadget: video class function driver")
Cc: stable <stable@kernel.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Signed-off-by: Szymon Heidrich <szymon.heidrich@gmail.com>
Link: https://lore.kernel.org/r/20221206141301.51305-1-szymon.heidrich@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/function/f_uvc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/function/f_uvc.c b/drivers/usb/gadget/function/f_uvc.c
index 6e196e06181e..4419b7972e78 100644
--- a/drivers/usb/gadget/function/f_uvc.c
+++ b/drivers/usb/gadget/function/f_uvc.c
@@ -216,8 +216,9 @@ uvc_function_ep0_complete(struct usb_ep *ep, struct usb_request *req)
 
 		memset(&v4l2_event, 0, sizeof(v4l2_event));
 		v4l2_event.type = UVC_EVENT_DATA;
-		uvc_event->data.length = req->actual;
-		memcpy(&uvc_event->data.data, req->buf, req->actual);
+		uvc_event->data.length = min_t(unsigned int, req->actual,
+			sizeof(uvc_event->data.data));
+		memcpy(&uvc_event->data.data, req->buf, uvc_event->data.length);
 		v4l2_event_queue(&uvc->vdev, &v4l2_event);
 	}
 }

From 0b63d587c0d2f87ef15ade56f14b3bede6918636 Mon Sep 17 00:00:00 2001
From: Duke Xin <duke_xinanwen@163.com>
Date: Sat, 19 Nov 2022 17:44:47 +0800
Subject: [PATCH 361/457] USB: serial: option: add Quectel EM05-G modem

commit f0052d7a1edb3d8921b4e154aa8c46c4845b3714 upstream.

The EM05-G modem has 2 USB configurations that are configurable via the AT
command AT+QCFG="usbnet",[ 0 | 2 ] which make the modem enumerate with
the following interfaces, respectively:

"RMNET" : AT + DIAG + NMEA + Modem + QMI
"MBIM"  : MBIM + AT + DIAG + NMEA + Modem

The detailed description of the USB configuration for each mode as follows:

RMNET Mode
--------------
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 21 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=2c7c ProdID=0311 Rev= 3.18
S:  Manufacturer=Quectel
S:  Product=Quectel EM05-G
C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA
I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E:  Ad=83(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E:  Ad=85(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E:  Ad=87(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 6 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none)
E:  Ad=89(I) Atr=03(Int.) MxPS=   8 Ivl=32ms
E:  Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

MBIM Mode
--------------
T:  Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 16 Spd=480  MxCh= 0
D:  Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=2c7c ProdID=0311 Rev= 3.18
S:  Manufacturer=Quectel
S:  Product=Quectel EM05-G
C:* #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA
A:  FirstIf#= 0 IfCount= 2 Cls=02(comm.) Sub=0e Prot=00
I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option
E:  Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E:  Ad=83(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E:  Ad=85(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
E:  Ad=87(I) Atr=03(Int.) MxPS=  10 Ivl=32ms
E:  Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 0 Alt= 0 #EPs= 1 Cls=02(comm.) Sub=0e Prot=00 Driver=cdc_mbim
E:  Ad=89(I) Atr=03(Int.) MxPS=  64 Ivl=32ms
I:  If#= 1 Alt= 0 #EPs= 0 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim
I:* If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim
E:  Ad=88(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms

Signed-off-by: Duke Xin <duke_xinanwen@163.com>
Cc: stable@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/option.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index c3b7f1d98e78..dee79c7d82d5 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -255,6 +255,7 @@ static void option_instat_callback(struct urb *urb);
 #define QUECTEL_PRODUCT_EP06			0x0306
 #define QUECTEL_PRODUCT_EM05G			0x030a
 #define QUECTEL_PRODUCT_EM060K			0x030b
+#define QUECTEL_PRODUCT_EM05G_SG		0x0311
 #define QUECTEL_PRODUCT_EM12			0x0512
 #define QUECTEL_PRODUCT_RM500Q			0x0800
 #define QUECTEL_PRODUCT_RM520N			0x0801
@@ -1160,6 +1161,8 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0, 0) },
 	{ USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G, 0xff),
 	  .driver_info = RSVD(6) | ZLP },
+	{ USB_DEVICE_INTERFACE_CLASS(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM05G_SG, 0xff),
+	  .driver_info = RSVD(6) | ZLP },
 	{ USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0x00, 0x40) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0xff, 0x30) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM060K, 0xff, 0xff, 0x40) },

From 736f626ea8b87ab7080d62e06d4dd6422f604ad8 Mon Sep 17 00:00:00 2001
From: Bruno Thomsen <bruno.thomsen@gmail.com>
Date: Sun, 27 Nov 2022 18:08:11 +0100
Subject: [PATCH 362/457] USB: serial: cp210x: add Kamstrup RF sniffer PIDs

commit e88906b169ebcb8046e8f0ad76edd09ab41cfdfe upstream.

The RF sniffers are based on cp210x where the RF frontends
are based on a different USB stack.

RF sniffers can analyze packets meta data including power level
and perform packet injection.

Can be used to perform RF frontend self-test when connected to
a concentrator, ex. arch/arm/boot/dts/imx7d-flex-concentrator.dts

Signed-off-by: Bruno Thomsen <bruno.thomsen@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/cp210x.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 3bcec419f463..f6fb23620e87 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -195,6 +195,8 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x16DC, 0x0015) }, /* W-IE-NE-R Plein & Baus GmbH CML Control, Monitoring and Data Logger */
 	{ USB_DEVICE(0x17A8, 0x0001) }, /* Kamstrup Optical Eye/3-wire */
 	{ USB_DEVICE(0x17A8, 0x0005) }, /* Kamstrup M-Bus Master MultiPort 250D */
+	{ USB_DEVICE(0x17A8, 0x0011) }, /* Kamstrup 444 MHz RF sniffer */
+	{ USB_DEVICE(0x17A8, 0x0013) }, /* Kamstrup 870 MHz RF sniffer */
 	{ USB_DEVICE(0x17A8, 0x0101) }, /* Kamstrup 868 MHz wM-Bus C-Mode Meter Reader (Int Ant) */
 	{ USB_DEVICE(0x17A8, 0x0102) }, /* Kamstrup 868 MHz wM-Bus C-Mode Meter Reader (Ext Ant) */
 	{ USB_DEVICE(0x17F4, 0xAAAA) }, /* Wavesense Jazz blood glucose meter */

From 3c8b21ee14048e5e13d4e5c1f4d6e288519a13a8 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 29 Nov 2022 15:17:49 +0100
Subject: [PATCH 363/457] USB: serial: f81232: fix division by zero on
 line-speed change

commit a08ca6ebafe615c9028c53fc4c9e6c9b2b1f2888 upstream.

The driver leaves the line speed unchanged in case a requested speed is
not supported. Make sure to handle the case where the current speed is
B0 (hangup) without dividing by zero when determining the clock source.

Fixes: 268ddb5e9b62 ("USB: serial: f81232: add high baud rate support")
Cc: stable@vger.kernel.org      # 5.2
Cc: Ji-Ze Hong (Peter Hong) <hpeter@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/f81232.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/usb/serial/f81232.c b/drivers/usb/serial/f81232.c
index 2dd58cd9f0cc..891fb1fe69df 100644
--- a/drivers/usb/serial/f81232.c
+++ b/drivers/usb/serial/f81232.c
@@ -130,9 +130,6 @@ static u8 const clock_table[] = { F81232_CLK_1_846_MHZ, F81232_CLK_14_77_MHZ,
 
 static int calc_baud_divisor(speed_t baudrate, speed_t clockrate)
 {
-	if (!baudrate)
-		return 0;
-
 	return DIV_ROUND_CLOSEST(clockrate, baudrate);
 }
 
@@ -498,9 +495,14 @@ static void f81232_set_baudrate(struct tty_struct *tty,
 	speed_t baud_list[] = { baudrate, old_baudrate, F81232_DEF_BAUDRATE };
 
 	for (i = 0; i < ARRAY_SIZE(baud_list); ++i) {
-		idx = f81232_find_clk(baud_list[i]);
+		baudrate = baud_list[i];
+		if (baudrate == 0) {
+			tty_encode_baud_rate(tty, 0, 0);
+			return;
+		}
+
+		idx = f81232_find_clk(baudrate);
 		if (idx >= 0) {
-			baudrate = baud_list[i];
 			tty_encode_baud_rate(tty, baudrate, baudrate);
 			break;
 		}

From c0815ea0854831454aa8160bae641eb63914ec5e Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 29 Nov 2022 15:18:19 +0100
Subject: [PATCH 364/457] USB: serial: f81534: fix division by zero on
 line-speed change

commit 188c9c2e0c7f4ae864113f80c40bafb394062271 upstream.

The driver leaves the line speed unchanged in case a requested speed is
not supported. Make sure to handle the case where the current speed is
B0 (hangup) without dividing by zero when determining the clock source.

Fixes: 3aacac02f385 ("USB: serial: f81534: add high baud rate support")
Cc: stable@vger.kernel.org      # 4.16
Cc: Ji-Ze Hong (Peter Hong) <hpeter@gmail.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/f81534.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/usb/serial/f81534.c b/drivers/usb/serial/f81534.c
index ddfcd72eb0ae..4083ae961be4 100644
--- a/drivers/usb/serial/f81534.c
+++ b/drivers/usb/serial/f81534.c
@@ -536,9 +536,6 @@ static int f81534_submit_writer(struct usb_serial_port *port, gfp_t mem_flags)
 
 static u32 f81534_calc_baud_divisor(u32 baudrate, u32 clockrate)
 {
-	if (!baudrate)
-		return 0;
-
 	/* Round to nearest divisor */
 	return DIV_ROUND_CLOSEST(clockrate, baudrate);
 }
@@ -568,9 +565,14 @@ static int f81534_set_port_config(struct usb_serial_port *port,
 	u32 baud_list[] = {baudrate, old_baudrate, F81534_DEFAULT_BAUD_RATE};
 
 	for (i = 0; i < ARRAY_SIZE(baud_list); ++i) {
-		idx = f81534_find_clk(baud_list[i]);
+		baudrate = baud_list[i];
+		if (baudrate == 0) {
+			tty_encode_baud_rate(tty, 0, 0);
+			return 0;
+		}
+
+		idx = f81534_find_clk(baudrate);
 		if (idx >= 0) {
-			baudrate = baud_list[i];
 			tty_encode_baud_rate(tty, baudrate, baudrate);
 			break;
 		}

From c0d91ec1a16a2def5eed92972da448a3d52542b0 Mon Sep 17 00:00:00 2001
From: Andy Chi <andy.chi@canonical.com>
Date: Mon, 28 Nov 2022 10:28:47 +0800
Subject: [PATCH 365/457] ALSA: hda/realtek: fix mute/micmute LEDs for a HP
 ProBook

commit 1d8025ec722d5e011f9299c46274eb21fb54a428 upstream.

There is a HP ProBook which using ALC236 codec and need the
ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mute LED and
micmute LED work.

Signed-off-by: Andy Chi <andy.chi@canonical.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20221128022849.13759-1-andy.chi@canonical.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 sound/pci/hda/patch_realtek.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index e5c036385666..cf7c825078dc 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -9354,6 +9354,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	 SND_PCI_QUIRK(0x103c, 0x8abb, "HP ZBook Firefly 14 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8ad1, "HP EliteBook 840 14 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
 	SND_PCI_QUIRK(0x103c, 0x8ad2, "HP EliteBook 860 16 inch G9 Notebook PC", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED),
+	SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
+	SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
 	SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
 	SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300),
 	SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),

From c9cacc0ab1ea5457f7bd454c57f60d31a8771fd4 Mon Sep 17 00:00:00 2001
From: Reka Norman <rekanorman@chromium.org>
Date: Wed, 30 Nov 2022 11:19:40 +0200
Subject: [PATCH 366/457] xhci: Apply XHCI_RESET_TO_DEFAULT quirk to ADL-N

commit fed70b61ef2c0aed54456db3d485b215f6cc3209 upstream.

ADL-N systems have the same issue as ADL-P, where a large boot firmware
delay is seen if USB ports are left in U3 at shutdown. So apply the
XHCI_RESET_TO_DEFAULT quirk to ADL-N as well.

This patch depends on commit 34cd2db408d5 ("xhci: Add quirk to reset
host back to default state at shutdown").

The issue it fixes is a ~20s boot time delay when booting from S5. It
affects ADL-N devices, and ADL-N support was added starting from v5.16.

Cc: stable@vger.kernel.org
Signed-off-by: Reka Norman <rekanorman@chromium.org>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Link: https://lore.kernel.org/r/20221130091944.2171610-3-mathias.nyman@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-pci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 7bccbe50bab1..f98cf30a3c1a 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -59,6 +59,7 @@
 #define PCI_DEVICE_ID_INTEL_TIGER_LAKE_XHCI		0x9a13
 #define PCI_DEVICE_ID_INTEL_MAPLE_RIDGE_XHCI		0x1138
 #define PCI_DEVICE_ID_INTEL_ALDER_LAKE_PCH_XHCI		0x51ed
+#define PCI_DEVICE_ID_INTEL_ALDER_LAKE_N_PCH_XHCI	0x54ed
 
 #define PCI_DEVICE_ID_AMD_RENOIR_XHCI			0x1639
 #define PCI_DEVICE_ID_AMD_PROMONTORYA_4			0x43b9
@@ -246,7 +247,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
 		xhci->quirks |= XHCI_MISSING_CAS;
 
 	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
-	    pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_PCH_XHCI)
+	    (pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_PCH_XHCI ||
+	     pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_N_PCH_XHCI))
 		xhci->quirks |= XHCI_RESET_TO_DEFAULT;
 
 	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&

From 52d5896ba2a4f939d7ff2f26f8e6cba60a8068c4 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Sat, 15 Oct 2022 17:11:06 +0200
Subject: [PATCH 367/457] staging: r8188eu: fix led register settings

commit 12c6223fc1804fd9295dc50d358294539b4a4184 upstream.

Using an InterTech DMG-02 dongle, the led remains on when the system goes
into standby mode. After wakeup, it's no longer possible to control the
led.

It turned out that the register settings to enable or disable the led were
not correct. They worked for some dongles like the Edimax V2 but not for
others like the InterTech DMG-02.

This patch fixes the register settings. Bit 3 in the led_cfg2 register
controls the led status, bit 5 must always be set to be able to control
the led, bit 6 has no influence on the led. Setting the mac_pinmux_cfg
register is not necessary.

These settings were tested with Edimax V2 and InterTech DMG-02.

Cc: stable@vger.kernel.org
Fixes: 8cd574e6af54 ("staging: r8188eu: introduce new hal dir for RTL8188eu driver")
Suggested-by: Michael Straube <straube.linux@gmail.com>
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Tested-by: Michael Straube <straube.linux@gmail.com> # InterTech DMG-02,
Tested-by: Philipp Hortmann <philipp.g.hortmann@gmail.com> # Edimax N150
Link: https://lore.kernel.org/r/20221015151115.232095-2-martin@kaiser.cx
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/r8188eu/core/rtw_led.c | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/drivers/staging/r8188eu/core/rtw_led.c b/drivers/staging/r8188eu/core/rtw_led.c
index 1e316e6358ea..48c5db69929c 100644
--- a/drivers/staging/r8188eu/core/rtw_led.c
+++ b/drivers/staging/r8188eu/core/rtw_led.c
@@ -32,40 +32,19 @@ static void ResetLedStatus(struct led_priv *pLed)
 
 static void SwLedOn(struct adapter *padapter, struct led_priv *pLed)
 {
-	u8	LedCfg;
-	int res;
-
 	if (padapter->bDriverStopped)
 		return;
 
-	res = rtw_read8(padapter, REG_LEDCFG2, &LedCfg);
-	if (res)
-		return;
-
-	rtw_write8(padapter, REG_LEDCFG2, (LedCfg & 0xf0) | BIT(5) | BIT(6)); /*  SW control led0 on. */
+	rtw_write8(padapter, REG_LEDCFG2, BIT(5)); /*  SW control led0 on. */
 	pLed->bLedOn = true;
 }
 
 static void SwLedOff(struct adapter *padapter, struct led_priv *pLed)
 {
-	u8	LedCfg;
-	int res;
-
 	if (padapter->bDriverStopped)
 		goto exit;
 
-	res = rtw_read8(padapter, REG_LEDCFG2, &LedCfg);/* 0x4E */
-	if (res)
-		goto exit;
-
-	LedCfg &= 0x90; /*  Set to software control. */
-	rtw_write8(padapter, REG_LEDCFG2, (LedCfg | BIT(3)));
-	res = rtw_read8(padapter, REG_MAC_PINMUX_CFG, &LedCfg);
-	if (res)
-		goto exit;
-
-	LedCfg &= 0xFE;
-	rtw_write8(padapter, REG_MAC_PINMUX_CFG, LedCfg);
+	rtw_write8(padapter, REG_LEDCFG2, BIT(5) | BIT(3));
 exit:
 	pLed->bLedOn = false;
 }

From c383c7c35c7bc15e07a04eefa060a8a80cbeae29 Mon Sep 17 00:00:00 2001
From: Tony Nguyen <anthony.l.nguyen@intel.com>
Date: Mon, 12 Dec 2022 11:00:31 -0800
Subject: [PATCH 368/457] igb: Initialize mailbox message for VF reset

commit de5dc44370fbd6b46bd7f1a1e00369be54a041c8 upstream.

When a MAC address is not assigned to the VF, that portion of the message
sent to the VF is not set. The memory, however, is allocated from the
stack meaning that information may be leaked to the VM. Initialize the
message buffer to 0 so that no information is passed to the VM in this
case.

Fixes: 6ddbc4cf1f4d ("igb: Indicate failure on vf reset for empty mac address")
Reported-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20221212190031.3983342-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index f8e32833226c..473158c09f1d 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -7521,7 +7521,7 @@ static void igb_vf_reset_msg(struct igb_adapter *adapter, u32 vf)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	unsigned char *vf_mac = adapter->vf_data[vf].vf_mac_addresses;
-	u32 reg, msgbuf[3];
+	u32 reg, msgbuf[3] = {};
 	u8 *addr = (u8 *)(&msgbuf[1]);
 
 	/* process all the same items cleared in a function level reset */

From 9222912924fcf56e2d166a503eddbdb5ffd2005f Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 23 Nov 2022 11:30:21 +0200
Subject: [PATCH 369/457] usb: typec: ucsi: Resume in separate work

commit e0dced9c7d4763fd97c86a13902d135f03cc42eb upstream.

It can take more than one second to check each connector
when the system is resumed. So if you have, say, eight
connectors, it may take eight seconds for ucsi_resume() to
finish. That's a bit too much.

This will modify ucsi_resume() so that it schedules a work
where the interface is actually resumed instead of checking
the connectors directly. The connections will also be
checked in separate tasks which are queued for each connector
separately.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216706
Fixes: 99f6d4361113 ("usb: typec: ucsi: Check the connection on resume")
Cc: <stable@vger.kernel.org>
Reported-by: Todd Brandt <todd.e.brandt@intel.com>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20221123093021.25981-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/ucsi/ucsi.c | 17 +++++++++++++----
 drivers/usb/typec/ucsi/ucsi.h |  1 +
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index a7987fc764cc..eabe519013e7 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -1270,8 +1270,9 @@ err:
 	return ret;
 }
 
-int ucsi_resume(struct ucsi *ucsi)
+static void ucsi_resume_work(struct work_struct *work)
 {
+	struct ucsi *ucsi = container_of(work, struct ucsi, resume_work);
 	struct ucsi_connector *con;
 	u64 command;
 	int ret;
@@ -1279,15 +1280,21 @@ int ucsi_resume(struct ucsi *ucsi)
 	/* Restore UCSI notification enable mask after system resume */
 	command = UCSI_SET_NOTIFICATION_ENABLE | ucsi->ntfy;
 	ret = ucsi_send_command(ucsi, command, NULL, 0);
-	if (ret < 0)
-		return ret;
+	if (ret < 0) {
+		dev_err(ucsi->dev, "failed to re-enable notifications (%d)\n", ret);
+		return;
+	}
 
 	for (con = ucsi->connector; con->port; con++) {
 		mutex_lock(&con->lock);
-		ucsi_check_connection(con);
+		ucsi_partner_task(con, ucsi_check_connection, 1, 0);
 		mutex_unlock(&con->lock);
 	}
+}
 
+int ucsi_resume(struct ucsi *ucsi)
+{
+	queue_work(system_long_wq, &ucsi->resume_work);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ucsi_resume);
@@ -1347,6 +1354,7 @@ struct ucsi *ucsi_create(struct device *dev, const struct ucsi_operations *ops)
 	if (!ucsi)
 		return ERR_PTR(-ENOMEM);
 
+	INIT_WORK(&ucsi->resume_work, ucsi_resume_work);
 	INIT_DELAYED_WORK(&ucsi->work, ucsi_init_work);
 	mutex_init(&ucsi->ppm_lock);
 	ucsi->dev = dev;
@@ -1401,6 +1409,7 @@ void ucsi_unregister(struct ucsi *ucsi)
 
 	/* Make sure that we are not in the middle of driver initialization */
 	cancel_delayed_work_sync(&ucsi->work);
+	cancel_work_sync(&ucsi->resume_work);
 
 	/* Disable notifications */
 	ucsi->ops->async_write(ucsi, UCSI_CONTROL, &cmd, sizeof(cmd));
diff --git a/drivers/usb/typec/ucsi/ucsi.h b/drivers/usb/typec/ucsi/ucsi.h
index 8eb391e3e592..c968474ee547 100644
--- a/drivers/usb/typec/ucsi/ucsi.h
+++ b/drivers/usb/typec/ucsi/ucsi.h
@@ -287,6 +287,7 @@ struct ucsi {
 	struct ucsi_capability cap;
 	struct ucsi_connector *connector;
 
+	struct work_struct resume_work;
 	struct delayed_work work;
 	int work_count;
 #define UCSI_ROLE_SWITCH_RETRY_PER_HZ	10

From 4e453324803d7a5166eb6e062dd631a47ef46204 Mon Sep 17 00:00:00 2001
From: Shruthi Sanil <shruthi.sanil@intel.com>
Date: Fri, 25 Nov 2022 16:23:27 +0530
Subject: [PATCH 370/457] usb: dwc3: pci: Update PCIe device ID for USB3
 controller on CPU sub-system for Raptor Lake

commit f05f80f217bf52443a2582bca19fd78188333f25 upstream.

The device ID 0xa70e is defined for the USB3 device controller in the CPU
sub-system of Raptor Lake platform. Hence updating the ID accordingly.

Fixes: bad0d1d726ac ("usb: dwc3: pci: Add support for Intel Raptor Lake")
Cc: stable <stable@kernel.org>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Shruthi Sanil <shruthi.sanil@intel.com>
Link: https://lore.kernel.org/r/20221125105327.27945-1-shruthi.sanil@intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/dwc3/dwc3-pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c
index fb14511b1e10..89c9ab2b19f8 100644
--- a/drivers/usb/dwc3/dwc3-pci.c
+++ b/drivers/usb/dwc3/dwc3-pci.c
@@ -45,7 +45,7 @@
 #define PCI_DEVICE_ID_INTEL_ADLN		0x465e
 #define PCI_DEVICE_ID_INTEL_ADLN_PCH		0x54ee
 #define PCI_DEVICE_ID_INTEL_ADLS		0x7ae1
-#define PCI_DEVICE_ID_INTEL_RPL			0x460e
+#define PCI_DEVICE_ID_INTEL_RPL			0xa70e
 #define PCI_DEVICE_ID_INTEL_RPLS		0x7a61
 #define PCI_DEVICE_ID_INTEL_MTLP		0x7ec1
 #define PCI_DEVICE_ID_INTEL_MTL			0x7e7e

From e8d16a54842d609fd4a3ed2d81d4333d6329aa94 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@cjr.nz>
Date: Sun, 11 Dec 2022 18:18:55 -0300
Subject: [PATCH 371/457] cifs: fix oops during encryption

commit f7f291e14dde32a07b1f0aa06921d28f875a7b54 upstream.

When running xfstests against Azure the following oops occurred on an
arm64 system

  Unable to handle kernel write to read-only memory at virtual address
  ffff0001221cf000
  Mem abort info:
    ESR = 0x9600004f
    EC = 0x25: DABT (current EL), IL = 32 bits
    SET = 0, FnV = 0
    EA = 0, S1PTW = 0
    FSC = 0x0f: level 3 permission fault
  Data abort info:
    ISV = 0, ISS = 0x0000004f
    CM = 0, WnR = 1
  swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000000294f3000
  [ffff0001221cf000] pgd=18000001ffff8003, p4d=18000001ffff8003,
  pud=18000001ff82e003, pmd=18000001ff71d003, pte=00600001221cf787
  Internal error: Oops: 9600004f [#1] PREEMPT SMP
  ...
  pstate: 80000005 (Nzcv daif -PAN -UAO -TCO BTYPE=--)
  pc : __memcpy+0x40/0x230
  lr : scatterwalk_copychunks+0xe0/0x200
  sp : ffff800014e92de0
  x29: ffff800014e92de0 x28: ffff000114f9de80 x27: 0000000000000008
  x26: 0000000000000008 x25: ffff800014e92e78 x24: 0000000000000008
  x23: 0000000000000001 x22: 0000040000000000 x21: ffff000000000000
  x20: 0000000000000001 x19: ffff0001037c4488 x18: 0000000000000014
  x17: 235e1c0d6efa9661 x16: a435f9576b6edd6c x15: 0000000000000058
  x14: 0000000000000001 x13: 0000000000000008 x12: ffff000114f2e590
  x11: ffffffffffffffff x10: 0000040000000000 x9 : ffff8000105c3580
  x8 : 2e9413b10000001a x7 : 534b4410fb86b005 x6 : 534b4410fb86b005
  x5 : ffff0001221cf008 x4 : ffff0001037c4490 x3 : 0000000000000001
  x2 : 0000000000000008 x1 : ffff0001037c4488 x0 : ffff0001221cf000
  Call trace:
   __memcpy+0x40/0x230
   scatterwalk_map_and_copy+0x98/0x100
   crypto_ccm_encrypt+0x150/0x180
   crypto_aead_encrypt+0x2c/0x40
   crypt_message+0x750/0x880
   smb3_init_transform_rq+0x298/0x340
   smb_send_rqst.part.11+0xd8/0x180
   smb_send_rqst+0x3c/0x100
   compound_send_recv+0x534/0xbc0
   smb2_query_info_compound+0x32c/0x440
   smb2_set_ea+0x438/0x4c0
   cifs_xattr_set+0x5d4/0x7c0

This is because in scatterwalk_copychunks(), we attempted to write to
a buffer (@sign) that was allocated in the stack (vmalloc area) by
crypt_message() and thus accessing its remaining 8 (x2) bytes ended up
crossing a page boundary.

To simply fix it, we could just pass @sign kmalloc'd from
crypt_message() and then we're done.  Luckily, we don't seem to pass
any other vmalloc'd buffers in smb_rqst::rq_iov...

Instead, let's map the correct pages and offsets from vmalloc buffers
as well in cifs_sg_set_buf() and then avoiding such oopses.

Signed-off-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/cifsglob.h  |  68 ++++++++++++++++++++
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/misc.c      |   4 +-
 fs/cifs/smb2ops.c   | 147 +++++++++++++++++++++-----------------------
 4 files changed, 142 insertions(+), 81 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1420acf987f0..157d3c0e3cc7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -13,6 +13,8 @@
 #include <linux/in6.h>
 #include <linux/inet.h>
 #include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/utsname.h>
@@ -2137,4 +2139,70 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const
 	dst->FileNameLength = src->FileNameLength;
 }
 
+static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
+					    int num_rqst,
+					    const u8 *sig)
+{
+	unsigned int len, skip;
+	unsigned int nents = 0;
+	unsigned long addr;
+	int i, j;
+
+	/* Assumes the first rqst has a transform header as the first iov.
+	 * I.e.
+	 * rqst[0].rq_iov[0]  is transform header
+	 * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+	 * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+	 */
+	for (i = 0; i < num_rqst; i++) {
+		/*
+		 * The first rqst has a transform header where the
+		 * first 20 bytes are not part of the encrypted blob.
+		 */
+		for (j = 0; j < rqst[i].rq_nvec; j++) {
+			struct kvec *iov = &rqst[i].rq_iov[j];
+
+			skip = (i == 0) && (j == 0) ? 20 : 0;
+			addr = (unsigned long)iov->iov_base + skip;
+			if (unlikely(is_vmalloc_addr((void *)addr))) {
+				len = iov->iov_len - skip;
+				nents += DIV_ROUND_UP(offset_in_page(addr) + len,
+						      PAGE_SIZE);
+			} else {
+				nents++;
+			}
+		}
+		nents += rqst[i].rq_npages;
+	}
+	nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
+	return nents;
+}
+
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
+						  const void *buf,
+						  unsigned int buflen)
+{
+	unsigned long addr = (unsigned long)buf;
+	unsigned int off = offset_in_page(addr);
+
+	addr &= PAGE_MASK;
+	if (unlikely(is_vmalloc_addr((void *)addr))) {
+		do {
+			unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
+
+			sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+
+			off = 0;
+			addr += PAGE_SIZE;
+			buflen -= len;
+		} while (buflen);
+	} else {
+		sg_set_page(sg++, virt_to_page(addr), buflen, off);
+	}
+	return sg;
+}
+
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 83e83d8beabb..eb1a0de9dd55 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -600,8 +600,8 @@ int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
 int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
 void cifs_free_hash(struct shash_desc **sdesc);
 
-extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
-				unsigned int *len, unsigned int *offset);
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+			  unsigned int *len, unsigned int *offset);
 struct cifs_chan *
 cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
 int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3e68d8208cf5..1cbecd64d697 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1136,8 +1136,8 @@ cifs_free_hash(struct shash_desc **sdesc)
  * @len: Where to store the length for this page:
  * @offset: Where to store the offset for this page
  */
-void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
-				unsigned int *len, unsigned int *offset)
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+			  unsigned int *len, unsigned int *offset)
 {
 	*len = rqst->rq_pagesz;
 	*offset = (page == 0) ? rqst->rq_offset : 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bfaafd02fb1f..b24e68b5ccd6 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4204,69 +4204,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
 	memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
 }
 
-/* We can not use the normal sg_set_buf() as we will sometimes pass a
- * stack object as buf.
- */
-static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
-				   unsigned int buflen)
+static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+				 int num_rqst, const u8 *sig, u8 **iv,
+				 struct aead_request **req, struct scatterlist **sgl,
+				 unsigned int *num_sgs)
 {
-	void *addr;
-	/*
-	 * VMAP_STACK (at least) puts stack into the vmalloc address space
-	 */
-	if (is_vmalloc_addr(buf))
-		addr = vmalloc_to_page(buf);
-	else
-		addr = virt_to_page(buf);
-	sg_set_page(sg, addr, buflen, offset_in_page(buf));
-}
+	unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+	unsigned int iv_size = crypto_aead_ivsize(tfm);
+	unsigned int len;
+	u8 *p;
 
-/* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0]  is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
- */
-static struct scatterlist *
-init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
-{
-	unsigned int sg_len;
-	struct scatterlist *sg;
-	unsigned int i;
-	unsigned int j;
-	unsigned int idx = 0;
-	int skip;
+	*num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
 
-	sg_len = 1;
-	for (i = 0; i < num_rqst; i++)
-		sg_len += rqst[i].rq_nvec + rqst[i].rq_npages;
+	len = iv_size;
+	len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+	len = ALIGN(len, crypto_tfm_ctx_alignment());
+	len += req_size;
+	len = ALIGN(len, __alignof__(struct scatterlist));
+	len += *num_sgs * sizeof(**sgl);
 
-	sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
-	if (!sg)
+	p = kmalloc(len, GFP_ATOMIC);
+	if (!p)
 		return NULL;
 
-	sg_init_table(sg, sg_len);
+	*iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
+	*req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+						crypto_tfm_ctx_alignment());
+	*sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+					       __alignof__(struct scatterlist));
+	return p;
+}
+
+static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+			       int num_rqst, const u8 *sig, u8 **iv,
+			       struct aead_request **req, struct scatterlist **sgl)
+{
+	unsigned int off, len, skip;
+	struct scatterlist *sg;
+	unsigned int num_sgs;
+	unsigned long addr;
+	int i, j;
+	void *p;
+
+	p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
+	if (!p)
+		return NULL;
+
+	sg_init_table(*sgl, num_sgs);
+	sg = *sgl;
+
+	/* Assumes the first rqst has a transform header as the first iov.
+	 * I.e.
+	 * rqst[0].rq_iov[0]  is transform header
+	 * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+	 * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+	 */
 	for (i = 0; i < num_rqst; i++) {
+		/*
+		 * The first rqst has a transform header where the
+		 * first 20 bytes are not part of the encrypted blob.
+		 */
 		for (j = 0; j < rqst[i].rq_nvec; j++) {
-			/*
-			 * The first rqst has a transform header where the
-			 * first 20 bytes are not part of the encrypted blob
-			 */
+			struct kvec *iov = &rqst[i].rq_iov[j];
+
 			skip = (i == 0) && (j == 0) ? 20 : 0;
-			smb2_sg_set_buf(&sg[idx++],
-					rqst[i].rq_iov[j].iov_base + skip,
-					rqst[i].rq_iov[j].iov_len - skip);
-			}
-
+			addr = (unsigned long)iov->iov_base + skip;
+			len = iov->iov_len - skip;
+			sg = cifs_sg_set_buf(sg, (void *)addr, len);
+		}
 		for (j = 0; j < rqst[i].rq_npages; j++) {
-			unsigned int len, offset;
-
-			rqst_page_get_length(&rqst[i], j, &len, &offset);
-			sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset);
+			rqst_page_get_length(&rqst[i], j, &len, &off);
+			sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
 		}
 	}
-	smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE);
-	return sg;
+	cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+
+	return p;
 }
 
 static int
@@ -4314,11 +4327,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 	u8 sign[SMB2_SIGNATURE_SIZE] = {};
 	u8 key[SMB3_ENC_DEC_KEY_SIZE];
 	struct aead_request *req;
-	char *iv;
-	unsigned int iv_len;
+	u8 *iv;
 	DECLARE_CRYPTO_WAIT(wait);
 	struct crypto_aead *tfm;
 	unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+	void *creq;
 
 	rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
 	if (rc) {
@@ -4352,32 +4365,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 		return rc;
 	}
 
-	req = aead_request_alloc(tfm, GFP_KERNEL);
-	if (!req) {
-		cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
+	creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
+	if (unlikely(!creq))
 		return -ENOMEM;
-	}
 
 	if (!enc) {
 		memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
 		crypt_len += SMB2_SIGNATURE_SIZE;
 	}
 
-	sg = init_sg(num_rqst, rqst, sign);
-	if (!sg) {
-		cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
-		rc = -ENOMEM;
-		goto free_req;
-	}
-
-	iv_len = crypto_aead_ivsize(tfm);
-	iv = kzalloc(iv_len, GFP_KERNEL);
-	if (!iv) {
-		cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
-		rc = -ENOMEM;
-		goto free_sg;
-	}
-
 	if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
 	    (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
 		memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
@@ -4386,6 +4382,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 		memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
 	}
 
+	aead_request_set_tfm(req, tfm);
 	aead_request_set_crypt(req, sg, sg, crypt_len, iv);
 	aead_request_set_ad(req, assoc_data_len);
 
@@ -4398,11 +4395,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
 	if (!rc && enc)
 		memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
 
-	kfree_sensitive(iv);
-free_sg:
-	kfree_sensitive(sg);
-free_req:
-	kfree_sensitive(req);
+	kfree_sensitive(creq);
 	return rc;
 }
 

From 1d1a710c1983819bdceaaae83cda309a84f51ea7 Mon Sep 17 00:00:00 2001
From: Nikolaus Voss <nikolaus.voss@haag-streit.com>
Date: Wed, 19 Oct 2022 18:38:20 +0200
Subject: [PATCH 372/457] KEYS: encrypted: fix key instantiation with
 user-provided data

commit 5adedd42245af0860ebda8fe0949f24f5204c1b1 upstream.

Commit cd3bc044af48 ("KEYS: encrypted: Instantiate key with
user-provided decrypted data") added key instantiation with user
provided decrypted data.  The user data is hex-ascii-encoded but was
just memcpy'ed to the binary buffer. Fix this to use hex2bin instead.

Old keys created from user provided decrypted data saved with "keyctl
pipe" are still valid, however if the key is recreated from decrypted
data the old key must be converted to the correct format. This can be
done with a small shell script, e.g.:

BROKENKEY=abcdefABCDEF1234567890aaaaaaaaaa
NEWKEY=$(echo -ne $BROKENKEY | xxd -p -c32)
keyctl add user masterkey "$(cat masterkey.bin)" @u
keyctl add encrypted testkey "new user:masterkey 32 $NEWKEY" @u

However, NEWKEY is still broken: If for BROKENKEY 32 bytes were
specified, a brute force attacker knowing the key properties would only
need to try at most 2^(16*8) keys, as if the key was only 16 bytes long.

The security issue is a result of the combination of limiting the input
range to hex-ascii and using memcpy() instead of hex2bin(). It could
have been fixed either by allowing binary input or using hex2bin() (and
doubling the ascii input key length). This patch implements the latter.

The corresponding test for the Linux Test Project ltp has also been
fixed (see link below).

Fixes: cd3bc044af48 ("KEYS: encrypted: Instantiate key with user-provided decrypted data")
Cc: stable@kernel.org
Link: https://lore.kernel.org/ltp/20221006081709.92303897@mail.steuer-voss.de/
Reviewed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Nikolaus Voss <nikolaus.voss@haag-streit.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/security/keys/trusted-encrypted.rst | 3 ++-
 security/keys/encrypted-keys/encrypted.c          | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/security/keys/trusted-encrypted.rst b/Documentation/security/keys/trusted-encrypted.rst
index 0bfb4c339748..9bc9db8ec651 100644
--- a/Documentation/security/keys/trusted-encrypted.rst
+++ b/Documentation/security/keys/trusted-encrypted.rst
@@ -350,7 +350,8 @@ Load an encrypted key "evm" from saved blob::
 
 Instantiate an encrypted key "evm" using user-provided decrypted data::
 
-    $ keyctl add encrypted evm "new default user:kmk 32 `cat evm_decrypted_data.blob`" @u
+    $ evmkey=$(dd if=/dev/urandom bs=1 count=32 | xxd -c32 -p)
+    $ keyctl add encrypted evm "new default user:kmk 32 $evmkey" @u
     794890253
 
     $ keyctl print 794890253
diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
index e05cfc2e49ae..1e313982af02 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -627,7 +627,7 @@ static struct encrypted_key_payload *encrypted_key_alloc(struct key *key,
 			pr_err("encrypted key: instantiation of keys using provided decrypted data is disabled since CONFIG_USER_DECRYPTED_DATA is set to false\n");
 			return ERR_PTR(-EINVAL);
 		}
-		if (strlen(decrypted_data) != decrypted_datalen) {
+		if (strlen(decrypted_data) != decrypted_datalen * 2) {
 			pr_err("encrypted key: decrypted data provided does not match decrypted data length provided\n");
 			return ERR_PTR(-EINVAL);
 		}
@@ -791,8 +791,8 @@ static int encrypted_init(struct encrypted_key_payload *epayload,
 		ret = encrypted_key_decrypt(epayload, format, hex_encoded_iv);
 	} else if (decrypted_data) {
 		get_random_bytes(epayload->iv, ivsize);
-		memcpy(epayload->decrypted_data, decrypted_data,
-				   epayload->decrypted_datalen);
+		ret = hex2bin(epayload->decrypted_data, decrypted_data,
+			      epayload->decrypted_datalen);
 	} else {
 		get_random_bytes(epayload->iv, ivsize);
 		get_random_bytes(epayload->decrypted_data, epayload->decrypted_datalen);

From ebdb69c5b054f115ef5ff72f0bb2aaa1718904e6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 21 Dec 2022 17:48:12 +0100
Subject: [PATCH 373/457] Linux 6.1.1

Link: https://lore.kernel.org/r/20221219182943.395169070@linuxfoundation.org
Tested-by: Ronald Warsow <rwarsow@gmx.de>
Tested-by: Shuah Khan <skhan@linuxfoundation.org>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Tested-by: Ron Economos <re@w6rz.net>
Tested-by: Rudi Heitbaum <rudi@heitbaum.com>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Tested-by: Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Allen Pais <apais@linux.microsoft.com>
Tested-by: Slade Watkins <srw@sladewatkins.net>
Tested-by: Justin M. Forbes <jforbes@fedoraproject.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 997b67722292..7307ae6c2ef7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 6
 PATCHLEVEL = 1
-SUBLEVEL = 0
+SUBLEVEL = 1
 EXTRAVERSION =
 NAME = Hurr durr I'ma ninja sloth
 

From ae23e29f7a152de2cc45ac7e78d0700c3b629f18 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 10 Nov 2022 00:29:42 -0800
Subject: [PATCH 374/457] fscrypt: pass super_block to
 fscrypt_put_master_key_activeref()

As this code confused Linus [1], pass the super_block as an argument to
fscrypt_put_master_key_activeref().  This removes the need to have the
back-pointer ->mk_sb, so remove that.

[1] https://lore.kernel.org/linux-fscrypt/CAHk-=wgud4Bc_um+htgfagYpZAnOoCb3NUoW67hc9LhOKsMtJg@mail.gmail.com

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221110082942.351615-1-ebiggers@kernel.org
---
 fs/crypto/fscrypt_private.h | 13 ++++---------
 fs/crypto/keyring.c         | 14 ++++++--------
 fs/crypto/keysetup.c        |  2 +-
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d5f68a0c5d15..316a778cec0f 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -439,13 +439,7 @@ struct fscrypt_master_key_secret {
 struct fscrypt_master_key {
 
 	/*
-	 * Back-pointer to the super_block of the filesystem to which this
-	 * master key has been added.  Only valid if ->mk_active_refs > 0.
-	 */
-	struct super_block			*mk_sb;
-
-	/*
-	 * Link in ->mk_sb->s_master_keys->key_hashtable.
+	 * Link in ->s_master_keys->key_hashtable.
 	 * Only valid if ->mk_active_refs > 0.
 	 */
 	struct hlist_node			mk_node;
@@ -456,7 +450,7 @@ struct fscrypt_master_key {
 	/*
 	 * Active and structural reference counts.  An active ref guarantees
 	 * that the struct continues to exist, continues to be in the keyring
-	 * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+	 * ->s_master_keys, and that any embedded subkeys (e.g.
 	 * ->mk_direct_keys) that have been prepared continue to exist.
 	 * A structural ref only guarantees that the struct continues to exist.
 	 *
@@ -569,7 +563,8 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
 
 void fscrypt_put_master_key(struct fscrypt_master_key *mk);
 
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+				      struct fscrypt_master_key *mk);
 
 struct fscrypt_master_key *
 fscrypt_find_master_key(struct super_block *sb,
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 2a24b1f0ae68..78dd2ff306bd 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk)
 	call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
 }
 
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+				      struct fscrypt_master_key *mk)
 {
-	struct super_block *sb = mk->mk_sb;
-	struct fscrypt_keyring *keyring = sb->s_master_keys;
 	size_t i;
 
 	if (!refcount_dec_and_test(&mk->mk_active_refs))
@@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
 	 * destroying any subkeys embedded in it.
 	 */
 
-	spin_lock(&keyring->lock);
+	spin_lock(&sb->s_master_keys->lock);
 	hlist_del_rcu(&mk->mk_node);
-	spin_unlock(&keyring->lock);
+	spin_unlock(&sb->s_master_keys->lock);
 
 	/*
 	 * ->mk_active_refs == 0 implies that ->mk_secret is not present and
@@ -243,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb)
 			WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
 			WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
 			wipe_master_key_secret(&mk->mk_secret);
-			fscrypt_put_master_key_activeref(mk);
+			fscrypt_put_master_key_activeref(sb, mk);
 		}
 	}
 	kfree_sensitive(keyring);
@@ -424,7 +423,6 @@ static int add_new_master_key(struct super_block *sb,
 	if (!mk)
 		return -ENOMEM;
 
-	mk->mk_sb = sb;
 	init_rwsem(&mk->mk_sem);
 	refcount_set(&mk->mk_struct_refs, 1);
 	mk->mk_spec = *mk_spec;
@@ -1068,7 +1066,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 	err = -ENOKEY;
 	if (is_master_key_secret_present(&mk->mk_secret)) {
 		wipe_master_key_secret(&mk->mk_secret);
-		fscrypt_put_master_key_activeref(mk);
+		fscrypt_put_master_key_activeref(sb, mk);
 		err = 0;
 	}
 	inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index f7407071a952..9e44dc078a81 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -509,7 +509,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 		spin_lock(&mk->mk_decrypted_inodes_lock);
 		list_del(&ci->ci_master_key_link);
 		spin_unlock(&mk->mk_decrypted_inodes_lock);
-		fscrypt_put_master_key_activeref(mk);
+		fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
 	}
 	memzero_explicit(ci, sizeof(*ci));
 	kmem_cache_free(fscrypt_info_cachep, ci);

From 8a2a58063603f4e6ca8e8405aa692e98794090d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:42 +0100
Subject: [PATCH 375/457] blk-crypto: don't use struct request_queue for public
 interfaces

Switch all public blk-crypto interfaces to use struct block_device
arguments to specify the device they operate on instead of th
request_queue, which is a block layer implementation detail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/inline-encryption.rst | 12 ++++++------
 block/blk-crypto.c                        | 24 +++++++++++++----------
 drivers/md/dm-table.c                     |  2 +-
 fs/crypto/inline_crypt.c                  |  8 +++-----
 include/linux/blk-crypto.h                | 11 ++++-------
 5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index 4d151fbe2058..f9bf18ea6509 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
 of inline encryption using the kernel crypto API.  blk-crypto-fallback is built
 into the block layer, so it works on any block device without any special setup.
 Essentially, when a bio with an encryption context is submitted to a
-request_queue that doesn't support that encryption context, the block layer will
+block_device that doesn't support that encryption context, the block layer will
 handle en/decryption of the bio using blk-crypto-fallback.
 
 For encryption, the data cannot be encrypted in-place, as callers usually rely
@@ -187,7 +187,7 @@ API presented to users of the block layer
 
 ``blk_crypto_config_supported()`` allows users to check ahead of time whether
 inline encryption with particular crypto settings will work on a particular
-request_queue -- either via hardware or via blk-crypto-fallback.  This function
+block_device -- either via hardware or via blk-crypto-fallback.  This function
 takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
 the actual bytes of the key and instead just contains the algorithm, data unit
 size, etc.  This function can be useful if blk-crypto-fallback is disabled.
@@ -195,7 +195,7 @@ size, etc.  This function can be useful if blk-crypto-fallback is disabled.
 ``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
 
 Users must call ``blk_crypto_start_using_key()`` before actually starting to use
-a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
 was called earlier).  This is needed to initialize blk-crypto-fallback if it
 will be needed.  This must not be called from the data path, as this may have to
 allocate resources, which may deadlock in that case.
@@ -207,7 +207,7 @@ for en/decryption.  Users don't need to worry about freeing the bio_crypt_ctx
 later, as that happens automatically when the bio is freed or reset.
 
 Finally, when done using inline encryption with a blk_crypto_key on a
-request_queue, users must call ``blk_crypto_evict_key()``.  This ensures that
+block_device, users must call ``blk_crypto_evict_key()``.  This ensures that
 the key is evicted from all keyslots it may be programmed into and unlinked from
 any kernel data structures it may be linked into.
 
@@ -221,9 +221,9 @@ as follows:
 5. ``blk_crypto_evict_key()`` (after all I/O has completed)
 6. Zeroize the blk_crypto_key (this has no dedicated function)
 
-If a blk_crypto_key is being used on multiple request_queues, then
+If a blk_crypto_key is being used on multiple block_devices, then
 ``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
-and ``blk_crypto_evict_key()`` must be called on each request_queue.
+and ``blk_crypto_evict_key()`` must be called on each block_device.
 
 API presented to device drivers
 ===============================
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index a496aaef85ba..0047436b6337 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -354,20 +354,21 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
 
 /*
  * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
- * request queue it's submitted to supports inline crypto, or the
+ * block_device it's submitted to supports inline crypto, or the
  * blk-crypto-fallback is enabled and supports the cfg).
  */
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg)
 {
 	return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-	       __blk_crypto_cfg_supported(q->crypto_profile, cfg);
+	       __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+					  cfg);
 }
 
 /**
  * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @bdev: block device to operate on
  * @key: A key to use on the device
- * @q: the request queue for the device
  *
  * Upper layers must call this function to ensure that either the hardware
  * supports the key's crypto settings, or the crypto API fallback has transforms
@@ -379,10 +380,11 @@ bool blk_crypto_config_supported(struct request_queue *q,
  *	   blk-crypto-fallback is either disabled or the needed algorithm
  *	   is disabled in the crypto API; or another -errno code.
  */
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
-			       struct request_queue *q)
+int blk_crypto_start_using_key(struct block_device *bdev,
+			       const struct blk_crypto_key *key)
 {
-	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+	if (__blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+			&key->crypto_cfg))
 		return 0;
 	return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
 }
@@ -390,7 +392,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
 /**
  * blk_crypto_evict_key() - Evict a key from any inline encryption hardware
  *			    it may have been programmed into
- * @q: The request queue who's associated inline encryption hardware this key
+ * @bdev: The block_device who's associated inline encryption hardware this key
  *     might have been programmed into
  * @key: The key to evict
  *
@@ -400,14 +402,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
  *
  * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
  */
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
 			 const struct blk_crypto_key *key)
 {
+	struct request_queue *q = bdev_get_queue(bdev);
+
 	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
 		return __blk_crypto_evict_key(q->crypto_profile, key);
 
 	/*
-	 * If the request_queue didn't support the key, then blk-crypto-fallback
+	 * If the block_device didn't support the key, then blk-crypto-fallback
 	 * may have been used, so try to evict the key from blk-crypto-fallback.
 	 */
 	return blk_crypto_fallback_evict_key(key);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 078da18bb86d..8541d5688f3a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
 	struct dm_keyslot_evict_args *args = data;
 	int err;
 
-	err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key);
+	err = blk_crypto_evict_key(dev->bdev, args->key);
 	if (!args->err)
 		args->err = err;
 	/* Always try to evict the key from all devices. */
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index cea8b14007e6..55c4d8c23d30 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -139,8 +139,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 		return PTR_ERR(devs);
 
 	for (i = 0; i < num_devs; i++) {
-		if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
-						 &crypto_cfg))
+		if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
 			goto out_free_devs;
 	}
 
@@ -184,8 +183,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 		goto fail;
 	}
 	for (i = 0; i < num_devs; i++) {
-		err = blk_crypto_start_using_key(blk_key,
-						 bdev_get_queue(devs[i]));
+		err = blk_crypto_start_using_key(devs[i], blk_key);
 		if (err)
 			break;
 	}
@@ -224,7 +222,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 	devs = fscrypt_get_devices(sb, &num_devs);
 	if (!IS_ERR(devs)) {
 		for (i = 0; i < num_devs; i++)
-			blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
+			blk_crypto_evict_key(devs[i], blk_key);
 		kfree(devs);
 	}
 	kfree_sensitive(blk_key);
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index 69b24fe92cbf..561ca92e204d 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -71,9 +71,6 @@ struct bio_crypt_ctx {
 #include <linux/blk_types.h>
 #include <linux/blkdev.h>
 
-struct request;
-struct request_queue;
-
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
 static inline bool bio_has_crypt_ctx(struct bio *bio)
@@ -94,13 +91,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
 			unsigned int dun_bytes,
 			unsigned int data_unit_size);
 
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
-			       struct request_queue *q);
+int blk_crypto_start_using_key(struct block_device *bdev,
+			       const struct blk_crypto_key *key);
 
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
 			 const struct blk_crypto_key *key);
 
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg);
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */

From bc46a95e090af35f715bde739f952e4978a6e391 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:43 +0100
Subject: [PATCH 376/457] blk-crypto: add a
 blk_crypto_config_supported_natively helper

Add a blk_crypto_config_supported_natively helper that wraps
__blk_crypto_cfg_supported to retrieve the crypto_profile from the
request queue.  With this fscrypt can stop including
blk-crypto-profile.h and rely on the public consumer interface in
blk-crypto.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-crypto.c         | 21 ++++++++++++---------
 fs/crypto/inline_crypt.c   |  6 ++----
 include/linux/blk-crypto.h |  2 ++
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 0047436b6337..6a461f4d676a 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -267,7 +267,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 {
 	struct bio *bio = *bio_ptr;
 	const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
-	struct blk_crypto_profile *profile;
 
 	/* Error if bio has no data. */
 	if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -284,10 +283,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 	 * Success if device supports the encryption context, or if we succeeded
 	 * in falling back to the crypto API.
 	 */
-	profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
-	if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
+	if (blk_crypto_config_supported_natively(bio->bi_bdev,
+						 &bc_key->crypto_cfg))
 		return true;
-
 	if (blk_crypto_fallback_bio_prep(bio_ptr))
 		return true;
 fail:
@@ -352,6 +350,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
 	return 0;
 }
 
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+					  const struct blk_crypto_config *cfg)
+{
+	return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+					  cfg);
+}
+
 /*
  * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
  * block_device it's submitted to supports inline crypto, or the
@@ -361,8 +366,7 @@ bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg)
 {
 	return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-	       __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
-					  cfg);
+	       blk_crypto_config_supported_natively(bdev, cfg);
 }
 
 /**
@@ -383,8 +387,7 @@ bool blk_crypto_config_supported(struct block_device *bdev,
 int blk_crypto_start_using_key(struct block_device *bdev,
 			       const struct blk_crypto_key *key)
 {
-	if (__blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
-			&key->crypto_cfg))
+	if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
 		return 0;
 	return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
 }
@@ -407,7 +410,7 @@ int blk_crypto_evict_key(struct block_device *bdev,
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+	if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
 		return __blk_crypto_evict_key(q->crypto_profile, key);
 
 	/*
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 55c4d8c23d30..8bfb3ce86476 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
  * provides the key and IV to use.
  */
 
-#include <linux/blk-crypto-profile.h>
+#include <linux/blk-crypto.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/sched/mm.h>
@@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
 	unsigned int i;
 
 	for (i = 0; i < num_devs; i++) {
-		struct request_queue *q = bdev_get_queue(devs[i]);
-
 		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-		    __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
+		    blk_crypto_config_supported_natively(devs[i], cfg)) {
 			if (!xchg(&mode->logged_blk_crypto_native, 1))
 				pr_info("fscrypt: %s using blk-crypto (native)\n",
 					mode->friendly_name);
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index 561ca92e204d..a33d32f5c268 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -97,6 +97,8 @@ int blk_crypto_start_using_key(struct block_device *bdev,
 int blk_crypto_evict_key(struct block_device *bdev,
 			 const struct blk_crypto_key *key);
 
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+					  const struct blk_crypto_config *cfg);
 bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg);
 

From 9d624c603f0b59f2bdd57cbe7cf1207d67f2dbc3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:44 +0100
Subject: [PATCH 377/457] blk-crypto: move internal only declarations to
 blk-crypto-internal.h

 blk_crypto_get_keyslot, blk_crypto_put_keyslot, __blk_crypto_evict_key
and __blk_crypto_cfg_supported are only used internally by the
blk-crypto code, so move the out of blk-crypto-profile.h, which is
included by drivers that supply blk-crypto functionality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-crypto-internal.h        | 12 ++++++++++++
 include/linux/blk-crypto-profile.h | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index e6818ffaddbf..d31fa80454e4 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -65,6 +65,18 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
 	return rq->crypt_ctx;
 }
 
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+				    const struct blk_crypto_key *key,
+				    struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+			   const struct blk_crypto_key *key);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+				const struct blk_crypto_config *cfg);
+
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
 static inline int blk_crypto_sysfs_register(struct request_queue *q)
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
index bbab65bd5428..e6802b69cdd6 100644
--- a/include/linux/blk-crypto-profile.h
+++ b/include/linux/blk-crypto-profile.h
@@ -138,18 +138,6 @@ int devm_blk_crypto_profile_init(struct device *dev,
 
 unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
 
-blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
-				    const struct blk_crypto_key *key,
-				    struct blk_crypto_keyslot **slot_ptr);
-
-void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
-
-bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
-				const struct blk_crypto_config *cfg);
-
-int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
-			   const struct blk_crypto_key *key);
-
 void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
 
 void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);

From e389a678609ed4d62f969407717c1cce0f2ecc44 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 23 Nov 2022 09:29:23 -0800
Subject: [PATCH 378/457] blk-crypto: Add a missing include directive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow the compiler to verify consistency of function declarations and
function definitions. This patch fixes the following sparse errors:

block/blk-crypto-profile.c:241:14: error: no previous prototype for ‘blk_crypto_get_keyslot’ [-Werror=missing-prototypes]
  241 | blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
      |              ^~~~~~~~~~~~~~~~~~~~~~
block/blk-crypto-profile.c:318:6: error: no previous prototype for ‘blk_crypto_put_keyslot’ [-Werror=missing-prototypes]
  318 | void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
      |      ^~~~~~~~~~~~~~~~~~~~~~
block/blk-crypto-profile.c:344:6: error: no previous prototype for ‘__blk_crypto_cfg_supported’ [-Werror=missing-prototypes]
  344 | bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
      |      ^~~~~~~~~~~~~~~~~~~~~~~~~~
block/blk-crypto-profile.c:373:5: error: no previous prototype for ‘__blk_crypto_evict_key’ [-Werror=missing-prototypes]
  373 | int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
      |     ^~~~~~~~~~~~~~~~~~~~~~

Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20221123172923.434339-1-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-crypto-profile.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 96c511967386..0307fb0d95d3 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -32,6 +32,7 @@
 #include <linux/wait.h>
 #include <linux/blkdev.h>
 #include <linux/blk-integrity.h>
+#include "blk-crypto-internal.h"
 
 struct blk_crypto_keyslot {
 	atomic_t slot_refs;

From fa68c31c3b0420d73639ea8fd231f796e52f8c23 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 25 Nov 2022 11:20:47 -0800
Subject: [PATCH 379/457] fscrypt: add comment for fscrypt_valid_enc_modes_v1()

Make it clear that nothing new should be added to this function.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221125192047.18916-1-ebiggers@kernel.org
---
 fs/crypto/policy.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 46757c3052ef..84fa51604b15 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb)
 	return sb->s_cop->get_dummy_policy(sb);
 }
 
+/*
+ * Return %true if the given combination of encryption modes is supported for v1
+ * (and later) encryption policies.
+ *
+ * Do *not* add anything new here, since v1 encryption policies are deprecated.
+ * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only.
+ */
 static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
 {
 	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&

From 0953e23b2a09da57b6bf81db1940e046e4106374 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:26:33 +0100
Subject: [PATCH 380/457] blk-crypto: pass a gendisk to
 blk_crypto_sysfs_{,un}register

Prepare for changes to the block layer sysfs handling by passing the
readily available gendisk to blk_crypto_sysfs_{,un}register.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042637.1009333-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-crypto-internal.h | 10 ++++++----
 block/blk-crypto-sysfs.c    |  7 ++++---
 block/blk-sysfs.c           |  4 ++--
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index d31fa80454e4..a8cdaf26851e 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
 
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
-int blk_crypto_sysfs_register(struct request_queue *q);
+int blk_crypto_sysfs_register(struct gendisk *disk);
 
-void blk_crypto_sysfs_unregister(struct request_queue *q);
+void blk_crypto_sysfs_unregister(struct gendisk *disk);
 
 void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
 			     unsigned int inc);
@@ -79,12 +79,14 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
-static inline int blk_crypto_sysfs_register(struct request_queue *q)
+static inline int blk_crypto_sysfs_register(struct gendisk *disk)
 {
 	return 0;
 }
 
-static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
+{
+}
 
 static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
 					       struct bio *bio)
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
index fd93bd2f33b7..e05f145cd797 100644
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = {
  * If the request_queue has a blk_crypto_profile, create the "crypto"
  * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
  */
-int blk_crypto_sysfs_register(struct request_queue *q)
+int blk_crypto_sysfs_register(struct gendisk *disk)
 {
+	struct request_queue *q = disk->queue;
 	struct blk_crypto_kobj *obj;
 	int err;
 
@@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q)
 	return 0;
 }
 
-void blk_crypto_sysfs_unregister(struct request_queue *q)
+void blk_crypto_sysfs_unregister(struct gendisk *disk)
 {
-	kobject_put(q->crypto_kobject);
+	kobject_put(disk->queue->crypto_kobject);
 }
 
 static int __init blk_crypto_sysfs_init(void)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e71b3b43927c..bc39225667d0 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -838,7 +838,7 @@ int blk_register_queue(struct gendisk *disk)
 			goto put_dev;
 	}
 
-	ret = blk_crypto_sysfs_register(q);
+	ret = blk_crypto_sysfs_register(disk);
 	if (ret)
 		goto put_dev;
 
@@ -915,7 +915,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	 */
 	if (queue_is_mq(q))
 		blk_mq_sysfs_unregister(disk);
-	blk_crypto_sysfs_unregister(q);
+	blk_crypto_sysfs_unregister(disk);
 
 	mutex_lock(&q->sysfs_lock);
 	elv_unregister_queue(q);

From 034c75ce0e364e209cb7dff10191bc7b38a9f388 Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Thu, 1 Dec 2022 20:58:18 +0800
Subject: [PATCH 381/457] blk-crypto: Add support for SM4-XTS blk crypto mode

SM4 is a symmetric cipher algorithm widely used in China. The SM4-XTS
variant is used to encrypt length-preserving data. This is the
mandatory algorithm in some special scenarios.

Add support for the algorithm to block inline encryption. This is needed
for the inlinecrypt mount option to be supported via
blk-crypto-fallback, as it is for the other fscrypt modes.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221201125819.36932-2-tianjia.zhang@linux.alibaba.com
---
 block/blk-crypto.c         | 6 ++++++
 include/linux/blk-crypto.h | 1 +
 2 files changed, 7 insertions(+)

diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 6a461f4d676a..45378586151f 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -36,6 +36,12 @@ const struct blk_crypto_mode blk_crypto_modes[] = {
 		.keysize = 32,
 		.ivsize = 32,
 	},
+	[BLK_ENCRYPTION_MODE_SM4_XTS] = {
+		.name = "SM4-XTS",
+		.cipher_str = "xts(sm4)",
+		.keysize = 32,
+		.ivsize = 16,
+	},
 };
 
 /*
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index a33d32f5c268..1e3e5d0adf12 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -13,6 +13,7 @@ enum blk_crypto_mode_num {
 	BLK_ENCRYPTION_MODE_AES_256_XTS,
 	BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
 	BLK_ENCRYPTION_MODE_ADIANTUM,
+	BLK_ENCRYPTION_MODE_SM4_XTS,
 	BLK_ENCRYPTION_MODE_MAX,
 };
 

From 0cd8fad6c6ebb74ac8684e395e149a1f62e40814 Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Thu, 1 Dec 2022 20:58:19 +0800
Subject: [PATCH 382/457] fscrypt: Add SM4 XTS/CTS symmetric algorithm support

Add support for XTS and CTS mode variant of SM4 algorithm. The former is
used to encrypt file contents, while the latter (SM4-CTS-CBC) is used to
encrypt filenames.

SM4 is a symmetric algorithm widely used in China, and is even mandatory
algorithm in some special scenarios. We need to provide these users with
the ability to encrypt files or disks using SM4-XTS.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221201125819.36932-3-tianjia.zhang@linux.alibaba.com
---
 Documentation/filesystems/fscrypt.rst |  1 +
 fs/crypto/keysetup.c                  | 15 +++++++++++++++
 fs/crypto/policy.c                    |  5 +++++
 include/uapi/linux/fscrypt.h          |  2 ++
 4 files changed, 23 insertions(+)

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 5ba5817c17c2..c0784ec05553 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -338,6 +338,7 @@ Currently, the following pairs of encryption modes are supported:
 - AES-128-CBC for contents and AES-128-CTS-CBC for filenames
 - Adiantum for both contents and filenames
 - AES-256-XTS for contents and AES-256-HCTR2 for filenames (v2 policies only)
+- SM4-XTS for contents and SM4-CTS-CBC for filenames (v2 policies only)
 
 If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
 
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 9e44dc078a81..94757ccd3056 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.security_strength = 16,
 		.ivsize = 16,
 	},
+	[FSCRYPT_MODE_SM4_XTS] = {
+		.friendly_name = "SM4-XTS",
+		.cipher_str = "xts(sm4)",
+		.keysize = 32,
+		.security_strength = 16,
+		.ivsize = 16,
+		.blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
+	},
+	[FSCRYPT_MODE_SM4_CTS] = {
+		.friendly_name = "SM4-CTS-CBC",
+		.cipher_str = "cts(cbc(sm4))",
+		.keysize = 16,
+		.security_strength = 16,
+		.ivsize = 16,
+	},
 	[FSCRYPT_MODE_ADIANTUM] = {
 		.friendly_name = "Adiantum",
 		.cipher_str = "adiantum(xchacha12,aes)",
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 84fa51604b15..893661b52376 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -90,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
 	if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
 	    filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
 		return true;
+
+	if (contents_mode == FSCRYPT_MODE_SM4_XTS &&
+	    filenames_mode == FSCRYPT_MODE_SM4_CTS)
+		return true;
+
 	return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
 }
 
diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index a756b29afcc2..47dbd1994bfe 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -26,6 +26,8 @@
 #define FSCRYPT_MODE_AES_256_CTS		4
 #define FSCRYPT_MODE_AES_128_CBC		5
 #define FSCRYPT_MODE_AES_128_CTS		6
+#define FSCRYPT_MODE_SM4_XTS			7
+#define FSCRYPT_MODE_SM4_CTS			8
 #define FSCRYPT_MODE_ADIANTUM			9
 #define FSCRYPT_MODE_AES_256_HCTR2		10
 /* If adding a mode number > 10, update FSCRYPT_MODE_MAX in fscrypt_private.h */

From 8e0547a4a43a12eb2f146c8bc4f0538b48171532 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2022 19:55:29 -0800
Subject: [PATCH 383/457] fscrypt: remove unused Speck definitions

These old unused definitions were originally left around to prevent the
same mode numbers from being reused.  However, we've now decided to
reuse the mode numbers anyway.  So let's completely remove these old
unused definitions to avoid confusion.  There is no reason for any code
to be using these constants in any way; and indeed, Debian Code Search
shows no uses of them (other than in copies or translations of the
header).  So this should be perfectly safe.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221202035529.55992-1-ebiggers@kernel.org
---
 include/uapi/linux/fscrypt.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h
index 47dbd1994bfe..fd1fb0d5389d 100644
--- a/include/uapi/linux/fscrypt.h
+++ b/include/uapi/linux/fscrypt.h
@@ -187,8 +187,6 @@ struct fscrypt_get_key_status_arg {
 #define FS_ENCRYPTION_MODE_AES_256_CTS	FSCRYPT_MODE_AES_256_CTS
 #define FS_ENCRYPTION_MODE_AES_128_CBC	FSCRYPT_MODE_AES_128_CBC
 #define FS_ENCRYPTION_MODE_AES_128_CTS	FSCRYPT_MODE_AES_128_CTS
-#define FS_ENCRYPTION_MODE_SPECK128_256_XTS	7	/* removed */
-#define FS_ENCRYPTION_MODE_SPECK128_256_CTS	8	/* removed */
 #define FS_ENCRYPTION_MODE_ADIANTUM	FSCRYPT_MODE_ADIANTUM
 #define FS_KEY_DESC_PREFIX		FSCRYPT_KEY_DESC_PREFIX
 #define FS_KEY_DESC_PREFIX_SIZE		FSCRYPT_KEY_DESC_PREFIX_SIZE

From 74c681d0f3bf12014625a814dafd90ed3ccb0eea Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Dec 2022 11:14:52 -0800
Subject: [PATCH 384/457] fscrypt: add additional documentation for SM4 support

Add a paragraph about SM4, like there is for the other modes.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Link: https://lore.kernel.org/r/20221201191452.6557-1-ebiggers@kernel.org
---
 Documentation/filesystems/fscrypt.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index c0784ec05553..ef183387da20 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -370,6 +370,12 @@ CONFIG_CRYPTO_HCTR2 must be enabled.  Also, fast implementations of XCTR and
 POLYVAL should be enabled, e.g. CRYPTO_POLYVAL_ARM64_CE and
 CRYPTO_AES_ARM64_CE_BLK for ARM64.
 
+SM4 is a Chinese block cipher that is an alternative to AES.  It has
+not seen as much security review as AES, and it only has a 128-bit key
+size.  It may be useful in cases where its use is mandated.
+Otherwise, it should not be used.  For SM4 support to be available, it
+also needs to be enabled in the kernel crypto API.
+
 New encryption modes can be added relatively easily, without changes
 to individual filesystems.  However, authenticated encryption (AE)
 modes are not currently supported because of the difficulty of dealing

From 08d23cafc54efdb00fe010aab9d2ed48b4bcd91a Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 13:25:36 +0000
Subject: [PATCH 385/457] ANDROID: KVM: arm64: Allow SMC handling from pKVM
 modules

Introduce a new default SMC handler for the host that can be set from
modules.

Bug: 244543039
Bug: 245034629
Change-Id: I8481bfb1926a3cb433b15de5c1a99e3550710689
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h  |  1 +
 arch/arm64/kvm/hyp/include/nvhe/modules.h |  2 ++
 arch/arm64/kvm/hyp/nvhe/hyp-main.c        | 16 ++++++++++------
 arch/arm64/kvm/hyp/nvhe/modules.c         |  1 +
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 86ba74e01492..128117c00356 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -20,6 +20,7 @@ struct pkvm_module_ops {
 	void (*flush_dcache_to_poc)(void *addr, size_t size);
 	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
 	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
+	int (*register_host_smc_handler)(bool (*cb)(struct kvm_cpu_context *));
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index d69aa744b82f..4a6162512a79 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -3,6 +3,8 @@
 #define HCALL_HANDLED 0
 #define HCALL_UNHANDLED -1
 
+int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *));
+
 #ifdef CONFIG_MODULES
 int __pkvm_init_module(void *module_init);
 int __pkvm_register_hcall(unsigned long hfn_hyp_va);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f15232433454..f3b75cd4e98b 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -40,6 +40,13 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
+static bool (*default_host_smc_handler)(struct kvm_cpu_context *host_ctxt);
+
+int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *))
+{
+	return cmpxchg(&default_host_smc_handler, NULL, cb) ? -EBUSY : 0;
+}
+
 static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
@@ -1283,11 +1290,6 @@ inval:
 	cpu_reg(host_ctxt, 0) = SMCCC_RET_NOT_SUPPORTED;
 }
 
-static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt)
-{
-	__kvm_hyp_host_forward_smc(host_ctxt);
-}
-
 static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 {
 	bool handled;
@@ -1295,8 +1297,10 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
 	handled = kvm_host_psci_handler(host_ctxt);
 	if (!handled)
 		handled = kvm_host_ffa_handler(host_ctxt);
+	if (!handled && READ_ONCE(default_host_smc_handler))
+		handled = default_host_smc_handler(host_ctxt);
 	if (!handled)
-		default_host_smc_handler(host_ctxt);
+		__kvm_hyp_host_forward_smc(host_ctxt);
 
 	/* SMC was trapped, move ELR past the current PC. */
 	kvm_skip_host_instr();
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index b71ce3880f3b..401159a2f526 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -63,6 +63,7 @@ const struct pkvm_module_ops module_ops = {
 	.flush_dcache_to_poc = __kvm_flush_dcache_to_poc,
 	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
 	.protect_host_page = hyp_protect_host_page,
+	.register_host_smc_handler = __pkvm_register_host_smc_handler,
 };
 
 int __pkvm_init_module(void *module_init)

From f3472b47c8078c3887e34070a9498e62ec294f9d Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 13:45:12 +0000
Subject: [PATCH 386/457] ANDROID: KVM: arm64: Allow handling illegal aborts
 from pKVM modules

Introduce a new handler allowing to notify pKVM modules when pKVM
detects an illegal access from the host.

Bug: 244543039
Bug: 245034629
Change-Id: I62133a8d967d91437e5216b307e449f8c83dfab6
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h  |  1 +
 arch/arm64/kvm/hyp/include/nvhe/modules.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c     | 10 ++++++++++
 arch/arm64/kvm/hyp/nvhe/modules.c         |  1 +
 4 files changed, 13 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 128117c00356..8b88727038b0 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -21,6 +21,7 @@ struct pkvm_module_ops {
 	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
 	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
 	int (*register_host_smc_handler)(bool (*cb)(struct kvm_cpu_context *));
+	int (*register_illegal_abt_notifier)(void (*cb)(struct kvm_cpu_context *));
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index 4a6162512a79..48eae5f57321 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -4,6 +4,7 @@
 #define HCALL_UNHANDLED -1
 
 int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *));
+int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *));
 
 #ifdef CONFIG_MODULES
 int __pkvm_init_module(void *module_init);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d3a1a8d84640..9ea3970ad293 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -710,12 +710,22 @@ static int host_stage2_idmap(u64 addr)
 	return host_stage2_idmap_locked(range.start, range.end - range.start, prot, false);
 }
 
+static void (*illegal_abt_notifier)(struct kvm_cpu_context *host_ctxt);
+
+int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *))
+{
+	return cmpxchg(&illegal_abt_notifier, NULL, cb) ? -EBUSY : 0;
+}
+
 static void host_inject_abort(struct kvm_cpu_context *host_ctxt)
 {
 	u64 spsr = read_sysreg_el2(SYS_SPSR);
 	u64 esr = read_sysreg_el2(SYS_ESR);
 	u64 ventry, ec;
 
+	if (READ_ONCE(illegal_abt_notifier))
+		illegal_abt_notifier(host_ctxt);
+
 	/* Repaint the ESR to report a same-level fault if taken from EL1 */
 	if ((spsr & PSR_MODE_MASK) != PSR_MODE_EL0t) {
 		ec = ESR_ELx_EC(esr);
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 401159a2f526..47ee9b83a28c 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -64,6 +64,7 @@ const struct pkvm_module_ops module_ops = {
 	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
 	.protect_host_page = hyp_protect_host_page,
 	.register_host_smc_handler = __pkvm_register_host_smc_handler,
+	.register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier,
 };
 
 int __pkvm_init_module(void *module_init)

From 2829c605886e4e984533dfd6617822f585334f29 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 14:41:56 +0000
Subject: [PATCH 387/457] ANDROID: KVM: arm64: Notify pKVM modules of PSCI
 events

Introduce a notifier allowing a pKVM module to be notified for major
PSCI events: {CPU,SYSTEM}_SUSPEND, as well as on the resume path.

Bug: 244543039
Bug: 245034629
Change-Id: Ia82923445214925fc77e321457c8eab31f9d42e8
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h  |  7 +++++++
 arch/arm64/kvm/hyp/include/nvhe/modules.h |  3 +++
 arch/arm64/kvm/hyp/nvhe/modules.c         |  1 +
 arch/arm64/kvm/hyp/nvhe/psci-relay.c      | 18 ++++++++++++++++++
 4 files changed, 29 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 8b88727038b0..ee0da3a99cbf 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -8,6 +8,12 @@
 
 typedef void (*dyn_hcall_t)(struct kvm_cpu_context *);
 
+enum pkvm_psci_notification {
+	PKVM_PSCI_CPU_SUSPEND,
+	PKVM_PSCI_SYSTEM_SUSPEND,
+	PKVM_PSCI_CPU_ENTRY,
+};
+
 struct pkvm_module_ops {
 	int (*create_private_mapping)(phys_addr_t phys, size_t size,
 				      enum kvm_pgtable_prot prot,
@@ -22,6 +28,7 @@ struct pkvm_module_ops {
 	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
 	int (*register_host_smc_handler)(bool (*cb)(struct kvm_cpu_context *));
 	int (*register_illegal_abt_notifier)(void (*cb)(struct kvm_cpu_context *));
+	int (*register_psci_notifier)(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index 48eae5f57321..bde933e0861f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -6,6 +6,9 @@
 int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *));
 int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *));
 
+enum pkvm_psci_notification;
+int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
+
 #ifdef CONFIG_MODULES
 int __pkvm_init_module(void *module_init);
 int __pkvm_register_hcall(unsigned long hfn_hyp_va);
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 47ee9b83a28c..99fa311daade 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -65,6 +65,7 @@ const struct pkvm_module_ops module_ops = {
 	.protect_host_page = hyp_protect_host_page,
 	.register_host_smc_handler = __pkvm_register_host_smc_handler,
 	.register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier,
+	.register_psci_notifier = __pkvm_register_psci_notifier,
 };
 
 int __pkvm_init_module(void *module_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index 0fa155323e19..c5b439fd0066 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -24,6 +24,18 @@ void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt);
 /* Config options set by the host. */
 struct kvm_host_psci_config __ro_after_init kvm_host_psci_config;
 
+static void (*pkvm_psci_notifier)(enum pkvm_psci_notification, struct kvm_cpu_context *);
+static void pkvm_psci_notify(enum pkvm_psci_notification notif, struct kvm_cpu_context *host_ctxt)
+{
+	if (READ_ONCE(pkvm_psci_notifier))
+		pkvm_psci_notifier(notif, host_ctxt);
+}
+
+int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *))
+{
+	return cmpxchg(&pkvm_psci_notifier, NULL, cb) ? -EBUSY : 0;
+}
+
 #define INVALID_CPU_ID	UINT_MAX
 
 struct psci_boot_args {
@@ -169,6 +181,8 @@ static int psci_cpu_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
 	boot_args->pc = pc;
 	boot_args->r0 = r0;
 
+	pkvm_psci_notify(PKVM_PSCI_CPU_SUSPEND, host_ctxt);
+
 	/*
 	 * Will either return if shallow sleep state, or wake up into the entry
 	 * point if it is a deep sleep state.
@@ -196,6 +210,8 @@ static int psci_system_suspend(u64 func_id, struct kvm_cpu_context *host_ctxt)
 	boot_args->pc = pc;
 	boot_args->r0 = r0;
 
+	pkvm_psci_notify(PKVM_PSCI_SYSTEM_SUSPEND, host_ctxt);
+
 	/* Will only return on error. */
 	return psci_call(func_id,
 			 __hyp_pa(&kvm_hyp_cpu_resume),
@@ -220,6 +236,8 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on)
 	if (is_cpu_on)
 		release_boot_args(boot_args);
 
+	pkvm_psci_notify(PKVM_PSCI_CPU_ENTRY, host_ctxt);
+
 	__host_enter(host_ctxt);
 }
 

From 00fca26bd78f75ed95bba3612febc862890172d9 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Wed, 7 Dec 2022 14:54:43 +0000
Subject: [PATCH 388/457] ANDROID: KVM: arm64: Allow trap handling from pKVM
 modules

Introduce a new default trap handler for the host that can be set
from modules.

Bug: 244543039
Bug: 245034629
Change-Id: Iaabfa44f5f2c41af51f36ed4eec8762e7c951c01
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h  | 1 +
 arch/arm64/kvm/hyp/include/nvhe/modules.h | 1 +
 arch/arm64/kvm/hyp/nvhe/hyp-main.c        | 8 +++++++-
 arch/arm64/kvm/hyp/nvhe/modules.c         | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index ee0da3a99cbf..c19441709edb 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -27,6 +27,7 @@ struct pkvm_module_ops {
 	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
 	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
 	int (*register_host_smc_handler)(bool (*cb)(struct kvm_cpu_context *));
+	int (*register_default_trap_handler)(bool (*cb)(struct kvm_cpu_context *));
 	int (*register_illegal_abt_notifier)(void (*cb)(struct kvm_cpu_context *));
 	int (*register_psci_notifier)(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
 };
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index bde933e0861f..681fd233fd2b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -4,6 +4,7 @@
 #define HCALL_UNHANDLED -1
 
 int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *));
+int __pkvm_register_default_trap_handler(bool (*cb)(struct kvm_cpu_context *));
 int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *));
 
 enum pkvm_psci_notification;
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index f3b75cd4e98b..040b98c0eebd 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -41,12 +41,18 @@ DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
 static bool (*default_host_smc_handler)(struct kvm_cpu_context *host_ctxt);
+static bool (*default_trap_handler)(struct kvm_cpu_context *host_ctxt);
 
 int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *))
 {
 	return cmpxchg(&default_host_smc_handler, NULL, cb) ? -EBUSY : 0;
 }
 
+int __pkvm_register_default_trap_handler(bool (*cb)(struct kvm_cpu_context *))
+{
+	return cmpxchg(&default_trap_handler, NULL, cb) ? -EBUSY : 0;
+}
+
 static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
 {
 	struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
@@ -1326,6 +1332,6 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
 		handle_host_mem_abort(host_ctxt);
 		break;
 	default:
-		BUG();
+		BUG_ON(!READ_ONCE(default_trap_handler) || !default_trap_handler(host_ctxt));
 	}
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 99fa311daade..3b033b7d6c19 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -64,6 +64,7 @@ const struct pkvm_module_ops module_ops = {
 	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
 	.protect_host_page = hyp_protect_host_page,
 	.register_host_smc_handler = __pkvm_register_host_smc_handler,
+	.register_default_trap_handler = __pkvm_register_default_trap_handler,
 	.register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier,
 	.register_psci_notifier = __pkvm_register_psci_notifier,
 };

From a6c94a925ccf512ab965e27b050c0b94e0308745 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
Date: Wed, 9 Nov 2022 10:48:36 +0530
Subject: [PATCH 389/457] UPSTREAM: mm/madvise: fix madvise_pageout for private
 file mappings

When MADV_PAGEOUT is called on a private file mapping VMA region, we bail
out early if the process is neither owner nor write capable of the file.
However, this VMA may have both private/shared clean pages and private
dirty pages.  The opportunity of paging out the private dirty pages (Anon
pages) is missed.  Fix this behavior by allowing private file mappings
pageout further and perform the file access check along with PageAnon()
during page walk.

We observe ~10% improvement in zram usage, thus leaving more available
memory on a 4GB RAM system running Android.

[quic_pkondeti@quicinc.com: v2]
  Link: https://lkml.kernel.org/r/1669962597-27724-1-git-send-email-quic_pkondeti@quicinc.com
Link: https://lkml.kernel.org/r/1667971116-12900-1-git-send-email-quic_pkondeti@quicinc.com
Signed-off-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
Cc: Charan Teja Kalla <quic_charante@quicinc.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Bug: 259329159
(cherry picked from commit fd3b1bc3c86ee11ba77421b00c70280605b521c6)
Change-Id: I5f2d425aec94e5a75ebeaf90f9f5d7adf1975c59
Signed-off-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
---
 mm/madvise.c | 53 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index b913ba6efc10..381d11184526 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,6 +321,21 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	return 0;
 }
 
+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+{
+	if (!vma->vm_file)
+		return false;
+	/*
+	 * paging out pagecache only for non-anonymous mappings that correspond
+	 * to the files the calling process could (if tried) open for writing;
+	 * otherwise we'd be including shared non-exclusive mappings, which
+	 * opens a side channel.
+	 */
+	return inode_owner_or_capable(&init_user_ns,
+				      file_inode(vma->vm_file)) ||
+	       file_permission(vma->vm_file, MAY_WRITE) == 0;
+}
+
 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
@@ -334,10 +349,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	spinlock_t *ptl;
 	struct page *page = NULL;
 	LIST_HEAD(page_list);
+	bool pageout_anon_only_filter;
 
 	if (fatal_signal_pending(current))
 		return -EINTR;
 
+	pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+					!can_do_file_pageout(vma);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (pmd_trans_huge(*pmd)) {
 		pmd_t orig_pmd;
@@ -364,6 +383,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		if (page_mapcount(page) != 1)
 			goto huge_unlock;
 
+		if (pageout_anon_only_filter && !PageAnon(page))
+			goto huge_unlock;
+
 		if (next - addr != HPAGE_PMD_SIZE) {
 			int err;
 
@@ -432,6 +454,8 @@ regular_page:
 		if (PageTransCompound(page)) {
 			if (page_mapcount(page) != 1)
 				break;
+			if (pageout_anon_only_filter && !PageAnon(page))
+				break;
 			get_page(page);
 			if (!trylock_page(page)) {
 				put_page(page);
@@ -459,6 +483,9 @@ regular_page:
 		if (!PageLRU(page) || page_mapcount(page) != 1)
 			continue;
 
+		if (pageout_anon_only_filter && !PageAnon(page))
+			continue;
+
 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
 
 		if (pte_young(ptent)) {
@@ -553,23 +580,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
 	tlb_end_vma(tlb, vma);
 }
 
-static inline bool can_do_pageout(struct vm_area_struct *vma)
-{
-	if (vma_is_anonymous(vma))
-		return true;
-	if (!vma->vm_file)
-		return false;
-	/*
-	 * paging out pagecache only for non-anonymous mappings that correspond
-	 * to the files the calling process could (if tried) open for writing;
-	 * otherwise we'd be including shared non-exclusive mappings, which
-	 * opens a side channel.
-	 */
-	return inode_owner_or_capable(&init_user_ns,
-				      file_inode(vma->vm_file)) ||
-	       file_permission(vma->vm_file, MAY_WRITE) == 0;
-}
-
 static long madvise_pageout(struct vm_area_struct *vma,
 			struct vm_area_struct **prev,
 			unsigned long start_addr, unsigned long end_addr)
@@ -581,7 +591,14 @@ static long madvise_pageout(struct vm_area_struct *vma,
 	if (!can_madv_lru_vma(vma))
 		return -EINVAL;
 
-	if (!can_do_pageout(vma))
+	/*
+	 * If the VMA belongs to a private file mapping, there can be private
+	 * dirty pages which can be paged out if even this process is neither
+	 * owner nor write capable of the file. We allow private file mappings
+	 * further to pageout dirty anon pages.
+	 */
+	if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+				(vma->vm_flags & VM_MAYSHARE)))
 		return 0;
 
 	lru_add_drain();

From 30b7becfd649a55e364253b80cdf938c05c6e3e7 Mon Sep 17 00:00:00 2001
From: Lina Iyer <ilina@codeaurora.org>
Date: Tue, 15 Dec 2020 13:33:02 -0700
Subject: [PATCH 390/457] ANDROID: cpuidle: export
 cpuidle_driver_state_disabled

Export cpuidle_driver_state_disabled() so that CPU idle states may be
disabled at runtime for debugging CPU and cluster idle states.

Bug: 263691252
Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Change-Id: Id9038074d64fb6c0444d9aca68420414c3223e93
(cherry picked from commit de93734e224be27681d2367bc6275a4dc58e6dc6)
(cherry picked from commit f154a239c34df6de48f5f4eab5b95755235e1d43)
---
 drivers/cpuidle/driver.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index f70aa17e2a8e..5bfdb8663e99 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -385,3 +385,4 @@ unlock:
 
 	mutex_unlock(&cpuidle_lock);
 }
+EXPORT_SYMBOL_GPL(cpuidle_driver_state_disabled);

From 49d7088c5fd459895e1b2780971460ab49a2fddb Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Date: Fri, 16 Dec 2022 16:30:10 -0800
Subject: [PATCH 391/457] ANDROID: GKI: Source GKI_BUILD_CONFIG_FRAGMENT after
 setting all variables

build.config.gki sources a GKI_BUILD_CONFIG_FRAGMENT before all of
the variables that are considered as part of a GKI kernel build are
declared. This reduces the effectiveness of a
GKI_BUILD_CONFIG_FRAGMENT, as it is only able to modify a subset of
the build variables.

Thus, move the logic to source GKI_BUILD_CONFIG_FRAGMENT to the end
of the GKI build config files to provide more flexibility for a
GKI_BUILD_CONFIG_FRAGMENT.

Bug: 262930113
Change-Id: I74abb45f9043acce04cb0052f54fded4340a9366
[isaacmanjarres: Modified build.config.gki.riscv64, since
that file did not exist on android13-5.15.]
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
(cherry picked from commit 69fefbb3db711e543ff0676526b7d285a4d10a14)
---
 build.config.gki         | 4 ----
 build.config.gki.aarch64 | 4 ++++
 build.config.gki.riscv64 | 4 ++++
 build.config.gki.x86_64  | 4 ++++
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/build.config.gki b/build.config.gki
index 8a13ddd696ab..4b931d9eb333 100644
--- a/build.config.gki
+++ b/build.config.gki
@@ -1,6 +1,2 @@
 DEFCONFIG=gki_defconfig
 POST_DEFCONFIG_CMDS="check_defconfig"
-
-if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then
-source ${GKI_BUILD_CONFIG_FRAGMENT}
-fi
diff --git a/build.config.gki.aarch64 b/build.config.gki.aarch64
index 17dec7a82c7a..709af184d7aa 100644
--- a/build.config.gki.aarch64
+++ b/build.config.gki.aarch64
@@ -21,3 +21,7 @@ BUILD_GKI_ARTIFACTS=1
 BUILD_GKI_BOOT_IMG_SIZE=67108864
 BUILD_GKI_BOOT_IMG_GZ_SIZE=47185920
 BUILD_GKI_BOOT_IMG_LZ4_SIZE=53477376
+
+if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then
+source ${GKI_BUILD_CONFIG_FRAGMENT}
+fi
diff --git a/build.config.gki.riscv64 b/build.config.gki.riscv64
index 0eb01c4975de..8215d16e95ba 100644
--- a/build.config.gki.riscv64
+++ b/build.config.gki.riscv64
@@ -24,3 +24,7 @@ BUILD_GKI_BOOT_IMG_LZ4_SIZE=53477376
 
 PRE_DEFCONFIG_CMDS="mkdir -p \${OUT_DIR}/arch/riscv/configs/ && cat ${ROOT_DIR}/${KERNEL_DIR}/arch/riscv/configs/gki_defconfig ${ROOT_DIR}/${KERNEL_DIR}/arch/riscv/configs/64-bit.config ${ROOT_DIR}/${KERNEL_DIR}/arch/riscv/configs/gki.config > \${OUT_DIR}/arch/riscv/configs/${DEFCONFIG};"
 POST_DEFCONFIG_CMDS=""
+
+if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then
+source ${GKI_BUILD_CONFIG_FRAGMENT}
+fi
diff --git a/build.config.gki.x86_64 b/build.config.gki.x86_64
index 3d81fea6aacb..93f492cabec8 100644
--- a/build.config.gki.x86_64
+++ b/build.config.gki.x86_64
@@ -9,3 +9,7 @@ BUILD_GKI_CERTIFICATION_TOOLS=1
 
 BUILD_GKI_ARTIFACTS=1
 BUILD_GKI_BOOT_IMG_SIZE=67108864
+
+if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then
+source ${GKI_BUILD_CONFIG_FRAGMENT}
+fi

From 803b66f506969637cc2fb9f0c680fe0146700022 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Thu, 22 Dec 2022 16:18:53 +0000
Subject: [PATCH 392/457] ANDROID: KVM: arm64: Always declare pKVM module
 loading functions

Move __pkvm_register_el2_call and __pkvm_load_el2_module out of the
MODULE ifdef so the associated EXPORT_SYMBOL are never alone.

Bug: 244543039
Bug: 244373730
Reported-by: kernel test robot <lkp@intel.com>
Change-Id: Icdac2ccd32d09388472c6500d4af951cc23439fb
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index c19441709edb..c76bd3276f06 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -49,10 +49,12 @@ struct pkvm_el2_module {
 	int (*init)(const struct pkvm_module_ops *ops);
 };
 
-#ifdef MODULE
 int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
 			   unsigned long *token);
 
+int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
+			     unsigned long hyp_text_kern_va);
+#ifdef MODULE
 #define pkvm_load_el2_module(init_fn, token)				\
 ({									\
 	extern char __kvm_nvhe___hypmod_text_start[];			\
@@ -83,9 +85,6 @@ int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
 	__pkvm_load_el2_module(&mod, THIS_MODULE, token);		\
 })
 
-int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
-			     unsigned long hyp_text_kern_va);
-
 #define pkvm_register_el2_mod_call(hfn, token)				\
 ({									\
 	extern char __kvm_nvhe___hypmod_text_start[];			\

From 4350e1821b1d324904e45defa48c084c9e9208b7 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 23 Dec 2022 10:53:16 +0000
Subject: [PATCH 393/457] ANDROID: KVM: Remove function_nocfi() leftover in
 pKVM modules

While porting pKVM modules from 5.15 to 6.1, a now deprecated
function_nocfi() has been forgotten.

Bug: 244543039
Bug: 244373730
Change-Id: I00d89c661216753ef52ab6f76d118bf761c8b5d7
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index c76bd3276f06..2fa7aa81a0c4 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -90,7 +90,7 @@ int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
 	extern char __kvm_nvhe___hypmod_text_start[];			\
 	unsigned long hyp_text_kern_va = 				\
 		(unsigned long)__kvm_nvhe___hypmod_text_start;		\
-	__pkvm_register_el2_call(function_nocfi(hfn), token,		\
+	__pkvm_register_el2_call(hfn, token,				\
 				 hyp_text_kern_va);			\
 })
 

From 731835eae3bf37d4f7c70172990053da65a21e28 Mon Sep 17 00:00:00 2001
From: Charan Teja Reddy <quic_charante@quicinc.com>
Date: Tue, 16 Feb 2021 13:59:45 +0530
Subject: [PATCH 394/457] ANDROID: implement wrapper for reverse migration

Reverse migration is used to do the balancing the occupancy of memory
zones in a node in the system whose imabalance may be caused by
migration of pages to other zones by an operation, eg: hotremove and
then hotadding the same memory. In this case there is a lot of free
memory in newly hotadd memory which can be filled up by the previous
migrated pages(as part of offline/hotremove) thus may free up some
pressure in other zones of the node.

Upstream discussion: https://lore.kernel.org/all/ee78c83d-da9b-f6d1-4f66-934b7782acfb@codeaurora.org/

Bug: 201263307
Change-Id: Ib3137dab0db66ecf6858c4077dcadb9dfd0c6b1c
Signed-off-by: Charan Teja Reddy <quic_charante@quicinc.com>
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 include/linux/compaction.h |  8 ++++++++
 include/linux/mmzone.h     |  1 +
 mm/compaction.c            | 23 +++++++++++++++++++++++
 mm/migrate.c               |  2 ++
 mm/page_alloc.c            | 18 ++++++++++++++++++
 5 files changed, 52 insertions(+)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 52a9ff65faee..499308b3933b 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -180,6 +180,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
 extern void kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
+extern unsigned long isolate_and_split_free_page(struct page *page,
+				struct list_head *list);
 
 #else
 static inline void reset_isolation_suitable(pg_data_t *pgdat)
@@ -224,6 +226,12 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat,
 {
 }
 
+static unsigned long isolate_and_split_free_page(struct page *page,
+				struct list_head *list)
+{
+	return 0;
+}
+
 #endif /* CONFIG_COMPACTION */
 
 struct node;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fb1580f99d49..f944387c2914 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1407,6 +1407,7 @@ static inline struct pglist_data *NODE_DATA(int nid)
 extern struct pglist_data *first_online_pgdat(void);
 extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
 extern struct zone *next_zone(struct zone *zone);
+extern int isolate_anon_lru_page(struct page *page);
 
 /**
  * for_each_online_pgdat - helper macro to iterate over all online nodes
diff --git a/mm/compaction.c b/mm/compaction.c
index 1f6da31dd9a5..cc76fd756185 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -744,6 +744,29 @@ isolate_freepages_range(struct compact_control *cc,
 	return pfn;
 }
 
+unsigned long isolate_and_split_free_page(struct page *page,
+						struct list_head *list)
+{
+	unsigned long isolated;
+	unsigned int order;
+
+	if (!PageBuddy(page))
+		return 0;
+
+	order = buddy_order(page);
+	isolated = __isolate_free_page(page, order);
+	if (!isolated)
+		return 0;
+
+	set_page_private(page, order);
+	list_add(&page->lru, list);
+
+	split_map_pages(list);
+
+	return isolated;
+}
+EXPORT_SYMBOL_GPL(isolate_and_split_free_page);
+
 /* Similar to reclaim, but different enough that they don't share logic */
 static bool too_many_isolated(pg_data_t *pgdat)
 {
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8a..91e8ef362016 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -167,6 +167,7 @@ void putback_movable_pages(struct list_head *l)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(putback_movable_pages);
 
 /*
  * Restore a potential migration pte to a working pte entry
@@ -1602,6 +1603,7 @@ out:
 
 	return rc;
 }
+EXPORT_SYMBOL_GPL(migrate_pages);
 
 struct page *alloc_migration_target(struct page *page, unsigned long private)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e60657875d3..285628e90117 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -585,6 +585,24 @@ unsigned long get_pfnblock_flags_mask(const struct page *page,
 {
 	return __get_pfnblock_flags_mask(page, pfn, mask);
 }
+EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
+
+int isolate_anon_lru_page(struct page *page)
+{
+	int ret;
+
+	if (!PageLRU(page) || !PageAnon(page))
+		return -EINVAL;
+
+	if (!get_page_unless_zero(page))
+		return -EINVAL;
+
+	ret = isolate_lru_page(page);
+	put_page(page);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
 
 static __always_inline int get_pfnblock_migratetype(const struct page *page,
 					unsigned long pfn)

From 8a98feb47ec08674a15c094db2dead5b94d2190a Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas@google.com>
Date: Fri, 23 Apr 2021 19:20:54 +0000
Subject: [PATCH 395/457] ANDROID: mm: compaction: fix
 isolate_and_split_free_page() redefinition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Guard isolate_and_split_free_page() with CONFIG_COMPACTION. This fixes
the follwoing build error as the function collides with its inline stub
from the header file:

mm/compaction.c:766:15: error: redefinition of ‘isolate_and_split_free_page’
  766 | unsigned long isolate_and_split_free_page(struct page *page,
      |               ^~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from mm/compaction.c:14:
./include/linux/compaction.h:241:29: note: previous definition of ‘isolate_and_split_free_page’ was here
  241 | static inline unsigned long isolate_and_split_free_page(struct page *page,
      |                             ^~~~~~~~~~~~~~~~~~~~~~~~~~~

Bug: 201263307
Change-Id: Ie8f3fedcc9d4af5cfdcfd5829377671745ab77d6
Fixes: 8cd9aa93b726 ("ANDROID: implement wrapper for reverse migration")
Reported-by: kernelci.org bot <bot@kernelci.org>
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 mm/compaction.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/compaction.c b/mm/compaction.c
index cc76fd756185..7509bacad1bf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -744,6 +744,7 @@ isolate_freepages_range(struct compact_control *cc,
 	return pfn;
 }
 
+#ifdef CONFIG_COMPACTION
 unsigned long isolate_and_split_free_page(struct page *page,
 						struct list_head *list)
 {
@@ -766,6 +767,7 @@ unsigned long isolate_and_split_free_page(struct page *page,
 	return isolated;
 }
 EXPORT_SYMBOL_GPL(isolate_and_split_free_page);
+#endif
 
 /* Similar to reclaim, but different enough that they don't share logic */
 static bool too_many_isolated(pg_data_t *pgdat)

From d0652b3a8559416c98b940dd6f2f6816d6fa0b27 Mon Sep 17 00:00:00 2001
From: Elliot Berman <eberman@codeaurora.org>
Date: Wed, 21 Apr 2021 10:10:24 -0700
Subject: [PATCH 396/457] ANDROID: inline isolate_and_split_free_page

Add inline keyword to pass compilation when CONFIG_COMPACTION is not
enabled.

include/linux/compaction.h:241:22: warning: 'isolate_and_split_free_page' defined but not used [-Wunused-function]

Bug: 175403896
Change-Id: I6bec2789a68e1b8fc9f4ea8f7267571a333d68ef
Fixes: 061e34c52e0f ("ANDROID: mm: compaction: fix isolate_and_split_free_page() redefinition")
Signed-off-by: Elliot Berman <eberman@codeaurora.org>
(cherry picked from commit b9836d4090eb918a90e99f590e470e40eb8e200e)
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 include/linux/compaction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 499308b3933b..fb8a8f3157de 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -226,7 +226,7 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat,
 {
 }
 
-static unsigned long isolate_and_split_free_page(struct page *page,
+static inline unsigned long isolate_and_split_free_page(struct page *page,
 				struct list_head *list)
 {
 	return 0;

From e486eabf3a8c5c95b44aa2f540f3cc25c4d34f8b Mon Sep 17 00:00:00 2001
From: Will McVicker <willmcvicker@google.com>
Date: Tue, 3 Jan 2023 15:49:19 -0800
Subject: [PATCH 397/457] ANDROID: update the BRANCH constant

... as part of setting up this branch.

Bug: 259701619
Change-Id: I4bfee9b34d63034e563af95c1aa0696418c6ec65
Signed-off-by: Will McVicker <willmcvicker@google.com>
---
 build.config.constants | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.config.constants b/build.config.constants
index 3ac4d70578ec..ea2a05e143db 100644
--- a/build.config.constants
+++ b/build.config.constants
@@ -1,2 +1,2 @@
-BRANCH=android-mainline
+BRANCH=android14-6.1
 CLANG_VERSION=r475365

From 51d2f7a941229c246350ec8a978aef2635d6e381 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Fri, 30 Dec 2022 23:43:32 +0800
Subject: [PATCH 398/457] f2fs: fix to avoid NULL pointer dereference in
 f2fs_issue_flush()

With below two cases, it will cause NULL pointer dereference when
accessing SM_I(sbi)->fcc_info in f2fs_issue_flush().

a) If kthread_run() fails in f2fs_create_flush_cmd_control(), it will
release SM_I(sbi)->fcc_info,

- mount -o noflush_merge /dev/vda /mnt/f2fs
- mount -o remount,flush_merge /dev/vda /mnt/f2fs  -- kthread_run() fails
- dd if=/dev/zero of=/mnt/f2fs/file bs=4k count=1 conv=fsync

b) we will never allocate memory for SM_I(sbi)->fcc_info w/ below
testcase,

- mount -o ro /dev/vda /mnt/f2fs
- mount -o rw,remount /dev/vda /mnt/f2fs
- dd if=/dev/zero of=/mnt/f2fs/file bs=4k count=1 conv=fsync

In order to fix this issue, let change as below:
- fix error path handling in f2fs_create_flush_cmd_control().
- allocate SM_I(sbi)->fcc_info even if readonly is on.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/segment.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a9099a754dd2..661cbcd5ebad 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -663,8 +663,7 @@ init_thread:
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
 		int err = PTR_ERR(fcc->f2fs_issue_flush);
 
-		kfree(fcc);
-		SM_I(sbi)->fcc_info = NULL;
+		fcc->f2fs_issue_flush = NULL;
 		return err;
 	}
 
@@ -5138,11 +5137,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
 
 	init_f2fs_rwsem(&sm_info->curseg_lock);
 
-	if (!f2fs_readonly(sbi->sb)) {
-		err = f2fs_create_flush_cmd_control(sbi);
-		if (err)
-			return err;
-	}
+	err = f2fs_create_flush_cmd_control(sbi);
+	if (err)
+		return err;
 
 	err = create_discard_cmd_control(sbi);
 	if (err)

From 48bab6221f5b904ee355aa92a288c9920e61d22f Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 16 Dec 2022 14:05:44 -0800
Subject: [PATCH 399/457] f2fs: initialize extent_cache parameter

This can avoid confusing tracepoint values.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c         | 2 +-
 fs/f2fs/extent_cache.c | 2 +-
 fs/f2fs/file.c         | 2 +-
 fs/f2fs/segment.c      | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 6e43e19c7d1c..97e816590cd9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2183,7 +2183,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	sector_t last_block_in_file;
 	const unsigned blocksize = blks_to_bytes(inode, 1);
 	struct decompress_io_ctx *dic = NULL;
-	struct extent_info ei = {0, };
+	struct extent_info ei = {};
 	bool from_dnode = true;
 	int i;
 	int ret = 0;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 1bd38a78ebba..3aa2f8296045 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -938,7 +938,7 @@ out:
 
 static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
 {
-	struct extent_info ei;
+	struct extent_info ei = {};
 
 	if (!__may_extent_tree(dn->inode, type))
 		return;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 56c23b5e9d65..d8d8773fb819 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2559,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	struct f2fs_map_blocks map = { .m_next_extent = NULL,
 					.m_seg_type = NO_CHECK_TYPE,
 					.m_may_create = false };
-	struct extent_info ei = {0, };
+	struct extent_info ei = {};
 	pgoff_t pg_start, pg_end, next_pgofs;
 	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	unsigned int total = 0, sec_num;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 661cbcd5ebad..0f457f5f524f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3160,7 +3160,7 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
 static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct extent_info ei;
+	struct extent_info ei = {};
 
 	if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
 		if (!ei.age)

From e982e6c834ba5a66be83f71d2ba5db2ee0f6a2d3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 16 Dec 2022 14:41:54 -0800
Subject: [PATCH 400/457] f2fs: don't mix to use union values in extent_info

Let's explicitly use the defined values in block_age case only.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 3aa2f8296045..cc3fed04dd6f 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -881,7 +881,8 @@ static unsigned long long __calculate_block_age(unsigned long long new,
 }
 
 /* This returns a new age and allocated blocks in ei */
-static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
+static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
+						block_t blkaddr)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	loff_t f_size = i_size_read(inode);
@@ -894,7 +895,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
 	 * block here.
 	 */
 	if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
-			ei->blk == NEW_ADDR)
+			blkaddr == NEW_ADDR)
 		return -EINVAL;
 
 	if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) {
@@ -915,14 +916,14 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei)
 		return 0;
 	}
 
-	f2fs_bug_on(sbi, ei->blk == NULL_ADDR);
+	f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
 
 	/* the data block was allocated for the first time */
-	if (ei->blk == NEW_ADDR)
+	if (blkaddr == NEW_ADDR)
 		goto out;
 
-	if (__is_valid_data_blkaddr(ei->blk) &&
-			!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE)) {
+	if (__is_valid_data_blkaddr(blkaddr) &&
+	    !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
 		f2fs_bug_on(sbi, 1);
 		return -EINVAL;
 	}
@@ -953,8 +954,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ
 		else
 			ei.blk = dn->data_blkaddr;
 	} else if (type == EX_BLOCK_AGE) {
-		ei.blk = dn->data_blkaddr;
-		if (__get_new_block_age(dn->inode, &ei))
+		if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr))
 			return;
 	}
 	__update_extent_tree_range(dn->inode, &ei, type);

From 5336f0776253ea00da46487386b2f1f6c40069b3 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 16 Dec 2022 16:36:36 -0800
Subject: [PATCH 401/457] f2fs: should use a temp extent_info for lookup

Otherwise, __lookup_extent_tree() will override the given extent_info which will
be used by caller.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index cc3fed04dd6f..7b191ff65631 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -888,6 +888,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
 	loff_t f_size = i_size_read(inode);
 	unsigned long long cur_blocks =
 				atomic64_read(&sbi->allocated_data_blocks);
+	struct extent_info tei = *ei;	/* only fofs and len are valid */
 
 	/*
 	 * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
@@ -898,17 +899,17 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
 			blkaddr == NEW_ADDR)
 		return -EINVAL;
 
-	if (__lookup_extent_tree(inode, ei->fofs, ei, EX_BLOCK_AGE)) {
+	if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) {
 		unsigned long long cur_age;
 
-		if (cur_blocks >= ei->last_blocks)
-			cur_age = cur_blocks - ei->last_blocks;
+		if (cur_blocks >= tei.last_blocks)
+			cur_age = cur_blocks - tei.last_blocks;
 		else
 			/* allocated_data_blocks overflow */
-			cur_age = ULLONG_MAX - ei->last_blocks + cur_blocks;
+			cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks;
 
-		if (ei->age)
-			ei->age = __calculate_block_age(cur_age, ei->age);
+		if (tei.age)
+			ei->age = __calculate_block_age(cur_age, tei.age);
 		else
 			ei->age = cur_age;
 		ei->last_blocks = cur_blocks;

From 628815291f91af92bb6fc9339359daad8f48f72c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 21 Dec 2022 16:14:10 -0800
Subject: [PATCH 402/457] f2fs: let's avoid panic if extent_tree is not created

This patch avoids the below panic.

pc : __lookup_extent_tree+0xd8/0x760
lr : f2fs_do_write_data_page+0x104/0x87c
sp : ffffffc010cbb3c0
x29: ffffffc010cbb3e0 x28: 0000000000000000
x27: ffffff8803e7f020 x26: ffffff8803e7ed40
x25: ffffff8803e7f020 x24: ffffffc010cbb460
x23: ffffffc010cbb480 x22: 0000000000000000
x21: 0000000000000000 x20: ffffffff22e90900
x19: 0000000000000000 x18: ffffffc010c5d080
x17: 0000000000000000 x16: 0000000000000020
x15: ffffffdb1acdbb88 x14: ffffff888759e2b0
x13: 0000000000000000 x12: ffffff802da49000
x11: 000000000a001200 x10: ffffff8803e7ed40
x9 : ffffff8023195800 x8 : ffffff802da49078
x7 : 0000000000000001 x6 : 0000000000000000
x5 : 0000000000000006 x4 : ffffffc010cbba28
x3 : 0000000000000000 x2 : ffffffc010cbb480
x1 : 0000000000000000 x0 : ffffff8803e7ed40
Call trace:
 __lookup_extent_tree+0xd8/0x760
 f2fs_do_write_data_page+0x104/0x87c
 f2fs_write_single_data_page+0x420/0xb60
 f2fs_write_cache_pages+0x418/0xb1c
 __f2fs_write_data_pages+0x428/0x58c
 f2fs_write_data_pages+0x30/0x40
 do_writepages+0x88/0x190
 __writeback_single_inode+0x48/0x448
 writeback_sb_inodes+0x468/0x9e8
 __writeback_inodes_wb+0xb8/0x2a4
 wb_writeback+0x33c/0x740
 wb_do_writeback+0x2b4/0x400
 wb_workfn+0xe4/0x34c
 process_one_work+0x24c/0x5bc
 worker_thread+0x3e8/0xa50
 kthread+0x150/0x1b4

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/extent_cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 7b191ff65631..342af24b2f8c 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -546,7 +546,8 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
 	struct extent_node *en;
 	bool ret = false;
 
-	f2fs_bug_on(sbi, !et);
+	if (!et)
+		return false;
 
 	trace_f2fs_lookup_extent_tree_start(inode, pgofs, type);
 

From 66d68a50dbcdf62ab36dd977867685f7df7fcbd7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:42 +0100
Subject: [PATCH 403/457] BACKPORT: blk-crypto: don't use struct request_queue
 for public interfaces

Switch all public blk-crypto interfaces to use struct block_device
arguments to specify the device they operate on instead of th
request_queue, which is a block layer implementation detail.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>

(cherry picked from commit fce3caea0f241f5d34855c82c399d5e0e2d91f07)
(resolved conflict in blk_crypto_config_supported())
Change-Id: Ifde7cf1c8a2a5ddfb2fde4e5fb118269a3bfcdb0
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/block/inline-encryption.rst | 12 ++++++------
 block/blk-crypto.c                        | 24 +++++++++++++----------
 drivers/md/dm-table.c                     |  2 +-
 fs/crypto/inline_crypt.c                  |  8 +++-----
 include/linux/blk-crypto.h                | 11 ++++-------
 5 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index c2bb3c5fc41a..7ab203be0117 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
 of inline encryption using the kernel crypto API.  blk-crypto-fallback is built
 into the block layer, so it works on any block device without any special setup.
 Essentially, when a bio with an encryption context is submitted to a
-request_queue that doesn't support that encryption context, the block layer will
+block_device that doesn't support that encryption context, the block layer will
 handle en/decryption of the bio using blk-crypto-fallback.
 
 For encryption, the data cannot be encrypted in-place, as callers usually rely
@@ -187,7 +187,7 @@ API presented to users of the block layer
 
 ``blk_crypto_config_supported()`` allows users to check ahead of time whether
 inline encryption with particular crypto settings will work on a particular
-request_queue -- either via hardware or via blk-crypto-fallback.  This function
+block_device -- either via hardware or via blk-crypto-fallback.  This function
 takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
 the actual bytes of the key and instead just contains the algorithm, data unit
 size, etc.  This function can be useful if blk-crypto-fallback is disabled.
@@ -195,7 +195,7 @@ size, etc.  This function can be useful if blk-crypto-fallback is disabled.
 ``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
 
 Users must call ``blk_crypto_start_using_key()`` before actually starting to use
-a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
 was called earlier).  This is needed to initialize blk-crypto-fallback if it
 will be needed.  This must not be called from the data path, as this may have to
 allocate resources, which may deadlock in that case.
@@ -207,7 +207,7 @@ for en/decryption.  Users don't need to worry about freeing the bio_crypt_ctx
 later, as that happens automatically when the bio is freed or reset.
 
 Finally, when done using inline encryption with a blk_crypto_key on a
-request_queue, users must call ``blk_crypto_evict_key()``.  This ensures that
+block_device, users must call ``blk_crypto_evict_key()``.  This ensures that
 the key is evicted from all keyslots it may be programmed into and unlinked from
 any kernel data structures it may be linked into.
 
@@ -221,9 +221,9 @@ as follows:
 5. ``blk_crypto_evict_key()`` (after all I/O has completed)
 6. Zeroize the blk_crypto_key (this has no dedicated function)
 
-If a blk_crypto_key is being used on multiple request_queues, then
+If a blk_crypto_key is being used on multiple block_devices, then
 ``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
-and ``blk_crypto_evict_key()`` must be called on each request_queue.
+and ``blk_crypto_evict_key()`` must be called on each block_device.
 
 API presented to device drivers
 ===============================
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index c37a8f1f7b4b..bec643f92830 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -377,22 +377,23 @@ EXPORT_SYMBOL_GPL(blk_crypto_init_key);
 
 /*
  * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
- * request queue it's submitted to supports inline crypto, or the
+ * block_device it's submitted to supports inline crypto, or the
  * blk-crypto-fallback is enabled and supports the cfg).
  */
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg)
 {
 	if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) &&
 	    cfg->key_type == BLK_CRYPTO_KEY_TYPE_STANDARD)
 		return true;
-	return __blk_crypto_cfg_supported(q->crypto_profile, cfg);
+	return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+					  cfg);
 }
 
 /**
  * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @bdev: block device to operate on
  * @key: A key to use on the device
- * @q: the request queue for the device
  *
  * Upper layers must call this function to ensure that either the hardware
  * supports the key's crypto settings, or the crypto API fallback has transforms
@@ -404,10 +405,11 @@ bool blk_crypto_config_supported(struct request_queue *q,
  *	   blk-crypto-fallback is either disabled or the needed algorithm
  *	   is disabled in the crypto API; or another -errno code.
  */
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
-			       struct request_queue *q)
+int blk_crypto_start_using_key(struct block_device *bdev,
+			       const struct blk_crypto_key *key)
 {
-	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+	if (__blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+			&key->crypto_cfg))
 		return 0;
 	if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_STANDARD) {
 		pr_warn_once("tried to use wrapped key, but hardware doesn't support it\n");
@@ -420,7 +422,7 @@ EXPORT_SYMBOL_GPL(blk_crypto_start_using_key);
 /**
  * blk_crypto_evict_key() - Evict a key from any inline encryption hardware
  *			    it may have been programmed into
- * @q: The request queue who's associated inline encryption hardware this key
+ * @bdev: The block_device who's associated inline encryption hardware this key
  *     might have been programmed into
  * @key: The key to evict
  *
@@ -430,14 +432,16 @@ EXPORT_SYMBOL_GPL(blk_crypto_start_using_key);
  *
  * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
  */
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
 			 const struct blk_crypto_key *key)
 {
+	struct request_queue *q = bdev_get_queue(bdev);
+
 	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
 		return __blk_crypto_evict_key(q->crypto_profile, key);
 
 	/*
-	 * If the request_queue didn't support the key, then blk-crypto-fallback
+	 * If the block_device didn't support the key, then blk-crypto-fallback
 	 * may have been used, so try to evict the key from blk-crypto-fallback.
 	 */
 	return blk_crypto_fallback_evict_key(key);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f53a64c81d23..77c7bdc5409b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
 	struct dm_keyslot_evict_args *args = data;
 	int err;
 
-	err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key);
+	err = blk_crypto_evict_key(dev->bdev, args->key);
 	if (!args->err)
 		args->err = err;
 	/* Always try to evict the key from all devices. */
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 5a6abb8c1470..9d50514b71ec 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -143,8 +143,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci,
 		return PTR_ERR(devs);
 
 	for (i = 0; i < num_devs; i++) {
-		if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
-						 &crypto_cfg))
+		if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
 			goto out_free_devs;
 	}
 
@@ -193,8 +192,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 		goto fail;
 	}
 	for (i = 0; i < num_devs; i++) {
-		err = blk_crypto_start_using_key(blk_key,
-						 bdev_get_queue(devs[i]));
+		err = blk_crypto_start_using_key(devs[i], blk_key);
 		if (err)
 			break;
 	}
@@ -233,7 +231,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 	devs = fscrypt_get_devices(sb, &num_devs);
 	if (!IS_ERR(devs)) {
 		for (i = 0; i < num_devs; i++)
-			blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
+			blk_crypto_evict_key(devs[i], blk_key);
 		kfree(devs);
 	}
 	kfree_sensitive(blk_key);
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index 812e5172ff56..f25c6317a6f7 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -125,9 +125,6 @@ struct bio_crypt_ctx {
 #include <linux/blk_types.h>
 #include <linux/blkdev.h>
 
-struct request;
-struct request_queue;
-
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 
 static inline bool bio_has_crypt_ctx(struct bio *bio)
@@ -150,13 +147,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 			unsigned int dun_bytes,
 			unsigned int data_unit_size);
 
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
-			       struct request_queue *q);
+int blk_crypto_start_using_key(struct block_device *bdev,
+			       const struct blk_crypto_key *key);
 
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
 			 const struct blk_crypto_key *key);
 
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg);
 
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */

From 121484d910bbf1190c87b876e32355433cae2839 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:43 +0100
Subject: [PATCH 404/457] BACKPORT: blk-crypto: add a
 blk_crypto_config_supported_natively helper

Add a blk_crypto_config_supported_natively helper that wraps
__blk_crypto_cfg_supported to retrieve the crypto_profile from the
request queue.  With this fscrypt can stop including
blk-crypto-profile.h and rely on the public consumer interface in
blk-crypto.h.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>

(cherry picked from commit 6715c98b6cf003f26b1b2f655393134e9d999a05)
(resolved conflict in blk_crypto_config_supported())
Change-Id: I40c4ab6bd9a108661c40c837227b6aed64685ae7
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 block/blk-crypto.c         | 21 ++++++++++++---------
 fs/crypto/inline_crypt.c   |  6 ++----
 include/linux/blk-crypto.h |  2 ++
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index bec643f92830..bf28d3b31cba 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -274,7 +274,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 {
 	struct bio *bio = *bio_ptr;
 	const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
-	struct blk_crypto_profile *profile;
 
 	/* Error if bio has no data. */
 	if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -291,10 +290,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 	 * Success if device supports the encryption context, or if we succeeded
 	 * in falling back to the crypto API.
 	 */
-	profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
-	if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
+	if (blk_crypto_config_supported_natively(bio->bi_bdev,
+						 &bc_key->crypto_cfg))
 		return true;
-
 	if (blk_crypto_fallback_bio_prep(bio_ptr))
 		return true;
 fail:
@@ -375,6 +373,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 }
 EXPORT_SYMBOL_GPL(blk_crypto_init_key);
 
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+					  const struct blk_crypto_config *cfg)
+{
+	return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+					  cfg);
+}
+
 /*
  * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
  * block_device it's submitted to supports inline crypto, or the
@@ -386,8 +391,7 @@ bool blk_crypto_config_supported(struct block_device *bdev,
 	if (IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) &&
 	    cfg->key_type == BLK_CRYPTO_KEY_TYPE_STANDARD)
 		return true;
-	return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
-					  cfg);
+	return blk_crypto_config_supported_natively(bdev, cfg);
 }
 
 /**
@@ -408,8 +412,7 @@ bool blk_crypto_config_supported(struct block_device *bdev,
 int blk_crypto_start_using_key(struct block_device *bdev,
 			       const struct blk_crypto_key *key)
 {
-	if (__blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
-			&key->crypto_cfg))
+	if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
 		return 0;
 	if (key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_STANDARD) {
 		pr_warn_once("tried to use wrapped key, but hardware doesn't support it\n");
@@ -437,7 +440,7 @@ int blk_crypto_evict_key(struct block_device *bdev,
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+	if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
 		return __blk_crypto_evict_key(q->crypto_profile, key);
 
 	/*
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 9d50514b71ec..77d0d86c578c 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
  * provides the key and IV to use.
  */
 
-#include <linux/blk-crypto-profile.h>
+#include <linux/blk-crypto.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/sched/mm.h>
@@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
 	unsigned int i;
 
 	for (i = 0; i < num_devs; i++) {
-		struct request_queue *q = bdev_get_queue(devs[i]);
-
 		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-		    __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
+		    blk_crypto_config_supported_natively(devs[i], cfg)) {
 			if (!xchg(&mode->logged_blk_crypto_native, 1))
 				pr_info("fscrypt: %s using blk-crypto (native)\n",
 					mode->friendly_name);
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index f25c6317a6f7..f2b269d695f1 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -153,6 +153,8 @@ int blk_crypto_start_using_key(struct block_device *bdev,
 int blk_crypto_evict_key(struct block_device *bdev,
 			 const struct blk_crypto_key *key);
 
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+					  const struct blk_crypto_config *cfg);
 bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg);
 

From fb321781a52bbfbc53d3042638b06f7a4caa3758 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 14 Nov 2022 05:29:44 +0100
Subject: [PATCH 405/457] UPSTREAM: blk-crypto: move internal only declarations
 to blk-crypto-internal.h

 blk_crypto_get_keyslot, blk_crypto_put_keyslot, __blk_crypto_evict_key
and __blk_crypto_cfg_supported are only used internally by the
blk-crypto code, so move the out of blk-crypto-profile.h, which is
included by drivers that supply blk-crypto functionality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20221114042944.1009870-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>

(cherry picked from commit 3569788c08235c6f3e9e6ca724b2df44787ff487)
Change-Id: I80b07a1c3b6e6f41ffe48adbdb27a3ca4480ff75
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 block/blk-crypto-internal.h        | 12 ++++++++++++
 include/linux/blk-crypto-profile.h | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 5958ad9cc7f6..633b4b07704b 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -66,6 +66,18 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
 	return rq->crypt_ctx;
 }
 
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+				    const struct blk_crypto_key *key,
+				    struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+			   const struct blk_crypto_key *key);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+				const struct blk_crypto_config *cfg);
+
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
 static inline int blk_crypto_sysfs_register(struct request_queue *q)
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
index 706de8c91ec9..189f9b65b764 100644
--- a/include/linux/blk-crypto-profile.h
+++ b/include/linux/blk-crypto-profile.h
@@ -159,18 +159,6 @@ int devm_blk_crypto_profile_init(struct device *dev,
 
 unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
 
-blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
-				    const struct blk_crypto_key *key,
-				    struct blk_crypto_keyslot **slot_ptr);
-
-void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
-
-bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
-				const struct blk_crypto_config *cfg);
-
-int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
-			   const struct blk_crypto_key *key);
-
 void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
 
 void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);

From 3ec40c5b27de019cfb2b0be5d41dae56a2922c67 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 23 Nov 2022 09:29:23 -0800
Subject: [PATCH 406/457] UPSTREAM: blk-crypto: Add a missing include directive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow the compiler to verify consistency of function declarations and
function definitions. This patch fixes the following sparse errors:

block/blk-crypto-profile.c:241:14: error: no previous prototype for ‘blk_crypto_get_keyslot’ [-Werror=missing-prototypes]
  241 | blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
      |              ^~~~~~~~~~~~~~~~~~~~~~
block/blk-crypto-profile.c:318:6: error: no previous prototype for ‘blk_crypto_put_keyslot’ [-Werror=missing-prototypes]
  318 | void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
      |      ^~~~~~~~~~~~~~~~~~~~~~
block/blk-crypto-profile.c:344:6: error: no previous prototype for ‘__blk_crypto_cfg_supported’ [-Werror=missing-prototypes]
  344 | bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
      |      ^~~~~~~~~~~~~~~~~~~~~~~~~~
block/blk-crypto-profile.c:373:5: error: no previous prototype for ‘__blk_crypto_evict_key’ [-Werror=missing-prototypes]
  373 | int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
      |     ^~~~~~~~~~~~~~~~~~~~~~

Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20221123172923.434339-1-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>

(cherry picked from commit 85168d416e5d3184b77dbec8fee75c9439894afa)
Change-Id: I797a99bc00c114dc86e74e1d5b1f7866f7e64a10
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 block/blk-crypto-profile.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 1ea9ec06096a..cf4fd75e465e 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -32,6 +32,7 @@
 #include <linux/wait.h>
 #include <linux/blkdev.h>
 #include <linux/blk-integrity.h>
+#include "blk-crypto-internal.h"
 
 struct blk_crypto_keyslot {
 	atomic_t slot_refs;

From 2ef8812e375828afb246aae8af26f73a6cc2f54b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Dec 2022 23:30:40 +0000
Subject: [PATCH 407/457] ANDROID: dm-default-key: update for blk-crypto
 changes

The prototypes of blk_crypto_evict_key() and
blk_crypto_start_using_key() changed, so update the callers in
dm-default-key which is not upstream.

Bug: 160885805
Change-Id: Ie39a298d8aca77c042f11bbfa25fd9bf50593c52
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 drivers/md/dm-default-key.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-default-key.c b/drivers/md/dm-default-key.c
index d19f9b0fb8c1..577c5fdb6bbc 100644
--- a/drivers/md/dm-default-key.c
+++ b/drivers/md/dm-default-key.c
@@ -68,8 +68,7 @@ static void default_key_dtr(struct dm_target *ti)
 	int err;
 
 	if (dkc->dev) {
-		err = blk_crypto_evict_key(bdev_get_queue(dkc->dev->bdev),
-					   &dkc->key);
+		err = blk_crypto_evict_key(dkc->dev->bdev, &dkc->key);
 		if (err && err != -ENOKEY)
 			DMWARN("Failed to evict crypto key: %d", err);
 		dm_put_device(ti, dkc->dev);
@@ -245,8 +244,7 @@ static int default_key_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	err = blk_crypto_start_using_key(&dkc->key,
-					 bdev_get_queue(dkc->dev->bdev));
+	err = blk_crypto_start_using_key(dkc->dev->bdev, &dkc->key);
 	if (err) {
 		ti->error = "Error starting to use blk-crypto";
 		goto bad;

From 139dbaa221e23ec77a884c7314e1d06d79dc39db Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Dec 2022 22:52:12 +0000
Subject: [PATCH 408/457] ANDROID: update "block: add basic hardware-wrapped
 key support" to v7

The hardware-wrapped key support in this branch is based on my patch
"[RFC PATCH v3 1/3] block: add basic hardware-wrapped key support"
(https://lore.kernel.org/all/20211021181608.54127-2-ebiggers@kernel.org).
I've since made several updates to that patch and it is now at v7.

This commit brings in the updates from v3 to v7.  The main change is
making blk_crypto_derive_sw_secret() operate on a struct block_device,
and adding blk_crypto_hw_wrapped_keys_compatible().  This aligns with
changes upstream in v6.1 and v6.2 that removed block-layer internal
structures from the API that blk-crypto exposes to upper layers.
There's also a slight change in prototype for ->derive_sw_secret, so a
couple out-of-tree drivers will need to be updated, but people
maintaining out-of-tree drivers know what they are dealing with anyway.

Bug: 160883801
Link: https://lore.kernel.org/r/20221216203636.81491-2-ebiggers@kernel.org
Change-Id: I0f285c11c2764064cd4a9d6eac0089099a9601ed
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/block/inline-encryption.rst | 11 ++--
 block/blk-crypto-fallback.c               |  2 +-
 block/blk-crypto-profile.c                | 66 ++++++++++++++---------
 block/blk-crypto.c                        |  7 ++-
 include/linux/blk-crypto-profile.h        | 12 ++---
 include/linux/blk-crypto.h                | 11 +++-
 6 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index 7ab203be0117..4fd2b1171301 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -388,8 +388,8 @@ such as in file-based encryption.  Key wrapping is a commonly used technique.)
 The key which wraps (encrypts) hardware-wrapped keys is a hardware-internal key
 that is never exposed to software; it is either a persistent key (a "long-term
 wrapping key") or a per-boot key (an "ephemeral wrapping key").  The long-term
-wrapped form of the key is what is initially unlocked, but it is discarded as
-soon as it is converted into an ephemerally-wrapped key.  In-use
+wrapped form of the key is what is initially unlocked, but it is erased from
+memory as soon as it is converted into an ephemerally-wrapped key.  In-use
 hardware-wrapped keys are always ephemerally-wrapped, not long-term wrapped.
 
 As inline encryption hardware can only be used to encrypt/decrypt data on-disk,
@@ -442,8 +442,8 @@ The components are:
   for cryptographic applications that require up to a 256-bit security strength.
   Some use cases (e.g. full-disk encryption) won't require the software secret.
 
-Example: in the case of fscrypt, the fscrypt master key (the key used to unlock
-a particular set of encrypted directories) is made hardware-wrapped.  The inline
+Example: in the case of fscrypt, the fscrypt master key (the key that protects a
+particular set of encrypted directories) is made hardware-wrapped.  The inline
 encryption key is used as the file contents encryption key, while the software
 secret (rather than the master key directly) is used to key fscrypt's KDF
 (HKDF-SHA512) to derive other subkeys such as filenames encryption keys.
@@ -512,5 +512,6 @@ the hardware RNG and its use to generate the key, as well as the testing of the
 "import" mode as that should cover all parts other than the key generation.
 
 For an example of a test that verifies the ciphertext written to disk in the
-"import" mode, see `Android's vts_kernel_encryption_test
+"import" mode, see the fscrypt hardware-wrapped key tests in xfstests, or
+`Android's vts_kernel_encryption_test
 <https://android.googlesource.com/platform/test/vts-testcase/kernel/+/refs/heads/master/encryption/>`_.
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index c5c47f875951..243953d9705d 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -541,7 +541,7 @@ static int blk_crypto_fallback_init(void)
 	if (blk_crypto_fallback_inited)
 		return 0;
 
-	get_random_bytes(blank_key, BLK_CRYPTO_MAX_STANDARD_KEY_SIZE);
+	get_random_bytes(blank_key, sizeof(blank_key));
 
 	err = bioset_init(&crypto_bio_split, 64, 0, 0);
 	if (err)
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index cf4fd75e465e..6b9ac2596dba 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -467,42 +467,60 @@ bool blk_crypto_register(struct blk_crypto_profile *profile,
 EXPORT_SYMBOL_GPL(blk_crypto_register);
 
 /**
- * blk_crypto_derive_sw_secret() - Derive software secret from hardware-wrapped
- *				   key
- * @profile: the crypto profile of the device the key will be used on
- * @wrapped_key: the hardware-wrapped key
- * @wrapped_key_size: size of @wrapped_key in bytes
+ * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key
+ * @bdev: a block device whose hardware-wrapped keys implementation is
+ *	  compatible (blk_crypto_hw_wrapped_keys_compatible()) with all block
+ *	  devices on which the key will be used.
+ * @eph_key: the hardware-wrapped key in ephemerally-wrapped form
+ * @eph_key_size: size of @eph_key in bytes
  * @sw_secret: (output) the software secret
  *
- * Given a hardware-wrapped key, ask the hardware to derive the secret which
- * software can use for cryptographic tasks other than inline encryption.  This
- * secret is guaranteed to be cryptographically isolated from the inline
- * encryption key, i.e. derived with a different KDF context.
+ * Given a hardware-wrapped key in ephemerally-wrapped form (the same form that
+ * it is used for I/O), ask the hardware to derive the secret which software can
+ * use for cryptographic tasks other than inline encryption.  This secret is
+ * guaranteed to be cryptographically isolated from the inline encryption key,
+ * i.e. derived with a different KDF context.
  *
- * Return: 0 on success, -EOPNOTSUPP if the given @profile doesn't support
- *	   hardware-wrapped keys (or is NULL), -EBADMSG if the key isn't a valid
+ * Return: 0 on success, -EOPNOTSUPP if the block device doesn't support
+ *	   hardware-wrapped keys, -EBADMSG if the key isn't a valid
  *	   hardware-wrapped key, or another -errno code.
  */
-int blk_crypto_derive_sw_secret(struct blk_crypto_profile *profile,
-				const u8 *wrapped_key,
-				unsigned int wrapped_key_size,
+int blk_crypto_derive_sw_secret(struct block_device *bdev,
+				const u8 *eph_key, size_t eph_key_size,
 				u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
 {
-	int err = -EOPNOTSUPP;
+	struct blk_crypto_profile *profile =
+		bdev_get_queue(bdev)->crypto_profile;
+	int err;
 
-	if (profile &&
-	    (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) &&
-	    profile->ll_ops.derive_sw_secret) {
-		blk_crypto_hw_enter(profile);
-		err = profile->ll_ops.derive_sw_secret(profile, wrapped_key,
-						       wrapped_key_size,
-						       sw_secret);
-		blk_crypto_hw_exit(profile);
-	}
+	if (!profile)
+		return -EOPNOTSUPP;
+	if (!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
+		return -EOPNOTSUPP;
+	if (!profile->ll_ops.derive_sw_secret)
+		return -EOPNOTSUPP;
+	blk_crypto_hw_enter(profile);
+	err = profile->ll_ops.derive_sw_secret(profile, eph_key, eph_key_size,
+					       sw_secret);
+	blk_crypto_hw_exit(profile);
 	return err;
 }
 EXPORT_SYMBOL_GPL(blk_crypto_derive_sw_secret);
 
+/**
+ * blk_crypto_hw_wrapped_keys_compatible() - Check HW-wrapped key compatibility
+ * @bdev1: the first block device
+ * @bdev2: the second block device
+ *
+ * Return: true if HW-wrapped keys used on @bdev1 can also be used on @bdev2.
+ */
+bool blk_crypto_hw_wrapped_keys_compatible(struct block_device *bdev1,
+					   struct block_device *bdev2)
+{
+	return bdev_get_queue(bdev1)->crypto_profile ==
+		bdev_get_queue(bdev2)->crypto_profile;
+}
+
 /**
  * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
  *					 by child device
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index bf28d3b31cba..87b7b88345ff 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -72,7 +72,10 @@ static int __init bio_crypt_ctx_init(void)
 	/* This is assumed in various places. */
 	BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
 
-	/* Sanity check that no algorithm exceeds the defined limits. */
+	/*
+	 * Validate the crypto mode properties.  This ideally would be done with
+	 * static assertions, but boot-time checks are the next best thing.
+	 */
 	for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
 		BUG_ON(blk_crypto_modes[i].keysize >
 		       BLK_CRYPTO_MAX_STANDARD_KEY_SIZE);
@@ -327,7 +330,7 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
  *	   zeroizing both blk_key and raw_key when done with them.
  */
 int blk_crypto_init_key(struct blk_crypto_key *blk_key,
-			const u8 *raw_key, unsigned int raw_key_size,
+			const u8 *raw_key, size_t raw_key_size,
 			enum blk_crypto_key_type key_type,
 			enum blk_crypto_mode_num crypto_mode,
 			unsigned int dun_bytes,
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
index 189f9b65b764..8b30d04ef008 100644
--- a/include/linux/blk-crypto-profile.h
+++ b/include/linux/blk-crypto-profile.h
@@ -60,7 +60,7 @@ struct blk_crypto_ll_ops {
 
 	/**
 	 * @derive_sw_secret: Derive the software secret from a hardware-wrapped
-	 *		      key.
+	 *		      key in ephemerally-wrapped form.
 	 *
 	 * This only needs to be implemented if BLK_CRYPTO_KEY_TYPE_HW_WRAPPED
 	 * is supported.
@@ -69,8 +69,7 @@ struct blk_crypto_ll_ops {
 	 * -errno code on other errors.
 	 */
 	int (*derive_sw_secret)(struct blk_crypto_profile *profile,
-				const u8 *wrapped_key,
-				unsigned int wrapped_key_size,
+				const u8 *eph_key, size_t eph_key_size,
 				u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
 };
 
@@ -100,7 +99,7 @@ struct blk_crypto_profile {
 	unsigned int max_dun_bytes_supported;
 
 	/**
-	 * @key_types_supported: Supported types of keys --
+	 * @key_types_supported: A bitmask of the supported key types:
 	 * BLK_CRYPTO_KEY_TYPE_STANDARD and/or BLK_CRYPTO_KEY_TYPE_HW_WRAPPED.
 	 */
 	unsigned int key_types_supported;
@@ -163,11 +162,6 @@ void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
 
 void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);
 
-int blk_crypto_derive_sw_secret(struct blk_crypto_profile *profile,
-				const u8 *wrapped_key,
-				unsigned int wrapped_key_size,
-				u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
-
 void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
 				       const struct blk_crypto_profile *child);
 
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index f2b269d695f1..44d19e451b63 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -17,7 +17,7 @@ enum blk_crypto_mode_num {
 };
 
 /*
- * Supported types of keys.  Must be bit-flags due to their use in
+ * Supported types of keys.  Must be bitflags due to their use in
  * blk_crypto_profile::key_types_supported.
  */
 enum blk_crypto_key_type {
@@ -141,7 +141,7 @@ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
 				 const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]);
 
 int blk_crypto_init_key(struct blk_crypto_key *blk_key,
-			const u8 *raw_key, unsigned int raw_key_size,
+			const u8 *raw_key, size_t raw_key_size,
 			enum blk_crypto_key_type key_type,
 			enum blk_crypto_mode_num crypto_mode,
 			unsigned int dun_bytes,
@@ -158,6 +158,13 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev,
 bool blk_crypto_config_supported(struct block_device *bdev,
 				 const struct blk_crypto_config *cfg);
 
+int blk_crypto_derive_sw_secret(struct block_device *bdev,
+				const u8 *eph_key, size_t eph_key_size,
+				u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
+
+bool blk_crypto_hw_wrapped_keys_compatible(struct block_device *bdev1,
+					   struct block_device *bdev2);
+
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
 static inline bool bio_has_crypt_ctx(struct bio *bio)

From be794c1c9d45fe6d4b60c4e92366d9813a227efa Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Dec 2022 22:52:13 +0000
Subject: [PATCH 409/457] ANDROID: update "dm: add support for passing through
 derive_sw_secret"

Update this code to be compatible with the updated version of
"block: add basic hardware-wrapped key support".

Bug: 160883801
Change-Id: Ic6991ad163035870ace3cd468f53b21a824c5359
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 drivers/md/dm-table.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 77c7bdc5409b..e91afd6932e5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1252,9 +1252,9 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
 }
 
 struct dm_derive_sw_secret_args {
-	const u8 *wrapped_key;
-	unsigned int wrapped_key_size;
-	u8 *secret;
+	const u8 *eph_key;
+	size_t eph_key_size;
+	u8 *sw_secret;
 	int err;
 };
 
@@ -1263,15 +1263,14 @@ static int dm_derive_sw_secret_callback(struct dm_target *ti,
 					sector_t len, void *data)
 {
 	struct dm_derive_sw_secret_args *args = data;
-	struct request_queue *q = bdev_get_queue(dev->bdev);
 
 	if (!args->err)
 		return 0;
 
-	args->err = blk_crypto_derive_sw_secret(q->crypto_profile,
-						args->wrapped_key,
-						args->wrapped_key_size,
-						args->secret);
+	args->err = blk_crypto_derive_sw_secret(dev->bdev,
+						args->eph_key,
+						args->eph_key_size,
+						args->sw_secret);
 	/* Try another device in case this fails. */
 	return 0;
 }
@@ -1282,16 +1281,15 @@ static int dm_derive_sw_secret_callback(struct dm_target *ti,
  * first device that supports derive_sw_secret().
  */
 static int dm_derive_sw_secret(struct blk_crypto_profile *profile,
-			       const u8 *wrapped_key,
-			       unsigned int wrapped_key_size,
-			       u8 secret[BLK_CRYPTO_SW_SECRET_SIZE])
+			       const u8 *eph_key, size_t eph_key_size,
+			       u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
 {
 	struct mapped_device *md =
 		container_of(profile, struct dm_crypto_profile, profile)->md;
 	struct dm_derive_sw_secret_args args = {
-		.wrapped_key = wrapped_key,
-		.wrapped_key_size = wrapped_key_size,
-		.secret = secret,
+		.eph_key = eph_key,
+		.eph_key_size = eph_key_size,
+		.sw_secret = sw_secret,
 		.err = -EOPNOTSUPP,
 	};
 	struct dm_table *t;

From 113b9b5027bc56151dc31687dc28c25dd8f06786 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 16 Dec 2022 22:52:13 +0000
Subject: [PATCH 410/457] ANDROID: update "fscrypt: add support for
 hardware-wrapped keys" to v7

The hardware-wrapped key support in this branch is based on my patch
"[RFC PATCH v3 3/3] fscrypt: add support for hardware-wrapped keys"
(https://lore.kernel.org/r/20211021181608.54127-4-ebiggers@kernel.org)
I've since made several updates to that patch and it is now at v7.

This commit brings in the updates from v3 to v7, to the extent possible
while retaining compatibility with the UAPI and on-disk format used for
this feature in Android.  This mainly includes some improved log
messages, and compatibility with the blk-crypto updates.

Bug: 160883801
Link: https://lore.kernel.org/all/20221216203636.81491-5-ebiggers@kernel.org
Change-Id: I1c43ca55ec7e95dd06f8f7944100ffd14771d3a7
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/crypto/fscrypt_private.h | 15 +++++++--------
 fs/crypto/inline_crypt.c    | 28 +++++++++++++++++-----------
 fs/crypto/keyring.c         |  6 +++---
 fs/crypto/keysetup.c        | 33 +++++++++++----------------------
 4 files changed, 38 insertions(+), 44 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index f252d4eda314..418a50a41747 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -363,16 +363,15 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
 }
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
-				     const u8 *raw_key,
-				     unsigned int raw_key_size,
+				     const u8 *raw_key, size_t raw_key_size,
 				     bool is_hw_wrapped,
 				     const struct fscrypt_info *ci);
 
 void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 				      struct fscrypt_prepared_key *prep_key);
 
-int fscrypt_derive_sw_secret(struct super_block *sb, const u8 *wrapped_key,
-			     unsigned int wrapped_key_size,
+int fscrypt_derive_sw_secret(struct super_block *sb,
+			     const u8 *wrapped_key, size_t wrapped_key_size,
 			     u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
 
 /*
@@ -385,7 +384,7 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
 {
 	/*
 	 * The two smp_load_acquire()'s here pair with the smp_store_release()'s
-	 * in fscrypt_prepare_inline_crypt_key() and __fscrypt_prepare_key().
+	 * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key().
 	 * I.e., in some cases (namely, if this prep_key is a per-mode
 	 * encryption key) another task can publish blk_key or tfm concurrently,
 	 * executing a RELEASE barrier.  We need to use smp_load_acquire() here
@@ -412,7 +411,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci)
 
 static inline int
 fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
-				 const u8 *raw_key, unsigned int raw_key_size,
+				 const u8 *raw_key, size_t raw_key_size,
 				 bool is_hw_wrapped,
 				 const struct fscrypt_info *ci)
 {
@@ -427,8 +426,8 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 }
 
 static inline int
-fscrypt_derive_sw_secret(struct super_block *sb, const u8 *wrapped_key,
-			 unsigned int wrapped_key_size,
+fscrypt_derive_sw_secret(struct super_block *sb,
+			 const u8 *wrapped_key, size_t wrapped_key_size,
 			 u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
 {
 	fscrypt_warn(NULL, "kernel doesn't support hardware-wrapped keys");
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 77d0d86c578c..8d33accf0670 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -155,8 +155,7 @@ out_free_devs:
 }
 
 int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
-				     const u8 *raw_key,
-				     unsigned int raw_key_size,
+				     const u8 *raw_key, size_t raw_key_size,
 				     bool is_hw_wrapped,
 				     const struct fscrypt_info *ci)
 {
@@ -240,19 +239,22 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
  * hardware-wrapped key.  Returns -EOPNOTSUPP if hardware-wrapped keys aren't
  * supported on this filesystem or hardware.
  */
-int fscrypt_derive_sw_secret(struct super_block *sb, const u8 *wrapped_key,
-			     unsigned int wrapped_key_size,
+int fscrypt_derive_sw_secret(struct super_block *sb,
+			     const u8 *wrapped_key, size_t wrapped_key_size,
 			     u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
 {
-	struct blk_crypto_profile *profile;
 	struct block_device **devs;
 	unsigned int num_devs;
 	unsigned int i;
 	int err;
 
-	/* The filesystem must be mounted with -o inlinecrypt */
-	if (!(sb->s_flags & SB_INLINECRYPT))
+	/* The filesystem must be mounted with -o inlinecrypt. */
+	if (!(sb->s_flags & SB_INLINECRYPT)) {
+		fscrypt_warn(NULL,
+			     "%s: filesystem not mounted with inlinecrypt\n",
+			     sb->s_id);
 		return -EOPNOTSUPP;
+	}
 
 	/*
 	 * Hardware-wrapped keys might be specific to a particular storage
@@ -263,17 +265,21 @@ int fscrypt_derive_sw_secret(struct super_block *sb, const u8 *wrapped_key,
 	devs = fscrypt_get_devices(sb, &num_devs);
 	if (IS_ERR(devs))
 		return PTR_ERR(devs);
-	profile = bdev_get_queue(devs[0])->crypto_profile;
 	for (i = 1; i < num_devs; i++) {
-		if (bdev_get_queue(devs[i])->crypto_profile != profile) {
+		if (!blk_crypto_hw_wrapped_keys_compatible(devs[0], devs[i])) {
 			fscrypt_warn(NULL,
-				     "unsupported multi-device configuration for hardware-wrapped keys");
+				     "%s: unsupported multi-device configuration for hardware-wrapped keys",
+				     sb->s_id);
 			kfree(devs);
 			return -EOPNOTSUPP;
 		}
 	}
-	err = blk_crypto_derive_sw_secret(profile, wrapped_key,
+	err = blk_crypto_derive_sw_secret(devs[0], wrapped_key,
 					  wrapped_key_size, sw_secret);
+	if (err == -EOPNOTSUPP)
+		fscrypt_warn(NULL,
+			     "%s: block device doesn't support hardware-wrapped keys\n",
+			     sb->s_id);
 	kfree(devs);
 	return err;
 }
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index aeb28d091a1e..672c3fbda0f3 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -782,11 +782,11 @@ fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
 {
 	static u8 test_key[FSCRYPT_MAX_STANDARD_KEY_SIZE];
 
-	get_random_once(test_key, FSCRYPT_MAX_STANDARD_KEY_SIZE);
+	get_random_once(test_key, sizeof(test_key));
 
 	memset(secret, 0, sizeof(*secret));
-	secret->size = FSCRYPT_MAX_STANDARD_KEY_SIZE;
-	memcpy(secret->raw, test_key, FSCRYPT_MAX_STANDARD_KEY_SIZE);
+	secret->size = sizeof(test_key);
+	memcpy(secret->raw, test_key, sizeof(test_key));
 }
 
 int fscrypt_get_test_dummy_key_identifier(
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 8b0237241c2f..2c375f09af1e 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -130,23 +130,17 @@ err_free_tfm:
  * Prepare the crypto transform object or blk-crypto key in @prep_key, given the
  * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption
  * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt),
- * and IV generation method (@ci->ci_policy.flags).  The raw key can be either a
- * standard key or a hardware-wrapped key, as indicated by @is_hw_wrapped; it
- * can only be a hardware-wrapped key if blk-crypto will be used.
+ * and IV generation method (@ci->ci_policy.flags).
  */
-static int __fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-				 const u8 *raw_key, unsigned int raw_key_size,
-				 bool is_hw_wrapped,
-				 const struct fscrypt_info *ci)
+int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
+			const u8 *raw_key, const struct fscrypt_info *ci)
 {
 	struct crypto_skcipher *tfm;
 
 	if (fscrypt_using_inline_encryption(ci))
-		return fscrypt_prepare_inline_crypt_key(prep_key,
-				raw_key, raw_key_size, is_hw_wrapped, ci);
-
-	if (WARN_ON(is_hw_wrapped || raw_key_size != ci->ci_mode->keysize))
-		return -EINVAL;
+		return fscrypt_prepare_inline_crypt_key(prep_key, raw_key,
+							ci->ci_mode->keysize,
+							false, ci);
 
 	tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
 	if (IS_ERR(tfm))
@@ -161,13 +155,6 @@ static int __fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
 	return 0;
 }
 
-int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
-			const u8 *raw_key, const struct fscrypt_info *ci)
-{
-	return __fscrypt_prepare_key(prep_key, raw_key, ci->ci_mode->keysize,
-				     false, ci);
-}
-
 /* Destroy a crypto transform object and/or blk-crypto key. */
 void fscrypt_destroy_prepared_key(struct super_block *sb,
 				  struct fscrypt_prepared_key *prep_key)
@@ -208,7 +195,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
 		if (!fscrypt_using_inline_encryption(ci)) {
 			if (sb->s_flags & SB_INLINECRYPT)
 				fscrypt_warn(ci->ci_inode,
-					     "Hardware-wrapped key required, but no suitable inline encryption hardware is available");
+					     "Hardware-wrapped key required, but no suitable inline encryption capabilities are available");
 			else
 				fscrypt_warn(ci->ci_inode,
 					     "Hardware-wrapped keys require inline encryption (-o inlinecrypt)");
@@ -229,8 +216,10 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci,
 		goto done_unlock;
 
 	if (use_hw_wrapped_key) {
-		err = __fscrypt_prepare_key(prep_key, mk->mk_secret.raw,
-					    mk->mk_secret.size, true, ci);
+		err = fscrypt_prepare_inline_crypt_key(prep_key,
+						       mk->mk_secret.raw,
+						       mk->mk_secret.size, true,
+						       ci);
 		if (err)
 			goto out_unlock;
 		goto done_unlock;

From c0b208dbee699ab3fdf9625a3890974fcf911621 Mon Sep 17 00:00:00 2001
From: Chun-Hung Wu <chun-hung.wu@mediatek.com>
Date: Thu, 23 Dec 2021 18:59:35 +0800
Subject: [PATCH 411/457] ANDROID: GKI: Export clocksource_mmio_init

Export clocksource_mmio_init and clocksource_mmio_readl_up
to support building clocksource driver as module,
such as timer-mediatek.c.

Bug: 161675989
Signed-off-by: Chun-Hung Wu <chun-hung.wu@mediatek.com>
Change-Id: I7df2f2ac62f2322c1d32686907cb0bd87639f6fc
(cherry picked from commit bd213d9a67050fdd380a1c1d94a5388f8041f28a)
---
 drivers/clocksource/mmio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/clocksource/mmio.c b/drivers/clocksource/mmio.c
index 9de751531831..826dcc42629c 100644
--- a/drivers/clocksource/mmio.c
+++ b/drivers/clocksource/mmio.c
@@ -21,6 +21,7 @@ u64 clocksource_mmio_readl_up(struct clocksource *c)
 {
 	return (u64)readl_relaxed(to_mmio_clksrc(c)->reg);
 }
+EXPORT_SYMBOL_GPL(clocksource_mmio_readl_up);
 
 u64 clocksource_mmio_readl_down(struct clocksource *c)
 {
@@ -46,7 +47,7 @@ u64 clocksource_mmio_readw_down(struct clocksource *c)
  * @bits:	Number of valid bits
  * @read:	One of clocksource_mmio_read*() above
  */
-int __init clocksource_mmio_init(void __iomem *base, const char *name,
+int clocksource_mmio_init(void __iomem *base, const char *name,
 	unsigned long hz, int rating, unsigned bits,
 	u64 (*read)(struct clocksource *))
 {
@@ -68,3 +69,4 @@ int __init clocksource_mmio_init(void __iomem *base, const char *name,
 
 	return clocksource_register_hz(&cs->clksrc, hz);
 }
+EXPORT_SYMBOL_GPL(clocksource_mmio_init);

From 0baa11384bf87212556a5cdce26a471ccadd11c7 Mon Sep 17 00:00:00 2001
From: Jing-Ting Wu <Jing-Ting.Wu@mediatek.com>
Date: Thu, 5 Jan 2023 18:09:06 +0800
Subject: [PATCH 412/457] FROMLIST: sched/pelt: Introduce PELT multiplier

The new sysctl sched_pelt_multiplier allows a user to set a clock
multiplier to x2 or x4 (x1 being the default). This clock multiplier
artificially speeds up PELT ramp up/down similarly to use a faster
half-life than the default 32ms.

  - x1: 32ms half-life
  - x2: 16ms half-life
  - x4: 8ms  half-life

Internally, a new clock is created: rq->clock_task_mult. It sits in the
clock hierarchy between rq->clock_task and rq->clock_pelt.

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Bug: 263742061
Link: https://lore.kernel.org/lkml/20220829055450.1703092-2-dietmar.eggemann@arm.com
Change-Id: Id379ff3cf07733ae63a854bc1e5af64426576788
Signed-off-by: Jing-Ting Wu <Jing-Ting.Wu@mediatek.com>
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/pelt.c  | 60 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/pelt.h  | 42 ++++++++++++++++++++++++++++---
 kernel/sched/sched.h |  1 +
 4 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bbaea2749434..911e29318645 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -733,7 +733,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		update_irq_load_avg(rq, irq_delta + steal);
 #endif
-	update_rq_clock_pelt(rq, delta);
+	update_rq_clock_task_mult(rq, delta);
 }
 
 void update_rq_clock(struct rq *rq)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 7a64adec8fb1..3e94c96f0c81 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -468,3 +468,63 @@ int update_irq_load_avg(struct rq *rq, u64 running)
 	return ret;
 }
 #endif
+
+__read_mostly unsigned int sched_pelt_lshift;
+
+#ifdef CONFIG_SYSCTL
+static unsigned int sysctl_sched_pelt_multiplier = 1;
+
+int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	unsigned int old;
+	int ret;
+
+	mutex_lock(&mutex);
+	old = sysctl_sched_pelt_multiplier;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto undo;
+	if (!write)
+		goto done;
+
+	switch (sysctl_sched_pelt_multiplier)  {
+	case 1:
+		fallthrough;
+	case 2:
+		fallthrough;
+	case 4:
+		WRITE_ONCE(sched_pelt_lshift,
+			   sysctl_sched_pelt_multiplier >> 1);
+		goto done;
+	default:
+		ret = -EINVAL;
+	}
+
+undo:
+	sysctl_sched_pelt_multiplier = old;
+done:
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+
+static struct ctl_table sched_pelt_sysctls[] = {
+	{
+		.procname       = "sched_pelt_multiplier",
+		.data           = &sysctl_sched_pelt_multiplier,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = sched_pelt_multiplier,
+	},
+	{}
+};
+
+static int __init sched_pelt_sysctl_init(void)
+{
+	register_sysctl_init("kernel", sched_pelt_sysctls);
+	return 0;
+}
+late_initcall(sched_pelt_sysctl_init);
+#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 3a0e0dc28721..9b35b5072bae 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 }
 
+static inline u64 rq_clock_task_mult(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+	assert_clock_updated(rq);
+
+	return rq->clock_task_mult;
+}
+
 static inline u64 rq_clock_pelt(struct rq *rq)
 {
 	lockdep_assert_rq_held(rq);
@@ -72,7 +80,7 @@ static inline u64 rq_clock_pelt(struct rq *rq)
 /* The rq is idle, we can sync to clock_task */
 static inline void _update_idle_rq_clock_pelt(struct rq *rq)
 {
-	rq->clock_pelt  = rq_clock_task(rq);
+	rq->clock_pelt = rq_clock_task_mult(rq);
 
 	u64_u32_store(rq->clock_idle, rq_clock(rq));
 	/* Paired with smp_rmb in migrate_se_pelt_lag() */
@@ -121,6 +129,27 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 	rq->clock_pelt += delta;
 }
 
+extern unsigned int sched_pelt_lshift;
+
+/*
+ * absolute time   |1      |2      |3      |4      |5      |6      |
+ * @ mult = 1      --------****************--------****************-
+ * @ mult = 2      --------********----------------********---------
+ * @ mult = 4      --------****--------------------****-------------
+ * clock task mult
+ * @ mult = 2      |   |   |2  |3  |   |   |   |   |5  |6  |   |   |
+ * @ mult = 4      | | | | |2|3| | | | | | | | | | |5|6| | | | | | |
+ *
+ */
+static inline void update_rq_clock_task_mult(struct rq *rq, s64 delta)
+{
+	delta <<= READ_ONCE(sched_pelt_lshift);
+
+	rq->clock_task_mult += delta;
+
+	update_rq_clock_pelt(rq, delta);
+}
+
 /*
  * When rq becomes idle, we have to check if it has lost idle time
  * because it was fully busy. A rq is fully used when the /Sum util_sum
@@ -147,7 +176,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
 	 * rq's clock_task.
 	 */
 	if (util_sum >= divider)
-		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+		rq->lost_idle_time += rq_clock_task_mult(rq) - rq->clock_pelt;
 
 	_update_idle_rq_clock_pelt(rq);
 }
@@ -218,13 +247,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
 	return 0;
 }
 
-static inline u64 rq_clock_pelt(struct rq *rq)
+static inline u64 rq_clock_task_mult(struct rq *rq)
 {
 	return rq_clock_task(rq);
 }
 
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+	return rq_clock_task_mult(rq);
+}
+
 static inline void
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+update_rq_clock_task_mult(struct rq *rq, s64 delta) { }
 
 static inline void
 update_idle_rq_clock_pelt(struct rq *rq) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 895ee9942331..78e9ef569ac7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1022,6 +1022,7 @@ struct rq {
 	u64			clock;
 	/* Ensure that all clocks are in the same cache line */
 	u64			clock_task ____cacheline_aligned;
+	u64			clock_task_mult;
 	u64			clock_pelt;
 	unsigned long		lost_idle_time;
 	u64			clock_pelt_idle;

From 1072495f4a7cc8eaf1e699d4680879b38ab0a48f Mon Sep 17 00:00:00 2001
From: JianMin Liu <jian-min.liu@mediatek.com>
Date: Tue, 21 Jul 2020 13:36:22 +0800
Subject: [PATCH 413/457] ANDROID: rwsem: Add vendor hook to the rw-semaphore

Add the hook to apply vendor's performance tune for owner
of rwsem.

Add the hook for the waiter list of rwsem to allow
vendor perform waiting queue enhancement

ANDROID_VENDOR_DATA added to rw_semaphore

Bug: 264007752

Signed-off-by: JianMin Liu <jian-min.liu@mediatek.com>
Signed-off-by: Guanwun Chen <guan-wun.chen@mediatek.com>
Change-Id: I007a5e26f3db2adaeaf4e5ccea414ce7abfa83b8
---
 drivers/android/vendor_hooks.c |  5 +++++
 include/linux/rwsem.h          |  2 ++
 include/trace/hooks/rwsem.h    | 30 ++++++++++++++++++++++++++++++
 kernel/locking/rwsem.c         | 20 ++++++++++++++++++--
 4 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/hooks/rwsem.h

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 2b226e7d3d8c..fd232db7c293 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -36,6 +36,7 @@
 #include <trace/hooks/selinux.h>
 #include <trace/hooks/syscall_check.h>
 #include <trace/hooks/remoteproc.h>
+#include <trace/hooks/rwsem.h>
 
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -108,3 +109,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index efa5c324369a..34bd4eebcfaf 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -31,6 +31,7 @@
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #include <linux/osq_lock.h>
 #endif
+#include <linux/android_vendor.h>
 
 /*
  * For an uncontended rwsem, count and owner are the only fields a task
@@ -63,6 +64,7 @@ struct rw_semaphore {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
+	ANDROID_VENDOR_DATA(1);
 };
 
 /* In all implementations count != 0 means locked */
diff --git a/include/trace/hooks/rwsem.h b/include/trace/hooks/rwsem.h
new file mode 100644
index 000000000000..71215dfd5eff
--- /dev/null
+++ b/include/trace/hooks/rwsem.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rwsem
+#define TRACE_INCLUDE_PATH trace/hooks
+#if !defined(_TRACE_HOOK_RWSEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HOOK_RWSEM_H
+#include <trace/hooks/vendor_hooks.h>
+/*
+ * Following tracepoints are not exported in tracefs and provide a
+ * mechanism for vendor modules to hook and extend functionality
+ */
+struct rw_semaphore;
+struct rwsem_waiter;
+DECLARE_HOOK(android_vh_rwsem_init,
+	TP_PROTO(struct rw_semaphore *sem),
+	TP_ARGS(sem));
+DECLARE_HOOK(android_vh_rwsem_wake,
+	TP_PROTO(struct rw_semaphore *sem),
+	TP_ARGS(sem));
+DECLARE_HOOK(android_vh_rwsem_write_finished,
+	TP_PROTO(struct rw_semaphore *sem),
+	TP_ARGS(sem));
+DECLARE_HOOK(android_vh_alter_rwsem_list_add,
+	TP_PROTO(struct rwsem_waiter *waiter,
+		 struct rw_semaphore *sem,
+		 bool *already_on_list),
+	TP_ARGS(waiter, sem, already_on_list));
+#endif /* _TRACE_HOOK_RWSEM_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de03..7e2a83a5d025 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -31,6 +31,7 @@
 
 #ifndef CONFIG_PREEMPT_RT
 #include "lock_events.h"
+#include <trace/hooks/rwsem.h>
 
 /*
  * The least significant 2 bits of the owner value has the following
@@ -330,6 +331,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	osq_lock_init(&sem->osq);
 #endif
+	trace_android_vh_rwsem_init(sem);
 }
 EXPORT_SYMBOL(__init_rwsem);
 
@@ -1008,6 +1010,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
 	long rcnt = (count >> RWSEM_READER_SHIFT);
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
+	bool already_on_list = false;
 
 	/*
 	 * To prevent a constant stream of readers from starving a sleeping
@@ -1064,12 +1067,17 @@ queue:
 		}
 		adjustment += RWSEM_FLAG_WAITERS;
 	}
-	rwsem_add_waiter(sem, &waiter);
+	trace_android_vh_alter_rwsem_list_add(
+					&waiter,
+					sem, &already_on_list);
+	if (!already_on_list)
+		rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = atomic_long_add_return(adjustment, &sem->count);
 
 	rwsem_cond_wake_waiter(sem, count, &wake_q);
+	trace_android_vh_rwsem_wake(sem);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
 	if (!wake_q_empty(&wake_q))
@@ -1117,6 +1125,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
+	bool already_on_list = false;
 
 	/* do optimistic spinning and steal lock if possible */
 	if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
@@ -1134,7 +1143,11 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	waiter.handoff_set = false;
 
 	raw_spin_lock_irq(&sem->wait_lock);
-	rwsem_add_waiter(sem, &waiter);
+	trace_android_vh_alter_rwsem_list_add(
+					&waiter,
+					sem, &already_on_list);
+	if (!already_on_list)
+		rwsem_add_waiter(sem, &waiter);
 
 	/* we're now waiting on the lock */
 	if (rwsem_first_waiter(sem) != &waiter) {
@@ -1153,6 +1166,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
 	}
 
+	trace_android_vh_rwsem_wake(sem);
 	/* wait until we successfully acquire the lock */
 	set_current_state(state);
 	trace_contention_begin(sem, LCB_F_WRITE);
@@ -1612,6 +1626,7 @@ EXPORT_SYMBOL(up_read);
 void up_write(struct rw_semaphore *sem)
 {
 	rwsem_release(&sem->dep_map, _RET_IP_);
+	trace_android_vh_rwsem_write_finished(sem);
 	__up_write(sem);
 }
 EXPORT_SYMBOL(up_write);
@@ -1622,6 +1637,7 @@ EXPORT_SYMBOL(up_write);
 void downgrade_write(struct rw_semaphore *sem)
 {
 	lock_downgrade(&sem->dep_map, _RET_IP_);
+	trace_android_vh_rwsem_write_finished(sem);
 	__downgrade_write(sem);
 }
 EXPORT_SYMBOL(downgrade_write);

From 634004c4c8a476c46bda197b0067e9b9a5fd7188 Mon Sep 17 00:00:00 2001
From: JianMin Liu <jian-min.liu@mediatek.com>
Date: Tue, 11 Aug 2020 15:24:06 +0800
Subject: [PATCH 414/457] ANDROID: futex: Add vendor hook for wait queue

Add the hook for the waiter list of futex to allow
vendor perform wait queue enhancement

Bug: 264007752

Signed-off-by: JianMin Liu <jian-min.liu@mediatek.com>
Signed-off-by: Guanwun Chen <guan-wun.chen@mediatek.com>
Change-Id: I68218b89c35b23aa5529099bb0bbbd031bdeafef
---
 drivers/android/vendor_hooks.c |  2 ++
 include/trace/hooks/futex.h    | 26 ++++++++++++++++++++++++++
 kernel/futex/core.c            |  6 +++++-
 3 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/hooks/futex.h

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index fd232db7c293..d4cb5666ee69 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -37,6 +37,7 @@
 #include <trace/hooks/syscall_check.h>
 #include <trace/hooks/remoteproc.h>
 #include <trace/hooks/rwsem.h>
+#include <trace/hooks/futex.h>
 
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -113,3 +114,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add);
diff --git a/include/trace/hooks/futex.h b/include/trace/hooks/futex.h
new file mode 100644
index 000000000000..cbf6abb66a57
--- /dev/null
+++ b/include/trace/hooks/futex.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM futex
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH trace/hooks
+#if !defined(_TRACE_HOOK_FUTEX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HOOK_FUTEX_H
+#include <linux/tracepoint.h>
+#include <trace/hooks/vendor_hooks.h>
+#include <linux/plist.h>
+/*
+ * Following tracepoints are not exported in tracefs and provide a
+ * mechanism for vendor modules to hook and extend functionality
+ */
+#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_ANDROID_VENDOR_HOOKS)
+DECLARE_HOOK(android_vh_alter_futex_plist_add,
+	TP_PROTO(struct plist_node *node,
+		 struct plist_head *head,
+		 bool *already_on_hb),
+	TP_ARGS(node, head, already_on_hb));
+#else
+#define trace_android_vh_alter_futex_plist_add(node, head, already_on_hb)
+#endif
+#endif /* _TRACE_HOOK_FUTEX_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b22ef1efe751..0457455a31d7 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -40,6 +40,7 @@
 
 #include "futex.h"
 #include "../locking/rtmutex_common.h"
+#include <trace/hooks/futex.h>
 
 /*
  * The base of the bucket array and its size are always used together
@@ -543,6 +544,7 @@ void futex_q_unlock(struct futex_hash_bucket *hb)
 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
 {
 	int prio;
+	bool already_on_hb = false;
 
 	/*
 	 * The priority used to register this element is
@@ -555,7 +557,9 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
 	prio = min(current->normal_prio, MAX_RT_PRIO);
 
 	plist_node_init(&q->list, prio);
-	plist_add(&q->list, &hb->chain);
+	trace_android_vh_alter_futex_plist_add(&q->list, &hb->chain, &already_on_hb);
+	if (!already_on_hb)
+		plist_add(&q->list, &hb->chain);
 	q->task = current;
 }
 

From e97fed285679eee64d142b9e6e286896021ac141 Mon Sep 17 00:00:00 2001
From: Namkyu Kim <namkyu78.kim@samsung.com>
Date: Thu, 4 Mar 2021 09:31:56 +0900
Subject: [PATCH 415/457] ANDROID: Add a vendor hook that allow a module to
 modify the wake flag

 android_vh_do_wake_up_sync:
  To modify the mode value of __wake_up_sync_key

 android_vh_set_wake_flags:
  To modify the wake flag from a module

Bug: 181743516
Bug: 263838089
Signed-off-by: Namkyu Kim <namkyu78.kim@samsung.com>
Change-Id: I972e2469c3f139373d21f1e8c85974763388a693
(cherry picked from commit 97368fc2dcc29777e8d3d637d0afdef90e611763)
(cherry picked from commit 0d0f0c5020bc425c9a51c8d17b16ca831c2598fb)
[Dongseok Yi: Moved into kernel/sched/vendor_hooks.c per commit
    5f657b04f4f2 ("ANDROID: subsystem-specific vendor_hooks.c for
    sched")]
Signed-off-by: Dongseok Yi <dseok.yi@samsung.com>
---
 include/trace/hooks/sched.h |  8 ++++++++
 kernel/sched/vendor_hooks.c |  2 ++
 kernel/sched/wait.c         |  6 +++++-
 net/core/sock.c             | 13 ++++++++++++-
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/trace/hooks/sched.h b/include/trace/hooks/sched.h
index 0773eae433a6..c94e0484cb39 100644
--- a/include/trace/hooks/sched.h
+++ b/include/trace/hooks/sched.h
@@ -317,6 +317,14 @@ DECLARE_RESTRICTED_HOOK(android_rvh_update_thermal_stats,
 		TP_PROTO(int cpu),
 		TP_ARGS(cpu), 1);
 
+DECLARE_HOOK(android_vh_do_wake_up_sync,
+	TP_PROTO(struct wait_queue_head *wq_head, int *done),
+	TP_ARGS(wq_head, done));
+
+DECLARE_HOOK(android_vh_set_wake_flags,
+	TP_PROTO(int *wake_flags, unsigned int *mode),
+	TP_ARGS(wake_flags, mode));
+
 /* macro versions of hooks are no longer required */
 
 #endif /* _TRACE_HOOK_SCHED_H */
diff --git a/kernel/sched/vendor_hooks.c b/kernel/sched/vendor_hooks.c
index a5a581cf888c..26dc388f983d 100644
--- a/kernel/sched/vendor_hooks.c
+++ b/kernel/sched/vendor_hooks.c
@@ -82,3 +82,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_schedule_bug);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_sched_exec);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_update_topology_flags_workfn);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_update_thermal_stats);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_do_wake_up_sync);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_set_wake_flags);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9860bb9a847c..ba93b6cba096 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,6 +4,7 @@
  *
  * (C) 2004 Nadia Yvette Chambers, Oracle
  */
+#include <trace/hooks/sched.h>
 
 void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
 {
@@ -198,10 +199,13 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			void *key)
 {
+	int wake_flags = WF_SYNC;
+
 	if (unlikely(!wq_head))
 		return;
 
-	__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
+	trace_android_vh_set_wake_flags(&wake_flags, &mode);
+	__wake_up_common_lock(wq_head, mode, 1, wake_flags, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index a3ba0358c77c..587e8dbe9cac 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -135,6 +135,7 @@
 #include <net/bpf_sk_storage.h>
 
 #include <trace/events/sock.h>
+#include <trace/hooks/sched.h>
 
 #include <net/tcp.h>
 #include <net/busy_poll.h>
@@ -3271,9 +3272,19 @@ void sock_def_readable(struct sock *sk)
 
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
+
+	if (skwq_has_sleeper(wq)) {
+		int done = 0;
+
+		trace_android_vh_do_wake_up_sync(&wq->wait, &done);
+		if (done)
+			goto out;
+
 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
 						EPOLLRDNORM | EPOLLRDBAND);
+	}
+
+out:
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }

From ffb7f4adfcc79afbecba8633160219d23a610647 Mon Sep 17 00:00:00 2001
From: Namkyu Kim <namkyu78.kim@samsung.com>
Date: Wed, 2 Jun 2021 14:56:16 +0900
Subject: [PATCH 416/457] ANDROID: scheduler: add vendor-specific wake flag

It comes with the commit efbb82d3b0db ("ANDROID: Add a vendor hook
that allow a module to modify the wake flag"). A vendor who want to
hook sock_def_readable can set this vendor-specific wake flag.

sock_def_readable ->
(vendor hook will call __wake_up_sync_key with a custom mode) ->
__wake_up_sync_key ->
(vendor hook changes wake_flags per the custom mode) ->
__wake_up_common -> default_wake_function

It is an Android-specific since it works with a vendor hook.

Bug: 189858948
Bug: 226256614
Bug: 263838089
Signed-off-by: Namkyu Kim <namkyu78.kim@samsung.com>
Change-Id: Idc23c1c47f7d83b298c0b2560859f1ce2761fd85
(cherry picked from commit 4c1097df5d9bca84a1922adc752794db9b615a3e)
(cherry picked from commit 87b89ce83b2c0228b5a8300dbf5a63027eabc036)
Signed-off-by: Dongseok Yi <dseok.yi@samsung.com>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/sched.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 911e29318645..e8ea66a42fef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6949,7 +6949,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC | WF_ANDROID_VENDOR));
 	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 78e9ef569ac7..dd57df69a497 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2100,6 +2100,8 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #define WF_SYNC     0x10 /* Waker goes to sleep after wakeup */
 #define WF_MIGRATED 0x20 /* Internal use, task got migrated */
 
+#define WF_ANDROID_VENDOR	0x1000 /* Vendor specific for Android */
+
 #ifdef CONFIG_SMP
 static_assert(WF_EXEC == SD_BALANCE_EXEC);
 static_assert(WF_FORK == SD_BALANCE_FORK);

From 07274b5ec46d69eefc95ce561903206e9f6a342a Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 23 Dec 2022 10:18:06 +0000
Subject: [PATCH 417/457] ANDROID: KVM: arm64: Expose linear map APIs to pKVM
 modules

pKVM modules may need to temporarily map large-ish physically contiguous
regions of memory when bootstrapping themselves. In order to support
this use-case, introduce two new APIs in the module_ops struct allowing
to map and unmap pages in pKVM's linear map range. Since pKVM's page
ownership infrastructure relies on linear map PTEs, this needs to be
done with special care. To avoid any problem, let's count the number of
pages mapped by modules and unsure they have been unmapped before
reaching the point of deprivilege.

Bug: 244373730
Change-Id: I4aecb93f5c9ba08d9f830d1f0976704688b98509
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h |  2 ++
 arch/arm64/kvm/hyp/nvhe/modules.c        | 42 ++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 2fa7aa81a0c4..b77d5e8382e4 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -23,6 +23,8 @@ struct pkvm_module_ops {
 	void (*putx64)(u64 num);
 	void *(*fixmap_map)(phys_addr_t phys);
 	void (*fixmap_unmap)(void);
+	void *(*linear_map_early)(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot);
+	void (*linear_unmap_early)(void *addr, size_t size);
 	void (*flush_dcache_to_poc)(void *addr, size_t size);
 	int (*register_host_perm_fault_handler)(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr));
 	int (*protect_host_page)(u64 pfn, enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 3b033b7d6c19..5b657f80d134 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -36,11 +36,51 @@ bool pkvm_modules_enabled(void)
 	return __pkvm_modules_enabled;
 }
 
+static u64 early_lm_pages;
+static void *__pkvm_linear_map_early(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot)
+{
+	void *addr = NULL;
+	int ret;
+
+	if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+		return NULL;
+
+	pkvm_modules_lock();
+	if (!__pkvm_modules_enabled)
+		goto out;
+
+	addr = __hyp_va(phys);
+	ret = pkvm_create_mappings(addr, addr + size, prot);
+	if (ret)
+		addr = NULL;
+	else
+		early_lm_pages += size >> PAGE_SHIFT;
+out:
+	pkvm_modules_unlock();
+
+	return addr;
+}
+
+static void __pkvm_linear_unmap_early(void *addr, size_t size)
+{
+	pkvm_modules_lock();
+	pkvm_remove_mappings(addr, addr + size);
+	early_lm_pages -= size >> PAGE_SHIFT;
+	pkvm_modules_unlock();
+}
+
 int __pkvm_close_module_registration(void)
 {
 	int ret;
 
 	pkvm_modules_lock();
+	/*
+	 * Page ownership tracking might go out of sync if there are stale
+	 * entries in pKVM's linear map range, so they must really be gone by
+	 * now.
+	 */
+	WARN_ON(early_lm_pages);
+
 	ret = __pkvm_modules_enabled ? 0 : -EACCES;
 	if (!ret) {
 		void *addr = hyp_fixmap_map(__hyp_pa(&__pkvm_modules_enabled));
@@ -60,6 +100,8 @@ const struct pkvm_module_ops module_ops = {
 	.putx64 = hyp_putx64,
 	.fixmap_map = hyp_fixmap_map,
 	.fixmap_unmap = hyp_fixmap_unmap,
+	.linear_map_early = __pkvm_linear_map_early,
+	.linear_unmap_early = __pkvm_linear_unmap_early,
 	.flush_dcache_to_poc = __kvm_flush_dcache_to_poc,
 	.register_host_perm_fault_handler = hyp_register_host_perm_fault_handler,
 	.protect_host_page = hyp_protect_host_page,

From 7d969932eec583efdec695b4ee5d57ba6a88a186 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 23 Dec 2022 11:13:28 +0000
Subject: [PATCH 418/457] ANDROID: KVM: arm64: Introduce a hyp panic module
 notifier

pKVM modules may need to be notified in case of unexpected same-level
EL2 exceptions, which result in a hyp panic. To do so, introduce a new
notifier on the hyp_panic path.

Bug: 244373730
Change-Id: I144609a933d648ddf2aebcd950e64d6035bf8be3
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h  | 1 +
 arch/arm64/kvm/hyp/include/nvhe/modules.h | 1 +
 arch/arm64/kvm/hyp/nvhe/modules.c         | 1 +
 arch/arm64/kvm/hyp/nvhe/switch.c          | 9 +++++++++
 4 files changed, 12 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index b77d5e8382e4..68b0fbbf2fd4 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -32,6 +32,7 @@ struct pkvm_module_ops {
 	int (*register_default_trap_handler)(bool (*cb)(struct kvm_cpu_context *));
 	int (*register_illegal_abt_notifier)(void (*cb)(struct kvm_cpu_context *));
 	int (*register_psci_notifier)(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
+	int (*register_hyp_panic_notifier)(void (*cb)(struct kvm_cpu_context *host_ctxt));
 };
 
 struct pkvm_module_section {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/modules.h b/arch/arm64/kvm/hyp/include/nvhe/modules.h
index 681fd233fd2b..278e53d29f16 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/modules.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/modules.h
@@ -6,6 +6,7 @@
 int __pkvm_register_host_smc_handler(bool (*cb)(struct kvm_cpu_context *));
 int __pkvm_register_default_trap_handler(bool (*cb)(struct kvm_cpu_context *));
 int __pkvm_register_illegal_abt_notifier(void (*cb)(struct kvm_cpu_context *));
+int __pkvm_register_hyp_panic_notifier(void (*cb)(struct kvm_cpu_context *));
 
 enum pkvm_psci_notification;
 int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *));
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 5b657f80d134..075dfc2b142d 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -109,6 +109,7 @@ const struct pkvm_module_ops module_ops = {
 	.register_default_trap_handler = __pkvm_register_default_trap_handler,
 	.register_illegal_abt_notifier = __pkvm_register_illegal_abt_notifier,
 	.register_psci_notifier = __pkvm_register_psci_notifier,
+	.register_hyp_panic_notifier = __pkvm_register_hyp_panic_notifier,
 };
 
 int __pkvm_init_module(void *module_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 310aaf493909..0409fb351d85 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -333,6 +333,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	return exit_code;
 }
 
+static void (*hyp_panic_notifier)(struct kvm_cpu_context *host_ctxt);
+int __pkvm_register_hyp_panic_notifier(void (*cb)(struct kvm_cpu_context *host_ctxt))
+{
+	return cmpxchg(&hyp_panic_notifier, NULL, cb) ? -EBUSY : 0;
+}
+
 asmlinkage void __noreturn hyp_panic(void)
 {
 	u64 spsr = read_sysreg_el2(SYS_SPSR);
@@ -344,6 +350,9 @@ asmlinkage void __noreturn hyp_panic(void)
 	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	vcpu = host_ctxt->__hyp_running_vcpu;
 
+	if (READ_ONCE(hyp_panic_notifier))
+		hyp_panic_notifier(host_ctxt);
+
 	if (vcpu) {
 		__timer_disable_traps(vcpu);
 		__deactivate_traps(vcpu);

From e6af8b20245772df240114b922b9e1d0a101553f Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 24 Nov 2022 10:09:12 +0000
Subject: [PATCH 419/457] ANDROID: KVM: arm64: Introduce default_host_prot()

pKVM uses different default permissions for memory and non-memory
regions of the PA space. To avoid scattering this logic around,
introduce a default_host_prot() helper function.

Non functional changes intended.

Bug: 244543039
Bug: 244373730
Change-Id: I36cdbb26a2cb0d54b5641f945f6ede4ffe371045
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 9ea3970ad293..d0e59fbc75bc 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -498,6 +498,11 @@ static struct memblock_region *find_mem_range(phys_addr_t addr, struct kvm_mem_r
 	return NULL;
 }
 
+static enum kvm_pgtable_prot default_host_prot(bool is_memory)
+{
+	return is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
+}
+
 bool addr_is_memory(phys_addr_t phys)
 {
 	struct kvm_mem_range range;
@@ -652,10 +657,7 @@ static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot)
 	 * mappings, hence avoiding to lose the state because of side-effects in
 	 * kvm_pgtable_stage2_map().
 	 */
-	if (range_is_memory(addr, end))
-		return prot != PKVM_HOST_MEM_PROT;
-	else
-		return prot != PKVM_HOST_MMIO_PROT;
+	return prot != default_host_prot(range_is_memory(addr, end));
 }
 
 static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
@@ -686,12 +688,11 @@ static int host_stage2_idmap(u64 addr)
 {
 	struct kvm_mem_range range;
 	bool is_memory = !!find_mem_range(addr, &range);
-	enum kvm_pgtable_prot prot;
+	enum kvm_pgtable_prot prot = default_host_prot(is_memory);
 	int ret;
 
 	hyp_assert_lock_held(&host_mmu.lock);
 
-	prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT;
 	/*
 	 * Adjust against IOMMU devices first. host_stage2_adjust_range() should
 	 * be called last for proper alignment.

From 39c484f30110fd4dd704b1204dec7a45bcf52894 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Thu, 24 Nov 2022 10:12:02 +0000
Subject: [PATCH 420/457] ANDROID: KVM: arm64: Correctly flag MMIO pages as
 PKVM_PAGE_RESTRICTED_PROT

The host_get_page_state() logic has currently a baked in assumption that
it will only be used on memory, and checks against the default memory
permssions to flag pages as having a RESTRICTED_PROT state.

Add support for correctly flagging non-memory pages to prepare the
ground for future patches.

Bug: 244543039
Bug: 244373730
Change-Id: Idaaef96cb98c147c8b793059438064cf770af525
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index d0e59fbc75bc..53cc9b80b93d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -897,13 +897,17 @@ static enum pkvm_page_state host_get_page_state(kvm_pte_t pte)
 {
 	enum pkvm_page_state state = 0;
 	enum kvm_pgtable_prot prot;
+	phys_addr_t phys;
 
 	if (!kvm_pte_valid(pte) && pte)
 		return PKVM_NOPAGE;
 
 	prot = kvm_pgtable_stage2_pte_prot(pte);
-	if (kvm_pte_valid(pte) && ((prot & KVM_PGTABLE_PROT_RWX) != PKVM_HOST_MEM_PROT))
-		state = PKVM_PAGE_RESTRICTED_PROT;
+	if (kvm_pte_valid(pte)) {
+		phys = kvm_pte_to_phys(pte);
+		if ((prot & KVM_PGTABLE_PROT_RWX) != default_host_prot(addr_is_memory(phys)))
+			state = PKVM_PAGE_RESTRICTED_PROT;
+	}
 
 	return state | pkvm_getstate(prot);
 }

From 51a84221b108408dfd76e08652698ccf371e60b9 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 12 Dec 2022 18:09:47 +0000
Subject: [PATCH 421/457] ANDROID: KVM: arm64: Introduce concept of pKVM
 moveable regions

The pKVM memory pool is currently sized to allow page-granularity
mapping in the host stage-2 page-table of all the memory as well as up
to 1GiB of MMIO range. Indeed, pKVM currently assumes that MMIO regions
are completely and solely owned by the host for the entire lifetime of
the system. As such, the pages used to map MMIO regions can always be
recycled to allow forward progress if the memory pool ran out of
pages -- pKVM can unmap MMIO ranges at stage-2 without fearing to loose
important information about the state of the underlying page, and those
mappings can always be reconstructed later.

In order to allow transitioning the ownership of non-memory regions,
introduce a concept of pkvm 'moveable' regions, which represents regions
of the physical address space which can be 'moved' from an ownership
perspective. These moveable regions are used to size the hyp memory
pool. In a first step, the list of moveable regions is equal to the
memblock list, but it will be extended in subsequent changes.

No functional changes intended.

Bug: 244543039
Bug: 244373730
Change-Id: I7f451924b1eed9579868e6ff8c7adc7b4a5a0ae1
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h             | 28 ++++++---
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/iommu.c               |  2 +-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 36 +++++++-----
 arch/arm64/kvm/pkvm.c                         | 57 +++++++++++++++++++
 5 files changed, 101 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 7ac74dd69b34..49aa99fad1b3 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -242,6 +242,20 @@ static inline int pkvm_get_max_wrps(void)
 	return num ? num + 1 : 0;
 }
 
+enum pkvm_moveable_reg_type {
+	PKVM_MREG_MEMORY,
+};
+
+struct pkvm_moveable_reg {
+	phys_addr_t start;
+	u64 size;
+	enum pkvm_moveable_reg_type type;
+};
+
+#define PKVM_NR_MOVEABLE_REGS 512
+extern struct pkvm_moveable_reg kvm_nvhe_sym(pkvm_moveable_regs)[];
+extern unsigned int kvm_nvhe_sym(pkvm_moveable_regs_nr);
+
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);
 
@@ -292,13 +306,13 @@ static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 	return total;
 }
 
-static inline unsigned long __hyp_pgtable_total_pages(void)
+static inline unsigned long __hyp_pgtable_moveable_regs_pages(void)
 {
 	unsigned long res = 0, i;
 
-	/* Cover all of memory with page-granularity */
-	for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
-		struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
+	/* Cover all of moveable regions with page-granularity */
+	for (i = 0; i < kvm_nvhe_sym(pkvm_moveable_regs_nr); i++) {
+		struct pkvm_moveable_reg *reg = &kvm_nvhe_sym(pkvm_moveable_regs)[i];
 		res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
 	}
 
@@ -309,7 +323,7 @@ static inline unsigned long hyp_s1_pgtable_pages(void)
 {
 	unsigned long res;
 
-	res = __hyp_pgtable_total_pages();
+	res = __hyp_pgtable_moveable_regs_pages();
 
 	/* Allow 1 GiB for private mappings */
 	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
@@ -325,9 +339,9 @@ static inline unsigned long host_s2_pgtable_pages(void)
 	 * Include an extra 16 pages to safely upper-bound the worst case of
 	 * concatenated pgds.
 	 */
-	res = __hyp_pgtable_total_pages() + 16;
+	res = __hyp_pgtable_moveable_regs_pages() + 16;
 
-	/* Allow 1 GiB for MMIO mappings */
+	/* Allow 1 GiB for non-moveable regions */
 	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
 
 	return res;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 64d643355759..751f860770ac 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -87,7 +87,7 @@ bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot,
 			     bool update_iommu);
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id);
-int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size);
+int host_stage2_unmap_reg_locked(phys_addr_t start, u64 size);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
 int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
 void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
diff --git a/arch/arm64/kvm/hyp/nvhe/iommu.c b/arch/arm64/kvm/hyp/nvhe/iommu.c
index df974eba1e0f..a6073f443c8f 100644
--- a/arch/arm64/kvm/hyp/nvhe/iommu.c
+++ b/arch/arm64/kvm/hyp/nvhe/iommu.c
@@ -424,7 +424,7 @@ int __pkvm_iommu_register(unsigned long dev_id, unsigned long drv_id,
 	 * is successful, future attempts to re-map will be blocked by
 	 * pkvm_iommu_host_stage2_adjust_range.
 	 */
-	ret = host_stage2_unmap_dev_locked(dev_pa, dev_size);
+	ret = host_stage2_unmap_reg_locked(dev_pa, dev_size);
 	if (ret)
 		goto out_free;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 53cc9b80b93d..c1f457a56176 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -25,6 +25,9 @@
 
 struct host_mmu host_mmu;
 
+struct pkvm_moveable_reg pkvm_moveable_regs[PKVM_NR_MOVEABLE_REGS];
+unsigned int pkvm_moveable_regs_nr;
+
 static struct hyp_pool host_s2_pool;
 
 static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm);
@@ -432,7 +435,7 @@ int __pkvm_prot_finalize(void)
 	return 0;
 }
 
-int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size)
+int host_stage2_unmap_reg_locked(phys_addr_t start, u64 size)
 {
 	int ret;
 
@@ -446,21 +449,24 @@ int host_stage2_unmap_dev_locked(phys_addr_t start, u64 size)
 	return 0;
 }
 
-static int host_stage2_unmap_dev_all(void)
+static int host_stage2_unmap_unmoveable_regs(void)
 {
 	struct kvm_pgtable *pgt = &host_mmu.pgt;
-	struct memblock_region *reg;
+	struct pkvm_moveable_reg *reg;
 	u64 addr = 0;
 	int i, ret;
 
-	/* Unmap all non-memory regions to recycle the pages */
-	for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) {
-		reg = &hyp_memory[i];
-		ret = host_stage2_unmap_dev_locked(addr, reg->base - addr);
-		if (ret)
-			return ret;
+	/* Unmap all unmoveable regions to recycle the pages */
+	for (i = 0; i < pkvm_moveable_regs_nr; i++) {
+		reg = &pkvm_moveable_regs[i];
+		if (reg->start > addr) {
+			ret = host_stage2_unmap_reg_locked(addr, reg->start - addr);
+			if (ret)
+				return ret;
+		}
+		addr = max(addr, reg->start + reg->size);
 	}
-	return host_stage2_unmap_dev_locked(addr, BIT(pgt->ia_bits) - addr);
+	return host_stage2_unmap_reg_locked(addr, BIT(pgt->ia_bits) - addr);
 }
 
 struct kvm_mem_range {
@@ -552,10 +558,10 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
 }
 
 /*
- * The pool has been provided with enough pages to cover all of memory with
- * page granularity, but it is difficult to know how much of the MMIO range
- * we will need to cover upfront, so we may need to 'recycle' the pages if we
- * run out.
+ * The pool has been provided with enough pages to cover all of moveable regions
+ * with page granularity, but it is difficult to know how much of the
+ * non-moveable regions we will need to cover upfront, so we may need to
+ * 'recycle' the pages if we run out.
  */
 #define host_stage2_try(fn, ...)					\
 	({								\
@@ -563,7 +569,7 @@ static inline int __host_stage2_idmap(u64 start, u64 end,
 		hyp_assert_lock_held(&host_mmu.lock);			\
 		__ret = fn(__VA_ARGS__);				\
 		if (__ret == -ENOMEM) {					\
-			__ret = host_stage2_unmap_dev_all();		\
+			__ret = host_stage2_unmap_unmoveable_regs();		\
 			if (!__ret)					\
 				__ret = fn(__VA_ARGS__);		\
 		}							\
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index f53776f04a95..c2979cc77804 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -24,6 +24,7 @@ static struct reserved_mem *pkvm_firmware_mem;
 static phys_addr_t *pvmfw_base = &kvm_nvhe_sym(pvmfw_base);
 static phys_addr_t *pvmfw_size = &kvm_nvhe_sym(pvmfw_size);
 
+static struct pkvm_moveable_reg *moveable_regs = kvm_nvhe_sym(pkvm_moveable_regs);
 static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
 static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
 
@@ -63,6 +64,55 @@ static int __init register_memblock_regions(void)
 	return 0;
 }
 
+static int cmp_moveable_reg(const void *p1, const void *p2)
+{
+	const struct pkvm_moveable_reg *r1 = p1;
+	const struct pkvm_moveable_reg *r2 = p2;
+
+	/*
+	 * Moveable regions may overlap, so put the largest one first when start
+	 * addresses are equal to allow a simpler walk from e.g.
+	 * host_stage2_unmap_unmoveable_regs().
+	 */
+	if (r1->start < r2->start)
+		return -1;
+	else if (r1->start > r2->start)
+		return 1;
+	else if (r1->size > r2->size)
+		return -1;
+	else if (r1->size < r2->size)
+		return 1;
+	return 0;
+}
+
+static void __init sort_moveable_regs(void)
+{
+	sort(moveable_regs,
+	     kvm_nvhe_sym(pkvm_moveable_regs_nr),
+	     sizeof(struct pkvm_moveable_reg),
+	     cmp_moveable_reg,
+	     NULL);
+}
+
+static int __init register_moveable_regions(void)
+{
+	struct memblock_region *reg;
+	int i = 0;
+
+	for_each_mem_region(reg) {
+		if (i >= PKVM_NR_MOVEABLE_REGS)
+			return -ENOMEM;
+		moveable_regs[i].start = reg->base;
+		moveable_regs[i].size = reg->size;
+		moveable_regs[i].type = PKVM_MREG_MEMORY;
+		i++;
+	}
+	kvm_nvhe_sym(pkvm_moveable_regs_nr) = i;
+	sort_moveable_regs();
+
+	return 0;
+}
+
 void __init kvm_hyp_reserve(void)
 {
 	u64 hyp_mem_pages = 0;
@@ -81,6 +131,13 @@ void __init kvm_hyp_reserve(void)
 		return;
 	}
 
+	ret = register_moveable_regions();
+	if (ret) {
+		*hyp_memblock_nr_ptr = 0;
+		kvm_err("Failed to register pkvm moveable regions: %d\n", ret);
+		return;
+	}
+
 	hyp_mem_pages += hyp_s1_pgtable_pages();
 	hyp_mem_pages += host_s2_pgtable_pages();
 	hyp_mem_pages += hyp_vm_table_pages();

From 9d56fc78909afe95f69d5496a74295b8deb8bbe1 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 12 Dec 2022 18:47:56 +0000
Subject: [PATCH 422/457] ANDROID: KVM: arm64: Specify stage-2-protected
 regions in DT

Parse the devicetree during pKVM init to find nodes with the
"pkvm,protected-region" compatible string. These nodes specify a
physical address range in reg that must alway be mapped as invalid in
the host stage-2 page table when running under pKVM.

Example DT:

        pkvm_prot_reg: pkvm_prot_reg@80000000 {
                compatible = "pkvm,protected-region";
                reg = <0x00 0x80000000 0x00 0x200000>;
        };

Bug: 244543039
Bug: 244373730
Change-Id: I102cd16c91d96e5283cdd1a4fa58836cc4834eac
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h             |  1 +
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  4 ++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 13 ++++++++++
 arch/arm64/kvm/hyp/nvhe/setup.c               | 21 +++++++++++++++
 arch/arm64/kvm/pkvm.c                         | 26 +++++++++++++++++++
 5 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 49aa99fad1b3..6cd974fc584f 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -244,6 +244,7 @@ static inline int pkvm_get_max_wrps(void)
 
 enum pkvm_moveable_reg_type {
 	PKVM_MREG_MEMORY,
+	PKVM_MREG_PROTECTED_RANGE,
 };
 
 struct pkvm_moveable_reg {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 751f860770ac..6c7419abd54d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -60,7 +60,8 @@ enum pkvm_component_id {
 	PKVM_ID_HYP,
 	PKVM_ID_GUEST,
 	PKVM_ID_FFA,
-	PKVM_ID_MAX = PKVM_ID_FFA,
+	PKVM_ID_PROTECTED,
+	PKVM_ID_MAX = PKVM_ID_PROTECTED,
 };
 
 extern unsigned long hyp_nr_cpus;
@@ -87,6 +88,7 @@ bool addr_is_memory(phys_addr_t phys);
 int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot,
 			     bool update_iommu);
 int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, enum pkvm_component_id owner_id);
+int host_stage2_protect_pages_locked(phys_addr_t addr, u64 size);
 int host_stage2_unmap_reg_locked(phys_addr_t start, u64 size);
 int kvm_host_prepare_stage2(void *pgt_pool_base);
 int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index c1f457a56176..a0107f99abfa 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -2235,3 +2235,16 @@ bool __pkvm_check_ioguard_page(struct pkvm_hyp_vcpu *hyp_vcpu)
 
 	return ret;
 }
+
+int host_stage2_protect_pages_locked(phys_addr_t addr, u64 size)
+{
+	int ret;
+
+	hyp_assert_lock_held(&host_mmu.lock);
+
+	ret = __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED);
+	if (!ret)
+		ret = host_stage2_set_owner_locked(addr, size, PKVM_ID_PROTECTED);
+
+	return ret;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 5e379f03fcca..a3f787859fa5 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -303,6 +303,23 @@ static int fix_hyp_pgtable_refcnt(void)
 				&walker);
 }
 
+static int unmap_protected_regions(void)
+{
+	struct pkvm_moveable_reg *reg;
+	int i, ret;
+
+	for (i = 0; i < pkvm_moveable_regs_nr; i++) {
+		reg = &pkvm_moveable_regs[i];
+		if (reg->type != PKVM_MREG_PROTECTED_RANGE)
+			continue;
+		ret = host_stage2_protect_pages_locked(reg->start, reg->size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 void __noreturn __pkvm_init_finalise(void)
 {
 	struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -344,6 +361,10 @@ void __noreturn __pkvm_init_finalise(void)
 	if (ret)
 		goto out;
 
+	ret = unmap_protected_regions();
+	if (ret)
+		goto out;
+
 	ret = hyp_ffa_init(ffa_proxy_pages);
 	if (ret)
 		goto out;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index c2979cc77804..2ff0eb18a57d 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -9,6 +9,7 @@
 #include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/of_address.h>
 #include <linux/of_fdt.h>
 #include <linux/of_reserved_mem.h>
 #include <linux/sort.h>
@@ -97,6 +98,7 @@ static void __init sort_moveable_regs(void)
 static int __init register_moveable_regions(void)
 {
 	struct memblock_region *reg;
+	struct device_node *np;
 	int i = 0;
 
 	for_each_mem_region(reg) {
@@ -107,6 +109,30 @@ static int __init register_moveable_regions(void)
 		moveable_regs[i].type = PKVM_MREG_MEMORY;
 		i++;
 	}
+
+	for_each_compatible_node(np, NULL, "pkvm,protected-region") {
+		struct resource res;
+		u64 start, size;
+		int ret;
+
+		if (i >= PKVM_NR_MOVEABLE_REGS)
+			return -ENOMEM;
+
+		ret = of_address_to_resource(np, 0, &res);
+		if (ret)
+			return ret;
+
+		start = res.start;
+		size = resource_size(&res);
+		if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(size))
+			return -EINVAL;
+
+		moveable_regs[i].start = start;
+		moveable_regs[i].size = size;
+		moveable_regs[i].type = PKVM_MREG_PROTECTED_RANGE;
+		i++;
+	}
+
 	kvm_nvhe_sym(pkvm_moveable_regs_nr) = i;
 	sort_moveable_regs();
 

From b773c2285613cb3b20abc1d236c534fa2d8f4670 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Tue, 13 Dec 2022 09:13:39 +0000
Subject: [PATCH 423/457] ANDROID: KVM: arm64: Keep the pKVM private range
 under 1GiB

The hypervisor memory pool is sized to allow mapping up to 1GiB of data
in the 'private' range of the hypervisor. However, this is currently
not enforced in any way, which might become a problem as private range
mappings are used more and more (e.g. from pKVM modules).

Enforce the 1GiB limit at allocation time, and while at it, rename
__io_map_base to __private_range_base for consistency.

Bug: 244543039
Change-Id: I32c9145ba331309b49428ff461a41c94ea0c1512
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/include/asm/kvm_pkvm.h |  5 +++--
 arch/arm64/kvm/hyp/nvhe/mm.c      | 26 ++++++++++++++------------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 6cd974fc584f..2c7652bfe362 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -320,14 +320,15 @@ static inline unsigned long __hyp_pgtable_moveable_regs_pages(void)
 	return res;
 }
 
+#define __PKVM_PRIVATE_SZ SZ_1G
+
 static inline unsigned long hyp_s1_pgtable_pages(void)
 {
 	unsigned long res;
 
 	res = __hyp_pgtable_moveable_regs_pages();
 
-	/* Allow 1 GiB for private mappings */
-	res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+	res += __hyp_pgtable_max_pages(__PKVM_PRIVATE_SZ >> PAGE_SHIFT);
 
 	return res;
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 6ed9bf0819bc..58ddd7d1711e 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -25,7 +25,8 @@ hyp_spinlock_t pkvm_pgd_lock;
 struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS];
 unsigned int hyp_memblock_nr;
 
-static u64 __io_map_base;
+static u64 __private_range_base;
+static u64 __private_range_cur;
 
 struct hyp_fixmap_slot {
 	u64 addr;
@@ -50,29 +51,29 @@ static int __pkvm_create_mappings(unsigned long start, unsigned long size,
  * @size:	The size of the VA range to reserve.
  * @haddr:	The hypervisor virtual start address of the allocation.
  *
- * The private virtual address (VA) range is allocated above __io_map_base
+ * The private virtual address (VA) range is allocated above __private_range_base
  * and aligned based on the order of @size.
  *
  * Return: 0 on success or negative error code on failure.
  */
 int pkvm_alloc_private_va_range(size_t size, unsigned long *haddr)
 {
-	unsigned long base, addr;
+	unsigned long cur, addr;
 	int ret = 0;
 
 	hyp_spin_lock(&pkvm_pgd_lock);
 
 	/* Align the allocation based on the order of its size */
-	addr = ALIGN(__io_map_base, PAGE_SIZE << get_order(size));
+	addr = ALIGN(__private_range_cur, PAGE_SIZE << get_order(size));
 
 	/* The allocated size is always a multiple of PAGE_SIZE */
-	base = addr + PAGE_ALIGN(size);
+	cur = addr + PAGE_ALIGN(size);
 
-	/* Are we overflowing on the vmemmap ? */
-	if (!addr || base > __hyp_vmemmap)
+	/* Has the private range grown too large ? */
+	if (!addr || cur > __hyp_vmemmap || (cur - __private_range_base) > __PKVM_PRIVATE_SZ) {
 		ret = -ENOMEM;
-	else {
-		__io_map_base = base;
+	} else {
+		__private_range_cur = cur;
 		*haddr = addr;
 	}
 
@@ -386,9 +387,10 @@ int hyp_create_idmap(u32 hyp_va_bits)
 	 * with the idmap to place the IOs and the vmemmap. IOs use the lower
 	 * half of the quarter and the vmemmap the upper half.
 	 */
-	__io_map_base = start & BIT(hyp_va_bits - 2);
-	__io_map_base ^= BIT(hyp_va_bits - 2);
-	__hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3);
+	__private_range_base = start & BIT(hyp_va_bits - 2);
+	__private_range_base ^= BIT(hyp_va_bits - 2);
+	__private_range_cur = __private_range_base;
+	__hyp_vmemmap = __private_range_base | BIT(hyp_va_bits - 3);
 
 	return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC);
 }

From 0f2c334fd25a95b2752345ce75839d5abc8b4567 Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Fri, 6 Jan 2023 15:34:52 +0000
Subject: [PATCH 424/457] Revert "ANDROID: KVM: arm64: Coalesce host stage2
 entries on ownership reclaim"

This reverts commit 1af7ed3212d977869ed312e1f9b43ab259d36727.

Bug: 264333547
Test: /data/local/tmp/sebastianene/tests/test_host_app
Change-Id: Id88b705dd725cc8720913fd2909030c2f2fb597f
Signed-off-by: Sebastian Ene <sebastianene@google.com>
---
 arch/arm64/include/asm/kvm_pgtable.h  | 18 ------------
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 18 ++----------
 arch/arm64/kvm/hyp/pgtable.c          | 41 +--------------------------
 3 files changed, 3 insertions(+), 74 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 8e8cd6bc6433..054612a2a7fc 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -210,24 +210,6 @@ enum kvm_pgtable_prot {
 #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
 #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
 
-#define KVM_HOST_S2_DEFAULT_ATTR   (KVM_PTE_LEAF_ATTR_HI |	\
-				    KVM_PTE_LEAF_ATTR_LO)
-
-#define KVM_HOST_S2_DEFAULT_MEM_PTE		\
-	(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR |	\
-	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
-	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
-	KVM_PTE_LEAF_ATTR_LO_S2_SH	|	\
-	KVM_PTE_LEAF_ATTR_LO_S2_AF)
-
-#define KVM_HOST_S2_DEFAULT_MMIO_PTE		\
-	(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR |	\
-	KVM_PTE_LEAF_ATTR_HI_S2_XN |		\
-	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
-	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
-	KVM_PTE_LEAF_ATTR_LO_S2_SH |		\
-	KVM_PTE_LEAF_ATTR_LO_S2_AF)
-
 #define PAGE_HYP		KVM_PGTABLE_PROT_RW
 #define PAGE_HYP_EXEC		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
 #define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index a0107f99abfa..098ff16f408d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -181,7 +181,7 @@ static bool guest_stage2_force_pte_cb(u64 addr, u64 end,
 
 static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
-	return !!pte;
+	return host_stage2_pte_is_counted(pte, level);
 }
 
 static void *guest_s2_zalloc_pages_exact(size_t size)
@@ -668,26 +668,12 @@ static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot)
 
 static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
-	u64 phys;
-
 	/*
 	 * The refcount tracks valid entries as well as invalid entries if they
 	 * encode ownership of a page to another entity than the page-table
 	 * owner, whose id is 0.
 	 */
-	if (!kvm_pte_valid(pte))
-		return !!pte;
-
-	if (kvm_pte_table(pte, level))
-		return true;
-
-	phys = kvm_pte_to_phys(pte);
-	if (!addr_is_memory(phys))
-		return (pte & KVM_HOST_S2_DEFAULT_ATTR) !=
-			KVM_HOST_S2_DEFAULT_MMIO_PTE;
-	else
-		return (pte & KVM_HOST_S2_DEFAULT_ATTR) !=
-			KVM_HOST_S2_DEFAULT_MEM_PTE;
+	return !!pte;
 }
 
 static int host_stage2_idmap(u64 addr)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 6aac30b3ba7f..e48b66b744d5 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -787,13 +787,6 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	 */
 	if (pte_ops->pte_is_counted_cb(pte, level))
 		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
-	else {
-		/*
-		 * On non-refcounted PTEs we just clear them out without
-		 * dropping the refcount.
-		 */
-		stage2_clear_pte(ptep, data->mmu, addr, level);
-	}
 
 	kvm_set_table_pte(ptep, childp, mm_ops);
 	mm_ops->get_page(ptep);
@@ -801,35 +794,6 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	return 0;
 }
 
-static void stage2_coalesce_walk_table_post(u64 addr, u64 end, u32 level,
-			kvm_pte_t *ptep,
-			struct stage2_map_data *data)
-{
-	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
-	kvm_pte_t *childp = kvm_pte_follow(*ptep, mm_ops);
-
-	/*
-	 * Decrement the refcount only on the set ownership path to avoid a
-	 * loop situation when the following happens:
-	 *  1. We take a host stage2 fault and we create a small mapping which
-	 *  has default attributes (is not refcounted).
-	 *  2. On the way back we execute the post handler and we zap the
-	 *  table that holds our mapping.
-	 */
-	if (kvm_phys_is_valid(data->phys) ||
-	    !kvm_level_supports_block_mapping(level))
-		return;
-
-	/*
-	 * Free a page that is not referenced anymore and drop the reference
-	 * of the page table page.
-	 */
-	if (mm_ops->page_count(childp) == 1) {
-		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
-		mm_ops->put_page(childp);
-	}
-}
-
 static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 				      kvm_pte_t *ptep,
 				      struct stage2_map_data *data)
@@ -838,11 +802,8 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 	kvm_pte_t *childp;
 	int ret = 0;
 
-	if (!data->anchor) {
-		stage2_coalesce_walk_table_post(addr, end, level, ptep,
-						data);
+	if (!data->anchor)
 		return 0;
-	}
 
 	if (data->anchor == ptep) {
 		childp = data->childp;

From bd4ccca4a7c8e05097ba832705556e0ddd767a97 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Date: Thu, 5 Jan 2023 17:53:25 -0800
Subject: [PATCH 425/457] ANDROID: kbuild: Search external devicetree path when
 running clean target

When running the clean target, kbuild is supposed to remove dtbs and
dtbos. However, kbuild only searches for those devicetree build
artifacts in the directory that the kernel binaries or kernel modules
are output to. This is not sufficient in cases where an external
devicetree is used, so include the external devicetree path when
searching for dtb and dtbo build artifacts.

Bug: 264602319
Fixes: 3d42cc9e75a3 ("ANDROID: kbuild: add support for compiling external device trees")
Change-Id: I45fdfdef09c3d57401d98e5db731273147d7d265
[isaacmanjarres: resolved trivial merge conflict]
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 228fa03ca139..bfd92cb667f6 100644
--- a/Makefile
+++ b/Makefile
@@ -2039,7 +2039,9 @@ $(clean-dirs):
 
 clean: $(clean-dirs)
 	$(call cmd,rmfiles)
-	@find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
+	@find $(or $(KBUILD_EXTMOD), .) \
+		$(if $(filter-out arch/$(SRCARCH)/boot/dts, $(dtstree)), $(dtstree)) \
+		$(RCS_FIND_IGNORE) \
 		\( -name '*.[aios]' -o -name '*.rsi' -o -name '*.ko' -o -name '.*.cmd' \
 		-o -name '*.ko.*' \
 		-o -name '*.dtb' -o -name '*.dtbo' -o -name '*.dtb.S' -o -name '*.dt.yaml' \

From 8382f516fb5b62bfc77439d5776fa1c6e45bd2d0 Mon Sep 17 00:00:00 2001
From: Ray Chi <raychi@google.com>
Date: Thu, 13 May 2021 13:47:45 +0800
Subject: [PATCH 426/457] ANDROID: usb: gadget: f_accessory: update SS/SSP
 descriptors

Currently, only HS descriptors will be updated with endpoint address
during binding process. According to current max_speed in configfs,
this patch will also update SS/SSP descriptors with endpoint address.

Bug: 162562782
Signed-off-by: Ray Chi <raychi@google.com>
Change-Id: I67983ef47df7ac567ec1d3af80921c39c98a545d
(cherry picked from commit 41fe558317e9ffdc07326e8ef9ca6ea596d9a518)
(cherry picked from commit ba3ec687b701c60f3b9336e6549b94ed84d8fe77)
---
 drivers/usb/gadget/function/f_accessory.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c
index 36f775790ed9..3510f6d39f0c 100644
--- a/drivers/usb/gadget/function/f_accessory.c
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -1125,12 +1125,22 @@ __acc_function_bind(struct usb_configuration *c,
 		return ret;
 
 	/* support high speed hardware */
-	if (gadget_is_dualspeed(c->cdev->gadget)) {
-		acc_highspeed_in_desc.bEndpointAddress =
-			acc_fullspeed_in_desc.bEndpointAddress;
-		acc_highspeed_out_desc.bEndpointAddress =
-			acc_fullspeed_out_desc.bEndpointAddress;
-	}
+	acc_highspeed_in_desc.bEndpointAddress =
+		acc_fullspeed_in_desc.bEndpointAddress;
+	acc_highspeed_out_desc.bEndpointAddress =
+		acc_fullspeed_out_desc.bEndpointAddress;
+
+	/* support super speed hardware */
+	acc_superspeed_in_desc.bEndpointAddress =
+		acc_fullspeed_in_desc.bEndpointAddress;
+	acc_superspeed_out_desc.bEndpointAddress =
+		acc_fullspeed_out_desc.bEndpointAddress;
+
+	/* support super speed plus hardware */
+	acc_superspeedplus_in_desc.bEndpointAddress =
+		acc_fullspeed_in_desc.bEndpointAddress;
+	acc_superspeedplus_out_desc.bEndpointAddress =
+		acc_fullspeed_out_desc.bEndpointAddress;
 
 	DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
 			gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full",

From 3f48f34ca5d1ae44374555bd2fabff361be57a7b Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
Date: Thu, 22 Dec 2022 13:16:29 -0800
Subject: [PATCH 427/457] ANDROID: iommu: Add a vendor field in iova_domain

The alloc_iova() implementation uses a first-fit algorithm when allocating
an IOVA. On some devices, specially with 32-bit IOVA space, this can lead
to fragmentation and result in larger IOVA allocations failing. For such
devices, the best-fit algorithm works better.

Add a vendor field to iova_domain that can be used to indicate that the
best-fit algorithm should be used when allocating IOVAs for this device
(iova_domain).

Bug: 263499813
Bug: 190519428
Bug: 149544392
Change-Id: Ie7dec70ee158075804209f83ae68e5ae0cc20775
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 drivers/iommu/iova.c | 1 +
 include/linux/iova.h | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a44ad92fc5eb..602888ec9572 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -70,6 +70,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->anchor.pfn_lo = iovad->anchor.pfn_hi = IOVA_ANCHOR;
 	rb_link_node(&iovad->anchor.node, NULL, &iovad->rbroot.rb_node);
 	rb_insert_color(&iovad->anchor.node, &iovad->rbroot);
+	android_init_vendor_data(iovad, 1);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 83c00fac2acb..54fb81b2f5c8 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h>
+#include <linux/android_vendor.h>
 
 /* iova structure */
 struct iova {
@@ -38,6 +39,8 @@ struct iova_domain {
 
 	struct iova_rcache	*rcaches;
 	struct hlist_node	cpuhp_dead;
+
+	ANDROID_VENDOR_DATA(1);
 };
 
 static inline unsigned long iova_size(struct iova *iova)

From 72b891ab7f27fcf6baf0c2317f800fa4c3bf2a84 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
Date: Thu, 22 Dec 2022 13:48:15 -0800
Subject: [PATCH 428/457] ANDROID: iommu: Add vendor hook to select alloc_iova
 algorithm

Add a vendor hook that allows initializing the iovad->android_vendor_data1
field to indicate whether to use the first-fit or best-fit algorithm for
this device when allocating IOVAs.

Bug: 263499813
Bug: 190519428
Bug: 149544392
Change-Id: I0c894b32416a459d7a58aa076770daedcce67cc3
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 drivers/android/vendor_hooks.c | 5 +++++
 drivers/iommu/dma-iommu.c      | 3 +++
 include/trace/hooks/iommu.h    | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index d4cb5666ee69..d41c18cc3407 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -6,6 +6,10 @@
  * Copyright 2020 Google LLC
  */
 
+#ifndef __GENKSYSMS__
+#include <linux/iova.h>
+#endif
+
 #define CREATE_TRACE_POINTS
 #include <trace/hooks/vendor_hooks.h>
 #include <linux/tracepoint.h>
@@ -83,6 +87,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_iovad_init_alloc_algo);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ptype_head);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_allow_domain_state);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cpuidle_psci_enter);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 851bfc2cf074..7ac6dc205984 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -579,6 +579,9 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
 	}
 
 	init_iova_domain(iovad, 1UL << order, base_pfn);
+
+	trace_android_rvh_iommu_iovad_init_alloc_algo(dev, iovad);
+
 	ret = iova_domain_init_rcaches(iovad);
 	if (ret)
 		goto done_unlock;
diff --git a/include/trace/hooks/iommu.h b/include/trace/hooks/iommu.h
index 6357a6bc8f96..86b12dbb6d59 100644
--- a/include/trace/hooks/iommu.h
+++ b/include/trace/hooks/iommu.h
@@ -23,6 +23,10 @@ DECLARE_HOOK(android_vh_iommu_iovad_free_iova,
 	TP_PROTO(struct iova_domain *iovad, dma_addr_t iova, size_t size),
 	TP_ARGS(iovad, iova, size));
 
+DECLARE_RESTRICTED_HOOK(android_rvh_iommu_iovad_init_alloc_algo,
+	TP_PROTO(struct device *dev, struct iova_domain *iovad),
+	TP_ARGS(dev, iovad), 1);
+
 #endif /* _TRACE_HOOK_IOMMU_H */
 
 /* This part must be outside protection */

From 7e3069667d833d55c1d8c1921eebdacddf1aacc0 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
Date: Thu, 22 Dec 2022 14:00:46 -0800
Subject: [PATCH 429/457] ANDROID: iommu: Add vendor hook to alloc_iova()

Add a vendor hook that allows overriding the default alloc_iova()
algorithm.

The vendor hook can use the iovad->android_vendor_data1 field, initialized
in the previous patch, to determine whether to use the best-fit algorithm
to allocate iova for this device. If an IOVA was successfully allocated,
the vendor hook returns 0. Otherwise it returns non-zero and we fallback
to the default, first-fit algorithm.

On some 32-bit devices, using a best-fit algorithm reduces chances of the
allocation failure due to fragmentation.

We submitted patches (listed in Links below) with two different approaches
to the community but they were not accepted in part, because we are not
able to reproduce the problem with upstream-only code.

Links:
[1] Use a DT property:
https://lore.kernel.org/lkml/20221213163506.GA2011062-robh@kernel.org/
[2] Add an API function to be called by user drivers:
https://lore.kernel.org/lkml/20200217080339.GC10342@infradead.org/

Bug: 263499813
Bug: 190519428
Bug: 149544392
Change-Id: I8e948a08e79089d3fb7356eeb7f85b58784688f2
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 drivers/android/vendor_hooks.c |  1 +
 drivers/iommu/iova.c           | 11 ++++++++---
 include/trace/hooks/iommu.h    |  8 ++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index d41c18cc3407..4501d20feb6a 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -85,6 +85,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_send_tm_command);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ufs_check_int_errors);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_cgroup_attach);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_setup_dma_ops);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_alloc_insert_iova);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_alloc_iova);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_iommu_iovad_free_iova);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_iommu_iovad_init_alloc_algo);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 602888ec9572..889eae11b096 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -11,6 +11,7 @@
 #include <linux/smp.h>
 #include <linux/bitops.h>
 #include <linux/cpu.h>
+#include <trace/hooks/iommu.h>
 
 /* The anchor node sits above the top of the usable address space */
 #define IOVA_ANCHOR	~0UL
@@ -317,14 +318,18 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
 	bool size_aligned)
 {
 	struct iova *new_iova;
-	int ret;
+	int ret = -1;
 
 	new_iova = alloc_iova_mem();
 	if (!new_iova)
 		return NULL;
 
-	ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
-			new_iova, size_aligned);
+	trace_android_rvh_iommu_alloc_insert_iova(iovad, size, limit_pfn + 1,
+			new_iova, size_aligned, &ret);
+	if (ret) {
+		ret = __alloc_and_insert_iova_range(iovad, size,
+			limit_pfn + 1, new_iova, size_aligned);
+	}
 
 	if (ret) {
 		free_iova_mem(new_iova);
diff --git a/include/trace/hooks/iommu.h b/include/trace/hooks/iommu.h
index 86b12dbb6d59..ed02144539f9 100644
--- a/include/trace/hooks/iommu.h
+++ b/include/trace/hooks/iommu.h
@@ -14,6 +14,14 @@ DECLARE_RESTRICTED_HOOK(android_rvh_iommu_setup_dma_ops,
 	TP_ARGS(dev, dma_base, dma_limit), 1);
 
 struct iova_domain;
+struct iova;
+
+DECLARE_RESTRICTED_HOOK(android_rvh_iommu_alloc_insert_iova,
+	TP_PROTO(struct iova_domain *iovad, unsigned long size,
+		unsigned long limit_pfn, struct iova *new_iova,
+		bool size_aligned, int *ret),
+	TP_ARGS(iovad, size, limit_pfn, new_iova, size_aligned, ret),
+	1);
 
 DECLARE_HOOK(android_vh_iommu_iovad_alloc_iova,
 	TP_PROTO(struct device *dev, struct iova_domain *iovad, dma_addr_t iova, size_t size),

From 510e65b2c19315a537f83649a6afd8e20dd31730 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Date: Mon, 9 Jan 2023 10:30:26 -0800
Subject: [PATCH 430/457] ANDROID: GKI: Remove usage of __GENKSYMS__ in vendor
 hooks source

On older kernel branches, the inclusion of kernel headers in the
vendor hooks source file (drivers/android/vendor_hooks.c) was
guarded with `#ifndef __GENKSYMS__` since the headers were
added to the source file after those branches were KMI frozen.
If the header inclusions were not guarded by `#ifndef __GENKSYMS__`
then the existing CRC values of the symbols in the KMI would have
been impacted, resulting in a KMI break.

Given that this problem does not exist on android14-6.1, remove
the usage of `#ifndef __GENKSYMS__` in drivers/android/vendor_hooks.c.

Bug: 264895944
Change-Id: Iaf051ec4ab00a8239b8e1bc74034717e8bbdc588
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
---
 drivers/android/vendor_hooks.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 4501d20feb6a..19b2c4466d08 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -6,9 +6,7 @@
  * Copyright 2020 Google LLC
  */
 
-#ifndef __GENKSYSMS__
 #include <linux/iova.h>
-#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/hooks/vendor_hooks.h>

From 3a49c6f70d4fbda47610e85400e8bea89c70e8c7 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Tue, 23 Mar 2021 10:54:38 +0100
Subject: [PATCH 431/457] ANDROID: kbuild: customize module linker script for
 fips140 module

To meet FIPS requirements, fips140.ko must check its own integrity at
load time.  This requires that it know where its .text and .rodata
sections are.  To allow this, make the module linker script support
defining symbols that enclose these sections.

In addition, support creating an .initcalls section, so that fips140.ko
can include code from what would normally be multiple modules by
compiling it as "built-in" code.

[ebiggers: Separated this out from the original commit
 "ANDROID: crypto: fips140 - perform load time integrity check" and
 folded in later changes to the script.  See below.]

Original commits:
  android12-5.10:
    6be141eb36fe ("ANDROID: crypto: fips140 - perform load time integrity check")
    e8d56bd78b6e ("ANDROID: module: apply special LTO treatment to .text even if CFI is disabled")
    109f31ac23f5 ("ANDROID: fips140: add userspace interface for evaluation testing")
  android14-5.15:
    57be8919bf33 ("ANDROID: fips140: consolidate linker script changes into module.lds.S")
    d4966a820397 ("ANDROID: fips140: remove CONFIG_CRYPTO_FIPS140 option")
    6da26b8750f5 ("ANDROID: fips140: require 'm' to enable CRYPTO_FIPS140_MOD")
    ae4ca7a09bb6 ("ANDROID: fips140: allow building without LTO")

Bug: 153614920
Bug: 188620248
Change-Id: I22209ff4e6444f9115eca6909bcb653fd5d14aec
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 scripts/module.lds.S | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index da4bddd26171..023c4986ad75 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -27,7 +27,36 @@ SECTIONS {
 	__kcfi_traps 		: { KEEP(*(.kcfi_traps)) }
 #endif
 
-#ifdef CONFIG_LTO_CLANG
+#if IS_ENABLED(CONFIG_CRYPTO_FIPS140_MOD)
+	/*
+	 * The FIPS140 module incorporates copies of builtin code, which gets
+	 * integrity checked at module load time, and registered in a way that
+	 * ensures that the integrity checked versions supersede the builtin
+	 * ones.  These objects are compiled as builtin code, and so their init
+	 * hooks will be exported from the binary in the same way as builtin
+	 * initcalls are, i.e., annotated with a level that defines the order
+	 * in which the hooks are expected to be invoked.
+	 */
+#define INIT_CALLS_LEVEL(level)						\
+		KEEP(*(.initcall##level##.init*))			\
+		KEEP(*(.initcall##level##s.init*))
+
+	.initcalls : {
+		*(.initcalls._start)
+		INIT_CALLS_LEVEL(0)
+		INIT_CALLS_LEVEL(1)
+		INIT_CALLS_LEVEL(2)
+		INIT_CALLS_LEVEL(3)
+		INIT_CALLS_LEVEL(4)
+		INIT_CALLS_LEVEL(5)
+		INIT_CALLS_LEVEL(rootfs)
+		INIT_CALLS_LEVEL(6)
+		INIT_CALLS_LEVEL(7)
+		*(.initcalls._end)
+	}
+#endif
+
+#if defined(CONFIG_LTO_CLANG) || IS_ENABLED(CONFIG_CRYPTO_FIPS140_MOD)
 	/*
 	 * With CONFIG_LTO_CLANG, LLD always enables -fdata-sections and
 	 * -ffunction-sections, which increases the size of the final module.
@@ -44,8 +73,17 @@ SECTIONS {
 	}
 
 	.rodata : {
+		*(.rodata.._start)
 		*(.rodata .rodata.[0-9a-zA-Z_]*)
 		*(.rodata..L*)
+		*(.rodata.._end)
+	}
+
+	.text : {
+		*(.text.._start)
+		*(.text .text.[0-9a-zA-Z_]*)
+		*(.text.._end)
+		*(.text.._fips140_unchecked)
 	}
 #endif
 }

From 1984e62b1075053a4ed3102043eb8ac71f81c5cb Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Mon, 12 Apr 2021 12:51:16 +0200
Subject: [PATCH 432/457] ANDROID: crypto: lib/sha256 - add vendor hook for
 sha256() routine

Add a vendor hook that will allow the FIPS140 kernel module to override
the implementation of the sha256() library routine. The FIPS 140 version
is identical to the normal one, but its code and rodata will have been
integrity checked at module load time.

Original commits:
  android12-5.10:
    1e351b98e7c7 ("ANDROID: crypto: lib/sha256 - add vendor hook for sha256() routine")
  android14-5.15:
    0ef21e1c1ae5 ("ANDROID: vendor_hooks: Reduce pointless modversions CRC churn")
    d4966a820397 ("ANDROID: fips140: remove CONFIG_CRYPTO_FIPS140 option")

Bug: 153614920
Bug: 188620248
Change-Id: I8ccc4f0cc8206af39fa922134b438dacac2a614a
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 drivers/android/vendor_hooks.c |  2 ++
 include/trace/hooks/fips140.h  | 26 ++++++++++++++++++++++++++
 lib/crypto/sha256.c            |  9 +++++++++
 3 files changed, 37 insertions(+)
 create mode 100644 include/trace/hooks/fips140.h

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 19b2c4466d08..f03a1ec54440 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -40,6 +40,7 @@
 #include <trace/hooks/remoteproc.h>
 #include <trace/hooks/rwsem.h>
 #include <trace/hooks/futex.h>
+#include <trace/hooks/fips140.h>
 
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -119,3 +120,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_wake);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256);
diff --git a/include/trace/hooks/fips140.h b/include/trace/hooks/fips140.h
new file mode 100644
index 000000000000..100ddc14a0c8
--- /dev/null
+++ b/include/trace/hooks/fips140.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fips140
+#define TRACE_INCLUDE_PATH trace/hooks
+
+#if !defined(_TRACE_HOOK_FIPS140_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HOOK_FIPS140_H
+#include <trace/hooks/vendor_hooks.h>
+
+/*
+ * This hook exists only for the benefit of the FIPS140 crypto module, which
+ * uses it to swap out the underlying implementation with one that is integrity
+ * checked as per FIPS 140 requirements. No other uses are allowed or
+ * supported.
+ */
+DECLARE_HOOK(android_vh_sha256,
+	     TP_PROTO(const u8 *data,
+		      unsigned int len,
+		      u8 *out,
+		      int *hook_inuse),
+	     TP_ARGS(data, len, out, hook_inuse));
+
+#endif /* _TRACE_HOOK_FIPS140_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 72a4b0b1df28..3f0475055059 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <crypto/sha2.h>
 #include <asm/unaligned.h>
+#include <trace/hooks/fips140.h>
 
 static const u32 SHA256_K[] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
@@ -200,6 +201,14 @@ void sha256(const u8 *data, unsigned int len, u8 *out)
 {
 	struct sha256_state sctx;
 
+#ifndef __DISABLE_EXPORTS
+	int hook_inuse = 0;
+
+	trace_android_vh_sha256(data, len, out, &hook_inuse);
+	if (hook_inuse)
+		return;
+#endif
+
 	sha256_init(&sctx);
 	sha256_update(&sctx, data, len);
 	sha256_final(&sctx, out);

From 1c0ab9432e0a96d7f4430d388e376608db6d30b5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Mon, 12 Apr 2021 13:05:54 +0200
Subject: [PATCH 433/457] ANDROID: crypto: lib/aes - add vendor hooks for AES
 library routines

Add vendor hooks that will allow the FIPS140 kernel module to override
the implementations of the AES library routines.  The FIPS 140 versions
are identical to the normal ones, but their code and rodata will have
been integrity checked at module load time.

Original commits:
  android12-5.10:
    9c556792b713 ("ANDROID: crypto: lib/aes - add vendor hooks for AES library routines")
  android14-5.15:
    d4966a820397 ("ANDROID: fips140: remove CONFIG_CRYPTO_FIPS140 option")

Bug: 153614920
Bug: 188620248
Change-Id: I5711fc42eced903565fd3c8d41ca7cdd82641148
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 drivers/android/vendor_hooks.c |  3 +++
 include/trace/hooks/fips140.h  | 28 ++++++++++++++++++++++++++--
 lib/crypto/aes.c               | 22 ++++++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index f03a1ec54440..f900944fe250 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -121,3 +121,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_write_finished);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_rwsem_list_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alter_futex_plist_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt);
diff --git a/include/trace/hooks/fips140.h b/include/trace/hooks/fips140.h
index 100ddc14a0c8..fd4a42c013c7 100644
--- a/include/trace/hooks/fips140.h
+++ b/include/trace/hooks/fips140.h
@@ -7,12 +7,15 @@
 #define _TRACE_HOOK_FIPS140_H
 #include <trace/hooks/vendor_hooks.h>
 
+struct crypto_aes_ctx;
+
 /*
- * This hook exists only for the benefit of the FIPS140 crypto module, which
- * uses it to swap out the underlying implementation with one that is integrity
+ * These hooks exist only for the benefit of the FIPS140 crypto module, which
+ * uses them to swap out the underlying implementation with one that is integrity
  * checked as per FIPS 140 requirements. No other uses are allowed or
  * supported.
  */
+
 DECLARE_HOOK(android_vh_sha256,
 	     TP_PROTO(const u8 *data,
 		      unsigned int len,
@@ -20,6 +23,27 @@ DECLARE_HOOK(android_vh_sha256,
 		      int *hook_inuse),
 	     TP_ARGS(data, len, out, hook_inuse));
 
+DECLARE_HOOK(android_vh_aes_expandkey,
+	     TP_PROTO(struct crypto_aes_ctx *ctx,
+		      const u8 *in_key,
+		      unsigned int key_len,
+		      int *err),
+	     TP_ARGS(ctx, in_key, key_len, err));
+
+DECLARE_HOOK(android_vh_aes_encrypt,
+	     TP_PROTO(const struct crypto_aes_ctx *ctx,
+		      u8 *out,
+		      const u8 *in,
+		      int *hook_inuse),
+	     TP_ARGS(ctx, out, in, hook_inuse));
+
+DECLARE_HOOK(android_vh_aes_decrypt,
+	     TP_PROTO(const struct crypto_aes_ctx *ctx,
+		      u8 *out,
+		      const u8 *in,
+		      int *hook_inuse),
+	     TP_ARGS(ctx, out, in, hook_inuse));
+
 #endif /* _TRACE_HOOK_FIPS140_H */
 
 /* This part must be outside protection */
diff --git a/lib/crypto/aes.c b/lib/crypto/aes.c
index 827fe89922ff..5fc78e50d693 100644
--- a/lib/crypto/aes.c
+++ b/lib/crypto/aes.c
@@ -7,6 +7,7 @@
 #include <linux/crypto.h>
 #include <linux/module.h>
 #include <asm/unaligned.h>
+#include <trace/hooks/fips140.h>
 
 /*
  * Emit the sbox as volatile const to prevent the compiler from doing
@@ -189,6 +190,13 @@ int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 	u32 rc, i, j;
 	int err;
 
+#ifndef __DISABLE_EXPORTS
+	err = -(MAX_ERRNO + 1);
+	trace_android_vh_aes_expandkey(ctx, in_key, key_len, &err);
+	if (err != -(MAX_ERRNO + 1))
+		return err;
+#endif
+
 	err = aes_check_keylen(key_len);
 	if (err)
 		return err;
@@ -261,6 +269,13 @@ void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
 	int round;
+#ifndef __DISABLE_EXPORTS
+	int hook_inuse = 0;
+
+	trace_android_vh_aes_encrypt(ctx, out, in, &hook_inuse);
+	if (hook_inuse)
+		return;
+#endif
 
 	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
 	st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
@@ -312,6 +327,13 @@ void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
 	int rounds = 6 + ctx->key_length / 4;
 	u32 st0[4], st1[4];
 	int round;
+#ifndef __DISABLE_EXPORTS
+	int hook_inuse = 0;
+
+	trace_android_vh_aes_decrypt(ctx, out, in, &hook_inuse);
+	if (hook_inuse)
+		return;
+#endif
 
 	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
 	st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);

From 9871f4d6af79c37d3352359b9f909476069871bf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 8 Jul 2021 14:46:46 -0700
Subject: [PATCH 434/457] ANDROID: crypto: define fips_enabled to 1 in
 fips140.ko

In fips140.ko, enable the behavior that the upstream fips_enabled flag
controls, such as the XTS weak key check which apparently is required.

Note that some of this behavior, such as the DRBG continuity check, is
allegedly not required.  But to ensure we don't miss anything that was
already handled upstream, it seems best to define fips_enabled to 1.  We
can still disable anything that turns out to be problematic.

Bug: 153614920
Bug: 188620248
Change-Id: Idcded9e69e7d7cdf7f2937009af209857b0c08e2
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/fips.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/linux/fips.h b/include/linux/fips.h
index c6961e932fef..83f5d6f7c62d 100644
--- a/include/linux/fips.h
+++ b/include/linux/fips.h
@@ -2,7 +2,15 @@
 #ifndef _FIPS_H
 #define _FIPS_H
 
-#ifdef CONFIG_CRYPTO_FIPS
+#ifdef BUILD_FIPS140_KO
+/*
+ * In fips140.ko, enable the behavior that the upstream fips_enabled flag
+ * controls, such as the XTS weak key check.
+ */
+#define fips_enabled 1
+#define CONFIG_CRYPTO_FIPS 1
+
+#elif defined(CONFIG_CRYPTO_FIPS)
 extern int fips_enabled;
 extern struct atomic_notifier_head fips_fail_notif_chain;
 

From ff4aa3372440a94311c59ada261367caba7bd52c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 8 Jul 2021 14:46:44 -0700
Subject: [PATCH 435/457] ANDROID: jump_label: disable jump labels in
 fips140.ko

The fips140 module doesn't support jump labels, as they would invalidate
the hash of the .text section.  So when building the module, switch to
the generic implementation that does not rely on arch-specific code
patching support.

This fixes a failure in check_fips140_module_hmac() caused by the module
containing a call to crypto_alg_put(), which is an inline function that
calls refcount_dec_and_test(), which on arm64 uses a jump label.

Note that the optimized definition of struct static_key is retained, to
ensure ABI compatibility across the FIPS140 module boundary.  To ensure
that static keys and their associated jump labels remain in a consistent
state, the fips140 module will not be able to manipulate static keys,
but only to check their state.

Bug: 153614920
Bug: 188620248
Change-Id: Ie834bbf2eed5d09bfae7f387b711a934bedf390d
Signed-off-by: Eric Biggers <ebiggers@google.com>
[ardb: disable jump labels in generic code not in arm64 arch code]
Signed-off-by: Ard Biesheuvel <ardb@google.com>
---
 include/linux/jump_label.h | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 570831ca9951..27c9ce32d9fc 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -108,7 +108,7 @@ struct static_key {
 
 #endif /* __ASSEMBLY__ */
 
-#ifdef CONFIG_JUMP_LABEL
+#if defined(CONFIG_JUMP_LABEL) && !defined(BUILD_FIPS140_KO)
 #include <asm/jump_label.h>
 
 #ifndef __ASSEMBLY__
@@ -195,7 +195,30 @@ enum jump_label_type {
 
 struct module;
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef BUILD_FIPS140_KO
+
+#include <linux/atomic.h>
+
+static inline int static_key_count(struct static_key *key)
+{
+	return arch_atomic_read(&key->enabled);
+}
+
+static __always_inline bool static_key_false(struct static_key *key)
+{
+	if (unlikely(static_key_count(key) > 0))
+		return true;
+	return false;
+}
+
+static __always_inline bool static_key_true(struct static_key *key)
+{
+	if (likely(static_key_count(key) > 0))
+		return true;
+	return false;
+}
+
+#elif defined(CONFIG_JUMP_LABEL)
 
 #define JUMP_TYPE_FALSE		0UL
 #define JUMP_TYPE_TRUE		1UL
@@ -408,7 +431,7 @@ extern bool ____wrong_branch_error(void);
 	static_key_count((struct static_key *)x) > 0;				\
 })
 
-#ifdef CONFIG_JUMP_LABEL
+#if defined(CONFIG_JUMP_LABEL) && !defined(BUILD_FIPS140_KO)
 
 /*
  * Combine the right initial value (type) with the right branch order

From 77e257420d1ff78f2afa38db38a80badc1519355 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Thu, 3 Jun 2021 15:18:35 +0000
Subject: [PATCH 436/457] ANDROID: arm64: only permit certain alternatives in
 the FIPS140 module

The FIPS140 crypto module takes a HMAC digest of its own .text and
.rodata section in its module_init() hook. This digest is compared to a
digest taken at build time, which means that we need to take some extra
care to ensure that the build time and runtime versions line up.

One thing we cannot tolerate in this case is alternatives patching. In
the general case, we cannot simply ignore alternatives, but fortunately,
there is only a small subset that actually gets instantiated in the
FIPS140 module, and all of these can be ignored if we are willing to
accept that the FIPS140 module does not support VHE hardware, and does
not work when running with pseudo-NMI support enabled. None of this is
important for the use case targeted by the FIPS140 module, so this is
something we should be able to live with.

Bug: 153614920
Bug: 188620248
Change-Id: Ie6666e01d5524a3c33aa451609bab2f29b612f8c
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/include/asm/alternative-macros.h | 47 +++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 3622e9f4fb44..c7842fd06ad6 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -19,6 +19,7 @@
 #error "cpucaps have overflown ARM64_CB_BIT"
 #endif
 
+#ifndef BUILD_FIPS140_KO
 #ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
@@ -261,4 +262,50 @@ l_yes:
 
 #endif /* __ASSEMBLY__ */
 
+#else
+
+/*
+ * The FIPS140 module does not support alternatives patching, as this
+ * invalidates the HMAC digest of the .text section. However, some alternatives
+ * are known to be irrelevant so we can tolerate them in the FIPS140 module, as
+ * they will never be applied in the first place in the use cases that the
+ * FIPS140 module targets (Android running on a production phone). Any other
+ * uses of alternatives should be avoided, as it is not safe in the general
+ * case to simply use the default sequence in one place (the fips module) and
+ * the alternative sequence everywhere else.
+ *
+ * Below is an allowlist of features that we can ignore, by simply taking the
+ * safe default instruction sequence. Note that this implies that the FIPS140
+ * module is not compatible with VHE, or with pseudo-NMI support.
+ */
+
+#define __ALT_ARM64_HAS_LDAPR			0,
+#define __ALT_ARM64_HAS_VIRT_HOST_EXTN		0,
+#define __ALT_ARM64_HAS_IRQ_PRIO_MASKING	0,
+
+#define ALTERNATIVE(oldinstr, newinstr, feature, ...)   \
+	_ALTERNATIVE(oldinstr, __ALT_ ## feature, #feature)
+
+#define _ALTERNATIVE(oldinstr, feature, feature_str)   \
+	__take_second_arg(feature oldinstr, \
+		".err Feature " feature_str " not supported in fips140 module")
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+static __always_inline bool
+alternative_has_feature_likely(unsigned long feature)
+{
+	return feature == ARM64_HAS_LDAPR ||
+		feature == ARM64_HAS_VIRT_HOST_EXTN ||
+		feature == ARM64_HAS_IRQ_PRIO_MASKING;
+}
+
+#define alternative_has_feature_unlikely alternative_has_feature_likely
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* BUILD_FIPS140_KO */
+
 #endif /* __ASM_ALTERNATIVE_MACROS_H */

From 4dc1a5b9559230994affffca7c5ce45ea11406ad Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Mon, 12 Jul 2021 07:49:03 +0000
Subject: [PATCH 437/457] ANDROID: arm64: disable LSE when building the FIPS140
 module

The arm64 LSE atomics implementation uses both alternatives patching and
jump label patching, both of which need to be selectively disabled when
building the FIPS140 module, or the hashing of the .text section no
longer works.

We already disable jump labels in generic code, but this uncovers a
rather nasty circular include dependency, as the jump label fallback
code uses atomics, which are provided by the LSE code if enabled.

So let's disable LSE as well when building the FIPS140 module: this does
not have any impact on the code, as no code patching goes on in this
module anyway, but it avoids #include hell.

Bug: 153614920
Bug: 188620248
Change-Id: Ia3d823fa3a309777f0c955d619ae8b139dc74061
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/include/asm/lse.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index c503db8e73b0..28751299fa4b 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -4,7 +4,7 @@
 
 #include <asm/atomic_ll_sc.h>
 
-#ifdef CONFIG_ARM64_LSE_ATOMICS
+#if defined(CONFIG_ARM64_LSE_ATOMICS) && !defined(BUILD_FIPS140_KO)
 
 #define __LSE_PREAMBLE	".arch_extension lse\n"
 

From 2bacdab575b7f16d7b82f6547c17c72ab4781e6b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Thu, 8 Apr 2021 12:08:18 +0200
Subject: [PATCH 438/457] ANDROID: arm64: simd: omit capability check in
 may_use_simd()

may_use_simd() should only be called by code that may use FP/SIMD when
it is available, and so checking whether the system supports FP/SIMD in
the first place should be redundant - the caller in question (e.g., a
SIMD crypto algorithm) should never be initialized in the first place.

Checking the system capability involves jump labels and therefore code
patching, which interferes with our ability to perform an integrity
check on some of the crypto code. So let's get rid of the capability
check altogether.

Bug: 153614920
Bug: 188620248
Change-Id: Ia8df624f4648cc980a12a44eeb82e8f186d5f961
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/include/asm/simd.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index 6a75d7ecdcaa..543fa59e72e0 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -35,9 +35,7 @@ static __must_check inline bool may_use_simd(void)
 	 * migrated, and if it's clear we cannot be migrated to a CPU
 	 * where it is set.
 	 */
-	return !WARN_ON(!system_capabilities_finalized()) &&
-	       system_supports_fpsimd() &&
-	       !in_hardirq() && !irqs_disabled() && !in_nmi() &&
+	return !in_hardirq() && !irqs_disabled() && !in_nmi() &&
 	       !this_cpu_read(fpsimd_context_busy);
 }
 

From c6d5a767217ce93403528e99f1a1eb401a26eeb3 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@google.com>
Date: Tue, 23 Mar 2021 10:54:38 +0100
Subject: [PATCH 439/457] ANDROID: fips140: add kernel crypto module

To meet FIPS 140 requirements, add support for building a kernel module
"fips140.ko" that contains various cryptographic algorithms built from
existing kernel source files.  At load time, the module checks its own
integrity and self-tests its algorithms, then registers the algorithms
with the crypto API to supersede the original algorithms provided by the
kernel itself.

[ebiggers: this commit originated from "ANDROID: crypto: fips140 -
 perform load time integrity check", but I've folded many later commits
 into it to make forward porting easier.  See below]

Original commits:
  android12-5.10:
    6be141eb36fe ("ANDROID: crypto: fips140 - perform load time integrity check")
    868be244bbed ("ANDROID: inject correct HMAC digest into fips140.ko at build time")
    091338cb398e ("ANDROID: fips140: add missing static keyword to fips140_init()")
    c799c6644b52 ("ANDROID: fips140: adjust some log messages")
    92de53472e68 ("ANDROID: fips140: log already-live algorithms")
    0af06624eadc ("ANDROID: fips140: check for errors from initcalls")
    634445a640a4 ("ANDROID: fips140: fix deadlock in unregister_existing_fips140_algos()")
    e886dd4c339e ("ANDROID: fips140: unregister existing DRBG algorithms")
    b7397e89db29 ("ANDROID: fips140: add power-up cryptographic self-tests")
    50661975be74 ("ANDROID: fips140: add/update module help text")
    b397a0387cb2 ("ANDROID: fips140: test all implementations")
    17ccefe14021 ("ANDROID: fips140: use full 16-byte IV")
    1be58af0776a ("ANDROID: fips140: remove non-prediction-resistant DRBG test")
    2b5843ae2d90 ("ANDROID: fips140: add AES-CBC-CTS")
    2ee56aad318c ("ANDROID: fips140: add AES-CMAC")
    960ebb2b565b ("ANDROID: fips140: add jitterentropy to fips140 module")
    e5b14396f9d2 ("ANDROID: fips140: take into account AES-GCM not being approvable")
    52b70d491bd4 ("ANDROID: fips140: use FIPS140_CFLAGS when compiling fips140-selftests.c")
    6b995f5a5403 ("ANDROID: fips140: preserve RELA sections without relying on the module loader")
    e45108ecff64 ("ANDROID: fips140: block crypto operations until tests complete")
    ecf9341134d1 ("ANDROID: fips140: remove in-place updating of live algorithms")
    482b0323cf29 ("ANDROID: fips140: zeroize temporary values from integrity check")
    64d769e53f20 ("ANDROID: fips140: add service indicators")
    8d7f609cdaa4 ("ANDROID: fips140: add name and version, and a function to retrieve them")
    6b7c37f6c449 ("ANDROID: fips140: use UTS_RELEASE as FIPS version")
    903e97a0ca6d ("ANDROID: fips140: refactor evaluation testing support")
    97fb2104fe22 ("ANDROID: fips140: add support for injecting integrity error")
    109f31ac23f5 ("ANDROID: fips140: add userspace interface for evaluation testing")
  android14-5.15:
    84572a0c7981 ("ANDROID: fips140: split dump-section+add-section into 2 ops")
    b0f8873811d4 ("ANDROID: kleaf: convert fips140 to kleaf")
    2535deae8069 ("ANDROID: GKI: Source GKI_BUILD_CONFIG_FRAGMENT after setting all variables")
    685a2ade28bb ("ANDROID: fips140: add crypto_memneq() back to the module")
    320dfca58a3d ("ANDROID: fips140: fix in-tree builds")
    d4966a820397 ("ANDROID: fips140: remove CONFIG_CRYPTO_FIPS140 option")
    6da26b8750f5 ("ANDROID: fips140: require 'm' to enable CRYPTO_FIPS140_MOD")
    bfcfcce3803b ("ANDROID: fips140: unapply ABS32 relocations generated by KCFI")
    63f46b45dda2 ("ANDROID: fips140: eliminate crypto-fips.a build step")
    ae4ca7a09bb6 ("ANDROID: fips140: allow building without LTO")

Bug: 153614920
Bug: 188620248
Test: tested that the module builds and can be loaded on raven.
Change-Id: I3fde49dbc3d16b149b072a27ba5b4c6219015c94
Signed-off-by: Ard Biesheuvel <ardb@google.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 BUILD.bazel                             |  17 +
 arch/arm64/Makefile.postlink            |  49 ++
 arch/arm64/configs/fips140_gki.fragment |   2 +
 arch/arm64/crypto/Kbuild.fips140        |  52 ++
 build.config.gki.aarch64.fips140        |  27 +
 crypto/Kconfig                          |  25 +
 crypto/Makefile                         |  53 ++
 crypto/fips140-alg-registration.c       | 388 +++++++++
 crypto/fips140-defs.h                   |  58 ++
 crypto/fips140-eval-testing-uapi.h      |  30 +
 crypto/fips140-eval-testing.c           | 129 +++
 crypto/fips140-generated-testvecs.h     |  68 ++
 crypto/fips140-module.c                 | 611 +++++++++++++++
 crypto/fips140-module.h                 |  50 ++
 crypto/fips140-refs.S                   |  34 +
 crypto/fips140-selftests.c              | 998 ++++++++++++++++++++++++
 crypto/fips140_gen_hmac.c               | 194 +++++
 tools/crypto/gen_fips140_testvecs.py    | 125 +++
 18 files changed, 2910 insertions(+)
 create mode 100644 arch/arm64/Makefile.postlink
 create mode 100644 arch/arm64/configs/fips140_gki.fragment
 create mode 100644 arch/arm64/crypto/Kbuild.fips140
 create mode 100644 build.config.gki.aarch64.fips140
 create mode 100644 crypto/fips140-alg-registration.c
 create mode 100644 crypto/fips140-defs.h
 create mode 100644 crypto/fips140-eval-testing-uapi.h
 create mode 100644 crypto/fips140-eval-testing.c
 create mode 100644 crypto/fips140-generated-testvecs.h
 create mode 100644 crypto/fips140-module.c
 create mode 100644 crypto/fips140-module.h
 create mode 100644 crypto/fips140-refs.S
 create mode 100644 crypto/fips140-selftests.c
 create mode 100644 crypto/fips140_gen_hmac.c
 create mode 100755 tools/crypto/gen_fips140_testvecs.py

diff --git a/BUILD.bazel b/BUILD.bazel
index c89e71f59c41..723c46b17611 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -317,6 +317,23 @@ copy_to_dist_dir(
     flat = True,
 )
 
+kernel_build(
+    name = "fips140",
+    outs = [],
+    base_kernel = ":kernel_aarch64",
+    build_config = "build.config.gki.aarch64.fips140",
+    module_outs = ["crypto/fips140.ko"],
+)
+
+copy_to_dist_dir(
+    name = "fips140_dist",
+    data = [
+        ":fips140",
+    ],
+    dist_dir = "out/fips140/dist",
+    flat = True,
+)
+
 # allmodconfig build tests.
 # These are build tests only, so:
 # - outs are intentionally set to empty to not copy anything to DIST_DIR
diff --git a/arch/arm64/Makefile.postlink b/arch/arm64/Makefile.postlink
new file mode 100644
index 000000000000..8cf297fb7dd9
--- /dev/null
+++ b/arch/arm64/Makefile.postlink
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: GPL-2.0
+
+#
+# This file is included by the generic Kbuild makefile to permit the
+# architecture to perform postlink actions on vmlinux and any .ko module file.
+# In this case, we only need it for fips140.ko, which needs some postprocessing
+# for the integrity check mandated by FIPS. This involves making copies of the
+# relocation sections so that the module will have access to them at
+# initialization time, and calculating and injecting a HMAC digest into the
+# module. All other targets are NOPs.
+#
+
+PHONY := __archpost
+__archpost:
+
+-include include/config/auto.conf
+include scripts/Kbuild.include
+
+CMD_FIPS140_GEN_HMAC = crypto/fips140_gen_hmac
+quiet_cmd_gen_hmac = HMAC    $@
+      cmd_gen_hmac = $(OBJCOPY) $@ \
+	--dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.text\S*')=$@.rela.text \
+	--dump-section=$(shell $(READELF) -SW $@|grep -Eo '\.rela\.rodata\S*')=$@.rela.rodata && \
+	$(OBJCOPY) $@ \
+	--add-section=.init.rela.text=$@.rela.text \
+	--add-section=.init.rela.rodata=$@.rela.rodata \
+	--set-section-flags=.init.rela.text=alloc,readonly \
+	--set-section-flags=.init.rela.rodata=alloc,readonly && \
+	$(CMD_FIPS140_GEN_HMAC) $@
+
+# `@true` prevents complaints when there is nothing to be done
+
+vmlinux: FORCE
+	@true
+
+$(objtree)/crypto/fips140.ko: FORCE
+	$(call cmd,gen_hmac)
+
+%.ko: FORCE
+	@true
+
+clean:
+	rm -f $(objtree)/crypto/fips140.ko.rela.*
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
diff --git a/arch/arm64/configs/fips140_gki.fragment b/arch/arm64/configs/fips140_gki.fragment
new file mode 100644
index 000000000000..aa8444d34e56
--- /dev/null
+++ b/arch/arm64/configs/fips140_gki.fragment
@@ -0,0 +1,2 @@
+CONFIG_CRYPTO_FIPS140_MOD=m
+# CONFIG_MODULE_SIG_ALL is not set
diff --git a/arch/arm64/crypto/Kbuild.fips140 b/arch/arm64/crypto/Kbuild.fips140
new file mode 100644
index 000000000000..9aa0af602130
--- /dev/null
+++ b/arch/arm64/crypto/Kbuild.fips140
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Create a separate FIPS archive that duplicates the modules that are relevant
+# for FIPS 140 certification as builtin objects
+#
+
+sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
+sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
+sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
+aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
+sha256-arm64-y := sha256-glue.o sha256-core.o
+sha512-arm64-y := sha512-glue.o sha512-core.o
+aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
+
+crypto-arm64-fips-src	  := $(srctree)/arch/arm64/crypto/
+crypto-arm64-fips-modules := sha1-ce.o sha2-ce.o sha512-ce.o ghash-ce.o \
+			     aes-ce-cipher.o aes-ce-blk.o aes-neon-blk.o \
+			     sha256-arm64.o sha512-arm64.o aes-arm64.o \
+			     aes-neon-bs.o
+
+crypto-fips-objs += $(foreach o,$(crypto-arm64-fips-modules),$($(o:.o=-y):.o=-fips-arch.o))
+
+CFLAGS_aes-glue-ce-fips-arch.o := -DUSE_V8_CRYPTO_EXTENSIONS
+
+$(obj)/aes-glue-%-fips-arch.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS)
+$(obj)/aes-glue-%-fips-arch.o: $(crypto-arm64-fips-src)/aes-glue.c FORCE
+	$(call if_changed_rule,cc_o_c)
+
+$(obj)/%-fips-arch.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS)
+$(obj)/%-fips-arch.o: $(crypto-arm64-fips-src)/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+
+$(obj)/%-fips-arch.o: $(crypto-arm64-fips-src)/%.S FORCE
+	$(call if_changed_rule,as_o_S)
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/%-core.S: $(crypto-arm64-fips-src)/%-armv8.pl
+	$(call cmd,perlasm)
+
+$(obj)/sha256-core.S: $(crypto-arm64-fips-src)/sha512-armv8.pl
+	$(call cmd,perlasm)
+
+clean-files += sha256-core.S sha512-core.S
+
+$(obj)/%-fips-arch.o: $(obj)/%.S FORCE
+	$(call if_changed_rule,as_o_S)
diff --git a/build.config.gki.aarch64.fips140 b/build.config.gki.aarch64.fips140
new file mode 100644
index 000000000000..ec493efc20cf
--- /dev/null
+++ b/build.config.gki.aarch64.fips140
@@ -0,0 +1,27 @@
+. ${ROOT_DIR}/${KERNEL_DIR}/build.config.common
+. ${ROOT_DIR}/${KERNEL_DIR}/build.config.aarch64
+. ${ROOT_DIR}/${KERNEL_DIR}/build.config.gki
+
+FILES="
+crypto/fips140.ko
+"
+
+MAKE_GOALS="
+modules
+"
+
+if [ "${LTO}" = "none" ]; then
+	echo "The FIPS140 module needs LTO to be enabled."
+	exit 1
+fi
+
+MODULES_ORDER=android/gki_aarch64_fips140_modules
+KERNEL_DIR=common
+
+DEFCONFIG=fips140_gki_defconfig
+PRE_DEFCONFIG_CMDS="mkdir -p \${OUT_DIR}/arch/arm64/configs/ && KCONFIG_CONFIG=\${OUT_DIR}/arch/arm64/configs/${DEFCONFIG} ${ROOT_DIR}/${KERNEL_DIR}/scripts/kconfig/merge_config.sh -m -r ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/gki_defconfig ${ROOT_DIR}/${KERNEL_DIR}/arch/arm64/configs/fips140_gki.fragment"
+POST_DEFCONFIG_CMDS=""
+
+if [ -n "${GKI_BUILD_CONFIG_FRAGMENT}" ]; then
+source ${GKI_BUILD_CONFIG_FRAGMENT}
+fi
diff --git a/crypto/Kconfig b/crypto/Kconfig
index d779667671b2..65f71ac48961 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -54,6 +54,31 @@ config CRYPTO_FIPS_VERSION
 	  This option provides the ability to override the FIPS Module Version.
 	  By default the KERNELRELEASE value is used.
 
+config CRYPTO_FIPS140_MOD
+	tristate "Enable FIPS 140 cryptographic module"
+	depends on ARM64 && ARM64_MODULE_PLTS
+	depends on m
+	help
+	  This option enables building a loadable module fips140.ko, which
+	  contains various crypto algorithms that are also built into vmlinux.
+	  At load time, this module overrides the built-in implementations of
+	  these algorithms with its implementations.  It also runs self-tests on
+	  these algorithms and verifies the integrity of its code and data.  If
+	  either of these steps fails, the kernel will panic.
+
+	  This module is intended to be loaded at early boot time in order to
+	  meet FIPS 140 and NIAP FPT_TST_EXT.1 requirements.  It shouldn't be
+	  used if you don't need to meet these requirements.
+
+config CRYPTO_FIPS140_MOD_EVAL_TESTING
+	bool "Enable evaluation testing features in FIPS 140 module"
+	depends on CRYPTO_FIPS140_MOD
+	help
+	  This option adds some features to the FIPS 140 module which are needed
+	  for lab evaluation testing of the module, e.g. support for injecting
+	  errors and support for a userspace interface to some of the module's
+	  services.  This option should not be enabled in production builds.
+
 config CRYPTO_ALGAPI
 	tristate
 	select CRYPTO_ALGAPI2
diff --git a/crypto/Makefile b/crypto/Makefile
index 303b21c43df0..1fa1ea16e0c5 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -212,3 +212,56 @@ obj-$(CONFIG_CRYPTO_SIMD) += crypto_simd.o
 # Key derivation function
 #
 obj-$(CONFIG_CRYPTO_KDF800108_CTR) += kdf_sp800108.o
+
+ifneq ($(CONFIG_CRYPTO_FIPS140_MOD),)
+
+FIPS140_CFLAGS := -DBUILD_FIPS140_KO -include $(srctree)/crypto/fips140-defs.h
+
+CFLAGS_jitterentropy-fips.o := -O0
+KASAN_SANITIZE_jitterentropy-fips.o = n
+UBSAN_SANITIZE_jitterentropy-fips.o = n
+
+# Compile an extra copy of various crypto algorithms into the fips140 module.
+#
+# Note: the module will still work if some files are removed from here.
+# However, it may affect FIPS certifiability.  Don't remove files from here
+# without considering impact on FIPS certifiability.
+
+crypto-fips-objs := drbg.o ecb.o cbc.o ctr.o cts.o gcm.o xts.o hmac.o cmac.o \
+		    gf128mul.o aes_generic.o lib-crypto-aes.o \
+		    jitterentropy.o jitterentropy-kcapi.o \
+		    sha1_generic.o sha256_generic.o sha512_generic.o \
+		    lib-crypto-memneq.o lib-crypto-sha1.o lib-crypto-sha256.o \
+		    lib-crypto-utils.o
+crypto-fips-objs := $(foreach o,$(crypto-fips-objs),$(o:.o=-fips.o))
+
+# get the arch to add its objects to $(crypto-fips-objs)
+include $(srctree)/arch/$(ARCH)/crypto/Kbuild.fips140
+
+$(obj)/%-fips.o: KBUILD_CFLAGS += $(FIPS140_CFLAGS)
+$(obj)/%-fips.o: $(src)/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+$(obj)/lib-%-fips.o: $(srctree)/lib/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+$(obj)/lib-crypto-%-fips.o: $(srctree)/lib/crypto/%.c FORCE
+	$(call if_changed_rule,cc_o_c)
+
+fips140-objs := \
+	fips140-alg-registration.o \
+	fips140-module.o \
+	fips140-refs.o \
+	fips140-selftests.o \
+	$(crypto-fips-objs)
+fips140-$(CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING) += \
+	fips140-eval-testing.o
+obj-m += fips140.o
+
+CFLAGS_fips140-alg-registration.o += $(FIPS140_CFLAGS)
+CFLAGS_fips140-module.o += $(FIPS140_CFLAGS)
+CFLAGS_fips140-selftests.o += $(FIPS140_CFLAGS)
+CFLAGS_fips140-eval-testing.o += $(FIPS140_CFLAGS)
+
+hostprogs-always-y := fips140_gen_hmac
+HOSTLDLIBS_fips140_gen_hmac := -lcrypto -lelf
+
+endif
diff --git a/crypto/fips140-alg-registration.c b/crypto/fips140-alg-registration.c
new file mode 100644
index 000000000000..03757f88890b
--- /dev/null
+++ b/crypto/fips140-alg-registration.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Block crypto operations until tests complete
+ *
+ * Copyright 2021 Google LLC
+ *
+ * This file defines the fips140_crypto_register_*() functions, to which all
+ * calls to crypto_register_*() in the module are redirected.  These functions
+ * override the tfm initialization function of each algorithm to insert a wait
+ * for the module having completed its self-tests and integrity check.
+ *
+ * The exact field that we override depends on the algorithm type.  For
+ * algorithm types that have a strongly-typed initialization function pointer
+ * (e.g. skcipher), we must override that, since cra_init isn't guaranteed to be
+ * called for those despite the field being present in the base struct.  For the
+ * other algorithm types (e.g. "cipher") we must override cra_init.
+ *
+ * All of this applies to both normal algorithms and template instances.
+ *
+ * The purpose of all of this is to meet a FIPS requirement where the module
+ * must not produce any output from cryptographic algorithms until it completes
+ * its tests.  Technically this is impossible, but this solution meets the
+ * intent of the requirement, assuming the user makes a supported sequence of
+ * API calls.  Note that we can't simply run the tests before registering the
+ * algorithms, as the algorithms must be registered in order to run the tests.
+ *
+ * It would be much easier to handle this in the kernel's crypto API framework.
+ * Unfortunately, that was deemed insufficient because the module itself is
+ * required to do the enforcement.  What is *actually* required is still very
+ * vague, but the approach implemented here should meet the requirement.
+ */
+
+/*
+ * This file is the one place in fips140.ko that needs to call the kernel's real
+ * algorithm registration functions, so #undefine all the macros from
+ * fips140-defs.h so that the "fips140_" prefix doesn't automatically get added.
+ */
+#undef aead_register_instance
+#undef ahash_register_instance
+#undef crypto_register_aead
+#undef crypto_register_aeads
+#undef crypto_register_ahash
+#undef crypto_register_ahashes
+#undef crypto_register_alg
+#undef crypto_register_algs
+#undef crypto_register_rng
+#undef crypto_register_rngs
+#undef crypto_register_shash
+#undef crypto_register_shashes
+#undef crypto_register_skcipher
+#undef crypto_register_skciphers
+#undef shash_register_instance
+#undef skcipher_register_instance
+
+#include <crypto/algapi.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/rng.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/xarray.h>
+
+#include "fips140-module.h"
+
+/* Indicates whether the self-tests and integrity check have completed */
+DECLARE_COMPLETION(fips140_tests_done);
+
+/* The thread running the self-tests and integrity check */
+struct task_struct *fips140_init_thread;
+
+/*
+ * Map from crypto_alg to original initialization function (possibly NULL)
+ *
+ * Note: unregistering an algorithm will leak its map entry, as we don't bother
+ * to remove it.  This should be fine since fips140.ko can't be unloaded.  The
+ * proper solution would be to store the original function pointer in a new
+ * field in 'struct crypto_alg', but that would require kernel support.
+ */
+static DEFINE_XARRAY(fips140_init_func_map);
+
+static bool fips140_ready(void)
+{
+	return completion_done(&fips140_tests_done);
+}
+
+/*
+ * Wait until crypto operations are allowed to proceed.  Return true if the
+ * tests are done, or false if the caller is the thread running the tests so it
+ * is allowed to proceed anyway.
+ */
+static bool fips140_wait_until_ready(struct crypto_alg *alg)
+{
+	if (fips140_ready())
+		return true;
+	/*
+	 * The thread running the tests must not wait.  Since tfms can only be
+	 * allocated in task context, we can reliably determine whether the
+	 * invocation is from that thread or not by checking 'current'.
+	 */
+	if (current == fips140_init_thread)
+		return false;
+
+	pr_info("blocking user of %s until tests complete\n",
+		alg->cra_driver_name);
+	wait_for_completion(&fips140_tests_done);
+	pr_info("tests done, allowing %s to proceed\n", alg->cra_driver_name);
+	return true;
+}
+
+static int fips140_store_init_function(struct crypto_alg *alg, void *func)
+{
+	void *ret;
+
+	/*
+	 * The XArray API requires 4-byte aligned values.  Although function
+	 * pointers in general aren't guaranteed to be 4-byte aligned, it should
+	 * be the case for the platforms this module is used on.
+	 */
+	if (WARN_ON((unsigned long)func & 3))
+		return -EINVAL;
+
+	ret = xa_store(&fips140_init_func_map, (unsigned long)alg, func,
+		       GFP_KERNEL);
+	return xa_err(ret);
+}
+
+/* Get the algorithm's original initialization function (possibly NULL) */
+static void *fips140_load_init_function(struct crypto_alg *alg)
+{
+	return xa_load(&fips140_init_func_map, (unsigned long)alg);
+}
+
+/* tfm initialization function overrides */
+
+static int fips140_alg_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_alg *alg = tfm->__crt_alg;
+	int (*cra_init)(struct crypto_tfm *tfm) =
+		fips140_load_init_function(alg);
+
+	if (fips140_wait_until_ready(alg))
+		WRITE_ONCE(alg->cra_init, cra_init);
+	return cra_init ? cra_init(tfm) : 0;
+}
+
+static int fips140_aead_init_tfm(struct crypto_aead *tfm)
+{
+	struct aead_alg *alg = crypto_aead_alg(tfm);
+	int (*init)(struct crypto_aead *tfm) =
+		fips140_load_init_function(&alg->base);
+
+	if (fips140_wait_until_ready(&alg->base))
+		WRITE_ONCE(alg->init, init);
+	return init ? init(tfm) : 0;
+}
+
+static int fips140_ahash_init_tfm(struct crypto_ahash *tfm)
+{
+	struct hash_alg_common *halg = crypto_hash_alg_common(tfm);
+	struct ahash_alg *alg = container_of(halg, struct ahash_alg, halg);
+	int (*init_tfm)(struct crypto_ahash *tfm) =
+		fips140_load_init_function(&halg->base);
+
+	if (fips140_wait_until_ready(&halg->base))
+		WRITE_ONCE(alg->init_tfm, init_tfm);
+	return init_tfm ? init_tfm(tfm) : 0;
+}
+
+static int fips140_shash_init_tfm(struct crypto_shash *tfm)
+{
+	struct shash_alg *alg = crypto_shash_alg(tfm);
+	int (*init_tfm)(struct crypto_shash *tfm) =
+		fips140_load_init_function(&alg->base);
+
+	if (fips140_wait_until_ready(&alg->base))
+		WRITE_ONCE(alg->init_tfm, init_tfm);
+	return init_tfm ? init_tfm(tfm) : 0;
+}
+
+static int fips140_skcipher_init_tfm(struct crypto_skcipher *tfm)
+{
+	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
+	int (*init)(struct crypto_skcipher *tfm) =
+		fips140_load_init_function(&alg->base);
+
+	if (fips140_wait_until_ready(&alg->base))
+		WRITE_ONCE(alg->init, init);
+	return init ? init(tfm) : 0;
+}
+
+/* Single algorithm registration */
+
+#define prepare_alg(alg, base_alg, field, wrapper_func)			\
+({									\
+	int err = 0;							\
+									\
+	if (!fips140_ready() && alg->field != wrapper_func) {		\
+		err = fips140_store_init_function(base_alg, alg->field);\
+		if (err == 0)						\
+			alg->field = wrapper_func;			\
+	}								\
+	err;								\
+})
+
+static int fips140_prepare_alg(struct crypto_alg *alg)
+{
+	/*
+	 * Override cra_init.  This is only for algorithm types like cipher and
+	 * rng that don't have a strongly-typed initialization function.
+	 */
+	return prepare_alg(alg, alg, cra_init, fips140_alg_init_tfm);
+}
+
+static int fips140_prepare_aead_alg(struct aead_alg *alg)
+{
+	return prepare_alg(alg, &alg->base, init, fips140_aead_init_tfm);
+}
+
+static int fips140_prepare_ahash_alg(struct ahash_alg *alg)
+{
+	return prepare_alg(alg, &alg->halg.base, init_tfm,
+			   fips140_ahash_init_tfm);
+}
+
+static int fips140_prepare_rng_alg(struct rng_alg *alg)
+{
+	/*
+	 * rng doesn't have a strongly-typed initialization function, so we must
+	 * treat rng algorithms as "generic" algorithms.
+	 */
+	return fips140_prepare_alg(&alg->base);
+}
+
+static int fips140_prepare_shash_alg(struct shash_alg *alg)
+{
+	return prepare_alg(alg, &alg->base, init_tfm, fips140_shash_init_tfm);
+}
+
+static int fips140_prepare_skcipher_alg(struct skcipher_alg *alg)
+{
+	return prepare_alg(alg, &alg->base, init, fips140_skcipher_init_tfm);
+}
+
+int fips140_crypto_register_alg(struct crypto_alg *alg)
+{
+	return fips140_prepare_alg(alg) ?: crypto_register_alg(alg);
+}
+
+int fips140_crypto_register_aead(struct aead_alg *alg)
+{
+	return fips140_prepare_aead_alg(alg) ?: crypto_register_aead(alg);
+}
+
+int fips140_crypto_register_ahash(struct ahash_alg *alg)
+{
+	return fips140_prepare_ahash_alg(alg) ?: crypto_register_ahash(alg);
+}
+
+int fips140_crypto_register_rng(struct rng_alg *alg)
+{
+	return fips140_prepare_rng_alg(alg) ?: crypto_register_rng(alg);
+}
+
+int fips140_crypto_register_shash(struct shash_alg *alg)
+{
+	return fips140_prepare_shash_alg(alg) ?: crypto_register_shash(alg);
+}
+
+int fips140_crypto_register_skcipher(struct skcipher_alg *alg)
+{
+	return fips140_prepare_skcipher_alg(alg) ?:
+		crypto_register_skcipher(alg);
+}
+
+/* Instance registration */
+
+int fips140_aead_register_instance(struct crypto_template *tmpl,
+				   struct aead_instance *inst)
+{
+	return fips140_prepare_aead_alg(&inst->alg) ?:
+		aead_register_instance(tmpl, inst);
+}
+
+int fips140_ahash_register_instance(struct crypto_template *tmpl,
+				    struct ahash_instance *inst)
+{
+	return fips140_prepare_ahash_alg(&inst->alg) ?:
+		ahash_register_instance(tmpl, inst);
+}
+
+int fips140_shash_register_instance(struct crypto_template *tmpl,
+				    struct shash_instance *inst)
+{
+	return fips140_prepare_shash_alg(&inst->alg) ?:
+		shash_register_instance(tmpl, inst);
+}
+
+int fips140_skcipher_register_instance(struct crypto_template *tmpl,
+				       struct skcipher_instance *inst)
+{
+	return fips140_prepare_skcipher_alg(&inst->alg) ?:
+		skcipher_register_instance(tmpl, inst);
+}
+
+/* Bulk algorithm registration */
+
+int fips140_crypto_register_algs(struct crypto_alg *algs, int count)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		err = fips140_prepare_alg(&algs[i]);
+		if (err)
+			return err;
+	}
+
+	return crypto_register_algs(algs, count);
+}
+
+int fips140_crypto_register_aeads(struct aead_alg *algs, int count)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		err = fips140_prepare_aead_alg(&algs[i]);
+		if (err)
+			return err;
+	}
+
+	return crypto_register_aeads(algs, count);
+}
+
+int fips140_crypto_register_ahashes(struct ahash_alg *algs, int count)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		err = fips140_prepare_ahash_alg(&algs[i]);
+		if (err)
+			return err;
+	}
+
+	return crypto_register_ahashes(algs, count);
+}
+
+int fips140_crypto_register_rngs(struct rng_alg *algs, int count)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		err = fips140_prepare_rng_alg(&algs[i]);
+		if (err)
+			return err;
+	}
+
+	return crypto_register_rngs(algs, count);
+}
+
+int fips140_crypto_register_shashes(struct shash_alg *algs, int count)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		err = fips140_prepare_shash_alg(&algs[i]);
+		if (err)
+			return err;
+	}
+
+	return crypto_register_shashes(algs, count);
+}
+
+int fips140_crypto_register_skciphers(struct skcipher_alg *algs, int count)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < count; i++) {
+		err = fips140_prepare_skcipher_alg(&algs[i]);
+		if (err)
+			return err;
+	}
+
+	return crypto_register_skciphers(algs, count);
+}
diff --git a/crypto/fips140-defs.h b/crypto/fips140-defs.h
new file mode 100644
index 000000000000..9005f9513308
--- /dev/null
+++ b/crypto/fips140-defs.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2021 Google LLC
+ *
+ * This file is automatically included by all files built into fips140.ko, via
+ * the "-include" compiler flag.
+ */
+
+/*
+ * fips140.ko is built from various unmodified or minimally modified kernel
+ * source files, many of which are normally meant to be buildable into different
+ * modules themselves.  That results in conflicting instances of module_init()
+ * and related macros such as MODULE_LICENSE().
+ *
+ * To solve that, we undefine MODULE to trick the kernel headers into thinking
+ * the code is being compiled as built-in.  That causes module_init() and
+ * related macros to be expanded as they would be for built-in code; e.g.,
+ * module_init() adds the function to the .initcalls section of the binary.
+ *
+ * The .c file that contains the real module_init() for fips140.ko is then
+ * responsible for redefining MODULE, and the real module_init() is responsible
+ * for executing all the initcalls that were collected into .initcalls.
+ */
+#undef MODULE
+
+/*
+ * Defining KBUILD_MODFILE is also required, since the kernel headers expect it
+ * to be defined when code that can be a module is compiled as built-in.
+ */
+#define KBUILD_MODFILE "crypto/fips140"
+
+/*
+ * Disable symbol exports by default.  fips140.ko includes various files that
+ * use EXPORT_SYMBOL*(), but it's unwanted to export any symbols from fips140.ko
+ * except where explicitly needed for FIPS certification reasons.
+ */
+#define __DISABLE_EXPORTS
+
+/*
+ * Redirect all calls to algorithm registration functions to the wrapper
+ * functions defined within the module.
+ */
+#define aead_register_instance		fips140_aead_register_instance
+#define ahash_register_instance		fips140_ahash_register_instance
+#define crypto_register_aead		fips140_crypto_register_aead
+#define crypto_register_aeads		fips140_crypto_register_aeads
+#define crypto_register_ahash		fips140_crypto_register_ahash
+#define crypto_register_ahashes		fips140_crypto_register_ahashes
+#define crypto_register_alg		fips140_crypto_register_alg
+#define crypto_register_algs		fips140_crypto_register_algs
+#define crypto_register_rng		fips140_crypto_register_rng
+#define crypto_register_rngs		fips140_crypto_register_rngs
+#define crypto_register_shash		fips140_crypto_register_shash
+#define crypto_register_shashes		fips140_crypto_register_shashes
+#define crypto_register_skcipher	fips140_crypto_register_skcipher
+#define crypto_register_skciphers	fips140_crypto_register_skciphers
+#define shash_register_instance		fips140_shash_register_instance
+#define skcipher_register_instance	fips140_skcipher_register_instance
diff --git a/crypto/fips140-eval-testing-uapi.h b/crypto/fips140-eval-testing-uapi.h
new file mode 100644
index 000000000000..04e6cf633594
--- /dev/null
+++ b/crypto/fips140-eval-testing-uapi.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _CRYPTO_FIPS140_EVAL_TESTING_H
+#define _CRYPTO_FIPS140_EVAL_TESTING_H
+
+#include <linux/ioctl.h>
+
+/*
+ * This header defines the ioctls that are available on the fips140 character
+ * device.  These ioctls expose some of the module's services to userspace so
+ * that they can be tested by the FIPS certification lab; this is a required
+ * part of getting a FIPS 140 certification.  These ioctls do not have any other
+ * purpose, and they do not need to be present in production builds.
+ */
+
+/*
+ * Call the fips140_is_approved_service() function.  The argument must be the
+ * service name as a NUL-terminated string.  The return value will be 1 if
+ * fips140_is_approved_service() returned true, or 0 if it returned false.
+ */
+#define FIPS140_IOCTL_IS_APPROVED_SERVICE	_IO('F', 0)
+
+/*
+ * Call the fips140_module_version() function.  The argument must be a pointer
+ * to a buffer of size >= 256 chars.  The NUL-terminated string returned by
+ * fips140_module_version() will be written to this buffer.
+ */
+#define FIPS140_IOCTL_MODULE_VERSION		_IOR('F', 1, char[256])
+
+#endif /* _CRYPTO_FIPS140_EVAL_TESTING_H */
diff --git a/crypto/fips140-eval-testing.c b/crypto/fips140-eval-testing.c
new file mode 100644
index 000000000000..ea3cd653983a
--- /dev/null
+++ b/crypto/fips140-eval-testing.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 Google LLC
+ *
+ * This file can optionally be built into fips140.ko in order to support certain
+ * types of testing that the FIPS lab has to do to evaluate the module.  It
+ * should not be included in production builds of the module.
+ */
+
+/*
+ * We have to redefine inline to mean always_inline, so that _copy_to_user()
+ * gets inlined.  This is needed for it to be placed into the correct section.
+ * See fips140_copy_to_user().
+ *
+ * We also need to undefine BUILD_FIPS140_KO to allow the use of the code
+ * patching which copy_to_user() requires.
+ */
+#undef inline
+#define inline inline __attribute__((__always_inline__)) __gnu_inline \
+       __inline_maybe_unused notrace
+#undef BUILD_FIPS140_KO
+
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "fips140-module.h"
+#include "fips140-eval-testing-uapi.h"
+
+/*
+ * This option allows deliberately failing the self-tests for a particular
+ * algorithm.
+ */
+static char *fips140_fail_selftest;
+module_param_named(fail_selftest, fips140_fail_selftest, charp, 0);
+
+/* This option allows deliberately failing the integrity check. */
+static bool fips140_fail_integrity_check;
+module_param_named(fail_integrity_check, fips140_fail_integrity_check, bool, 0);
+
+static dev_t fips140_devnum;
+static struct cdev fips140_cdev;
+
+/* Inject a self-test failure (via corrupting the result) if requested. */
+void fips140_inject_selftest_failure(const char *impl, u8 *result)
+{
+	if (fips140_fail_selftest && strcmp(impl, fips140_fail_selftest) == 0)
+		result[0] ^= 0xff;
+}
+
+/* Inject an integrity check failure (via corrupting the text) if requested. */
+void fips140_inject_integrity_failure(u8 *textcopy)
+{
+	if (fips140_fail_integrity_check)
+		textcopy[0] ^= 0xff;
+}
+
+static long fips140_ioctl_is_approved_service(unsigned long arg)
+{
+	const char *service_name = strndup_user((const char __user *)arg, 256);
+	long ret;
+
+	if (IS_ERR(service_name))
+		return PTR_ERR(service_name);
+
+	ret = fips140_is_approved_service(service_name);
+
+	kfree(service_name);
+	return ret;
+}
+
+/*
+ * Code in fips140.ko is covered by an integrity check by default, and this
+ * check breaks if copy_to_user() is called.  This is because copy_to_user() is
+ * an inline function that relies on code patching.  However, since this is
+ * "evaluation testing" code which isn't included in the production builds of
+ * fips140.ko, it's acceptable to just exclude it from the integrity check.
+ */
+static noinline unsigned long __section("text.._fips140_unchecked")
+fips140_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	return copy_to_user(to, from, n);
+}
+
+static long fips140_ioctl_module_version(unsigned long arg)
+{
+	const char *version = fips140_module_version();
+	size_t len = strlen(version) + 1;
+
+	if (len > 256)
+		return -EOVERFLOW;
+
+	if (fips140_copy_to_user((void __user *)arg, version, len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long fips140_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case FIPS140_IOCTL_IS_APPROVED_SERVICE:
+		return fips140_ioctl_is_approved_service(arg);
+	case FIPS140_IOCTL_MODULE_VERSION:
+		return fips140_ioctl_module_version(arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations fips140_fops = {
+	.unlocked_ioctl = fips140_ioctl,
+};
+
+bool fips140_eval_testing_init(void)
+{
+	if (alloc_chrdev_region(&fips140_devnum, 1, 1, "fips140") != 0) {
+		pr_err("failed to allocate device number\n");
+		return false;
+	}
+	cdev_init(&fips140_cdev, &fips140_fops);
+	if (cdev_add(&fips140_cdev, fips140_devnum, 1) != 0) {
+		pr_err("failed to add fips140 character device\n");
+		return false;
+	}
+	return true;
+}
diff --git a/crypto/fips140-generated-testvecs.h b/crypto/fips140-generated-testvecs.h
new file mode 100644
index 000000000000..d4ccd77eb97f
--- /dev/null
+++ b/crypto/fips140-generated-testvecs.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2021 Google LLC */
+
+/*
+ * This header was automatically generated by gen_fips140_testvecs.py.
+ * Don't edit it directly.
+ */
+
+static const u8 fips_message[32] __initconst =
+	"This is a 32-byte test message.";
+
+static const u8 fips_aes_key[16] __initconst = "128-bit AES key";
+
+static const u8 fips_aes_iv[16] __initconst = "ABCDEFGHIJKLMNOP";
+
+static const u8 fips_aes_cbc_ciphertext[32] __initconst =
+	"\x4c\x3e\xeb\x38\x8d\x1f\x28\xfd\xa2\x3b\xa9\xda\x36\xf2\x99\xe2"
+	"\x84\x84\x66\x37\x0a\x53\x68\x2f\x17\x95\x8d\x7f\xca\x5a\x68\x4e";
+
+static const u8 fips_aes_ecb_ciphertext[32] __initconst =
+	"\xc1\x9d\xe6\xb8\xb2\x90\xff\xfe\xf2\x77\x18\xb0\x55\xd3\xee\xa9"
+	"\xe2\x6f\x4a\x32\x67\xfd\xb7\xa5\x2f\x4b\x6e\x1a\x86\x2b\x6e\x3a";
+
+static const u8 fips_aes_ctr_ciphertext[32] __initconst =
+	"\xed\x06\x2c\xd0\xbc\x48\xd1\x2e\x6a\x4e\x13\xe9\xaa\x17\x40\xca"
+	"\x00\xb4\xaf\x3b\x4f\xee\x73\xd6\x6c\x41\xf6\x4c\x8b\x0d\x6a\x0f";
+
+static const u8 fips_aes_gcm_assoc[22] __initconst = "associated data string";
+
+static const u8 fips_aes_gcm_ciphertext[48] __initconst =
+	"\x37\x88\x3e\x1d\x58\x50\xda\x10\x07\xeb\x52\xdf\xea\x0a\x54\xd4"
+	"\x44\xbf\x88\x2a\xf3\x03\x03\x84\xaf\x8b\x96\xbd\xea\x65\x60\x6f"
+	"\x82\xfa\x51\xf4\x28\xad\x0c\xf1\xce\x0f\x91\xdd\x1a\x4c\x77\x5f";
+
+static const u8 fips_aes_xts_key[32] __initconst =
+	"This is an AES-128-XTS key.";
+
+static const u8 fips_aes_xts_ciphertext[32] __initconst =
+	"\x4f\xf7\x9f\x6c\x00\xa8\x30\xdf\xff\xf3\x25\x9c\xf6\x0b\x1b\xfd"
+	"\x3b\x34\x5e\x67\x7c\xf8\x8b\x68\x9a\xb9\x5a\x89\x51\x51\xbd\x35";
+
+static const u8 fips_aes_cmac_digest[16] __initconst =
+	"\x0c\x05\xda\x64\x51\x0c\x8e\x6c\x86\x52\x46\xa8\x2d\xb1\xfe\x0f";
+
+static const u8 fips_hmac_key[16] __initconst = "128-bit HMAC key";
+
+static const u8 fips_sha1_digest[20] __initconst =
+	"\x1b\x78\xc7\x4b\xd5\xd4\x83\xb1\x58\xc5\x96\x83\x4f\x16\x8d\x15"
+	"\xb4\xaa\x22\x8c";
+
+static const u8 fips_sha256_digest[32] __initconst =
+	"\x4e\x11\x83\x0c\x53\x80\x1e\x5f\x9b\x38\x33\x38\xe8\x74\x43\xb0"
+	"\xc1\x3a\xbe\xbf\x75\xf0\x12\x0f\x21\x33\xf5\x16\x33\xf1\xb0\x81";
+
+static const u8 fips_hmac_sha256_digest[32] __initconst =
+	"\x63\x0e\xb5\x73\x79\xfc\xaf\x5f\x86\xe3\xaf\xf0\xc8\x36\xef\xd5"
+	"\x35\x8d\x40\x25\x38\xb3\x65\x72\x98\xf3\x59\xd8\x1e\x54\x4c\xa1";
+
+static const u8 fips_sha512_digest[64] __initconst =
+	"\x32\xe0\x44\x23\xbd\xe3\xec\x28\xbf\xf1\x34\x11\xd5\xae\xbf\xd5"
+	"\xc0\x8e\xb5\xa1\x04\xef\x2f\x07\x84\xf1\xd9\x83\x0f\x6c\x31\xab"
+	"\xf7\xe7\x57\xfa\xf7\xae\xf0\x6f\xb2\x16\x08\x32\xcf\xc7\xef\x35"
+	"\xb3\x3b\x51\xb9\xfd\xe7\xff\x5e\xb2\x8b\xc6\x79\xe6\x14\x04\xb4";
+
+/*
+ * This header was automatically generated by gen_fips140_testvecs.py.
+ * Don't edit it directly.
+ */
diff --git a/crypto/fips140-module.c b/crypto/fips140-module.c
new file mode 100644
index 000000000000..5c2a594dd26b
--- /dev/null
+++ b/crypto/fips140-module.c
@@ -0,0 +1,611 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 Google LLC
+ * Author: Ard Biesheuvel <ardb@google.com>
+ *
+ * This file is the core of fips140.ko, which contains various crypto algorithms
+ * that are also built into vmlinux.  At load time, this module overrides the
+ * built-in implementations of these algorithms with its implementations.  It
+ * also runs self-tests on these algorithms and verifies the integrity of its
+ * code and data.  If either of these steps fails, the kernel will panic.
+ *
+ * This module is intended to be loaded at early boot time in order to meet
+ * FIPS 140 and NIAP FPT_TST_EXT.1 requirements.  It shouldn't be used if you
+ * don't need to meet these requirements.
+ */
+
+/*
+ * Since this .c file is the real entry point of fips140.ko, it needs to be
+ * compiled normally, so undo the hacks that were done in fips140-defs.h.
+ */
+#define MODULE
+#undef KBUILD_MODFILE
+#undef __DISABLE_EXPORTS
+
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <crypto/aead.h>
+#include <crypto/aes.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <crypto/skcipher.h>
+#include <crypto/rng.h>
+#include <trace/hooks/fips140.h>
+
+#include "fips140-module.h"
+#include "internal.h"
+
+/*
+ * FIPS 140-2 prefers the use of HMAC with a public key over a plain hash.
+ */
+u8 __initdata fips140_integ_hmac_key[] = "The quick brown fox jumps over the lazy dog";
+
+/* this is populated by the build tool */
+u8 __initdata fips140_integ_hmac_digest[SHA256_DIGEST_SIZE];
+
+const u32 __initcall_start_marker __section(".initcalls._start");
+const u32 __initcall_end_marker __section(".initcalls._end");
+
+const u8 __fips140_text_start __section(".text.._start");
+const u8 __fips140_text_end __section(".text.._end");
+
+const u8 __fips140_rodata_start __section(".rodata.._start");
+const u8 __fips140_rodata_end __section(".rodata.._end");
+
+/*
+ * We need this little detour to prevent Clang from detecting out of bounds
+ * accesses to __fips140_text_start and __fips140_rodata_start, which only exist
+ * to delineate the section, and so their sizes are not relevant to us.
+ */
+const u32 *__initcall_start = &__initcall_start_marker;
+
+const u8 *__text_start = &__fips140_text_start;
+const u8 *__rodata_start = &__fips140_rodata_start;
+
+/*
+ * The list of the crypto API algorithms (by cra_name) that will be unregistered
+ * by this module, in preparation for the module registering its own
+ * implementation(s) of them.
+ *
+ * All algorithms that will be declared as FIPS-approved in the module
+ * certification must be listed here, to ensure that the non-FIPS-approved
+ * implementations of these algorithms in the kernel image aren't used.
+ *
+ * For every algorithm in this list, the module should contain all the "same"
+ * implementations that the kernel image does, including the C implementation as
+ * well as any architecture-specific implementations.  This is needed to avoid
+ * performance regressions as well as the possibility of an algorithm being
+ * unavailable on some CPUs.  E.g., "xcbc(aes)" isn't in this list, as the
+ * module doesn't have a C implementation of it (and it won't be FIPS-approved).
+ *
+ * Due to a quirk in the FIPS requirements, "gcm(aes)" isn't actually able to be
+ * FIPS-approved.  However, we otherwise treat it the same as the algorithms
+ * that will be FIPS-approved, and therefore it's included in this list.
+ *
+ * When adding a new algorithm here, make sure to consider whether it needs a
+ * self-test added to fips140_selftests[] as well.
+ */
+static const struct {
+	const char *name;
+	bool approved;
+} fips140_algs_to_replace[] = {
+	{"aes", true},
+
+	{"cmac(aes)", true},
+	{"ecb(aes)", true},
+
+	{"cbc(aes)", true},
+	{"cts(cbc(aes))", true},
+	{"ctr(aes)", true},
+	{"xts(aes)", true},
+	{"gcm(aes)", false},
+
+	{"hmac(sha1)", true},
+	{"hmac(sha224)", true},
+	{"hmac(sha256)", true},
+	{"hmac(sha384)", true},
+	{"hmac(sha512)", true},
+	{"sha1", true},
+	{"sha224", true},
+	{"sha256", true},
+	{"sha384", true},
+	{"sha512", true},
+
+	{"stdrng", true},
+	{"jitterentropy_rng", false},
+};
+
+static bool __init fips140_should_unregister_alg(struct crypto_alg *alg)
+{
+	int i;
+
+	/*
+	 * All software algorithms are synchronous, hardware algorithms must
+	 * be covered by their own FIPS 140 certification.
+	 */
+	if (alg->cra_flags & CRYPTO_ALG_ASYNC)
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(fips140_algs_to_replace); i++) {
+		if (!strcmp(alg->cra_name, fips140_algs_to_replace[i].name))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * FIPS 140-3 service indicators.  FIPS 140-3 requires that all services
+ * "provide an indicator when the service utilises an approved cryptographic
+ * algorithm, security function or process in an approved manner".  What this
+ * means is very debatable, even with the help of the FIPS 140-3 Implementation
+ * Guidance document.  However, it was decided that a function that takes in an
+ * algorithm name and returns whether that algorithm is approved or not will
+ * meet this requirement.  Note, this relies on some properties of the module:
+ *
+ *   - The module doesn't distinguish between "services" and "algorithms"; its
+ *     services are simply its algorithms.
+ *
+ *   - The status of an approved algorithm is never non-approved, since (a) the
+ *     module doesn't support operating in a non-approved mode, such as a mode
+ *     where the self-tests are skipped; (b) there are no cases where the module
+ *     supports non-approved settings for approved algorithms, e.g.
+ *     non-approved key sizes; and (c) this function isn't available to be
+ *     called until the module_init function has completed, so it's guaranteed
+ *     that the self-tests and integrity check have already passed.
+ *
+ *   - The module does support some non-approved algorithms, so a single static
+ *     indicator ("return true;") would not be acceptable.
+ */
+bool fips140_is_approved_service(const char *name)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(fips140_algs_to_replace); i++) {
+		if (!strcmp(name, fips140_algs_to_replace[i].name))
+			return fips140_algs_to_replace[i].approved;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(fips140_is_approved_service);
+
+/*
+ * FIPS 140-3 requires that modules provide a "service" that outputs "the name
+ * or module identifier and the versioning information that can be correlated
+ * with a validation record".  This function meets that requirement.
+ *
+ * Note: the module also prints this same information to the kernel log when it
+ * is loaded.  That might meet the requirement by itself.  However, given the
+ * vagueness of what counts as a "service", we provide this function too, just
+ * in case the certification lab or CMVP is happier with an explicit function.
+ *
+ * Note: /sys/modules/fips140/scmversion also provides versioning information
+ * about the module.  However that file just shows the bare git commit ID, so it
+ * probably isn't sufficient to meet the FIPS requirement, which seems to want
+ * the "official" module name and version number used in the FIPS certificate.
+ */
+const char *fips140_module_version(void)
+{
+	return FIPS140_MODULE_NAME " " FIPS140_MODULE_VERSION;
+}
+EXPORT_SYMBOL_GPL(fips140_module_version);
+
+static LIST_HEAD(existing_live_algos);
+
+/*
+ * Release a list of algorithms which have been removed from crypto_alg_list.
+ *
+ * Note that even though the list is a private list, we have to hold
+ * crypto_alg_sem while iterating through it because crypto_unregister_alg() may
+ * run concurrently (as we haven't taken a reference to the algorithms on the
+ * list), and crypto_unregister_alg() will remove the algorithm from whichever
+ * list it happens to be on, while holding crypto_alg_sem.  That's okay, since
+ * in that case crypto_unregister_alg() will handle the crypto_alg_put().
+ */
+static void fips140_remove_final(struct list_head *list)
+{
+	struct crypto_alg *alg;
+	struct crypto_alg *n;
+
+	/*
+	 * We need to take crypto_alg_sem to safely traverse the list (see
+	 * comment above), but we have to drop it when doing each
+	 * crypto_alg_put() as that may take crypto_alg_sem again.
+	 */
+	down_write(&crypto_alg_sem);
+	list_for_each_entry_safe(alg, n, list, cra_list) {
+		list_del_init(&alg->cra_list);
+		up_write(&crypto_alg_sem);
+
+		crypto_alg_put(alg);
+
+		down_write(&crypto_alg_sem);
+	}
+	up_write(&crypto_alg_sem);
+}
+
+static void __init unregister_existing_fips140_algos(void)
+{
+	struct crypto_alg *alg, *tmp;
+	LIST_HEAD(remove_list);
+	LIST_HEAD(spawns);
+
+	down_write(&crypto_alg_sem);
+
+	/*
+	 * Find all registered algorithms that we care about, and move them to a
+	 * private list so that they are no longer exposed via the algo lookup
+	 * API. Subsequently, we will unregister them if they are not in active
+	 * use. If they are, we can't fully unregister them but we can ensure
+	 * that new users won't use them.
+	 */
+	list_for_each_entry_safe(alg, tmp, &crypto_alg_list, cra_list) {
+		if (!fips140_should_unregister_alg(alg))
+			continue;
+		if (refcount_read(&alg->cra_refcnt) == 1) {
+			/*
+			 * This algorithm is not currently in use, but there may
+			 * be template instances holding references to it via
+			 * spawns. So let's tear it down like
+			 * crypto_unregister_alg() would, but without releasing
+			 * the lock, to prevent races with concurrent TFM
+			 * allocations.
+			 */
+			alg->cra_flags |= CRYPTO_ALG_DEAD;
+			list_move(&alg->cra_list, &remove_list);
+			crypto_remove_spawns(alg, &spawns, NULL);
+		} else {
+			/*
+			 * This algorithm is live, i.e. it has TFMs allocated,
+			 * so we can't fully unregister it.  It's not necessary
+			 * to dynamically redirect existing users to the FIPS
+			 * code, given that they can't be relying on FIPS
+			 * certified crypto in the first place.  However, we do
+			 * need to ensure that new users will get the FIPS code.
+			 *
+			 * In most cases, setting alg->cra_priority to 0
+			 * achieves this.  However, that isn't enough for
+			 * algorithms like "hmac(sha256)" that need to be
+			 * instantiated from a template, since existing
+			 * algorithms always take priority over a template being
+			 * instantiated.  Therefore, we move the algorithm to
+			 * a private list so that algorithm lookups won't find
+			 * it anymore.  To further distinguish it from the FIPS
+			 * algorithms, we also append "+orig" to its name.
+			 */
+			pr_info("found already-live algorithm '%s' ('%s')\n",
+				alg->cra_name, alg->cra_driver_name);
+			alg->cra_priority = 0;
+			strlcat(alg->cra_name, "+orig", CRYPTO_MAX_ALG_NAME);
+			strlcat(alg->cra_driver_name, "+orig",
+				CRYPTO_MAX_ALG_NAME);
+			list_move(&alg->cra_list, &existing_live_algos);
+		}
+	}
+	up_write(&crypto_alg_sem);
+
+	fips140_remove_final(&remove_list);
+	fips140_remove_final(&spawns);
+}
+
+static void __init unapply_text_relocations(void *section, int section_size,
+					    const Elf64_Rela *rela, int numrels)
+{
+	while (numrels--) {
+		u32 *place = (u32 *)(section + rela->r_offset);
+
+		BUG_ON(rela->r_offset >= section_size);
+
+		switch (ELF64_R_TYPE(rela->r_info)) {
+#ifdef CONFIG_ARM64
+		case R_AARCH64_ABS32: /* for KCFI */
+			*place = 0;
+			break;
+
+		case R_AARCH64_JUMP26:
+		case R_AARCH64_CALL26:
+			*place &= ~GENMASK(25, 0);
+			break;
+
+		case R_AARCH64_ADR_PREL_LO21:
+		case R_AARCH64_ADR_PREL_PG_HI21:
+		case R_AARCH64_ADR_PREL_PG_HI21_NC:
+			*place &= ~(GENMASK(30, 29) | GENMASK(23, 5));
+			break;
+
+		case R_AARCH64_ADD_ABS_LO12_NC:
+		case R_AARCH64_LDST8_ABS_LO12_NC:
+		case R_AARCH64_LDST16_ABS_LO12_NC:
+		case R_AARCH64_LDST32_ABS_LO12_NC:
+		case R_AARCH64_LDST64_ABS_LO12_NC:
+		case R_AARCH64_LDST128_ABS_LO12_NC:
+			*place &= ~GENMASK(21, 10);
+			break;
+		default:
+			pr_err("unhandled relocation type %llu\n",
+			       ELF64_R_TYPE(rela->r_info));
+			BUG();
+#else
+#error
+#endif
+		}
+		rela++;
+	}
+}
+
+static void __init unapply_rodata_relocations(void *section, int section_size,
+					      const Elf64_Rela *rela, int numrels)
+{
+	while (numrels--) {
+		void *place = section + rela->r_offset;
+
+		BUG_ON(rela->r_offset >= section_size);
+
+		switch (ELF64_R_TYPE(rela->r_info)) {
+#ifdef CONFIG_ARM64
+		case R_AARCH64_ABS64:
+			*(u64 *)place = 0;
+			break;
+		default:
+			pr_err("unhandled relocation type %llu\n",
+			       ELF64_R_TYPE(rela->r_info));
+			BUG();
+#else
+#error
+#endif
+		}
+		rela++;
+	}
+}
+
+extern struct {
+	u32	offset;
+	u32	count;
+} fips140_rela_text, fips140_rela_rodata;
+
+static bool __init check_fips140_module_hmac(void)
+{
+	struct crypto_shash *tfm = NULL;
+	SHASH_DESC_ON_STACK(desc, dontcare);
+	u8 digest[SHA256_DIGEST_SIZE];
+	void *textcopy, *rodatacopy;
+	int textsize, rodatasize;
+	bool ok = false;
+	int err;
+
+	textsize	= &__fips140_text_end - &__fips140_text_start;
+	rodatasize	= &__fips140_rodata_end - &__fips140_rodata_start;
+
+	pr_info("text size  : 0x%x\n", textsize);
+	pr_info("rodata size: 0x%x\n", rodatasize);
+
+	textcopy = kmalloc(textsize + rodatasize, GFP_KERNEL);
+	if (!textcopy) {
+		pr_err("Failed to allocate memory for copy of .text\n");
+		goto out;
+	}
+
+	rodatacopy = textcopy + textsize;
+
+	memcpy(textcopy, __text_start, textsize);
+	memcpy(rodatacopy, __rodata_start, rodatasize);
+
+	// apply the relocations in reverse on the copies of .text  and .rodata
+	unapply_text_relocations(textcopy, textsize,
+				 offset_to_ptr(&fips140_rela_text.offset),
+				 fips140_rela_text.count);
+
+	unapply_rodata_relocations(rodatacopy, rodatasize,
+				  offset_to_ptr(&fips140_rela_rodata.offset),
+				  fips140_rela_rodata.count);
+
+	fips140_inject_integrity_failure(textcopy);
+
+	tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	if (IS_ERR(tfm)) {
+		pr_err("failed to allocate hmac tfm (%ld)\n", PTR_ERR(tfm));
+		tfm = NULL;
+		goto out;
+	}
+	desc->tfm = tfm;
+
+	pr_info("using '%s' for integrity check\n",
+		crypto_shash_driver_name(tfm));
+
+	err = crypto_shash_setkey(tfm, fips140_integ_hmac_key,
+				  strlen(fips140_integ_hmac_key)) ?:
+	      crypto_shash_init(desc) ?:
+	      crypto_shash_update(desc, textcopy, textsize) ?:
+	      crypto_shash_finup(desc, rodatacopy, rodatasize, digest);
+
+	/* Zeroizing this is important; see the comment below. */
+	shash_desc_zero(desc);
+
+	if (err) {
+		pr_err("failed to calculate hmac shash (%d)\n", err);
+		goto out;
+	}
+
+	if (memcmp(digest, fips140_integ_hmac_digest, sizeof(digest))) {
+		pr_err("provided_digest  : %*phN\n", (int)sizeof(digest),
+		       fips140_integ_hmac_digest);
+
+		pr_err("calculated digest: %*phN\n", (int)sizeof(digest),
+		       digest);
+		goto out;
+	}
+	ok = true;
+out:
+	/*
+	 * FIPS 140-3 requires that all "temporary value(s) generated during the
+	 * integrity test" be zeroized (ref: FIPS 140-3 IG 9.7.B).  There is no
+	 * technical reason to do this given that these values are public
+	 * information, but this is the requirement so we follow it.
+	 */
+	crypto_free_shash(tfm);
+	memzero_explicit(digest, sizeof(digest));
+	kfree_sensitive(textcopy);
+	return ok;
+}
+
+static void fips140_sha256(void *p, const u8 *data, unsigned int len, u8 *out,
+			   int *hook_inuse)
+{
+	sha256(data, len, out);
+	*hook_inuse = 1;
+}
+
+static void fips140_aes_expandkey(void *p, struct crypto_aes_ctx *ctx,
+				  const u8 *in_key, unsigned int key_len,
+				  int *err)
+{
+	*err = aes_expandkey(ctx, in_key, key_len);
+}
+
+static void fips140_aes_encrypt(void *priv, const struct crypto_aes_ctx *ctx,
+				u8 *out, const u8 *in, int *hook_inuse)
+{
+	aes_encrypt(ctx, out, in);
+	*hook_inuse = 1;
+}
+
+static void fips140_aes_decrypt(void *priv, const struct crypto_aes_ctx *ctx,
+				u8 *out, const u8 *in, int *hook_inuse)
+{
+	aes_decrypt(ctx, out, in);
+	*hook_inuse = 1;
+}
+
+static bool update_fips140_library_routines(void)
+{
+	int ret;
+
+	ret = register_trace_android_vh_sha256(fips140_sha256, NULL) ?:
+	      register_trace_android_vh_aes_expandkey(fips140_aes_expandkey, NULL) ?:
+	      register_trace_android_vh_aes_encrypt(fips140_aes_encrypt, NULL) ?:
+	      register_trace_android_vh_aes_decrypt(fips140_aes_decrypt, NULL);
+
+	return ret == 0;
+}
+
+/*
+ * Initialize the FIPS 140 module.
+ *
+ * Note: this routine iterates over the contents of the initcall section, which
+ * consists of an array of function pointers that was emitted by the linker
+ * rather than the compiler. This means that these function pointers lack the
+ * usual CFI stubs that the compiler emits when CFI codegen is enabled. So
+ * let's disable CFI locally when handling the initcall array, to avoid
+ * surpises.
+ */
+static int __init __attribute__((__no_sanitize__("cfi")))
+fips140_init(void)
+{
+	const u32 *initcall;
+
+	pr_info("loading " FIPS140_MODULE_NAME " " FIPS140_MODULE_VERSION "\n");
+	fips140_init_thread = current;
+
+	unregister_existing_fips140_algos();
+
+	/* iterate over all init routines present in this module and call them */
+	for (initcall = __initcall_start + 1;
+	     initcall < &__initcall_end_marker;
+	     initcall++) {
+		int (*init)(void) = offset_to_ptr(initcall);
+		int err = init();
+
+		/*
+		 * ENODEV is expected from initcalls that only register
+		 * algorithms that depend on non-present CPU features.  Besides
+		 * that, errors aren't expected here.
+		 */
+		if (err && err != -ENODEV) {
+			pr_err("initcall %ps() failed: %d\n", init, err);
+			goto panic;
+		}
+	}
+
+	if (!fips140_run_selftests())
+		goto panic;
+
+	/*
+	 * It may seem backward to perform the integrity check last, but this
+	 * is intentional: the check itself uses hmac(sha256) which is one of
+	 * the algorithms that are replaced with versions from this module, and
+	 * the integrity check must use the replacement version.  Also, to be
+	 * ready for FIPS 140-3, the integrity check algorithm must have already
+	 * been self-tested.
+	 */
+
+	if (!check_fips140_module_hmac()) {
+		pr_crit("integrity check failed -- giving up!\n");
+		goto panic;
+	}
+	pr_info("integrity check passed\n");
+
+	complete_all(&fips140_tests_done);
+
+	if (!update_fips140_library_routines())
+		goto panic;
+
+	if (!fips140_eval_testing_init())
+		goto panic;
+
+	pr_info("module successfully loaded\n");
+	return 0;
+
+panic:
+	panic("FIPS 140 module load failure");
+}
+
+module_init(fips140_init);
+
+MODULE_IMPORT_NS(CRYPTO_INTERNAL);
+MODULE_LICENSE("GPL v2");
+
+/*
+ * Below are copies of some selected "crypto-related" helper functions that are
+ * used by fips140.ko but are not already built into it, due to them being
+ * defined in a file that cannot easily be built into fips140.ko (e.g.,
+ * crypto/algapi.c) instead of one that can (e.g., most files in lib/).
+ *
+ * There is no hard rule about what needs to be included here, as this is for
+ * FIPS certifiability, not any technical reason.  FIPS modules are supposed to
+ * implement the "crypto" themselves, but to do so they are allowed to call
+ * non-cryptographic helper functions from outside the module.  Something like
+ * memcpy() is "clearly" non-cryptographic.  However, there is is ambiguity
+ * about functions like crypto_inc() which aren't cryptographic by themselves,
+ * but are more closely associated with cryptography than e.g. memcpy().  To err
+ * on the side of caution, we define copies of some selected functions below so
+ * that calls to them from within fips140.ko will remain in fips140.ko.
+ */
+
+static inline void crypto_inc_byte(u8 *a, unsigned int size)
+{
+	u8 *b = (a + size);
+	u8 c;
+
+	for (; size; size--) {
+		c = *--b + 1;
+		*b = c;
+		if (c)
+			break;
+	}
+}
+
+void crypto_inc(u8 *a, unsigned int size)
+{
+	__be32 *b = (__be32 *)(a + size);
+	u32 c;
+
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
+	    IS_ALIGNED((unsigned long)b, __alignof__(*b)))
+		for (; size >= 4; size -= 4) {
+			c = be32_to_cpu(*--b) + 1;
+			*b = cpu_to_be32(c);
+			if (likely(c))
+				return;
+		}
+
+	crypto_inc_byte(a, size);
+}
diff --git a/crypto/fips140-module.h b/crypto/fips140-module.h
new file mode 100644
index 000000000000..a2a63194eb64
--- /dev/null
+++ b/crypto/fips140-module.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2021 Google LLC
+ */
+
+#ifndef _CRYPTO_FIPS140_MODULE_H
+#define _CRYPTO_FIPS140_MODULE_H
+
+#include <linux/completion.h>
+#include <linux/module.h>
+#include <generated/utsrelease.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "fips140: " fmt
+
+/*
+ * This is the name and version number of the module that are shown on the FIPS
+ * certificate.
+ */
+#define FIPS140_MODULE_NAME "Android Kernel Cryptographic Module"
+#define FIPS140_MODULE_VERSION UTS_RELEASE
+
+/* fips140-eval-testing.c */
+#ifdef CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING
+void fips140_inject_selftest_failure(const char *impl, u8 *result);
+void fips140_inject_integrity_failure(u8 *textcopy);
+bool fips140_eval_testing_init(void);
+#else
+static inline void fips140_inject_selftest_failure(const char *impl, u8 *result)
+{
+}
+static inline void fips140_inject_integrity_failure(u8 *textcopy)
+{
+}
+static inline bool fips140_eval_testing_init(void)
+{
+	return true;
+}
+#endif /* !CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING */
+
+/* fips140-module.c */
+extern struct completion fips140_tests_done;
+extern struct task_struct *fips140_init_thread;
+bool fips140_is_approved_service(const char *name);
+const char *fips140_module_version(void);
+
+/* fips140-selftests.c */
+bool __init __must_check fips140_run_selftests(void);
+
+#endif /* _CRYPTO_FIPS140_MODULE_H */
diff --git a/crypto/fips140-refs.S b/crypto/fips140-refs.S
new file mode 100644
index 000000000000..fcbd52776323
--- /dev/null
+++ b/crypto/fips140-refs.S
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2021 Google LLC
+ * Author: Ard Biesheuvel <ardb@google.com>
+ *
+ * This file contains the variable definitions that will be used by the FIPS140
+ * s/w module to access the RELA sections in the ELF image. These are used to
+ * apply the relocations applied by the module loader in reverse, so that we
+ * can reconstruct the image that was used to derive the HMAC used by the
+ * integrity check.
+ *
+ * The first .long of each entry will be populated by the module loader based
+ * on the actual placement of the respective RELA section in memory. The second
+ * .long carries the RELA entry count, and is populated by the host tool that
+ * also generates the HMAC of the contents of .text and .rodata.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.section	".init.rodata", "a"
+
+	.align	2
+	.globl	fips140_rela_text
+fips140_rela_text:
+	.weak	__sec_rela_text
+	.long	__sec_rela_text - .
+	.long	0
+
+	.globl	fips140_rela_rodata
+fips140_rela_rodata:
+	.weak	__sec_rela_rodata
+	.long	__sec_rela_rodata - .
+	.long	0
diff --git a/crypto/fips140-selftests.c b/crypto/fips140-selftests.c
new file mode 100644
index 000000000000..9663b1a33bd4
--- /dev/null
+++ b/crypto/fips140-selftests.c
@@ -0,0 +1,998 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 Google LLC
+ *
+ * Authors: Elena Petrova <lenaptr@google.com>,
+ *          Eric Biggers <ebiggers@google.com>
+ *
+ * Self-tests of fips140.ko cryptographic functionality.  These are run at
+ * module load time to fulfill FIPS 140 and NIAP FPT_TST_EXT.1 requirements.
+ *
+ * The actual requirements for these self-tests are somewhat vague, but
+ * section 9 ("Self-Tests") of the FIPS 140-2 Implementation Guidance document
+ * (https://csrc.nist.gov/csrc/media/projects/cryptographic-module-validation-program/documents/fips140-2/fips1402ig.pdf)
+ * is somewhat helpful.  Basically, all implementations of all FIPS approved
+ * algorithms (including modes of operation) must be tested.  However:
+ *
+ *   - There are provisions for skipping tests that are already sufficiently
+ *     covered by other tests.  E.g., HMAC-SHA256 may cover SHA-256.
+ *
+ *   - Only one test vector is required per algorithm, and it can be generated
+ *     by any known-good implementation or taken from any official document.
+ *
+ *   - For ciphers, both encryption and decryption must be tested.
+ *
+ *   - Only one key size per algorithm needs to be tested.
+ *
+ * There is some ambiguity about whether all implementations of each algorithm
+ * must be tested, or whether it is sufficient to test just the highest priority
+ * implementation.  To be safe we test all implementations, except ones that can
+ * be excluded by one of the rules above.
+ *
+ * See fips140_selftests[] for the list of tests we've selected.  Currently, all
+ * our test vectors except the AES-CBC-CTS and DRBG ones were generated by the
+ * script tools/crypto/gen_fips140_testvecs.py, using the known-good
+ * implementations in the Python packages hashlib, pycryptodome, and
+ * cryptography.
+ *
+ * Note that we don't reuse the upstream crypto API's self-tests
+ * (crypto/testmgr.{c,h}), for several reasons:
+ *
+ *   - To meet FIPS requirements, the self-tests must be located within the FIPS
+ *     module boundary (fips140.ko).  But testmgr is integrated into the crypto
+ *     API framework and can't be extracted into the module.
+ *
+ *   - testmgr is much more heavyweight than required for FIPS and NIAP; it
+ *     tests more algorithms and does more tests per algorithm, as it's meant to
+ *     do proper testing and not just meet certification requirements.  We need
+ *     tests that can run with minimal overhead on every boot-up.
+ *
+ *   - Despite being more heavyweight in general, testmgr doesn't test the
+ *     SHA-256 and AES library APIs, despite that being needed here.
+ */
+#include <crypto/aead.h>
+#include <crypto/aes.h>
+#include <crypto/drbg.h>
+#include <crypto/hash.h>
+#include <crypto/rng.h>
+#include <crypto/sha2.h>
+#include <crypto/skcipher.h>
+
+#include "fips140-module.h"
+
+/* Test vector for an AEAD algorithm */
+struct aead_testvec {
+	const u8 *key;
+	size_t key_size;
+	const u8 *iv;
+	size_t iv_size;
+	const u8 *assoc;
+	size_t assoc_size;
+	const u8 *plaintext;
+	size_t plaintext_size;
+	const u8 *ciphertext;
+	size_t ciphertext_size;
+};
+
+/* Test vector for a length-preserving encryption algorithm */
+struct skcipher_testvec {
+	const u8 *key;
+	size_t key_size;
+	const u8 *iv;
+	size_t iv_size;
+	const u8 *plaintext;
+	const u8 *ciphertext;
+	size_t message_size;
+};
+
+/* Test vector for a hash algorithm */
+struct hash_testvec {
+	const u8 *key;
+	size_t key_size;
+	const u8 *message;
+	size_t message_size;
+	const u8 *digest;
+	size_t digest_size;
+};
+
+/* Test vector for a DRBG algorithm */
+struct drbg_testvec {
+	const u8 *entropy;
+	size_t entropy_size;
+	const u8 *pers;
+	size_t pers_size;
+	const u8 *entpr_a;
+	const u8 *entpr_b;
+	size_t entpr_size;
+	const u8 *add_a;
+	const u8 *add_b;
+	size_t add_size;
+	const u8 *output;
+	size_t out_size;
+};
+
+struct fips_test {
+	/* The name of the algorithm, in crypto API syntax */
+	const char *alg;
+
+	/*
+	 * The optional list of implementations to test.  @func will be called
+	 * once per implementation, or once with @alg if this list is empty.
+	 * The implementation names must be given in crypto API syntax, or in
+	 * the case of a library implementation should have "-lib" appended.
+	 */
+	const char *impls[8];
+
+	/*
+	 * The test function.  It should execute a known-answer test on an
+	 * algorithm implementation, using the below test vector.
+	 */
+	int __must_check (*func)(const struct fips_test *test,
+				 const char *impl);
+
+	/* The test vector, with a format specific to the type of algorithm */
+	union {
+		struct aead_testvec aead;
+		struct skcipher_testvec skcipher;
+		struct hash_testvec hash;
+		struct drbg_testvec drbg;
+	};
+};
+
+/* Maximum IV size (in bytes) among any algorithm tested here */
+#define MAX_IV_SIZE	16
+
+static int __init __must_check
+fips_check_result(u8 *result, const u8 *expected_result, size_t result_size,
+		  const char *impl, const char *operation)
+{
+	fips140_inject_selftest_failure(impl, result);
+	if (memcmp(result, expected_result, result_size) != 0) {
+		pr_err("wrong result from %s %s\n", impl, operation);
+		return -EBADMSG;
+	}
+	return 0;
+}
+
+/*
+ * None of the algorithms should be ASYNC, as the FIPS module doesn't register
+ * any ASYNC algorithms.  (The ASYNC flag is only declared by hardware
+ * algorithms, which would need their own FIPS certification.)
+ *
+ * Ideally we would verify alg->cra_module == THIS_MODULE here as well, but that
+ * doesn't work because the files are compiled as built-in code.
+ */
+static int __init __must_check
+fips_validate_alg(const struct crypto_alg *alg)
+{
+	if (alg->cra_flags & CRYPTO_ALG_ASYNC) {
+		pr_err("unexpectedly got async implementation of %s (%s)\n",
+		       alg->cra_name, alg->cra_driver_name);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int __init __must_check
+fips_handle_alloc_tfm_error(const char *impl, int err)
+{
+	if (err == -ENOENT) {
+		/*
+		 * The requested implementation of the algorithm wasn't found.
+		 * This is expected if the CPU lacks a feature the
+		 * implementation needs, such as the ARMv8 Crypto Extensions.
+		 *
+		 * When this happens, the implementation isn't available for
+		 * use, so we can't test it, nor do we need to.  So we just skip
+		 * the test.
+		 */
+		pr_info("%s is unavailable (no CPU support?), skipping testing it\n",
+			impl);
+		return 0;
+	}
+	pr_err("failed to allocate %s tfm: %d\n", impl, err);
+	return err;
+}
+
+static int __init __must_check
+fips_test_aes_library(const struct fips_test *test, const char *impl)
+{
+	const struct skcipher_testvec *vec = &test->skcipher;
+	struct crypto_aes_ctx ctx;
+	u8 block[AES_BLOCK_SIZE];
+	int err;
+
+	if (WARN_ON(vec->message_size != AES_BLOCK_SIZE))
+		return -EINVAL;
+
+	err = aes_expandkey(&ctx, vec->key, vec->key_size);
+	if (err) {
+		pr_err("aes_expandkey() failed: %d\n", err);
+		return err;
+	}
+	aes_encrypt(&ctx, block, vec->plaintext);
+	err = fips_check_result(block, vec->ciphertext, AES_BLOCK_SIZE,
+				impl, "encryption");
+	if (err)
+		return err;
+	aes_decrypt(&ctx, block, block);
+	return fips_check_result(block, vec->plaintext, AES_BLOCK_SIZE,
+				 impl, "decryption");
+}
+
+/* Test a length-preserving symmetric cipher using the crypto_skcipher API. */
+static int __init __must_check
+fips_test_skcipher(const struct fips_test *test, const char *impl)
+{
+	const struct skcipher_testvec *vec = &test->skcipher;
+	struct crypto_skcipher *tfm;
+	struct skcipher_request *req = NULL;
+	u8 *message = NULL;
+	struct scatterlist sg;
+	u8 iv[MAX_IV_SIZE];
+	int err;
+
+	if (WARN_ON(vec->iv_size > MAX_IV_SIZE))
+		return -EINVAL;
+	if (WARN_ON(vec->message_size <= 0))
+		return -EINVAL;
+
+	tfm = crypto_alloc_skcipher(impl, 0, 0);
+	if (IS_ERR(tfm))
+		return fips_handle_alloc_tfm_error(impl, PTR_ERR(tfm));
+	err = fips_validate_alg(&crypto_skcipher_alg(tfm)->base);
+	if (err)
+		goto out;
+	if (crypto_skcipher_ivsize(tfm) != vec->iv_size) {
+		pr_err("%s has wrong IV size\n", impl);
+		err = -EINVAL;
+		goto out;
+	}
+
+	req = skcipher_request_alloc(tfm, GFP_KERNEL);
+	message = kmemdup(vec->plaintext, vec->message_size, GFP_KERNEL);
+	if (!req || !message) {
+		err = -ENOMEM;
+		goto out;
+	}
+	sg_init_one(&sg, message, vec->message_size);
+
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+				      NULL, NULL);
+	skcipher_request_set_crypt(req, &sg, &sg, vec->message_size, iv);
+
+	err = crypto_skcipher_setkey(tfm, vec->key, vec->key_size);
+	if (err) {
+		pr_err("failed to set %s key: %d\n", impl, err);
+		goto out;
+	}
+
+	/* Encrypt the plaintext, then verify the resulting ciphertext. */
+	memcpy(iv, vec->iv, vec->iv_size);
+	err = crypto_skcipher_encrypt(req);
+	if (err) {
+		pr_err("%s encryption failed: %d\n", impl, err);
+		goto out;
+	}
+	err = fips_check_result(message, vec->ciphertext, vec->message_size,
+				impl, "encryption");
+	if (err)
+		goto out;
+
+	/* Decrypt the ciphertext, then verify the resulting plaintext. */
+	memcpy(iv, vec->iv, vec->iv_size);
+	err = crypto_skcipher_decrypt(req);
+	if (err) {
+		pr_err("%s decryption failed: %d\n", impl, err);
+		goto out;
+	}
+	err = fips_check_result(message, vec->plaintext, vec->message_size,
+				impl, "decryption");
+out:
+	kfree(message);
+	skcipher_request_free(req);
+	crypto_free_skcipher(tfm);
+	return err;
+}
+
+/* Test an AEAD using the crypto_aead API. */
+static int __init __must_check
+fips_test_aead(const struct fips_test *test, const char *impl)
+{
+	const struct aead_testvec *vec = &test->aead;
+	const int tag_size = vec->ciphertext_size - vec->plaintext_size;
+	struct crypto_aead *tfm;
+	struct aead_request *req = NULL;
+	u8 *assoc = NULL;
+	u8 *message = NULL;
+	struct scatterlist sg[2];
+	int sg_idx = 0;
+	u8 iv[MAX_IV_SIZE];
+	int err;
+
+	if (WARN_ON(vec->iv_size > MAX_IV_SIZE))
+		return -EINVAL;
+	if (WARN_ON(vec->ciphertext_size <= vec->plaintext_size))
+		return -EINVAL;
+
+	tfm = crypto_alloc_aead(impl, 0, 0);
+	if (IS_ERR(tfm))
+		return fips_handle_alloc_tfm_error(impl, PTR_ERR(tfm));
+	err = fips_validate_alg(&crypto_aead_alg(tfm)->base);
+	if (err)
+		goto out;
+	if (crypto_aead_ivsize(tfm) != vec->iv_size) {
+		pr_err("%s has wrong IV size\n", impl);
+		err = -EINVAL;
+		goto out;
+	}
+
+	req = aead_request_alloc(tfm, GFP_KERNEL);
+	assoc = kmemdup(vec->assoc, vec->assoc_size, GFP_KERNEL);
+	message = kzalloc(vec->ciphertext_size, GFP_KERNEL);
+	if (!req || !assoc || !message) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(message, vec->plaintext, vec->plaintext_size);
+
+	sg_init_table(sg, ARRAY_SIZE(sg));
+	if (vec->assoc_size)
+		sg_set_buf(&sg[sg_idx++], assoc, vec->assoc_size);
+	sg_set_buf(&sg[sg_idx++], message, vec->ciphertext_size);
+
+	aead_request_set_ad(req, vec->assoc_size);
+	aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
+	err = crypto_aead_setkey(tfm, vec->key, vec->key_size);
+	if (err) {
+		pr_err("failed to set %s key: %d\n", impl, err);
+		goto out;
+	}
+
+	err = crypto_aead_setauthsize(tfm, tag_size);
+	if (err) {
+		pr_err("failed to set %s authentication tag size: %d\n",
+		       impl, err);
+		goto out;
+	}
+
+	/*
+	 * Encrypt the plaintext, then verify the resulting ciphertext (which
+	 * includes the authentication tag).
+	 */
+	memcpy(iv, vec->iv, vec->iv_size);
+	aead_request_set_crypt(req, sg, sg, vec->plaintext_size, iv);
+	err = crypto_aead_encrypt(req);
+	if (err) {
+		pr_err("%s encryption failed: %d\n", impl, err);
+		goto out;
+	}
+	err = fips_check_result(message, vec->ciphertext, vec->ciphertext_size,
+				impl, "encryption");
+	if (err)
+		goto out;
+
+	/*
+	 * Decrypt the ciphertext (which includes the authentication tag), then
+	 * verify the resulting plaintext.
+	 */
+	memcpy(iv, vec->iv, vec->iv_size);
+	aead_request_set_crypt(req, sg, sg, vec->ciphertext_size, iv);
+	err = crypto_aead_decrypt(req);
+	if (err) {
+		pr_err("%s decryption failed: %d\n", impl, err);
+		goto out;
+	}
+	err = fips_check_result(message, vec->plaintext, vec->plaintext_size,
+				impl, "decryption");
+out:
+	kfree(message);
+	kfree(assoc);
+	aead_request_free(req);
+	crypto_free_aead(tfm);
+	return err;
+}
+
+/*
+ * Test a hash algorithm using the crypto_shash API.
+ *
+ * Note that we don't need to test the crypto_ahash API too, since none of the
+ * hash algorithms in the FIPS module have the ASYNC flag, and thus there will
+ * be no hash algorithms that can be accessed only through crypto_ahash.
+ */
+static int __init __must_check
+fips_test_hash(const struct fips_test *test, const char *impl)
+{
+	const struct hash_testvec *vec = &test->hash;
+	struct crypto_shash *tfm;
+	u8 digest[HASH_MAX_DIGESTSIZE];
+	int err;
+
+	if (WARN_ON(vec->digest_size > HASH_MAX_DIGESTSIZE))
+		return -EINVAL;
+
+	tfm = crypto_alloc_shash(impl, 0, 0);
+	if (IS_ERR(tfm))
+		return fips_handle_alloc_tfm_error(impl, PTR_ERR(tfm));
+	err = fips_validate_alg(&crypto_shash_alg(tfm)->base);
+	if (err)
+		goto out;
+	if (crypto_shash_digestsize(tfm) != vec->digest_size) {
+		pr_err("%s has wrong digest size\n", impl);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (vec->key) {
+		err = crypto_shash_setkey(tfm, vec->key, vec->key_size);
+		if (err) {
+			pr_err("failed to set %s key: %d\n", impl, err);
+			goto out;
+		}
+	}
+
+	err = crypto_shash_tfm_digest(tfm, vec->message, vec->message_size,
+				      digest);
+	if (err) {
+		pr_err("%s digest computation failed: %d\n", impl, err);
+		goto out;
+	}
+	err = fips_check_result(digest, vec->digest, vec->digest_size,
+				impl, "digest");
+out:
+	crypto_free_shash(tfm);
+	return err;
+}
+
+static int __init __must_check
+fips_test_sha256_library(const struct fips_test *test, const char *impl)
+{
+	const struct hash_testvec *vec = &test->hash;
+	u8 digest[SHA256_DIGEST_SIZE];
+
+	if (WARN_ON(vec->digest_size != SHA256_DIGEST_SIZE))
+		return -EINVAL;
+
+	sha256(vec->message, vec->message_size, digest);
+	return fips_check_result(digest, vec->digest, vec->digest_size,
+				 impl, "digest");
+}
+
+/* Test a DRBG using the crypto_rng API. */
+static int __init __must_check
+fips_test_drbg(const struct fips_test *test, const char *impl)
+{
+	const struct drbg_testvec *vec = &test->drbg;
+	struct crypto_rng *rng;
+	u8 *output = NULL;
+	struct drbg_test_data test_data;
+	struct drbg_string addtl, pers, testentropy;
+	int err;
+
+	rng = crypto_alloc_rng(impl, 0, 0);
+	if (IS_ERR(rng))
+		return fips_handle_alloc_tfm_error(impl, PTR_ERR(rng));
+	err = fips_validate_alg(&crypto_rng_alg(rng)->base);
+	if (err)
+		goto out;
+
+	output = kzalloc(vec->out_size, GFP_KERNEL);
+	if (!output) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Initialize the DRBG with the entropy and personalization string given
+	 * in the test vector.
+	 */
+	test_data.testentropy = &testentropy;
+	drbg_string_fill(&testentropy, vec->entropy, vec->entropy_size);
+	drbg_string_fill(&pers, vec->pers, vec->pers_size);
+	err = crypto_drbg_reset_test(rng, &pers, &test_data);
+	if (err) {
+		pr_err("failed to reset %s\n", impl);
+		goto out;
+	}
+
+	/*
+	 * Generate some random bytes using the additional data string provided
+	 * in the test vector.  Also use the additional entropy if provided
+	 * (relevant for the prediction-resistant DRBG variants only).
+	 */
+	drbg_string_fill(&addtl, vec->add_a, vec->add_size);
+	if (vec->entpr_size) {
+		drbg_string_fill(&testentropy, vec->entpr_a, vec->entpr_size);
+		err = crypto_drbg_get_bytes_addtl_test(rng, output,
+						       vec->out_size, &addtl,
+						       &test_data);
+	} else {
+		err = crypto_drbg_get_bytes_addtl(rng, output, vec->out_size,
+						  &addtl);
+	}
+	if (err) {
+		pr_err("failed to get bytes from %s (try 1): %d\n",
+		       impl, err);
+		goto out;
+	}
+
+	/*
+	 * Do the same again, using a second additional data string, and (when
+	 * applicable) a second additional entropy string.
+	 */
+	drbg_string_fill(&addtl, vec->add_b, vec->add_size);
+	if (test->drbg.entpr_size) {
+		drbg_string_fill(&testentropy, vec->entpr_b, vec->entpr_size);
+		err = crypto_drbg_get_bytes_addtl_test(rng, output,
+						       vec->out_size, &addtl,
+						       &test_data);
+	} else {
+		err = crypto_drbg_get_bytes_addtl(rng, output, vec->out_size,
+						  &addtl);
+	}
+	if (err) {
+		pr_err("failed to get bytes from %s (try 2): %d\n",
+		       impl, err);
+		goto out;
+	}
+
+	/* Check that the DRBG generated the expected output. */
+	err = fips_check_result(output, vec->output, vec->out_size,
+				impl, "get_bytes");
+out:
+	kfree(output);
+	crypto_free_rng(rng);
+	return err;
+}
+
+/* Include the test vectors generated by the Python script. */
+#include "fips140-generated-testvecs.h"
+
+/*
+ * List of all self-tests.  Keep this in sync with fips140_algorithms[].
+ *
+ * When possible, we have followed the FIPS 140-2 Implementation Guidance (IG)
+ * document when creating this list of tests.  The result is intended to be a
+ * list of tests that is near-minimal (and thus minimizes runtime overhead)
+ * while complying with all requirements.  For additional details, see the
+ * comment at the beginning of this file.
+ */
+static const struct fips_test fips140_selftests[] __initconst = {
+	/*
+	 * Test for the AES library API.
+	 *
+	 * Since the AES library API may use its own AES implementation and the
+	 * module provides no support for composing it with a mode of operation
+	 * (it's just plain AES), we must test it directly.
+	 *
+	 * In contrast, we don't need to directly test the "aes" ciphers that
+	 * are accessible through the crypto_cipher API (e.g. "aes-ce"), as they
+	 * are covered indirectly by AES-CMAC and AES-ECB tests.
+	 */
+	{
+		.alg		= "aes",
+		.impls		= {"aes-lib"},
+		.func		= fips_test_aes_library,
+		.skcipher	= {
+			.key		= fips_aes_key,
+			.key_size	= sizeof(fips_aes_key),
+			.plaintext	= fips_message,
+			.ciphertext	= fips_aes_ecb_ciphertext,
+			.message_size	= 16,
+		}
+	},
+	/*
+	 * Tests for AES-CMAC, a.k.a. "cmac(aes)" in crypto API syntax.
+	 *
+	 * The IG requires that each underlying AES implementation be tested in
+	 * an authenticated mode, if implemented.  Of such modes, this module
+	 * implements AES-GCM and AES-CMAC.  However, AES-GCM doesn't "count"
+	 * because this module's implementations of AES-GCM won't actually be
+	 * FIPS-approved, due to a quirk in the FIPS requirements.
+	 *
+	 * Therefore, for us this requirement applies to AES-CMAC, so we must
+	 * test the "cmac" template composed with each "aes" implementation.
+	 *
+	 * Separately from the above, we also must test all standalone
+	 * implementations of "cmac(aes)" such as "cmac-aes-ce", as they don't
+	 * reuse another full AES implementation and thus can't be covered by
+	 * another test.
+	 */
+	{
+		.alg		= "cmac(aes)",
+		.impls		= {
+			/* "cmac" template with all "aes" implementations */
+			"cmac(aes-generic)",
+			"cmac(aes-arm64)",
+			"cmac(aes-ce)",
+			/* All standalone implementations of "cmac(aes)" */
+			"cmac-aes-neon",
+			"cmac-aes-ce",
+		},
+		.func		= fips_test_hash,
+		.hash		= {
+			.key		= fips_aes_key,
+			.key_size	= sizeof(fips_aes_key),
+			.message	= fips_message,
+			.message_size	= sizeof(fips_message),
+			.digest		= fips_aes_cmac_digest,
+			.digest_size	= sizeof(fips_aes_cmac_digest),
+		}
+	},
+	/*
+	 * Tests for AES-ECB, a.k.a. "ecb(aes)" in crypto API syntax.
+	 *
+	 * The IG requires that each underlying AES implementation be tested in
+	 * a mode that exercises the encryption direction of AES and in a mode
+	 * that exercises the decryption direction of AES.  CMAC only covers the
+	 * encryption direction, so we choose ECB to test decryption.  Thus, we
+	 * test the "ecb" template composed with each "aes" implementation.
+	 *
+	 * Separately from the above, we also must test all standalone
+	 * implementations of "ecb(aes)" such as "ecb-aes-ce", as they don't
+	 * reuse another full AES implementation and thus can't be covered by
+	 * another test.
+	 */
+	{
+		.alg		= "ecb(aes)",
+		.impls		= {
+			/* "ecb" template with all "aes" implementations */
+			"ecb(aes-generic)",
+			"ecb(aes-arm64)",
+			"ecb(aes-ce)",
+			/* All standalone implementations of "ecb(aes)" */
+			"ecb-aes-neon",
+			"ecb-aes-neonbs",
+			"ecb-aes-ce",
+		},
+		.func		= fips_test_skcipher,
+		.skcipher	= {
+			.key		= fips_aes_key,
+			.key_size	= sizeof(fips_aes_key),
+			.plaintext	= fips_message,
+			.ciphertext	= fips_aes_ecb_ciphertext,
+			.message_size	= sizeof(fips_message)
+		}
+	},
+	/*
+	 * Tests for AES-CBC, AES-CBC-CTS, AES-CTR, AES-XTS, and AES-GCM.
+	 *
+	 * According to the IG, an AES mode of operation doesn't need to have
+	 * its own test, provided that (a) both the encryption and decryption
+	 * directions of the underlying AES implementation are already tested
+	 * via other mode(s), and (b) in the case of an authenticated mode, at
+	 * least one other authenticated mode is already tested.  The tests of
+	 * the "cmac" and "ecb" templates fulfill these conditions; therefore,
+	 * we don't need to test any other AES mode templates.
+	 *
+	 * This does *not* apply to standalone implementations of these modes
+	 * such as "cbc-aes-ce", as such implementations don't reuse another
+	 * full AES implementation and thus can't be covered by another test.
+	 * We must test all such standalone implementations.
+	 *
+	 * The AES-GCM test isn't actually required, as it's expected that this
+	 * module's AES-GCM implementation won't actually be able to be
+	 * FIPS-approved.  This is unfortunate; it's caused by the FIPS
+	 * requirements for GCM being incompatible with GCM implementations that
+	 * don't generate their own IVs.  We choose to still include the AES-GCM
+	 * test to keep it on par with the other FIPS-approved algorithms, in
+	 * case it turns out that AES-GCM can be approved after all.
+	 */
+	{
+		.alg		= "cbc(aes)",
+		.impls		= {
+			/* All standalone implementations of "cbc(aes)" */
+			"cbc-aes-neon",
+			"cbc-aes-neonbs",
+			"cbc-aes-ce",
+		},
+		.func		= fips_test_skcipher,
+		.skcipher	= {
+			.key		= fips_aes_key,
+			.key_size	= sizeof(fips_aes_key),
+			.iv		= fips_aes_iv,
+			.iv_size	= sizeof(fips_aes_iv),
+			.plaintext	= fips_message,
+			.ciphertext	= fips_aes_cbc_ciphertext,
+			.message_size	= sizeof(fips_message),
+		}
+	}, {
+		.alg		= "cts(cbc(aes))",
+		.impls		= {
+			/* All standalone implementations of "cts(cbc(aes))" */
+			"cts-cbc-aes-neon",
+			"cts-cbc-aes-ce",
+		},
+		.func		= fips_test_skcipher,
+		/* Test vector taken from RFC 3962 */
+		.skcipher	= {
+			.key    = "\x63\x68\x69\x63\x6b\x65\x6e\x20"
+				  "\x74\x65\x72\x69\x79\x61\x6b\x69",
+			.key_size = 16,
+			.iv	= "\x00\x00\x00\x00\x00\x00\x00\x00"
+				  "\x00\x00\x00\x00\x00\x00\x00\x00",
+			.iv_size = 16,
+			.plaintext = "\x49\x20\x77\x6f\x75\x6c\x64\x20"
+				     "\x6c\x69\x6b\x65\x20\x74\x68\x65"
+				     "\x20\x47\x65\x6e\x65\x72\x61\x6c"
+				     "\x20\x47\x61\x75\x27\x73\x20",
+			.ciphertext = "\xfc\x00\x78\x3e\x0e\xfd\xb2\xc1"
+				      "\xd4\x45\xd4\xc8\xef\xf7\xed\x22"
+				      "\x97\x68\x72\x68\xd6\xec\xcc\xc0"
+				      "\xc0\x7b\x25\xe2\x5e\xcf\xe5",
+			.message_size = 31,
+		}
+	}, {
+		.alg		= "ctr(aes)",
+		.impls		= {
+			/* All standalone implementations of "ctr(aes)" */
+			"ctr-aes-neon",
+			"ctr-aes-neonbs",
+			"ctr-aes-ce",
+		},
+		.func		= fips_test_skcipher,
+		.skcipher	= {
+			.key		= fips_aes_key,
+			.key_size	= sizeof(fips_aes_key),
+			.iv		= fips_aes_iv,
+			.iv_size	= sizeof(fips_aes_iv),
+			.plaintext	= fips_message,
+			.ciphertext	= fips_aes_ctr_ciphertext,
+			.message_size	= sizeof(fips_message),
+		}
+	}, {
+		.alg		= "xts(aes)",
+		.impls		= {
+			/* All standalone implementations of "xts(aes)" */
+			"xts-aes-neon",
+			"xts-aes-neonbs",
+			"xts-aes-ce",
+		},
+		.func		= fips_test_skcipher,
+		.skcipher	= {
+			.key		= fips_aes_xts_key,
+			.key_size	= sizeof(fips_aes_xts_key),
+			.iv		= fips_aes_iv,
+			.iv_size	= sizeof(fips_aes_iv),
+			.plaintext	= fips_message,
+			.ciphertext	= fips_aes_xts_ciphertext,
+			.message_size	= sizeof(fips_message),
+		}
+	}, {
+		.alg		= "gcm(aes)",
+		.impls		= {
+			/* All standalone implementations of "gcm(aes)" */
+			"gcm-aes-ce",
+		},
+		.func		= fips_test_aead,
+		.aead		= {
+			.key		= fips_aes_key,
+			.key_size	= sizeof(fips_aes_key),
+			.iv		= fips_aes_iv,
+			/* The GCM implementations assume an IV size of 12. */
+			.iv_size	= 12,
+			.assoc		= fips_aes_gcm_assoc,
+			.assoc_size	= sizeof(fips_aes_gcm_assoc),
+			.plaintext	= fips_message,
+			.plaintext_size	= sizeof(fips_message),
+			.ciphertext	= fips_aes_gcm_ciphertext,
+			.ciphertext_size = sizeof(fips_aes_gcm_ciphertext),
+		}
+	},
+
+	/* Tests for SHA-1 */
+	{
+		.alg		= "sha1",
+		.impls		= {
+			/* All implementations of "sha1" */
+			"sha1-generic",
+			"sha1-ce"
+		},
+		.func		= fips_test_hash,
+		.hash		= {
+			.message	= fips_message,
+			.message_size	= sizeof(fips_message),
+			.digest		= fips_sha1_digest,
+			.digest_size	= sizeof(fips_sha1_digest)
+		}
+	},
+	/*
+	 * Tests for all SHA-256 implementations other than the sha256() library
+	 * function.  As per the IG, these tests also fulfill the tests for the
+	 * corresponding SHA-224 implementations.
+	 */
+	{
+		.alg		= "sha256",
+		.impls		= {
+			/* All implementations of "sha256" */
+			"sha256-generic",
+			"sha256-arm64",
+			"sha256-ce",
+		},
+		.func		= fips_test_hash,
+		.hash		= {
+			.message	= fips_message,
+			.message_size	= sizeof(fips_message),
+			.digest		= fips_sha256_digest,
+			.digest_size	= sizeof(fips_sha256_digest)
+		}
+	},
+	/*
+	 * Test for the sha256() library function.  This must be tested
+	 * separately because it may use its own SHA-256 implementation.
+	 */
+	{
+		.alg		= "sha256",
+		.impls		= {"sha256-lib"},
+		.func		= fips_test_sha256_library,
+		.hash		= {
+			.message	= fips_message,
+			.message_size	= sizeof(fips_message),
+			.digest		= fips_sha256_digest,
+			.digest_size	= sizeof(fips_sha256_digest)
+		}
+	},
+	/*
+	 * Tests for all SHA-512 implementations.  As per the IG, these tests
+	 * also fulfill the tests for the corresponding SHA-384 implementations.
+	 */
+	{
+		.alg		= "sha512",
+		.impls		= {
+			/* All implementations of "sha512" */
+			"sha512-generic",
+			"sha512-arm64",
+			"sha512-ce",
+		},
+		.func		= fips_test_hash,
+		.hash		= {
+			.message	= fips_message,
+			.message_size	= sizeof(fips_message),
+			.digest		= fips_sha512_digest,
+			.digest_size	= sizeof(fips_sha512_digest)
+		}
+	},
+	/*
+	 * Test for HMAC.  As per the IG, only one HMAC test is required,
+	 * provided that the same HMAC code is shared by all HMAC-SHA*.  This is
+	 * true in our case.  We choose HMAC-SHA256 for the test.
+	 *
+	 * Note that as per the IG, this can fulfill the test for the underlying
+	 * SHA.  However, we don't currently rely on this.
+	 */
+	{
+		.alg		= "hmac(sha256)",
+		.func		= fips_test_hash,
+		.hash		= {
+			.key		= fips_hmac_key,
+			.key_size	= sizeof(fips_hmac_key),
+			.message	= fips_message,
+			.message_size	= sizeof(fips_message),
+			.digest		= fips_hmac_sha256_digest,
+			.digest_size	= sizeof(fips_hmac_sha256_digest)
+		}
+	},
+	/*
+	 * Known-answer tests for the SP800-90A DRBG algorithms.
+	 *
+	 * These test vectors were manually extracted from
+	 * https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/drbg/drbgtestvectors.zip.
+	 *
+	 * The selection of these tests follows the FIPS 140-2 IG as well as
+	 * Section 11 of SP800-90A:
+	 *
+	 * - We must test all DRBG types (HMAC, Hash, and CTR) that the module
+	 *   implements.  However, currently the module only implements
+	 *   HMAC_DRBG (since CONFIG_CRYPTO_DRBG_CTR and CONFIG_CRYPTO_DRBG_HASH
+	 *   aren't enabled).  Therefore, we only need to test HMAC_DRBG.
+	 *
+	 * - We only need to test one HMAC variant.
+	 *
+	 * - We must test all DRBG operations: Instantiate(), Reseed(), and
+	 *   Generate().  However, a single test sequence with a single output
+	 *   comparison may cover all three operations, and this is what we do.
+	 *   Note that Reseed() happens implicitly via the use of the additional
+	 *   input and also via the use of prediction resistance when enabled.
+	 *
+	 * - The personalization string, additional input, and prediction
+	 *   resistance support must be tested.  Therefore we have chosen test
+	 *   vectors that have a nonempty personalization string and nonempty
+	 *   additional input, and we test the prediction-resistant variant.
+	 *   Testing the non-prediction-resistant variant is not required.
+	 */
+	{
+		.alg	= "drbg_pr_hmac_sha256",
+		.func	= fips_test_drbg,
+		.drbg	= {
+			.entropy =
+				"\xc7\xcc\xbc\x67\x7e\x21\x66\x1e\x27\x2b\x63\xdd"
+				"\x3a\x78\xdc\xdf\x66\x6d\x3f\x24\xae\xcf\x37\x01"
+				"\xa9\x0d\x89\x8a\xa7\xdc\x81\x58\xae\xb2\x10\x15"
+				"\x7e\x18\x44\x6d\x13\xea\xdf\x37\x85\xfe\x81\xfb",
+			.entropy_size = 48,
+			.entpr_a =
+				"\x7b\xa1\x91\x5b\x3c\x04\xc4\x1b\x1d\x19\x2f\x1a"
+				"\x18\x81\x60\x3c\x6c\x62\x91\xb7\xe9\xf5\xcb\x96"
+				"\xbb\x81\x6a\xcc\xb5\xae\x55\xb6",
+			.entpr_b =
+				"\x99\x2c\xc7\x78\x7e\x3b\x88\x12\xef\xbe\xd3\xd2"
+				"\x7d\x2a\xa5\x86\xda\x8d\x58\x73\x4a\x0a\xb2\x2e"
+				"\xbb\x4c\x7e\xe3\x9a\xb6\x81\xc1",
+			.entpr_size = 32,
+			.output =
+				"\x95\x6f\x95\xfc\x3b\xb7\xfe\x3e\xd0\x4e\x1a\x14"
+				"\x6c\x34\x7f\x7b\x1d\x0d\x63\x5e\x48\x9c\x69\xe6"
+				"\x46\x07\xd2\x87\xf3\x86\x52\x3d\x98\x27\x5e\xd7"
+				"\x54\xe7\x75\x50\x4f\xfb\x4d\xfd\xac\x2f\x4b\x77"
+				"\xcf\x9e\x8e\xcc\x16\xa2\x24\xcd\x53\xde\x3e\xc5"
+				"\x55\x5d\xd5\x26\x3f\x89\xdf\xca\x8b\x4e\x1e\xb6"
+				"\x88\x78\x63\x5c\xa2\x63\x98\x4e\x6f\x25\x59\xb1"
+				"\x5f\x2b\x23\xb0\x4b\xa5\x18\x5d\xc2\x15\x74\x40"
+				"\x59\x4c\xb4\x1e\xcf\x9a\x36\xfd\x43\xe2\x03\xb8"
+				"\x59\x91\x30\x89\x2a\xc8\x5a\x43\x23\x7c\x73\x72"
+				"\xda\x3f\xad\x2b\xba\x00\x6b\xd1",
+			.out_size = 128,
+			.add_a =
+				"\x18\xe8\x17\xff\xef\x39\xc7\x41\x5c\x73\x03\x03"
+				"\xf6\x3d\xe8\x5f\xc8\xab\xe4\xab\x0f\xad\xe8\xd6"
+				"\x86\x88\x55\x28\xc1\x69\xdd\x76",
+			.add_b =
+				"\xac\x07\xfc\xbe\x87\x0e\xd3\xea\x1f\x7e\xb8\xe7"
+				"\x9d\xec\xe8\xe7\xbc\xf3\x18\x25\x77\x35\x4a\xaa"
+				"\x00\x99\x2a\xdd\x0a\x00\x50\x82",
+			.add_size = 32,
+			.pers =
+				"\xbc\x55\xab\x3c\xf6\x52\xb0\x11\x3d\x7b\x90\xb8"
+				"\x24\xc9\x26\x4e\x5a\x1e\x77\x0d\x3d\x58\x4a\xda"
+				"\xd1\x81\xe9\xf8\xeb\x30\x8f\x6f",
+			.pers_size = 32,
+		}
+	}
+};
+
+static int __init __must_check
+fips_run_test(const struct fips_test *test)
+{
+	int i;
+	int err;
+
+	/*
+	 * If no implementations were specified, then just test the default one.
+	 * Otherwise, test the specified list of implementations.
+	 */
+
+	if (test->impls[0] == NULL) {
+		err = test->func(test, test->alg);
+		if (err)
+			pr_emerg("self-tests failed for algorithm %s: %d\n",
+				 test->alg, err);
+		return err;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(test->impls) && test->impls[i] != NULL;
+	     i++) {
+		err = test->func(test, test->impls[i]);
+		if (err) {
+			pr_emerg("self-tests failed for algorithm %s, implementation %s: %d\n",
+				 test->alg, test->impls[i], err);
+			return err;
+		}
+	}
+	return 0;
+}
+
+bool __init fips140_run_selftests(void)
+{
+	int i;
+
+	pr_info("running self-tests\n");
+	for (i = 0; i < ARRAY_SIZE(fips140_selftests); i++) {
+		if (fips_run_test(&fips140_selftests[i]) != 0) {
+			/* The caller is responsible for calling panic(). */
+			return false;
+		}
+	}
+	pr_info("all self-tests passed\n");
+	return true;
+}
diff --git a/crypto/fips140_gen_hmac.c b/crypto/fips140_gen_hmac.c
new file mode 100644
index 000000000000..69f754d38a1d
--- /dev/null
+++ b/crypto/fips140_gen_hmac.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 - Google LLC
+ * Author: Ard Biesheuvel <ardb@google.com>
+ *
+ * This is a host tool that is intended to be used to take the HMAC digest of
+ * the .text and .rodata sections of the fips140.ko module, and store it inside
+ * the module. The module will perform an integrity selfcheck at module_init()
+ * time, by recalculating the digest and comparing it with the value calculated
+ * here.
+ *
+ * Note that the peculiar way an HMAC is being used as a digest with a public
+ * key rather than as a symmetric key signature is mandated by FIPS 140-2.
+ */
+
+#include <elf.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <openssl/hmac.h>
+
+static Elf64_Ehdr *ehdr;
+static Elf64_Shdr *shdr;
+static int num_shdr;
+static const char *strtab, *shstrtab;
+static Elf64_Sym *syms;
+static int num_syms;
+
+static Elf64_Shdr *find_symtab_section(void)
+{
+	int i;
+
+	for (i = 0; i < num_shdr; i++)
+		if (shdr[i].sh_type == SHT_SYMTAB)
+			return &shdr[i];
+	return NULL;
+}
+
+static int get_section_idx(const char *name)
+{
+	int i;
+
+	for (i = 0; i < num_shdr; i++)
+		if (!strcmp(shstrtab + shdr[i].sh_name, name))
+			return i;
+	return -1;
+}
+
+static int get_sym_idx(const char *sym_name)
+{
+	int i;
+
+	for (i = 0; i < num_syms; i++)
+		if (!strcmp(strtab + syms[i].st_name, sym_name))
+			return i;
+	return -1;
+}
+
+static void *get_sym_addr(const char *sym_name)
+{
+	int i = get_sym_idx(sym_name);
+
+	if (i >= 0)
+		return (void *)ehdr + shdr[syms[i].st_shndx].sh_offset +
+		       syms[i].st_value;
+	return NULL;
+}
+
+static int update_rela_ref(const char *name)
+{
+	/*
+	 * We need to do a couple of things to ensure that the copied RELA data
+	 * is accessible to the module itself at module init time:
+	 * - the associated entry in the symbol table needs to refer to the
+	 *   correct section index, and have SECTION type and GLOBAL linkage.
+	 * - the 'count' global variable in the module need to be set to the
+	 *   right value based on the size of the RELA section.
+	 */
+	unsigned int *size_var;
+	int sec_idx, sym_idx;
+	char str[32];
+
+	sprintf(str, "fips140_rela_%s", name);
+	size_var = get_sym_addr(str);
+	if (!size_var) {
+		printf("variable '%s' not found, disregarding .%s section\n",
+		       str, name);
+		return 1;
+	}
+
+	sprintf(str, "__sec_rela_%s", name);
+	sym_idx = get_sym_idx(str);
+
+	sprintf(str, ".init.rela.%s", name);
+	sec_idx = get_section_idx(str);
+
+	if (sec_idx < 0 || sym_idx < 0) {
+		fprintf(stderr, "failed to locate metadata for .%s section in binary\n",
+			name);
+		return 0;
+	}
+
+	syms[sym_idx].st_shndx = sec_idx;
+	syms[sym_idx].st_info = (STB_GLOBAL << 4) | STT_SECTION;
+
+	size_var[1] = shdr[sec_idx].sh_size / sizeof(Elf64_Rela);
+
+	return 1;
+}
+
+static void hmac_section(HMAC_CTX *hmac, const char *start, const char *end)
+{
+	void *start_addr = get_sym_addr(start);
+	void *end_addr = get_sym_addr(end);
+
+	HMAC_Update(hmac, start_addr, end_addr - start_addr);
+}
+
+int main(int argc, char **argv)
+{
+	Elf64_Shdr *symtab_shdr;
+	const char *hmac_key;
+	unsigned char *dg;
+	unsigned int dglen;
+	struct stat stat;
+	HMAC_CTX *hmac;
+	int fd, ret;
+
+	if (argc < 2) {
+		fprintf(stderr, "file argument missing\n");
+		exit(EXIT_FAILURE);
+	}
+
+	fd = open(argv[1], O_RDWR);
+	if (fd < 0) {
+		fprintf(stderr, "failed to open %s\n", argv[1]);
+		exit(EXIT_FAILURE);
+	}
+
+	ret = fstat(fd, &stat);
+	if (ret < 0) {
+		fprintf(stderr, "failed to stat() %s\n", argv[1]);
+		exit(EXIT_FAILURE);
+	}
+
+	ehdr = mmap(0, stat.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (ehdr == MAP_FAILED) {
+		fprintf(stderr, "failed to mmap() %s\n", argv[1]);
+		exit(EXIT_FAILURE);
+	}
+
+	shdr = (void *)ehdr + ehdr->e_shoff;
+	num_shdr = ehdr->e_shnum;
+
+	symtab_shdr = find_symtab_section();
+
+	syms = (void *)ehdr + symtab_shdr->sh_offset;
+	num_syms = symtab_shdr->sh_size / sizeof(Elf64_Sym);
+
+	strtab = (void *)ehdr + shdr[symtab_shdr->sh_link].sh_offset;
+	shstrtab = (void *)ehdr + shdr[ehdr->e_shstrndx].sh_offset;
+
+	if (!update_rela_ref("text") || !update_rela_ref("rodata"))
+		exit(EXIT_FAILURE);
+
+	hmac_key = get_sym_addr("fips140_integ_hmac_key");
+	if (!hmac_key) {
+		fprintf(stderr, "failed to locate HMAC key in binary\n");
+		exit(EXIT_FAILURE);
+	}
+
+	dg = get_sym_addr("fips140_integ_hmac_digest");
+	if (!dg) {
+		fprintf(stderr, "failed to locate HMAC digest in binary\n");
+		exit(EXIT_FAILURE);
+	}
+
+	hmac = HMAC_CTX_new();
+	HMAC_Init_ex(hmac, hmac_key, strlen(hmac_key), EVP_sha256(), NULL);
+
+	hmac_section(hmac, "__fips140_text_start", "__fips140_text_end");
+	hmac_section(hmac, "__fips140_rodata_start", "__fips140_rodata_end");
+
+	HMAC_Final(hmac, dg, &dglen);
+
+	close(fd);
+	return 0;
+}
diff --git a/tools/crypto/gen_fips140_testvecs.py b/tools/crypto/gen_fips140_testvecs.py
new file mode 100755
index 000000000000..825c4872235a
--- /dev/null
+++ b/tools/crypto/gen_fips140_testvecs.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright 2021 Google LLC
+#
+# Generate most of the test vectors for the FIPS 140 cryptographic self-tests.
+#
+# Usage:
+#    tools/crypto/gen_fips140_testvecs.py > crypto/fips140-generated-testvecs.h
+#
+# Prerequisites:
+#    Debian:      apt-get install python3-pycryptodome python3-cryptography
+#    Arch Linux:  pacman -S python-pycryptodomex python-cryptography
+
+import hashlib
+import hmac
+import os
+
+import Cryptodome.Cipher.AES
+import Cryptodome.Util.Counter
+
+import cryptography.hazmat.primitives.ciphers
+import cryptography.hazmat.primitives.ciphers.algorithms
+import cryptography.hazmat.primitives.ciphers.modes
+
+scriptname = os.path.basename(__file__)
+
+message     = bytes('This is a 32-byte test message.\0', 'ascii')
+aes_key     = bytes('128-bit AES key\0', 'ascii')
+aes_xts_key = bytes('This is an AES-128-XTS key.\0\0\0\0\0', 'ascii')
+aes_iv      = bytes('ABCDEFGHIJKLMNOP', 'ascii')
+assoc       = bytes('associated data string', 'ascii')
+hmac_key    = bytes('128-bit HMAC key', 'ascii')
+
+def warn_generated():
+    print(f'''/*
+ * This header was automatically generated by {scriptname}.
+ * Don't edit it directly.
+ */''')
+
+def is_string_value(value):
+    return (value.isascii() and
+            all(c == '\x00' or c.isprintable() for c in str(value, 'ascii')))
+
+def format_value(value, is_string):
+    if is_string:
+        return value
+    hexstr = ''
+    for byte in value:
+        hexstr += f'\\x{byte:02x}'
+    return hexstr
+
+def print_value(name, value):
+    is_string = is_string_value(value)
+    hdr = f'static const u8 fips_{name}[{len(value)}] __initconst ='
+    print(hdr, end='')
+    if is_string:
+        value = str(value, 'ascii').rstrip('\x00')
+        chars_per_byte = 1
+    else:
+        chars_per_byte = 4
+    bytes_per_line = 64 // chars_per_byte
+
+    if len(hdr) + (chars_per_byte * len(value)) + 4 <= 80:
+        print(f' "{format_value(value, is_string)}"', end='')
+    else:
+        for chunk in [value[i:i+bytes_per_line]
+                      for i in range(0, len(value), bytes_per_line)]:
+            print(f'\n\t"{format_value(chunk, is_string)}"', end='')
+    print(';')
+    print('')
+
+def generate_aes_testvecs():
+    print_value('aes_key', aes_key)
+    print_value('aes_iv', aes_iv)
+
+    cbc = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_CBC,
+                                    iv=aes_iv)
+    print_value('aes_cbc_ciphertext', cbc.encrypt(message))
+
+    ecb = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_ECB)
+    print_value('aes_ecb_ciphertext', ecb.encrypt(message))
+
+    ctr = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_CTR,
+                                    nonce=bytes(), initial_value=aes_iv)
+    print_value('aes_ctr_ciphertext', ctr.encrypt(message))
+
+    print_value('aes_gcm_assoc', assoc)
+    gcm = Cryptodome.Cipher.AES.new(aes_key, Cryptodome.Cipher.AES.MODE_GCM,
+                                    nonce=aes_iv[:12], mac_len=16)
+    gcm.update(assoc)
+    raw_ciphertext, tag = gcm.encrypt_and_digest(message)
+    print_value('aes_gcm_ciphertext', raw_ciphertext + tag)
+
+    # Unfortunately, pycryptodome doesn't support XTS, so for it we need to use
+    # a different Python package (the "cryptography" package).
+    print_value('aes_xts_key', aes_xts_key)
+    xts = cryptography.hazmat.primitives.ciphers.Cipher(
+        cryptography.hazmat.primitives.ciphers.algorithms.AES(aes_xts_key),
+        cryptography.hazmat.primitives.ciphers.modes.XTS(aes_iv)).encryptor()
+    ciphertext = xts.update(message) + xts.finalize()
+    print_value('aes_xts_ciphertext', ciphertext)
+
+    cmac = Cryptodome.Hash.CMAC.new(aes_key, ciphermod=Cryptodome.Cipher.AES)
+    cmac.update(message)
+    print_value('aes_cmac_digest', cmac.digest())
+
+def generate_sha_testvecs():
+    print_value('hmac_key', hmac_key)
+    for alg in ['sha1', 'sha256', 'hmac_sha256', 'sha512']:
+        if alg.startswith('hmac_'):
+            h = hmac.new(hmac_key, message, alg.removeprefix('hmac_'))
+        else:
+            h = hashlib.new(alg, message)
+        print_value(f'{alg}_digest', h.digest())
+
+print('/* SPDX-License-Identifier: GPL-2.0-only */')
+print('/* Copyright 2021 Google LLC */')
+print('')
+warn_generated()
+print('')
+print_value('message', message)
+generate_aes_testvecs()
+generate_sha_testvecs()
+warn_generated()

From 41d708af25a9fff133a94d80a35f71f2f0d26a88 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 18 Nov 2021 16:09:11 -0800
Subject: [PATCH 440/457] ANDROID: fips140: add fips140_lab_util program

Add a sample program that supports various tests that the FIPS
certification lab is required to do on fips140.ko.  To do its work it
uses AF_ALG, as well as the /dev/fips140 device node provided by a build
of fips140.ko with CONFIG_CRYPTO_FIPS140_MOD_EVAL_TESTING enabled.

Original commits:
  android12-5.10:
    109f31ac23f5 ("ANDROID: fips140: add userspace interface for evaluation testing")
    a481d4352121 ("ANDROID: fips140: refactor and rename fips140_lab_test")
    3a624c9ccdd7 ("ANDROID: fips140: add show_invalid_inputs command to fips140_lab_util")
    fe60669d0308 ("ANDROID: fips140: add dump_jitterentropy command to fips140_lab_util")

Bug: 153614920
Bug: 188620248
Change-Id: Ide1875f39d439c3955d03a5f41160382544d47bd
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 samples/crypto/fips140_lab_util.c | 638 ++++++++++++++++++++++++++++++
 1 file changed, 638 insertions(+)
 create mode 100644 samples/crypto/fips140_lab_util.c

diff --git a/samples/crypto/fips140_lab_util.c b/samples/crypto/fips140_lab_util.c
new file mode 100644
index 000000000000..5f8e9018013a
--- /dev/null
+++ b/samples/crypto/fips140_lab_util.c
@@ -0,0 +1,638 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 Google LLC
+ *
+ * This program provides commands that dump certain types of output from the
+ * fips140 kernel module, as required by the FIPS lab for evaluation purposes.
+ *
+ * While the fips140 kernel module can only be accessed directly by other kernel
+ * code, an easy-to-use userspace utility program was desired for lab testing.
+ * When possible, this program uses AF_ALG to access the crypto algorithms; this
+ * requires that the kernel has AF_ALG enabled.  Where AF_ALG isn't sufficient,
+ * a custom device node /dev/fips140 is used instead; this requires that the
+ * fips140 module is loaded and has evaluation testing support compiled in.
+ *
+ * This program can be compiled and run on an Android device as follows:
+ *
+ *	NDK_DIR=$HOME/android-ndk-r23b  # adjust directory path as needed
+ *	$NDK_DIR/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang \
+ *		fips140_lab_util.c -O2 -Wall -o fips140_lab_util
+ *	adb push fips140_lab_util /data/local/tmp/
+ *	adb root
+ *	adb shell /data/local/tmp/fips140_lab_util
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <linux/if_alg.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+#include "../../crypto/fips140-eval-testing-uapi.h"
+
+/* ---------------------------------------------------------------------------
+ *			       Utility functions
+ * ---------------------------------------------------------------------------*/
+
+#define ARRAY_SIZE(A)	(sizeof(A) / sizeof((A)[0]))
+#define MIN(a, b)	((a) < (b) ? (a) : (b))
+#define MAX(a, b)	((a) > (b) ? (a) : (b))
+
+static void __attribute__((noreturn))
+do_die(const char *format, va_list va, int err)
+{
+	fputs("ERROR: ", stderr);
+	vfprintf(stderr, format, va);
+	if (err)
+		fprintf(stderr, ": %s", strerror(err));
+	putc('\n', stderr);
+	exit(1);
+}
+
+static void __attribute__((noreturn, format(printf, 1, 2)))
+die_errno(const char *format, ...)
+{
+	va_list va;
+
+	va_start(va, format);
+	do_die(format, va, errno);
+	va_end(va);
+}
+
+static void __attribute__((noreturn, format(printf, 1, 2)))
+die(const char *format, ...)
+{
+	va_list va;
+
+	va_start(va, format);
+	do_die(format, va, 0);
+	va_end(va);
+}
+
+static void __attribute__((noreturn))
+assertion_failed(const char *expr, const char *file, int line)
+{
+	die("Assertion failed: %s at %s:%d", expr, file, line);
+}
+
+#define ASSERT(e) ({ if (!(e)) assertion_failed(#e, __FILE__, __LINE__); })
+
+static void rand_bytes(uint8_t *bytes, size_t count)
+{
+	size_t i;
+
+	for (i = 0; i < count; i++)
+		bytes[i] = rand();
+}
+
+static const char *booltostr(bool b)
+{
+	return b ? "true" : "false";
+}
+
+static const char *bytes_to_hex(const uint8_t *bytes, size_t count)
+{
+	static char hex[1025];
+	size_t i;
+
+	ASSERT(count <= 512);
+	for (i = 0; i < count; i++)
+		sprintf(&hex[2*i], "%02x", bytes[i]);
+	return hex;
+}
+
+static void full_write(int fd, const void *buf, size_t count)
+{
+	while (count) {
+		ssize_t ret = write(fd, buf, count);
+
+		if (ret < 0)
+			die_errno("write failed");
+		buf += ret;
+		count -= ret;
+	}
+}
+
+enum {
+	OPT_AMOUNT,
+	OPT_ITERATIONS,
+};
+
+static void usage(void);
+
+/* ---------------------------------------------------------------------------
+ *			      /dev/fips140 ioctls
+ * ---------------------------------------------------------------------------*/
+
+static int get_fips140_device_number(void)
+{
+	FILE *f;
+	char line[128];
+	int number;
+	char name[32];
+
+	f = fopen("/proc/devices", "r");
+	if (!f)
+		die_errno("Failed to open /proc/devices");
+	while (fgets(line, sizeof(line), f)) {
+		if (sscanf(line, "%d %31s", &number, name) == 2 &&
+		    strcmp(name, "fips140") == 0)
+			return number;
+	}
+	fclose(f);
+	die("fips140 device node is unavailable.\n"
+"The fips140 device node is only available when the fips140 module is loaded\n"
+"and has been built with evaluation testing support.");
+}
+
+static void create_fips140_node_if_needed(void)
+{
+	struct stat stbuf;
+	int major;
+
+	if (stat("/dev/fips140", &stbuf) == 0)
+		return;
+
+	major = get_fips140_device_number();
+	if (mknod("/dev/fips140", S_IFCHR | 0600, makedev(major, 1)) != 0)
+		die_errno("Failed to create fips140 device node");
+}
+
+static int fips140_dev_fd = -1;
+
+static int fips140_ioctl(int cmd, const void *arg)
+{
+	if (fips140_dev_fd < 0) {
+		create_fips140_node_if_needed();
+		fips140_dev_fd = open("/dev/fips140", O_RDONLY);
+		if (fips140_dev_fd < 0)
+			die_errno("Failed to open /dev/fips140");
+	}
+	return ioctl(fips140_dev_fd, cmd, arg);
+}
+
+static bool fips140_is_approved_service(const char *name)
+{
+	int ret = fips140_ioctl(FIPS140_IOCTL_IS_APPROVED_SERVICE, name);
+
+	if (ret < 0)
+		die_errno("FIPS140_IOCTL_IS_APPROVED_SERVICE unexpectedly failed");
+	if (ret == 1)
+		return true;
+	if (ret == 0)
+		return false;
+	die("FIPS140_IOCTL_IS_APPROVED_SERVICE returned unexpected value %d",
+	    ret);
+}
+
+static const char *fips140_module_version(void)
+{
+	static char buf[256];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	ret = fips140_ioctl(FIPS140_IOCTL_MODULE_VERSION, buf);
+	if (ret < 0)
+		die_errno("FIPS140_IOCTL_MODULE_VERSION unexpectedly failed");
+	if (ret != 0)
+		die("FIPS140_IOCTL_MODULE_VERSION returned unexpected value %d",
+		    ret);
+	return buf;
+}
+
+/* ---------------------------------------------------------------------------
+ *				AF_ALG utilities
+ * ---------------------------------------------------------------------------*/
+
+#define AF_ALG_MAX_RNG_REQUEST_SIZE	128
+
+static int get_alg_fd(const char *alg_type, const char *alg_name)
+{
+	struct sockaddr_alg addr = {};
+	int alg_fd;
+
+	alg_fd = socket(AF_ALG, SOCK_SEQPACKET, 0);
+	if (alg_fd < 0)
+		die("Failed to create AF_ALG socket.\n"
+"AF_ALG is only available when it has been enabled in the kernel.\n");
+
+	strncpy((char *)addr.salg_type, alg_type, sizeof(addr.salg_type) - 1);
+	strncpy((char *)addr.salg_name, alg_name, sizeof(addr.salg_name) - 1);
+
+	if (bind(alg_fd, (void *)&addr, sizeof(addr)) != 0)
+		die_errno("Failed to bind AF_ALG socket to %s %s",
+			  alg_type, alg_name);
+	return alg_fd;
+}
+
+static int get_req_fd(int alg_fd, const char *alg_name)
+{
+	int req_fd = accept(alg_fd, NULL, NULL);
+
+	if (req_fd < 0)
+		die_errno("Failed to get request file descriptor for %s",
+			  alg_name);
+	return req_fd;
+}
+
+/* ---------------------------------------------------------------------------
+ *			   dump_jitterentropy command
+ * ---------------------------------------------------------------------------*/
+
+static void dump_from_jent_fd(int fd, size_t count)
+{
+	uint8_t buf[AF_ALG_MAX_RNG_REQUEST_SIZE];
+
+	while (count) {
+		ssize_t ret;
+
+		memset(buf, 0, sizeof(buf));
+		ret = read(fd, buf, MIN(count, sizeof(buf)));
+		if (ret < 0)
+			die_errno("error reading from jitterentropy_rng");
+		full_write(STDOUT_FILENO, buf, ret);
+		count -= ret;
+	}
+}
+
+static int cmd_dump_jitterentropy(int argc, char *argv[])
+{
+	static const struct option longopts[] = {
+		{ "amount", required_argument, NULL, OPT_AMOUNT },
+		{ "iterations", required_argument, NULL, OPT_ITERATIONS },
+		{ NULL, 0, NULL, 0 },
+	};
+	size_t amount = 128;
+	size_t iterations = 1;
+	size_t i;
+	int c;
+
+	while ((c = getopt_long(argc, argv, "", longopts, NULL)) != -1) {
+		switch (c) {
+		case OPT_AMOUNT:
+			amount = strtoul(optarg, NULL, 0);
+			if (amount <= 0 || amount >= ULONG_MAX)
+				die("invalid argument to --amount");
+			break;
+		case OPT_ITERATIONS:
+			iterations = strtoul(optarg, NULL, 0);
+			if (iterations <= 0 || iterations >= ULONG_MAX)
+				die("invalid argument to --iterations");
+			break;
+		default:
+			usage();
+			return 1;
+		}
+	}
+
+	for (i = 0; i < iterations; i++) {
+		int alg_fd = get_alg_fd("rng", "jitterentropy_rng");
+		int req_fd = get_req_fd(alg_fd, "jitterentropy_rng");
+
+		dump_from_jent_fd(req_fd, amount);
+
+		close(req_fd);
+		close(alg_fd);
+	}
+	return 0;
+}
+
+/* ---------------------------------------------------------------------------
+ *			  show_invalid_inputs command
+ * ---------------------------------------------------------------------------*/
+
+enum direction {
+	UNSPECIFIED,
+	DECRYPT,
+	ENCRYPT,
+};
+
+static const struct invalid_input_test {
+	const char *alg_type;
+	const char *alg_name;
+	const char *key;
+	size_t key_size;
+	const char *msg;
+	size_t msg_size;
+	const char *iv;
+	size_t iv_size;
+	enum direction direction;
+	int setkey_error;
+	int crypt_error;
+} invalid_input_tests[] = {
+	{
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 16,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 17,
+		.setkey_error = EINVAL,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 24,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 32,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 33,
+		.setkey_error = EINVAL,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 16,
+		.msg_size = 1,
+		.direction = DECRYPT,
+		.crypt_error = EINVAL,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 16,
+		.msg_size = 16,
+		.direction = ENCRYPT,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "cbc(aes)",
+		.key_size = 16,
+		.msg_size = 17,
+		.direction = ENCRYPT,
+		.crypt_error = EINVAL,
+	}, {
+		.alg_type = "hash",
+		.alg_name = "cmac(aes)",
+		.key_size = 29,
+		.setkey_error = EINVAL,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "xts(aes)",
+		.key_size = 32,
+	}, {
+		.alg_type = "skcipher",
+		.alg_name = "xts(aes)",
+		.key = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"
+		       "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0",
+		.key_size = 32,
+		.setkey_error = EINVAL,
+	}
+};
+
+static const char *describe_crypt_op(const struct invalid_input_test *t)
+{
+	if (t->direction == ENCRYPT)
+		return "encryption";
+	if (t->direction == DECRYPT)
+		return "decryption";
+	if (strcmp(t->alg_type, "hash") == 0)
+		return "hashing";
+	ASSERT(0);
+}
+
+static bool af_alg_setkey(const struct invalid_input_test *t, int alg_fd)
+{
+	const uint8_t *key = (const uint8_t *)t->key;
+	uint8_t _key[t->key_size];
+
+	if (t->key_size == 0)
+		return true;
+
+	if (t->key == NULL) {
+		rand_bytes(_key, t->key_size);
+		key = _key;
+	}
+	if (setsockopt(alg_fd, SOL_ALG, ALG_SET_KEY, key, t->key_size) != 0) {
+		printf("%s: setting %zu-byte key failed with error '%s'\n",
+		       t->alg_name, t->key_size, strerror(errno));
+		printf("\tkey was %s\n\n", bytes_to_hex(key, t->key_size));
+		ASSERT(t->setkey_error == errno);
+		return false;
+	}
+	printf("%s: setting %zu-byte key succeeded\n",
+	       t->alg_name, t->key_size);
+	printf("\tkey was %s\n\n", bytes_to_hex(key, t->key_size));
+	ASSERT(t->setkey_error == 0);
+	return true;
+}
+
+static void af_alg_process_msg(const struct invalid_input_test *t, int alg_fd)
+{
+	struct iovec iov;
+	struct msghdr hdr = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	const uint8_t *msg = (const uint8_t *)t->msg;
+	uint8_t *_msg = NULL;
+	uint8_t *output = NULL;
+	uint8_t *control = NULL;
+	size_t controllen = 0;
+	struct cmsghdr *cmsg;
+	int req_fd;
+
+	if (t->msg_size == 0)
+		return;
+
+	req_fd = get_req_fd(alg_fd, t->alg_name);
+
+	if (t->msg == NULL) {
+		_msg = malloc(t->msg_size);
+		rand_bytes(_msg, t->msg_size);
+		msg = _msg;
+	}
+	output = malloc(t->msg_size);
+	iov.iov_base = (void *)msg;
+	iov.iov_len = t->msg_size;
+
+	if (t->direction != UNSPECIFIED)
+		controllen += CMSG_SPACE(sizeof(uint32_t));
+	if (t->iv_size)
+		controllen += CMSG_SPACE(sizeof(struct af_alg_iv) + t->iv_size);
+	control = calloc(1, controllen);
+	hdr.msg_control = control;
+	hdr.msg_controllen = controllen;
+	cmsg = CMSG_FIRSTHDR(&hdr);
+	if (t->direction != UNSPECIFIED) {
+		cmsg->cmsg_level = SOL_ALG;
+		cmsg->cmsg_type = ALG_SET_OP;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+		*(uint32_t *)CMSG_DATA(cmsg) = t->direction == DECRYPT ?
+				ALG_OP_DECRYPT : ALG_OP_ENCRYPT;
+		cmsg = CMSG_NXTHDR(&hdr, cmsg);
+	}
+	if (t->iv_size) {
+		struct af_alg_iv *alg_iv;
+
+		cmsg->cmsg_level = SOL_ALG;
+		cmsg->cmsg_type = ALG_SET_IV;
+		cmsg->cmsg_len = CMSG_LEN(sizeof(*alg_iv) + t->iv_size);
+		alg_iv = (struct af_alg_iv *)CMSG_DATA(cmsg);
+		alg_iv->ivlen = t->iv_size;
+		memcpy(alg_iv->iv, t->iv, t->iv_size);
+	}
+
+	if (sendmsg(req_fd, &hdr, 0) != t->msg_size)
+		die_errno("sendmsg failed");
+
+	if (read(req_fd, output, t->msg_size) != t->msg_size) {
+		printf("%s: %s of %zu-byte message failed with error '%s'\n",
+		       t->alg_name, describe_crypt_op(t), t->msg_size,
+		       strerror(errno));
+		printf("\tmessage was %s\n\n", bytes_to_hex(msg, t->msg_size));
+		ASSERT(t->crypt_error == errno);
+	} else {
+		printf("%s: %s of %zu-byte message succeeded\n",
+		       t->alg_name, describe_crypt_op(t), t->msg_size);
+		printf("\tmessage was %s\n\n", bytes_to_hex(msg, t->msg_size));
+		ASSERT(t->crypt_error == 0);
+	}
+	free(_msg);
+	free(output);
+	free(control);
+	close(req_fd);
+}
+
+static void test_invalid_input(const struct invalid_input_test *t)
+{
+	int alg_fd = get_alg_fd(t->alg_type, t->alg_name);
+
+	if (af_alg_setkey(t, alg_fd))
+		af_alg_process_msg(t, alg_fd);
+
+	close(alg_fd);
+}
+
+static int cmd_show_invalid_inputs(int argc, char *argv[])
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(invalid_input_tests); i++)
+		test_invalid_input(&invalid_input_tests[i]);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------------
+ *			  show_module_version command
+ * ---------------------------------------------------------------------------*/
+
+static int cmd_show_module_version(int argc, char *argv[])
+{
+	printf("fips140_module_version() => \"%s\"\n",
+	       fips140_module_version());
+	return 0;
+}
+
+/* ---------------------------------------------------------------------------
+ *			show_service_indicators command
+ * ---------------------------------------------------------------------------*/
+
+static const char * const default_services_to_show[] = {
+	"aes",
+	"cbc(aes)",
+	"cbcmac(aes)",
+	"cmac(aes)",
+	"ctr(aes)",
+	"cts(cbc(aes))",
+	"ecb(aes)",
+	"essiv(cbc(aes),sha256)",
+	"gcm(aes)",
+	"hmac(sha1)",
+	"hmac(sha224)",
+	"hmac(sha256)",
+	"hmac(sha384)",
+	"hmac(sha512)",
+	"jitterentropy_rng",
+	"sha1",
+	"sha224",
+	"sha256",
+	"sha384",
+	"sha512",
+	"stdrng",
+	"xcbc(aes)",
+	"xts(aes)",
+};
+
+static int cmd_show_service_indicators(int argc, char *argv[])
+{
+	const char * const *services = default_services_to_show;
+	int count = ARRAY_SIZE(default_services_to_show);
+	int i;
+
+	if (argc > 1) {
+		services = (const char **)(argv + 1);
+		count = argc - 1;
+	}
+	for (i = 0; i < count; i++) {
+		printf("fips140_is_approved_service(\"%s\") => %s\n",
+		       services[i],
+		       booltostr(fips140_is_approved_service(services[i])));
+	}
+	return 0;
+}
+
+/* ---------------------------------------------------------------------------
+ *				     main()
+ * ---------------------------------------------------------------------------*/
+
+static const struct command {
+	const char *name;
+	int (*func)(int argc, char *argv[]);
+} commands[] = {
+	{ "dump_jitterentropy", cmd_dump_jitterentropy },
+	{ "show_invalid_inputs", cmd_show_invalid_inputs },
+	{ "show_module_version", cmd_show_module_version },
+	{ "show_service_indicators", cmd_show_service_indicators },
+};
+
+static void usage(void)
+{
+	fprintf(stderr,
+"Usage:\n"
+"       fips140_lab_util dump_jitterentropy [OPTION]...\n"
+"       fips140_lab_util show_invalid_inputs\n"
+"       fips140_lab_util show_module_version\n"
+"       fips140_lab_util show_service_indicators [SERVICE]...\n"
+"\n"
+"Options for dump_jitterentropy:\n"
+"  --amount=AMOUNT      Amount to dump in bytes per iteration (default 128)\n"
+"  --iterations=COUNT   Number of start-up iterations (default 1)\n"
+	);
+}
+
+int main(int argc, char *argv[])
+{
+	int i;
+
+	if (argc < 2) {
+		usage();
+		return 2;
+	}
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "--help") == 0) {
+			usage();
+			return 2;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(commands); i++) {
+		if (strcmp(commands[i].name, argv[1]) == 0)
+			return commands[i].func(argc - 1, argv + 1);
+	}
+	fprintf(stderr, "Unknown command: %s\n\n", argv[1]);
+	usage();
+	return 2;
+}

From 9972a4f7df2f40cb752cd3b366d506ba0e515cf9 Mon Sep 17 00:00:00 2001
From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
Date: Wed, 27 Jul 2022 16:58:09 -0700
Subject: [PATCH 441/457] ANDROID: dma-buf: Add vendorhook to allow mmaping
 more memory than a DMA-BUF holds

Add vendorhook to allow mmaping more memory than a DMA-BUF holds. The
implementor of the vmap callback for the DMA-BUF is responsible for
ensuring that all pages are backed by memory. The hook takes as input
a DMA-BUF to allow the VMA bounds check to be done on a case-by-case
basis for DMA-BUFs.

Note that if the override is allowed to go through for a given
DMA-BUF, then it can be the case that the size of this mmaped DMA-BUF
is reported incorrectly when looking at /proc/pid/maps for the owning
process.

Bug: 234753494
Change-Id: Iba8cc8adfd2290e4dc7ef04fce5d6a80ac92e0b3
Signed-off-by: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
Signed-off-by: Sukadev Bhattiprolu <quic_sukadev@quicinc.com>
---
 drivers/android/vendor_hooks.c |  3 +++
 drivers/dma-buf/dma-buf.c      |  8 ++++++--
 include/trace/hooks/dmabuf.h   | 21 +++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/hooks/dmabuf.h

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index f900944fe250..994e19d08480 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/iova.h>
+#include <linux/dma-buf.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/hooks/vendor_hooks.h>
@@ -41,6 +42,7 @@
 #include <trace/hooks/rwsem.h>
 #include <trace/hooks/futex.h>
 #include <trace/hooks/fips140.h>
+#include <trace/hooks/dmabuf.h>
 
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -113,6 +115,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_rvh_selinux_is_initialized);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_mmap_file);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_file_open);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_check_bpf_syscall);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ignore_dmabuf_vmap_bounds);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rproc_recovery_set);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rwsem_init);
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 639d254964a2..3ca807615488 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -27,6 +27,7 @@
 #include <linux/mm.h>
 #include <linux/mount.h>
 #include <linux/pseudo_fs.h>
+#include <trace/hooks/dmabuf.h>
 
 #include <uapi/linux/dma-buf.h>
 #include <uapi/linux/magic.h>
@@ -130,6 +131,7 @@ static struct file_system_type dma_buf_fs_type = {
 static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
 {
 	struct dma_buf *dmabuf;
+	bool ignore_bounds = false;
 
 	if (!is_dma_buf_file(file))
 		return -EINVAL;
@@ -140,9 +142,11 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
 	if (!dmabuf->ops->mmap)
 		return -EINVAL;
 
+	trace_android_vh_ignore_dmabuf_vmap_bounds(dmabuf, &ignore_bounds);
+
 	/* check for overflowing the buffer's size */
-	if (vma->vm_pgoff + vma_pages(vma) >
-	    dmabuf->size >> PAGE_SHIFT)
+	if ((vma->vm_pgoff + vma_pages(vma) >
+	    dmabuf->size >> PAGE_SHIFT) && !ignore_bounds)
 		return -EINVAL;
 
 	return dmabuf->ops->mmap(dmabuf, vma);
diff --git a/include/trace/hooks/dmabuf.h b/include/trace/hooks/dmabuf.h
new file mode 100644
index 000000000000..85688ebd703f
--- /dev/null
+++ b/include/trace/hooks/dmabuf.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dmabuf
+
+#define TRACE_INCLUDE_PATH trace/hooks
+
+#if !defined(_TRACE_HOOK_DMA_BUF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HOOK_DMA_BUF_H
+
+struct dma_buf;
+
+#include <trace/hooks/vendor_hooks.h>
+
+DECLARE_HOOK(android_vh_ignore_dmabuf_vmap_bounds,
+	     TP_PROTO(struct dma_buf *dma_buf, bool *ignore_bounds),
+	     TP_ARGS(dma_buf, ignore_bounds));
+
+#endif /* _TRACE_HOOK_DMA_BUF_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

From 631f92ce30a9ee2c3ecfabf34500fb2e0f6d93f1 Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Thu, 5 Jan 2023 23:24:26 +0000
Subject: [PATCH 442/457] ANDROID: KVM: arm64: Resolve hyp module addresses
 using ELF sections

Resolving the addresses of the hypervisor sections within a loadable
module using symbol assignment is fragile, particularly in the face of
mergeable sections (i.e. those emitted with SHF_MERGE by the compiler).

Instead, parse the ELF .hyp.* sections directly and remove the need for
global symbols in the hypervisor module linker script.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 261855285
Change-Id: I91d88e1a341b91ffe52ffc770dddc9b46ccb3aa4
---
 arch/arm64/Kconfig                       |  2 +-
 arch/arm64/include/asm/kvm_mmu.h         |  3 -
 arch/arm64/include/asm/kvm_pkvm_module.h | 71 +++++++-----------------
 arch/arm64/include/asm/module.h          | 48 ++++++++++++++--
 arch/arm64/kernel/module.c               | 60 +++++++++++++++++++-
 arch/arm64/kvm/hyp/nvhe/module.lds.S     | 13 +----
 arch/arm64/kvm/hyp/nvhe/psci-relay.c     |  2 +
 arch/arm64/kvm/pkvm.c                    |  8 +--
 8 files changed, 131 insertions(+), 76 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1478a5393b36..460761d22ef8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
 	select GENERIC_GETTIMEOFDAY
 	select GENERIC_VDSO_TIME_NS
 	select HARDIRQS_SW_RESEND
+	select HAVE_MOD_ARCH_SPECIFIC if (ARM64_MODULE_PLTS || KVM)
 	select HAVE_MOVE_PMD
 	select HAVE_MOVE_PUD
 	select HAVE_PCI
@@ -2049,7 +2050,6 @@ config ARM64_SME
 config ARM64_MODULE_PLTS
 	bool "Use PLTs to allow module memory to spill over into vmalloc area"
 	depends on MODULES
-	select HAVE_MOD_ARCH_SPECIFIC
 	help
 	  Allocate PLTs when loading modules so that jumps and calls whose
 	  targets are too far away for their relative offsets to be encoded
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index ecbe1e623685..0f2cfa107f5d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -122,9 +122,6 @@ void kvm_update_va_mask(struct alt_instr *alt,
 			__le32 *origptr, __le32 *updptr, int nr_inst);
 void kvm_compute_layout(void);
 void kvm_apply_hyp_relocations(void);
-void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va,
-				      kvm_nvhe_reloc_t *begin,
-				      kvm_nvhe_reloc_t *end);
 
 #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)
 
diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 68b0fbbf2fd4..3e0ebde74e03 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -14,6 +14,7 @@ enum pkvm_psci_notification {
 	PKVM_PSCI_CPU_ENTRY,
 };
 
+#ifdef CONFIG_MODULES
 struct pkvm_module_ops {
 	int (*create_private_mapping)(phys_addr_t phys, size_t size,
 				      enum kvm_pgtable_prot prot,
@@ -35,66 +36,36 @@ struct pkvm_module_ops {
 	int (*register_hyp_panic_notifier)(void (*cb)(struct kvm_cpu_context *host_ctxt));
 };
 
-struct pkvm_module_section {
-	void *start;
-	void *end;
-};
-
-typedef s32 kvm_nvhe_reloc_t;
-
-struct pkvm_el2_module {
-	struct pkvm_module_section text;
-	struct pkvm_module_section bss;
-	struct pkvm_module_section rodata;
-	struct pkvm_module_section data;
-	kvm_nvhe_reloc_t *relocs;
-	unsigned int nr_relocs;
-	int (*init)(const struct pkvm_module_ops *ops);
-};
-
-int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
-			   unsigned long *token);
+int __pkvm_load_el2_module(struct module *this, unsigned long *token);
 
 int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
 			     unsigned long hyp_text_kern_va);
+#else
+static inline int __pkvm_load_el2_module(struct module *this,
+					 unsigned long *token)
+{
+	return -ENOSYS;
+}
+
+static inline int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
+					   unsigned long hyp_text_kern_va)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_MODULES */
+
 #ifdef MODULE
 #define pkvm_load_el2_module(init_fn, token)				\
 ({									\
-	extern char __kvm_nvhe___hypmod_text_start[];			\
-	extern char __kvm_nvhe___hypmod_text_end[];			\
-	extern char __kvm_nvhe___hypmod_bss_start[];			\
-	extern char __kvm_nvhe___hypmod_bss_end[];			\
-	extern char __kvm_nvhe___hypmod_rodata_start[];			\
-	extern char __kvm_nvhe___hypmod_rodata_end[];			\
-	extern char __kvm_nvhe___hypmod_data_start[];			\
-	extern char __kvm_nvhe___hypmod_data_end[];			\
-	extern char __kvm_nvhe___hyprel_start[];			\
-	extern char __kvm_nvhe___hyprel_end[];				\
-	struct pkvm_el2_module mod;					\
-									\
-	mod.text.start		= __kvm_nvhe___hypmod_text_start;	\
-	mod.text.end		= __kvm_nvhe___hypmod_text_end;		\
-	mod.bss.start		= __kvm_nvhe___hypmod_bss_start;	\
-	mod.bss.end		= __kvm_nvhe___hypmod_bss_end;		\
-	mod.rodata.start	= __kvm_nvhe___hypmod_rodata_start;	\
-	mod.rodata.end		= __kvm_nvhe___hypmod_rodata_end;	\
-	mod.data.start		= __kvm_nvhe___hypmod_data_start;	\
-	mod.data.end		= __kvm_nvhe___hypmod_data_end;		\
-	mod.relocs		= (kvm_nvhe_reloc_t *)__kvm_nvhe___hyprel_start; \
-	mod.nr_relocs		= (__kvm_nvhe___hyprel_end - __kvm_nvhe___hyprel_start) / \
-				  sizeof(*mod.relocs);			\
-	mod.init = init_fn;						\
-									\
-	__pkvm_load_el2_module(&mod, THIS_MODULE, token);		\
+	THIS_MODULE->arch.hyp.init = init_fn;				\
+	__pkvm_load_el2_module(THIS_MODULE, token);			\
 })
 
 #define pkvm_register_el2_mod_call(hfn, token)				\
 ({									\
-	extern char __kvm_nvhe___hypmod_text_start[];			\
-	unsigned long hyp_text_kern_va = 				\
-		(unsigned long)__kvm_nvhe___hypmod_text_start;		\
-	__pkvm_register_el2_call(hfn, token,				\
-				 hyp_text_kern_va);			\
+	unsigned long hyp_text_kern_va;					\
+	hyp_text_kern_va = THIS_MODULE->arch.hyp.text.start;	 	\
+	__pkvm_register_el2_call(hfn, token, hyp_text_kern_va);		\
 })
 
 #define pkvm_el2_mod_call(id, ...)					\
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index 18734fed3bdd..3a505c18343a 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -14,12 +14,50 @@ struct mod_plt_sec {
 	int			plt_max_entries;
 };
 
-struct mod_arch_specific {
-	struct mod_plt_sec	core;
-	struct mod_plt_sec	init;
-
-	/* for CONFIG_DYNAMIC_FTRACE */
+#define ARM64_MODULE_PLTS_ARCHDATA					\
+	struct mod_plt_sec	core;					\
+	struct mod_plt_sec	init;					\
+									\
+	/* for CONFIG_DYNAMIC_FTRACE */					\
 	struct plt_entry	*ftrace_trampolines;
+#else
+#define ARM64_MODULE_PLTS_ARCHDATA
+#endif
+
+#ifdef CONFIG_KVM
+struct pkvm_module_section {
+	void *start;
+	void *end;
+};
+
+typedef s32 kvm_nvhe_reloc_t;
+struct pkvm_module_ops;
+
+struct pkvm_el2_module {
+	struct pkvm_module_section text;
+	struct pkvm_module_section bss;
+	struct pkvm_module_section rodata;
+	struct pkvm_module_section data;
+	kvm_nvhe_reloc_t *relocs;
+	unsigned int nr_relocs;
+	int (*init)(const struct pkvm_module_ops *ops);
+};
+
+void kvm_apply_hyp_module_relocations(void *mod_start, void *hyp_va,
+				      kvm_nvhe_reloc_t *begin,
+				      kvm_nvhe_reloc_t *end);
+
+#define ARM64_MODULE_KVM_ARCHDATA					\
+	/* For pKVM hypervisor modules */				\
+	struct pkvm_el2_module	hyp;
+#else
+#define ARM64_MODULE_KVM_ARCHDATA
+#endif
+
+#ifdef CONFIG_HAVE_MOD_ARCH_SPECIFIC
+struct mod_arch_specific {
+	ARM64_MODULE_PLTS_ARCHDATA
+	ARM64_MODULE_KVM_ARCHDATA
 };
 #endif
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 76b41e4ca9fa..1f10cd529b48 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -505,14 +505,72 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
 	return 0;
 }
 
+static int module_init_hyp(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+			   struct module *mod)
+{
+#ifdef CONFIG_KVM
+	const Elf_Shdr *s;
+
+	s = find_section(hdr, sechdrs, ".hyp.text");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.text = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.bss");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.bss = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.rodata");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.rodata = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.data");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.data = (struct pkvm_module_section) {
+		.start	= (void *)s->sh_addr,
+		.end	= (void *)s->sh_addr + s->sh_size,
+	};
+
+	s = find_section(hdr, sechdrs, ".hyp.reloc");
+	if (!s)
+		return -ENOEXEC;
+
+	mod->arch.hyp.relocs = (void *)s->sh_addr;
+	mod->arch.hyp.nr_relocs = s->sh_size / sizeof(mod->arch.hyp.relocs);
+#endif
+	return 0;
+}
+
 int module_finalize(const Elf_Ehdr *hdr,
 		    const Elf_Shdr *sechdrs,
 		    struct module *me)
 {
+	int err;
 	const Elf_Shdr *s;
+
 	s = find_section(hdr, sechdrs, ".altinstructions");
 	if (s)
 		apply_alternatives_module((void *)s->sh_addr, s->sh_size);
 
-	return module_init_ftrace_plt(hdr, sechdrs, me);
+	err = module_init_ftrace_plt(hdr, sechdrs, me);
+	if (err)
+		return err;
+
+	return module_init_hyp(hdr, sechdrs, me);
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/module.lds.S b/arch/arm64/kvm/hyp/nvhe/module.lds.S
index 696ab5408265..645080c681cd 100644
--- a/arch/arm64/kvm/hyp/nvhe/module.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/module.lds.S
@@ -6,32 +6,21 @@
 SECTIONS {
 	.hyp.text : {
 		HYP_SECTION_SYMBOL_NAME(.text) = .;
-		__hypmod_text_start = .;
 		*(.text .text.*)
-		__hypmod_text_end = .;
 	}
 
 	.hyp.bss : {
 		HYP_SECTION_SYMBOL_NAME(.bss) = .;
-		__hypmod_bss_start = .;
 		*(.bss .bss.*)
-		FILL(0)
-		__hypmod_bss_end = .;
 	}
 
 	.hyp.rodata : {
 		HYP_SECTION_SYMBOL_NAME(.rodata) = .;
-		__hypmod_rodata_start = .;
-		*(.rodata .rodata.* .note.gnu.property)
-		BYTE(0)
-		__hypmod_rodata_end = .;
+		*(.rodata .rodata.*)
 	}
 
 	.hyp.data : {
 		HYP_SECTION_SYMBOL_NAME(.data) = .;
-		__hypmod_data_start = .;
 		*(.data .data.*)
-		BYTE(0)
-		__hypmod_data_end = .;
 	}
 }
diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
index c5b439fd0066..0a10582e8607 100644
--- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c
+++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c
@@ -31,10 +31,12 @@ static void pkvm_psci_notify(enum pkvm_psci_notification notif, struct kvm_cpu_c
 		pkvm_psci_notifier(notif, host_ctxt);
 }
 
+#ifdef CONFIG_MODULES
 int __pkvm_register_psci_notifier(void (*cb)(enum pkvm_psci_notification, struct kvm_cpu_context *))
 {
 	return cmpxchg(&pkvm_psci_notifier, NULL, cb) ? -EBUSY : 0;
 }
+#endif
 
 #define INVALID_CPU_ID	UINT_MAX
 
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 2ff0eb18a57d..57395cb8f7a9 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -507,7 +507,6 @@ static int __init early_pkvm_enable_modules(char *arg)
 	return 0;
 }
 early_param("kvm-arm.protected_modules", early_pkvm_enable_modules);
-#endif
 
 struct pkvm_mod_sec_mapping {
 	struct pkvm_module_section *sec;
@@ -584,9 +583,9 @@ static int __pkvm_cmp_mod_sec(const void *p1, const void *p2)
 	return s1->sec->start < s2->sec->start ? -1 : s1->sec->start > s2->sec->start;
 }
 
-int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
-			   unsigned long *token)
+int __pkvm_load_el2_module(struct module *this, unsigned long *token)
 {
+	struct pkvm_el2_module *mod = &this->arch.hyp;
 	struct pkvm_mod_sec_mapping secs_map[] = {
 		{ &mod->text, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X },
 		{ &mod->bss, KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W },
@@ -616,7 +615,7 @@ int __pkvm_load_el2_module(struct pkvm_el2_module *mod, struct module *this,
 	sort(secs_map, ARRAY_SIZE(secs_map), sizeof(secs_map[0]), __pkvm_cmp_mod_sec, NULL);
 	start = secs_map[0].sec->start;
 	end = secs_map[ARRAY_SIZE(secs_map) - 1].sec->end;
-	size = PAGE_ALIGN(end - start);
+	size = end - start;
 
 	hyp_va = (void *)kvm_call_hyp_nvhe(__pkvm_alloc_module_va, size >> PAGE_SHIFT);
 	if (!hyp_va) {
@@ -670,3 +669,4 @@ int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__pkvm_register_el2_call);
+#endif /* CONFIG_MODULES */

From 0ead19c440f79f82a7773c4d1001e03669f3c53a Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 6 Jan 2023 17:53:04 +0000
Subject: [PATCH 443/457] Revert "ANDROID: KVM: arm64: Make gen-hyprel emit
 delimiters"

This reverts commit f347aa8c3da299a25c49dbfa533958985cfa46be.

The '__hyprel_{start,end}' symbols are no longer used, so don't bother
generating them.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 261855285
Change-Id: I8e8dc5c94a9e67400e73e362e4377032328d86d4
---
 arch/arm64/tools/gen-hyprel.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/tools/gen-hyprel.c b/arch/arm64/tools/gen-hyprel.c
index fa719b6c6d54..6bc88a756cb7 100644
--- a/arch/arm64/tools/gen-hyprel.c
+++ b/arch/arm64/tools/gen-hyprel.c
@@ -296,10 +296,8 @@ static void init_elf(const char *path)
 /* Print the prologue of the output ASM file. */
 static void emit_prologue(void)
 {
-	printf("#include <linux/linkage.h>\n"
-	       ".data\n"
-	       ".pushsection " HYP_RELOC_SECTION ", \"a\"\n"
-	       "SYM_ENTRY(__hyprel_start, SYM_L_GLOBAL, SYM_A_NONE)\n");
+	printf(".data\n"
+	       ".pushsection " HYP_RELOC_SECTION ", \"a\"\n");
 }
 
 /* Print ASM statements needed as a prologue to a processed hyp section. */
@@ -349,8 +347,7 @@ static void emit_rela_abs64(Elf64_Rela *rela, const char *sh_orig_name)
 /* Print the epilogue of the output ASM file. */
 static void emit_epilogue(void)
 {
-	printf("SYM_ENTRY(__hyprel_end, SYM_L_GLOBAL, SYM_A_NONE)\n"
-	       ".popsection\n");
+	printf(".popsection\n");
 }
 
 /*

From b17ff311f3cc2c812901ca85260e533474f14271 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 5 Jan 2023 23:24:26 +0000
Subject: [PATCH 444/457] ANDROID: KVM: arm64: Ignore modules with empty
 .hyp.text section

Modules with an empty '.hyp.text' section do not contain any EL2 code
and should therefore be ignored for the purposes of hypervisor module
loading. Failing to ignore such modules will likely result in a later
loading failure due to the absence of '.hyp.reloc', which is not present
for non-hypervisor modules.

Don't bother parsing the other '.hyp.*' sections for modules with an
empty '.hyp.text' section and return early success to allow the module
to load as a normal kernel module.

Fixes: 3dc729d157a7 ("ANDROID: KVM: arm64: Resolve hyp module addresses using ELF sections")
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 261855285
Change-Id: Idc24f95881c520b40038f77cd5af5ccc1d23624f
---
 arch/arm64/kernel/module.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 1f10cd529b48..e727c51a53eb 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -511,9 +511,13 @@ static int module_init_hyp(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 #ifdef CONFIG_KVM
 	const Elf_Shdr *s;
 
+	/*
+	 * If the .hyp.text is missing or empty, this is not a hypervisor
+	 * module so ignore the rest of it.
+	 */
 	s = find_section(hdr, sechdrs, ".hyp.text");
-	if (!s)
-		return -ENOEXEC;
+	if (!s || !s->sh_size)
+		return 0;
 
 	mod->arch.hyp.text = (struct pkvm_module_section) {
 		.start	= (void *)s->sh_addr,

From 955a8699b86ac3576d26c1ad77cccb2f5199b5a9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 5 Jan 2023 23:24:26 +0000
Subject: [PATCH 445/457] ANDROID: KVM: arm64: Fix calculation for number of
 relocs in .hyp.reloc

Fix the calculation to determine the number of module relocs present in
the '.hyp.reloc' section to divide by the size of 'kvm_nvhe_reloc_t' (4)
instead of the size of a pointer (8).

Fixes: 3dc729d157a7 ("ANDROID: KVM: arm64: Resolve hyp module addresses using ELF sections")
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
Bug: 261855285
Change-Id: Ia7afc508039d549ae061793afa39fde9d844c069
---
 arch/arm64/kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e727c51a53eb..f01d8b3328cb 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -556,7 +556,7 @@ static int module_init_hyp(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
 		return -ENOEXEC;
 
 	mod->arch.hyp.relocs = (void *)s->sh_addr;
-	mod->arch.hyp.nr_relocs = s->sh_size / sizeof(mod->arch.hyp.relocs);
+	mod->arch.hyp.nr_relocs = s->sh_size / sizeof(*mod->arch.hyp.relocs);
 #endif
 	return 0;
 }

From 1c28907d7396680dbbfcb341340369e69aa670f4 Mon Sep 17 00:00:00 2001
From: Huang Yiwei <quic_hyiwei@quicinc.com>
Date: Mon, 26 Sep 2022 14:53:11 +0800
Subject: [PATCH 446/457] ANDROID: timer: Add vendor hook for timer calc index

timer wheel calculates the index for any timer based on the expiry
value and level granularity of the timer. Due to the level granularity
timer will not fire at the exact time instead expire at a time value
expires + granularity. This is done in the timer code when the index for
each timer is calculated based on the expiry and granularity at each
level:
  expires = (expires >> LVL_SHIFT(lvl)) + 1;
For devfreq drivers the requirement is to fire the timer at the exact
time. If the timer does not expire at the exact time then it'll take
much longer to react and increase the device frequency. Devfreq driver
registers timer for 10ms expiry and due to slack in timer code the
expirty happens at 20 ms. For eg: Frame rendering time is 16ms.
If devfreq driver reacts after 20ms instead of 10ms, that's
way past a frame rendering time.
Timers with 10ms to 630ms expiry fall under level 0, to overcome the
granularity issue for level 0 with low expirty values do not add the
granularity by introducing a new calc_index vendor hook.

Bug: 178758017
Change-Id: I13cdf541e4c1bd426ce28b7a8a17cb8381eb2a92
Signed-off-by: Huang Yiwei <quic_hyiwei@quicinc.com>
(cherry picked from commit 18550710107dcdd4a370ad43a8ff346074df4faa)
[quic_satyap@quicinc.com: fix minor merge conflict]
Signed-off-by: Satya Durga Srinivasu Prabhala <quic_satyap@quicinc.com>
---
 drivers/android/vendor_hooks.c |  2 ++
 include/trace/hooks/timer.h    | 18 ++++++++++++++++++
 kernel/time/timer.c            |  3 +++
 3 files changed, 23 insertions(+)
 create mode 100644 include/trace/hooks/timer.h

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 994e19d08480..6d2133d63b49 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -43,6 +43,7 @@
 #include <trace/hooks/futex.h>
 #include <trace/hooks/fips140.h>
 #include <trace/hooks/dmabuf.h>
+#include <trace/hooks/timer.h>
 
 /*
  * Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -127,3 +128,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sha256);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_expandkey);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_encrypt);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_aes_decrypt);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timer_calc_index);
diff --git a/include/trace/hooks/timer.h b/include/trace/hooks/timer.h
new file mode 100644
index 000000000000..67ef865dad4a
--- /dev/null
+++ b/include/trace/hooks/timer.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM timer
+
+#define TRACE_INCLUDE_PATH trace/hooks
+
+#if !defined(_TRACE_HOOK_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HOOK_TIMER_H
+
+#include <trace/hooks/vendor_hooks.h>
+
+DECLARE_HOOK(android_vh_timer_calc_index,
+	TP_PROTO(unsigned int lvl, unsigned long *expires),
+	TP_ARGS(lvl, expires));
+
+#endif /* _TRACE_HOOK_TIMER_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14a..0ba4c1615f1c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -56,6 +56,8 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
+#undef CREATE_TRACE_POINTS
+#include <trace/hooks/timer.h>
 
 __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
@@ -525,6 +527,7 @@ static inline unsigned calc_index(unsigned long expires, unsigned lvl,
 	 *
 	 * Round up with level granularity to prevent this.
 	 */
+	trace_android_vh_timer_calc_index(lvl, &expires);
 	expires = (expires >> LVL_SHIFT(lvl)) + 1;
 	*bucket_expiry = expires << LVL_SHIFT(lvl);
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);

From fb5ea70e2e33932b5b35fedd7a30cf5d9170126c Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 9 Dec 2022 18:10:43 +0000
Subject: [PATCH 447/457] ANDROID: KVM: arm64: Add helper for pKVM modules addr
 conversion

pKVM modules can't rely on the usual hyp function kern_hyp_va() to
convert addr from the kernel space to the hyp's. Instead, provide
pkvm_el2_mod_va() that will do the conversion using the token provided
by pkvm_load_el2_module().

Bug: 244543039
Bug: 244373730
Change-Id: I7423b40f1107bb92cd732843c5cdbf1d45662f00
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 23 ++++++++++++++++-------
 arch/arm64/kvm/pkvm.c                    | 13 ++-----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 3e0ebde74e03..a9f7994189a7 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -38,8 +38,7 @@ struct pkvm_module_ops {
 
 int __pkvm_load_el2_module(struct module *this, unsigned long *token);
 
-int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
-			     unsigned long hyp_text_kern_va);
+int __pkvm_register_el2_call(unsigned long hfn_hyp_va);
 #else
 static inline int __pkvm_load_el2_module(struct module *this,
 					 unsigned long *token)
@@ -47,14 +46,26 @@ static inline int __pkvm_load_el2_module(struct module *this,
 	return -ENOSYS;
 }
 
-static inline int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
-					   unsigned long hyp_text_kern_va)
+static inline int __pkvm_register_el2_call(unsigned long hfn_hyp_va)
 {
 	return -ENOSYS;
 }
 #endif /* CONFIG_MODULES */
 
 #ifdef MODULE
+/*
+ * Convert an EL2 module addr from the kernel VA to the hyp VA
+ */
+#define pkvm_el2_mod_va(kern_va, token)					\
+({									\
+	unsigned long hyp_text_kern_va =				\
+		(unsigned long)THIS_MODULE->arch.hyp.text.start;	\
+	unsigned long offset;						\
+									\
+	offset = (unsigned long)kern_va - hyp_text_kern_va;		\
+	token + offset;							\
+})
+
 #define pkvm_load_el2_module(init_fn, token)				\
 ({									\
 	THIS_MODULE->arch.hyp.init = init_fn;				\
@@ -63,9 +74,7 @@ static inline int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
 
 #define pkvm_register_el2_mod_call(hfn, token)				\
 ({									\
-	unsigned long hyp_text_kern_va;					\
-	hyp_text_kern_va = THIS_MODULE->arch.hyp.text.start;	 	\
-	__pkvm_register_el2_call(hfn, token, hyp_text_kern_va);		\
+	__pkvm_register_el2_call(pkvm_el2_mod_va(hfn, token));		\
 })
 
 #define pkvm_el2_mod_call(id, ...)					\
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 57395cb8f7a9..7e2aa36b74ac 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -655,18 +655,9 @@ int __pkvm_load_el2_module(struct module *this, unsigned long *token)
 }
 EXPORT_SYMBOL_GPL(__pkvm_load_el2_module);
 
-int __pkvm_register_el2_call(dyn_hcall_t hfn, unsigned long token,
-			     unsigned long hyp_text_kern_va)
+int __pkvm_register_el2_call(unsigned long hfn_hyp_va)
 {
-	unsigned long hfn_hyp_va, offset, text_hyp_va = token;
-	int ret;
-
-	offset = (unsigned long)hfn - hyp_text_kern_va;
-	hfn_hyp_va = text_hyp_va + offset;
-
-	ret = kvm_call_hyp_nvhe(__pkvm_register_hcall,
-				(unsigned long)hfn_hyp_va);
-	return ret;
+	return kvm_call_hyp_nvhe(__pkvm_register_hcall, hfn_hyp_va);
 }
 EXPORT_SYMBOL_GPL(__pkvm_register_el2_call);
 #endif /* CONFIG_MODULES */

From 57f3ff9648991998d008ecf32f2f9e78a08bfb8b Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 2 Dec 2021 13:50:02 -0800
Subject: [PATCH 448/457] ANDROID: fuse-bpf v1.1

This is a squash of these changes cherry-picked from common-android13-5.10

ANDROID: fuse-bpf: Make compile and pass test
ANDROID: fuse-bpf: set error_in to ENOENT in negative lookup
ANDROID: fuse-bpf: Add ability to run ranges of tests to fuse_test
ANDROID: fuse-bpf: Add test for lookup postfilter
ANDROID: fuse-bpf: readddir postfilter fixes
ANDROID: fix kernelci error in fs/fuse/dir.c
ANDROID: fuse-bpf: Fix RCU/reference issue
ANDROID: fuse-bpf: Always call revalidate for backing
ANDROID: fuse-bpf: Adjust backing handle funcs
ANDROID: fuse-bpf: Fix revalidate error path and backing handling
ANDROID: fuse-bpf: Fix use of get_fuse_inode
ANDROID: fuse: Don't use readdirplus w/ nodeid 0
ANDROID: fuse-bpf: Introduce readdirplus test case for fuse bpf
ANDROID: fuse-bpf: Make sure force_again flag is false by default
ANDROID: fuse-bpf: Make inodes with backing_fd reachable for regular FUSE fuse_iget
Revert "ANDROID: fuse-bpf: use target instead of parent inode to execute backing revalidate"
ANDROID: fuse-bpf: use target instead of parent inode to execute backing revalidate
ANDROID: fuse-bpf: Fix misuse of args.out_args
ANDROID: fuse-bpf: Fix non-fusebpf build
ANDROID: fuse-bpf: Use fuse_bpf_args in uapi
ANDROID: fuse-bpf: Fix read_iter
ANDROID: fuse-bpf: Use cache and refcount
ANDROID: fuse-bpf: Rename iocb_fuse to iocb_orig
ANDROID: fuse-bpf: Fix fixattr in rename
ANDROID: fuse-bpf: Fix readdir
ANDROID: fuse-bpf: Fix lseek return value for offset 0
ANDROID: fuse-bpf: fix read_iter and write_iter
ANDROID: fuse-bpf: fix special devices
ANDROID: fuse-bpf: support FUSE_LSEEK
ANDROID: fuse-bpf: Add support for FUSE_COPY_FILE_RANGE
ANDROID: fuse-bpf: Report errors to finalize
ANDROID: fuse-bpf: Avoid reusing uint64_t for file
ANDROID: fuse-bpf: Fix CONFIG_FUSE_BPF typo in FUSE_FSYNCDIR
ANDROID: fuse-bpf: Move fd operations to be synchronous
ANDROID: fuse-bpf: Invalidate if lower is unhashed
ANDROID: fuse-bpf: Move bpf earlier in fuse_permission
ANDROID: fuse-bpf: Update attributes on file write
ANDROID: fuse: allow mounting with no userspace daemon
ANDROID: fuse-bpf: Support FUSE_STATFS
ANDROID: fuse-bpf: Fix filldir
ANDROID: fuse-bpf: fix fuse_create_open_finalize
ANDROID: fuse: add bpf support for removexattr
ANDROID: fuse-bpf: Fix truncate
ANDROID: fuse-bpf: Support inotify
ANDROID: fuse-bpf: Make compile with CONFIG_FUSE but no CONFIG_FUSE_BPF
ANDROID: fuse-bpf: Fix perms on readdir
ANDROID: fuse: Fix umasking in backing
ANDROID: fs/fuse: Backing move returns EXDEV if TO not backed
ANDROID: bpf-fuse: Fix Setattr
ANDROID: fuse-bpf: Check if mkdir dentry setup
ANDROID: fuse-bpf: Close backing fds in fuse_dentry_revalidate
ANDROID: fuse-bpf: Close backing-fd on both paths
ANDROID: fuse-bpf: Partial fix for mmap'd files
ANDROID: fuse-bpf: Restore a missing const
ANDROID: Add fuse-bpf self tests
ANDROID: Add FUSE_BPF to gki_defconfig
ANDROID: fuse-bpf v1
ANDROID: fuse: Move functions in preparation for fuse-bpf

Bug: 202785178
Bug: 265206112
Test: test_fuse passes on linux.
      On cuttlefish,
      atest android.scopedstorage.cts.host.ScopedStorageHostTest
      passes with fuse-bpf enabled and disabled
Change-Id: Idb099c281f9b39ff2c46fa3ebc63e508758416ee
Signed-off-by: Paul Lawrence <paullawrence@google.com>
Signed-off-by: Daniel Rosenberg <drosen@google.com>
---
 arch/arm64/configs/gki_defconfig              |    1 +
 arch/x86/configs/gki_defconfig                |    1 +
 fs/fuse/Kconfig                               |    8 +
 fs/fuse/Makefile                              |    1 +
 fs/fuse/backing.c                             | 2468 +++++++++++++++++
 fs/fuse/control.c                             |    2 +-
 fs/fuse/dev.c                                 |   19 +
 fs/fuse/dir.c                                 |  530 +++-
 fs/fuse/file.c                                |  130 +
 fs/fuse/fuse_i.h                              |  720 ++++-
 fs/fuse/inode.c                               |  322 ++-
 fs/fuse/passthrough.c                         |    2 +-
 fs/fuse/readdir.c                             |   22 +
 fs/fuse/xattr.c                               |   40 +
 include/linux/bpf_types.h                     |    3 +
 include/uapi/linux/android_fuse.h             |   95 +
 include/uapi/linux/bpf.h                      |   10 +
 kernel/bpf/Makefile                           |    3 +
 kernel/bpf/bpf_fuse.c                         |  128 +
 kernel/bpf/btf.c                              |    1 +
 .../selftests/filesystems/fuse/.gitignore     |    2 +
 .../selftests/filesystems/fuse/Makefile       |   34 +
 .../testing/selftests/filesystems/fuse/OWNERS |    2 +
 .../selftests/filesystems/fuse/bpf_loader.c   |  791 ++++++
 .../testing/selftests/filesystems/fuse/fd.txt |   21 +
 .../selftests/filesystems/fuse/fd_bpf.c       |  252 ++
 .../selftests/filesystems/fuse/fuse_daemon.c  |  294 ++
 .../selftests/filesystems/fuse/fuse_test.c    | 2142 ++++++++++++++
 .../selftests/filesystems/fuse/test_bpf.c     |  507 ++++
 .../filesystems/fuse/test_framework.h         |  179 ++
 .../selftests/filesystems/fuse/test_fuse.h    |  337 +++
 .../filesystems/fuse/test_fuse_bpf.h          |   65 +
 32 files changed, 8928 insertions(+), 204 deletions(-)
 create mode 100644 fs/fuse/backing.c
 create mode 100644 include/uapi/linux/android_fuse.h
 create mode 100644 kernel/bpf/bpf_fuse.c
 create mode 100644 tools/testing/selftests/filesystems/fuse/.gitignore
 create mode 100644 tools/testing/selftests/filesystems/fuse/Makefile
 create mode 100644 tools/testing/selftests/filesystems/fuse/OWNERS
 create mode 100644 tools/testing/selftests/filesystems/fuse/bpf_loader.c
 create mode 100644 tools/testing/selftests/filesystems/fuse/fd.txt
 create mode 100644 tools/testing/selftests/filesystems/fuse/fd_bpf.c
 create mode 100644 tools/testing/selftests/filesystems/fuse/fuse_daemon.c
 create mode 100644 tools/testing/selftests/filesystems/fuse/fuse_test.c
 create mode 100644 tools/testing/selftests/filesystems/fuse/test_bpf.c
 create mode 100644 tools/testing/selftests/filesystems/fuse/test_framework.h
 create mode 100644 tools/testing/selftests/filesystems/fuse/test_fuse.h
 create mode 100644 tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index 1004c61cbebf..b733a1dad78e 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -569,6 +569,7 @@ CONFIG_QUOTA=y
 CONFIG_QFMT_V2=y
 CONFIG_FUSE_FS=y
 CONFIG_VIRTIO_FS=y
+CONFIG_FUSE_BPF=y
 CONFIG_OVERLAY_FS=y
 CONFIG_INCREMENTAL_FS=y
 CONFIG_MSDOS_FS=y
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index a4ccc6d4d307..0978b0ab7dd2 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -509,6 +509,7 @@ CONFIG_QUOTA=y
 CONFIG_QFMT_V2=y
 CONFIG_FUSE_FS=y
 CONFIG_VIRTIO_FS=y
+CONFIG_FUSE_BPF=y
 CONFIG_OVERLAY_FS=y
 CONFIG_INCREMENTAL_FS=y
 CONFIG_MSDOS_FS=y
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..3a64fa73e591 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -52,3 +52,11 @@ config FUSE_DAX
 
 	  If you want to allow mounting a Virtio Filesystem with the "dax"
 	  option, answer Y.
+
+config FUSE_BPF
+	bool "Adds BPF to fuse"
+	depends on FUSE_FS
+	depends on BPF
+	help
+	  Extends FUSE by adding BPF to prefilter calls and potentially pass to a
+	  backing file system
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index d9e1b47382f3..096bd78f06e7 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -10,5 +10,6 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
 fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
 fuse-y += passthrough.o
 fuse-$(CONFIG_FUSE_DAX) += dax.o
+fuse-$(CONFIG_FUSE_BPF) += backing.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
new file mode 100644
index 000000000000..22656a0624ba
--- /dev/null
+++ b/fs/fuse/backing.c
@@ -0,0 +1,2468 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE-BPF: Filesystem in Userspace with BPF
+ * Copyright (c) 2021 Google LLC
+ */
+
+#include "fuse_i.h"
+
+#include <linux/fdtable.h>
+#include <linux/filter.h>
+#include <linux/fs_stack.h>
+#include <linux/namei.h>
+
+#include "../internal.h"
+
+#define FUSE_BPF_IOCB_MASK (IOCB_APPEND | IOCB_DSYNC | IOCB_HIPRI | IOCB_NOWAIT | IOCB_SYNC)
+
+struct fuse_bpf_aio_req {
+	struct kiocb iocb;
+	refcount_t ref;
+	struct kiocb *iocb_orig;
+};
+
+static struct kmem_cache *fuse_bpf_aio_request_cachep;
+
+static void fuse_stat_to_attr(struct fuse_conn *fc, struct inode *inode,
+		struct kstat *stat, struct fuse_attr *attr);
+
+static void fuse_file_accessed(struct file *dst_file, struct file *src_file)
+{
+	struct inode *dst_inode;
+	struct inode *src_inode;
+
+	if (dst_file->f_flags & O_NOATIME)
+		return;
+
+	dst_inode = file_inode(dst_file);
+	src_inode = file_inode(src_file);
+
+	if ((!timespec64_equal(&dst_inode->i_mtime, &src_inode->i_mtime) ||
+	     !timespec64_equal(&dst_inode->i_ctime, &src_inode->i_ctime))) {
+		dst_inode->i_mtime = src_inode->i_mtime;
+		dst_inode->i_ctime = src_inode->i_ctime;
+	}
+
+	touch_atime(&dst_file->f_path);
+}
+
+int fuse_open_initialize(struct fuse_bpf_args *fa, struct fuse_open_io *foio,
+			 struct inode *inode, struct file *file, bool isdir)
+{
+	foio->foi = (struct fuse_open_in) {
+		.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY),
+	};
+
+	foio->foo = (struct fuse_open_out) {0};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(inode)->nodeid,
+		.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN,
+		.in_numargs = 1,
+		.out_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(foio->foi),
+			.value = &foio->foi,
+		},
+		.out_args[0] = (struct fuse_bpf_arg) {
+			.size = sizeof(foio->foo),
+			.value = &foio->foo,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_open_backing(struct fuse_bpf_args *fa,
+		      struct inode *inode, struct file *file, bool isdir)
+{
+	struct fuse_mount *fm = get_fuse_mount(inode);
+	const struct fuse_open_in *foi = fa->in_args[0].value;
+	struct fuse_file *ff;
+	int retval;
+	int mask;
+	struct fuse_dentry *fd = get_fuse_dentry(file->f_path.dentry);
+	struct file *backing_file;
+
+	ff = fuse_file_alloc(fm);
+	if (!ff)
+		return -ENOMEM;
+	file->private_data = ff;
+
+	switch (foi->flags & O_ACCMODE) {
+	case O_RDONLY:
+		mask = MAY_READ;
+		break;
+
+	case O_WRONLY:
+		mask = MAY_WRITE;
+		break;
+
+	case O_RDWR:
+		mask = MAY_READ | MAY_WRITE;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	retval = inode_permission(&init_user_ns,
+				  get_fuse_inode(inode)->backing_inode, mask);
+	if (retval)
+		return retval;
+
+	backing_file = dentry_open(&fd->backing_path,
+				   foi->flags,
+				   current_cred());
+
+	if (IS_ERR(backing_file)) {
+		fuse_file_free(ff);
+		file->private_data = NULL;
+		return PTR_ERR(backing_file);
+	}
+	ff->backing_file = backing_file;
+
+	return 0;
+}
+
+void *fuse_open_finalize(struct fuse_bpf_args *fa,
+		       struct inode *inode, struct file *file, bool isdir)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_open_out *foo = fa->out_args[0].value;
+
+	if (ff) {
+		ff->fh = foo->fh;
+		ff->nodeid = get_fuse_inode(inode)->nodeid;
+	}
+	return 0;
+}
+
+int fuse_create_open_initialize(
+		struct fuse_bpf_args *fa, struct fuse_create_open_io *fcoio,
+		struct inode *dir, struct dentry *entry,
+		struct file *file, unsigned int flags, umode_t mode)
+{
+	fcoio->fci = (struct fuse_create_in) {
+		.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY),
+		.mode = mode,
+	};
+
+	fcoio->feo = (struct fuse_entry_out) {0};
+	fcoio->foo = (struct fuse_open_out) {0};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(dir),
+		.opcode = FUSE_CREATE,
+		.in_numargs = 2,
+		.out_numargs = 2,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(fcoio->fci),
+			.value = &fcoio->fci,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+		.out_args[0] = (struct fuse_bpf_arg) {
+			.size = sizeof(fcoio->feo),
+			.value = &fcoio->feo,
+		},
+		.out_args[1] = (struct fuse_bpf_arg) {
+			.size = sizeof(fcoio->foo),
+			.value = &fcoio->foo,
+		},
+	};
+
+	return 0;
+}
+
+static int fuse_open_file_backing(struct inode *inode, struct file *file)
+{
+	struct fuse_mount *fm = get_fuse_mount(inode);
+	struct dentry *entry = file->f_path.dentry;
+	struct fuse_dentry *fuse_dentry = get_fuse_dentry(entry);
+	struct fuse_file *fuse_file;
+	struct file *backing_file;
+
+	fuse_file = fuse_file_alloc(fm);
+	if (!fuse_file)
+		return -ENOMEM;
+	file->private_data = fuse_file;
+
+	backing_file = dentry_open(&fuse_dentry->backing_path, file->f_flags,
+				   current_cred());
+	if (IS_ERR(backing_file)) {
+		fuse_file_free(fuse_file);
+		file->private_data = NULL;
+		return PTR_ERR(backing_file);
+	}
+	fuse_file->backing_file = backing_file;
+
+	return 0;
+}
+
+int fuse_create_open_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry,
+		struct file *file, unsigned int flags, umode_t mode)
+{
+	struct fuse_inode *dir_fuse_inode = get_fuse_inode(dir);
+	struct fuse_dentry *dir_fuse_dentry = get_fuse_dentry(entry->d_parent);
+	struct dentry *backing_dentry = NULL;
+	struct inode *inode = NULL;
+	struct dentry *newent;
+	int err = 0;
+	const struct fuse_create_in *fci = fa->in_args[0].value;
+	struct inode *d_inode = entry->d_inode;
+	u64 target_nodeid = 0;
+
+	if (!dir_fuse_inode || !dir_fuse_dentry)
+		return -EIO;
+
+	inode_lock_nested(dir_fuse_inode->backing_inode, I_MUTEX_PARENT);
+	backing_dentry = lookup_one_len(fa->in_args[1].value,
+					dir_fuse_dentry->backing_path.dentry,
+					strlen(fa->in_args[1].value));
+	inode_unlock(dir_fuse_inode->backing_inode);
+
+	if (IS_ERR(backing_dentry))
+		return PTR_ERR(backing_dentry);
+
+	if (d_really_is_positive(backing_dentry)) {
+		err = -EIO;
+		goto out;
+	}
+
+	err = vfs_create(&init_user_ns,  dir_fuse_inode->backing_inode,
+			 backing_dentry, fci->mode, true);
+	if (err)
+		goto out;
+
+	if (get_fuse_dentry(entry)->backing_path.dentry)
+		path_put(&get_fuse_dentry(entry)->backing_path);
+	get_fuse_dentry(entry)->backing_path = (struct path) {
+		.mnt = dir_fuse_dentry->backing_path.mnt,
+		.dentry = backing_dentry,
+	};
+	path_get(&get_fuse_dentry(entry)->backing_path);
+
+	if (d_inode)
+		target_nodeid = get_fuse_inode(d_inode)->nodeid;
+
+	inode = fuse_iget_backing(dir->i_sb, target_nodeid,
+			get_fuse_dentry(entry)->backing_path.dentry->d_inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+
+	if (get_fuse_inode(inode)->bpf)
+		bpf_prog_put(get_fuse_inode(inode)->bpf);
+	get_fuse_inode(inode)->bpf = dir_fuse_inode->bpf;
+	if (get_fuse_inode(inode)->bpf)
+		bpf_prog_inc(dir_fuse_inode->bpf);
+
+	newent = d_splice_alias(inode, entry);
+	if (IS_ERR(newent)) {
+		err = PTR_ERR(newent);
+		goto out;
+	}
+
+	entry = newent ? newent : entry;
+	err = finish_open(file, entry, fuse_open_file_backing);
+
+out:
+	dput(backing_dentry);
+	return err;
+}
+
+void *fuse_create_open_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry,
+		struct file *file, unsigned int flags, umode_t mode)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_inode *fi = get_fuse_inode(file->f_inode);
+	struct fuse_entry_out *feo = fa->out_args[0].value;
+	struct fuse_open_out *foo = fa->out_args[1].value;
+
+	if (fi)
+		fi->nodeid = feo->nodeid;
+	if (ff)
+		ff->fh = foo->fh;
+	return 0;
+}
+
+int fuse_release_initialize(struct fuse_bpf_args *fa, struct fuse_release_in *fri,
+			    struct inode *inode, struct file *file)
+{
+	struct fuse_file *fuse_file = file->private_data;
+
+	/* Always put backing file whatever bpf/userspace says */
+	fput(fuse_file->backing_file);
+
+	*fri = (struct fuse_release_in) {
+		.fh = ((struct fuse_file *)(file->private_data))->fh,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(inode)->nodeid,
+		.opcode = FUSE_RELEASE,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*fri),
+		.in_args[0].value = fri,
+	};
+
+	return 0;
+}
+
+int fuse_releasedir_initialize(struct fuse_bpf_args *fa,
+			struct fuse_release_in *fri,
+			struct inode *inode, struct file *file)
+{
+	struct fuse_file *fuse_file = file->private_data;
+
+	/* Always put backing file whatever bpf/userspace says */
+	fput(fuse_file->backing_file);
+
+	*fri = (struct fuse_release_in) {
+		.fh = ((struct fuse_file *)(file->private_data))->fh,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(inode)->nodeid,
+		.opcode = FUSE_RELEASEDIR,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*fri),
+		.in_args[0].value = fri,
+	};
+
+	return 0;
+}
+
+int fuse_release_backing(struct fuse_bpf_args *fa,
+			 struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+void *fuse_release_finalize(struct fuse_bpf_args *fa,
+			    struct inode *inode, struct file *file)
+{
+	fuse_file_free(file->private_data);
+	return NULL;
+}
+
+int fuse_flush_initialize(struct fuse_bpf_args *fa, struct fuse_flush_in *ffi,
+			   struct file *file, fl_owner_t id)
+{
+	struct fuse_file *fuse_file = file->private_data;
+
+	*ffi = (struct fuse_flush_in) {
+		.fh = fuse_file->fh,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(file->f_inode),
+		.opcode = FUSE_FLUSH,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*ffi),
+		.in_args[0].value = ffi,
+		.flags = FUSE_BPF_FORCE,
+	};
+
+	return 0;
+}
+
+int fuse_flush_backing(struct fuse_bpf_args *fa, struct file *file, fl_owner_t id)
+{
+	struct fuse_file *fuse_file = file->private_data;
+	struct file *backing_file = fuse_file->backing_file;
+
+	if (backing_file->f_op->flush)
+		return backing_file->f_op->flush(backing_file, id);
+	return 0;
+}
+
+void *fuse_flush_finalize(struct fuse_bpf_args *fa, struct file *file, fl_owner_t id)
+{
+	return NULL;
+}
+
+int fuse_lseek_initialize(struct fuse_bpf_args *fa, struct fuse_lseek_io *flio,
+			  struct file *file, loff_t offset, int whence)
+{
+	struct fuse_file *fuse_file = file->private_data;
+
+	flio->fli = (struct fuse_lseek_in) {
+		.fh = fuse_file->fh,
+		.offset = offset,
+		.whence = whence,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(file->f_inode),
+		.opcode = FUSE_LSEEK,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(flio->fli),
+		.in_args[0].value = &flio->fli,
+		.out_numargs = 1,
+		.out_args[0].size = sizeof(flio->flo),
+		.out_args[0].value = &flio->flo,
+	};
+
+	return 0;
+}
+
+int fuse_lseek_backing(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence)
+{
+	const struct fuse_lseek_in *fli = fa->in_args[0].value;
+	struct fuse_lseek_out *flo = fa->out_args[0].value;
+	struct fuse_file *fuse_file = file->private_data;
+	struct file *backing_file = fuse_file->backing_file;
+	loff_t ret;
+
+	/* TODO: Handle changing of the file handle */
+	if (offset == 0) {
+		if (whence == SEEK_CUR) {
+			flo->offset = file->f_pos;
+			return flo->offset;
+		}
+
+		if (whence == SEEK_SET) {
+			flo->offset = vfs_setpos(file, 0, 0);
+			return flo->offset;
+		}
+	}
+
+	inode_lock(file->f_inode);
+	backing_file->f_pos = file->f_pos;
+	ret = vfs_llseek(backing_file, fli->offset, fli->whence);
+	flo->offset = ret;
+	inode_unlock(file->f_inode);
+	return ret;
+}
+
+void *fuse_lseek_finalize(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence)
+{
+	struct fuse_lseek_out *flo = fa->out_args[0].value;
+
+	if (!fa->error_in)
+		file->f_pos = flo->offset;
+	return ERR_PTR(flo->offset);
+}
+
+int fuse_copy_file_range_initialize(struct fuse_bpf_args *fa, struct fuse_copy_file_range_io *fcf,
+				   struct file *file_in, loff_t pos_in, struct file *file_out,
+				   loff_t pos_out, size_t len, unsigned int flags)
+{
+	struct fuse_file *fuse_file_in = file_in->private_data;
+	struct fuse_file *fuse_file_out = file_out->private_data;
+
+
+	fcf->fci = (struct fuse_copy_file_range_in) {
+		.fh_in = fuse_file_in->fh,
+		.off_in = pos_in,
+		.nodeid_out = fuse_file_out->nodeid,
+		.fh_out = fuse_file_out->fh,
+		.off_out = pos_out,
+		.len = len,
+		.flags = flags,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(file_in->f_inode),
+		.opcode = FUSE_COPY_FILE_RANGE,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(fcf->fci),
+		.in_args[0].value = &fcf->fci,
+		.out_numargs = 1,
+		.out_args[0].size = sizeof(fcf->fwo),
+		.out_args[0].value = &fcf->fwo,
+	};
+
+	return 0;
+}
+
+int fuse_copy_file_range_backing(struct fuse_bpf_args *fa, struct file *file_in, loff_t pos_in,
+				 struct file *file_out, loff_t pos_out, size_t len,
+				 unsigned int flags)
+{
+	const struct fuse_copy_file_range_in *fci = fa->in_args[0].value;
+	struct fuse_file *fuse_file_in = file_in->private_data;
+	struct file *backing_file_in = fuse_file_in->backing_file;
+	struct fuse_file *fuse_file_out = file_out->private_data;
+	struct file *backing_file_out = fuse_file_out->backing_file;
+
+	/* TODO: Handle changing of in/out files */
+	if (backing_file_out)
+		return vfs_copy_file_range(backing_file_in, fci->off_in, backing_file_out,
+					   fci->off_out, fci->len, fci->flags);
+	else
+		return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+					       flags);
+}
+
+void *fuse_copy_file_range_finalize(struct fuse_bpf_args *fa, struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out, size_t len,
+				    unsigned int flags)
+{
+	return NULL;
+}
+
+int fuse_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi,
+		   struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct fuse_file *fuse_file = file->private_data;
+
+	*ffi = (struct fuse_fsync_in) {
+		.fh = fuse_file->fh,
+		.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(file->f_inode)->nodeid,
+		.opcode = FUSE_FSYNC,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*ffi),
+		.in_args[0].value = ffi,
+		.flags = FUSE_BPF_FORCE,
+	};
+
+	return 0;
+}
+
+int fuse_fsync_backing(struct fuse_bpf_args *fa,
+		   struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct fuse_file *fuse_file = file->private_data;
+	struct file *backing_file = fuse_file->backing_file;
+	const struct fuse_fsync_in *ffi = fa->in_args[0].value;
+	int new_datasync = (ffi->fsync_flags & FUSE_FSYNC_FDATASYNC) ? 1 : 0;
+
+	return vfs_fsync(backing_file, new_datasync);
+}
+
+void *fuse_fsync_finalize(struct fuse_bpf_args *fa,
+		   struct file *file, loff_t start, loff_t end, int datasync)
+{
+	return NULL;
+}
+
+int fuse_dir_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi,
+		   struct file *file, loff_t start, loff_t end, int datasync)
+{
+	struct fuse_file *fuse_file = file->private_data;
+
+	*ffi = (struct fuse_fsync_in) {
+		.fh = fuse_file->fh,
+		.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(file->f_inode)->nodeid,
+		.opcode = FUSE_FSYNCDIR,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*ffi),
+		.in_args[0].value = ffi,
+		.flags = FUSE_BPF_FORCE,
+	};
+
+	return 0;
+}
+
+int fuse_getxattr_initialize(struct fuse_bpf_args *fa,
+		struct fuse_getxattr_io *fgio,
+		struct dentry *dentry, const char *name, void *value,
+		size_t size)
+{
+	*fgio = (struct fuse_getxattr_io) {
+		.fgi.size = size,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(dentry->d_inode)->nodeid,
+		.opcode = FUSE_GETXATTR,
+		.in_numargs = 2,
+		.out_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(fgio->fgi),
+			.value = &fgio->fgi,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = strlen(name) + 1,
+			.value = name,
+		},
+		.flags = size ? FUSE_BPF_OUT_ARGVAR : 0,
+		.out_args[0].size = size ? size : sizeof(fgio->fgo),
+		.out_args[0].value = size ? value : &fgio->fgo,
+	};
+	return 0;
+}
+
+int fuse_getxattr_backing(struct fuse_bpf_args *fa,
+		struct dentry *dentry, const char *name, void *value,
+		size_t size)
+{
+	ssize_t ret = vfs_getxattr(&init_user_ns,
+				   get_fuse_dentry(dentry)->backing_path.dentry,
+				   fa->in_args[1].value, value, size);
+
+	if (fa->flags & FUSE_BPF_OUT_ARGVAR)
+		fa->out_args[0].size = ret;
+	else
+		((struct fuse_getxattr_out *)fa->out_args[0].value)->size = ret;
+
+	return 0;
+}
+
+void *fuse_getxattr_finalize(struct fuse_bpf_args *fa,
+		struct dentry *dentry, const char *name, void *value,
+		size_t size)
+{
+	struct fuse_getxattr_out *fgo;
+
+	if (fa->flags & FUSE_BPF_OUT_ARGVAR)
+		return ERR_PTR(fa->out_args[0].size);
+
+	fgo = fa->out_args[0].value;
+
+	return ERR_PTR(fgo->size);
+
+}
+
+int fuse_listxattr_initialize(struct fuse_bpf_args *fa,
+			      struct fuse_getxattr_io *fgio,
+			      struct dentry *dentry, char *list, size_t size)
+{
+	*fgio = (struct fuse_getxattr_io){
+		.fgi.size = size,
+	};
+
+	*fa = (struct fuse_bpf_args){
+		.nodeid = get_fuse_inode(dentry->d_inode)->nodeid,
+		.opcode = FUSE_LISTXATTR,
+		.in_numargs = 1,
+		.out_numargs = 1,
+		.in_args[0] =
+			(struct fuse_bpf_in_arg){
+				.size = sizeof(fgio->fgi),
+				.value = &fgio->fgi,
+			},
+		.flags = size ? FUSE_BPF_OUT_ARGVAR : 0,
+		.out_args[0].size = size ? size : sizeof(fgio->fgo),
+		.out_args[0].value = size ? (void *)list : &fgio->fgo,
+	};
+
+	return 0;
+}
+
+int fuse_listxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry,
+			   char *list, size_t size)
+{
+	ssize_t ret =
+		vfs_listxattr(get_fuse_dentry(dentry)->backing_path.dentry,
+			      list, size);
+
+	if (ret < 0)
+		return ret;
+
+	if (fa->flags & FUSE_BPF_OUT_ARGVAR)
+		fa->out_args[0].size = ret;
+	else
+		((struct fuse_getxattr_out *)fa->out_args[0].value)->size = ret;
+
+	return ret;
+}
+
+void *fuse_listxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry,
+			      char *list, size_t size)
+{
+	struct fuse_getxattr_out *fgo;
+
+	if (fa->error_in)
+		return NULL;
+
+	if (fa->flags & FUSE_BPF_OUT_ARGVAR)
+		return ERR_PTR(fa->out_args[0].size);
+
+	fgo = fa->out_args[0].value;
+	return ERR_PTR(fgo->size);
+}
+
+int fuse_setxattr_initialize(struct fuse_bpf_args *fa,
+			     struct fuse_setxattr_in *fsxi,
+			     struct dentry *dentry, const char *name,
+			     const void *value, size_t size, int flags)
+{
+	*fsxi = (struct fuse_setxattr_in) {
+		.size = size,
+		.flags = flags,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(dentry->d_inode)->nodeid,
+		.opcode = FUSE_SETXATTR,
+		.in_numargs = 3,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(*fsxi),
+			.value = fsxi,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = strlen(name) + 1,
+			.value = name,
+		},
+		.in_args[2] = (struct fuse_bpf_in_arg) {
+			.size = size,
+			.value = value,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_setxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry,
+			  const char *name, const void *value, size_t size,
+			  int flags)
+{
+	return vfs_setxattr(&init_user_ns,
+			    get_fuse_dentry(dentry)->backing_path.dentry, name,
+			    value, size, flags);
+}
+
+void *fuse_setxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry,
+			     const char *name, const void *value, size_t size,
+			     int flags)
+{
+	return NULL;
+}
+
+int fuse_removexattr_initialize(struct fuse_bpf_args *fa,
+				struct fuse_dummy_io *unused,
+				struct dentry *dentry, const char *name)
+{
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(dentry->d_inode)->nodeid,
+		.opcode = FUSE_REMOVEXATTR,
+		.in_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = strlen(name) + 1,
+			.value = name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_removexattr_backing(struct fuse_bpf_args *fa,
+			     struct dentry *dentry, const char *name)
+{
+	struct path *backing_path =
+		&get_fuse_dentry(dentry)->backing_path;
+
+	/* TODO account for changes of the name by prefilter */
+	return vfs_removexattr(&init_user_ns, backing_path->dentry, name);
+}
+
+void *fuse_removexattr_finalize(struct fuse_bpf_args *fa,
+				struct dentry *dentry, const char *name)
+{
+	return NULL;
+}
+
+static inline void fuse_bpf_aio_put(struct fuse_bpf_aio_req *aio_req)
+{
+	if (refcount_dec_and_test(&aio_req->ref))
+		kmem_cache_free(fuse_bpf_aio_request_cachep, aio_req);
+}
+
+static void fuse_bpf_aio_cleanup_handler(struct fuse_bpf_aio_req *aio_req)
+{
+	struct kiocb *iocb = &aio_req->iocb;
+	struct kiocb *iocb_orig = aio_req->iocb_orig;
+
+	if (iocb->ki_flags & IOCB_WRITE) {
+		__sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
+				      SB_FREEZE_WRITE);
+		file_end_write(iocb->ki_filp);
+		fuse_copyattr(iocb_orig->ki_filp, iocb->ki_filp);
+	}
+	iocb_orig->ki_pos = iocb->ki_pos;
+	fuse_bpf_aio_put(aio_req);
+}
+
+static void fuse_bpf_aio_rw_complete(struct kiocb *iocb, long res)
+{
+	struct fuse_bpf_aio_req *aio_req =
+		container_of(iocb, struct fuse_bpf_aio_req, iocb);
+	struct kiocb *iocb_orig = aio_req->iocb_orig;
+
+	fuse_bpf_aio_cleanup_handler(aio_req);
+	iocb_orig->ki_complete(iocb_orig, res);
+}
+
+
+int fuse_file_read_iter_initialize(
+		struct fuse_bpf_args *fa, struct fuse_file_read_iter_io *fri,
+		struct kiocb *iocb, struct iov_iter *to)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+
+	fri->fri = (struct fuse_read_in) {
+		.fh = ff->fh,
+		.offset = iocb->ki_pos,
+		.size = to->count,
+	};
+
+	fri->frio = (struct fuse_read_iter_out) {
+		.ret = fri->fri.size,
+	};
+
+	/* TODO we can't assume 'to' is a kvec */
+	/* TODO we also can't assume the vector has only one component */
+	*fa = (struct fuse_bpf_args) {
+		.opcode = FUSE_READ,
+		.nodeid = ff->nodeid,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(fri->fri),
+		.in_args[0].value = &fri->fri,
+		.out_numargs = 1,
+		.out_args[0].size = sizeof(fri->frio),
+		.out_args[0].value = &fri->frio,
+		/*
+		 * TODO Design this properly.
+		 * Possible approach: do not pass buf to bpf
+		 * If going to userland, do a deep copy
+		 * For extra credit, do that to/from the vector, rather than
+		 * making an extra copy in the kernel
+		 */
+	};
+
+	return 0;
+}
+
+int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *to)
+{
+	struct fuse_read_iter_out *frio = fa->out_args[0].value;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	ssize_t ret;
+
+	if (!iov_iter_count(to))
+		return 0;
+
+	if ((iocb->ki_flags & IOCB_DIRECT) &&
+	    (!ff->backing_file->f_mapping->a_ops ||
+	     !ff->backing_file->f_mapping->a_ops->direct_IO))
+		return -EINVAL;
+
+	/* TODO This just plain ignores any change to fuse_read_in */
+	if (is_sync_kiocb(iocb)) {
+		ret = vfs_iter_read(ff->backing_file, to, &iocb->ki_pos,
+				iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK));
+	} else {
+		struct fuse_bpf_aio_req *aio_req;
+
+		ret = -ENOMEM;
+		aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL);
+		if (!aio_req)
+			goto out;
+
+		aio_req->iocb_orig = iocb;
+		kiocb_clone(&aio_req->iocb, iocb, ff->backing_file);
+		aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete;
+		refcount_set(&aio_req->ref, 2);
+		ret = vfs_iocb_iter_read(ff->backing_file, &aio_req->iocb, to);
+		fuse_bpf_aio_put(aio_req);
+		if (ret != -EIOCBQUEUED)
+			fuse_bpf_aio_cleanup_handler(aio_req);
+	}
+
+	frio->ret = ret;
+
+	/* TODO Need to point value at the buffer for post-modification */
+
+out:
+	fuse_file_accessed(file, ff->backing_file);
+
+	return ret;
+}
+
+void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *to)
+{
+	struct fuse_read_iter_out *frio = fa->out_args[0].value;
+
+	return ERR_PTR(frio->ret);
+}
+
+int fuse_file_write_iter_initialize(
+		struct fuse_bpf_args *fa, struct fuse_file_write_iter_io *fwio,
+		struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+
+	*fwio = (struct fuse_file_write_iter_io) {
+		.fwi.fh = ff->fh,
+		.fwi.offset = iocb->ki_pos,
+		.fwi.size = from->count,
+	};
+
+	/* TODO we can't assume 'from' is a kvec */
+	*fa = (struct fuse_bpf_args) {
+		.opcode = FUSE_WRITE,
+		.nodeid = ff->nodeid,
+		.in_numargs = 2,
+		.in_args[0].size = sizeof(fwio->fwi),
+		.in_args[0].value = &fwio->fwi,
+		.in_args[1].size = fwio->fwi.size,
+		.in_args[1].value = from->kvec->iov_base,
+		.out_numargs = 1,
+		.out_args[0].size = sizeof(fwio->fwio),
+		.out_args[0].value = &fwio->fwio,
+	};
+
+	return 0;
+}
+
+int fuse_file_write_iter_backing(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_write_iter_out *fwio = fa->out_args[0].value;
+	ssize_t ret;
+
+	if (!iov_iter_count(from))
+		return 0;
+
+	/* TODO This just plain ignores any change to fuse_write_in */
+	/* TODO uint32_t seems smaller than ssize_t.... right? */
+	inode_lock(file_inode(file));
+
+	fuse_copyattr(file, ff->backing_file);
+
+	if (is_sync_kiocb(iocb)) {
+		file_start_write(ff->backing_file);
+		ret = vfs_iter_write(ff->backing_file, from, &iocb->ki_pos,
+					   iocb_to_rw_flags(iocb->ki_flags, FUSE_BPF_IOCB_MASK));
+		file_end_write(ff->backing_file);
+
+		/* Must reflect change in size of backing file to upper file */
+		if (ret > 0)
+			fuse_copyattr(file, ff->backing_file);
+	} else {
+		struct fuse_bpf_aio_req *aio_req;
+
+		ret = -ENOMEM;
+		aio_req = kmem_cache_zalloc(fuse_bpf_aio_request_cachep, GFP_KERNEL);
+		if (!aio_req)
+			goto out;
+
+		file_start_write(ff->backing_file);
+		__sb_writers_release(file_inode(ff->backing_file)->i_sb, SB_FREEZE_WRITE);
+		aio_req->iocb_orig = iocb;
+		kiocb_clone(&aio_req->iocb, iocb, ff->backing_file);
+		aio_req->iocb.ki_complete = fuse_bpf_aio_rw_complete;
+		refcount_set(&aio_req->ref, 2);
+		ret = vfs_iocb_iter_write(ff->backing_file, &aio_req->iocb, from);
+		fuse_bpf_aio_put(aio_req);
+		if (ret != -EIOCBQUEUED)
+			fuse_bpf_aio_cleanup_handler(aio_req);
+	}
+
+out:
+	inode_unlock(file_inode(file));
+	fwio->ret = ret;
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+void *fuse_file_write_iter_finalize(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *from)
+{
+	struct fuse_write_iter_out *fwio = fa->out_args[0].value;
+
+	return ERR_PTR(fwio->ret);
+}
+
+ssize_t fuse_backing_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret;
+	struct fuse_file *ff = file->private_data;
+	struct inode *fuse_inode = file_inode(file);
+	struct file *backing_file = ff->backing_file;
+	struct inode *backing_inode = file_inode(backing_file);
+
+	if (!backing_file->f_op->mmap)
+		return -ENODEV;
+
+	if (WARN_ON(file != vma->vm_file))
+		return -EIO;
+
+	vma->vm_file = get_file(backing_file);
+
+	ret = call_mmap(vma->vm_file, vma);
+
+	if (ret)
+		fput(backing_file);
+	else
+		fput(file);
+
+	if (file->f_flags & O_NOATIME)
+		return ret;
+
+	if ((!timespec64_equal(&fuse_inode->i_mtime,
+			       &backing_inode->i_mtime) ||
+	     !timespec64_equal(&fuse_inode->i_ctime,
+			       &backing_inode->i_ctime))) {
+		fuse_inode->i_mtime = backing_inode->i_mtime;
+		fuse_inode->i_ctime = backing_inode->i_ctime;
+	}
+	touch_atime(&file->f_path);
+
+	return ret;
+}
+
+int fuse_file_fallocate_initialize(struct fuse_bpf_args *fa,
+		struct fuse_fallocate_in *ffi,
+		struct file *file, int mode, loff_t offset, loff_t length)
+{
+	struct fuse_file *ff = file->private_data;
+
+	*ffi = (struct fuse_fallocate_in) {
+		.fh = ff->fh,
+		.offset = offset,
+		.length = length,
+		.mode = mode
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.opcode = FUSE_FALLOCATE,
+		.nodeid = ff->nodeid,
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*ffi),
+		.in_args[0].value = ffi,
+	};
+
+	return 0;
+}
+
+int fuse_file_fallocate_backing(struct fuse_bpf_args *fa,
+		struct file *file, int mode, loff_t offset, loff_t length)
+{
+	const struct fuse_fallocate_in *ffi = fa->in_args[0].value;
+	struct fuse_file *ff = file->private_data;
+
+	return vfs_fallocate(ff->backing_file, ffi->mode, ffi->offset,
+			     ffi->length);
+}
+
+void *fuse_file_fallocate_finalize(struct fuse_bpf_args *fa,
+		struct file *file, int mode, loff_t offset, loff_t length)
+{
+	return NULL;
+}
+
+/*******************************************************************************
+ * Directory operations after here                                             *
+ ******************************************************************************/
+
+int fuse_lookup_initialize(struct fuse_bpf_args *fa, struct fuse_lookup_io *fli,
+	       struct inode *dir, struct dentry *entry, unsigned int flags)
+{
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_fuse_inode(dir)->nodeid,
+		.opcode = FUSE_LOOKUP,
+		.in_numargs = 1,
+		.out_numargs = 2,
+		.flags = FUSE_BPF_OUT_ARGVAR,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+		.out_args[0] = (struct fuse_bpf_arg) {
+			.size = sizeof(fli->feo),
+			.value = &fli->feo,
+		},
+		.out_args[1] = (struct fuse_bpf_arg) {
+			.size = sizeof(fli->feb.out),
+			.value = &fli->feb.out,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir,
+			  struct dentry *entry, unsigned int flags)
+{
+	struct fuse_dentry *fuse_entry = get_fuse_dentry(entry);
+	struct fuse_dentry *dir_fuse_entry = get_fuse_dentry(entry->d_parent);
+	struct dentry *dir_backing_entry = dir_fuse_entry->backing_path.dentry;
+	struct inode *dir_backing_inode = dir_backing_entry->d_inode;
+	struct dentry *backing_entry;
+	struct fuse_entry_out *feo = (void *)fa->out_args[0].value;
+	struct kstat stat;
+	int err;
+
+	/* TODO this will not handle lookups over mount points */
+	inode_lock_nested(dir_backing_inode, I_MUTEX_PARENT);
+	backing_entry = lookup_one_len(entry->d_name.name, dir_backing_entry,
+					strlen(entry->d_name.name));
+	inode_unlock(dir_backing_inode);
+
+	if (IS_ERR(backing_entry))
+		return PTR_ERR(backing_entry);
+
+	fuse_entry->backing_path = (struct path) {
+		.dentry = backing_entry,
+		.mnt = mntget(dir_fuse_entry->backing_path.mnt),
+	};
+
+	if (d_is_negative(backing_entry)) {
+		fa->error_in = -ENOENT;
+		return 0;
+	}
+
+	err = vfs_getattr(&fuse_entry->backing_path, &stat,
+				  STATX_BASIC_STATS, 0);
+	if (err) {
+		path_put_init(&fuse_entry->backing_path);
+		return err;
+	}
+
+	fuse_stat_to_attr(get_fuse_conn(dir),
+			  backing_entry->d_inode, &stat, &feo->attr);
+	return 0;
+}
+
+int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode,
+			struct path *backing_path)
+{
+	switch (feb->out.backing_action) {
+	case FUSE_ACTION_KEEP:
+		/* backing inode/path are added in fuse_lookup_backing */
+		break;
+
+	case FUSE_ACTION_REMOVE:
+		iput(*backing_inode);
+		*backing_inode = NULL;
+		path_put_init(backing_path);
+		break;
+
+	case FUSE_ACTION_REPLACE: {
+		struct file *backing_file = feb->backing_file;
+
+		if (!backing_file)
+			return -EINVAL;
+		if (IS_ERR(backing_file))
+			return PTR_ERR(backing_file);
+
+		if (backing_inode)
+			iput(*backing_inode);
+		*backing_inode = backing_file->f_inode;
+		ihold(*backing_inode);
+
+		path_put(backing_path);
+		*backing_path = backing_file->f_path;
+		path_get(backing_path);
+
+		fput(backing_file);
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int fuse_handle_bpf_prog(struct fuse_entry_bpf *feb, struct inode *parent,
+			 struct bpf_prog **bpf)
+{
+	struct fuse_inode *pi;
+
+	// Parent isn't presented, but we want to keep
+	// Don't touch bpf program at all in this case
+	if (feb->out.bpf_action == FUSE_ACTION_KEEP && !parent)
+		goto out;
+
+	if (*bpf) {
+		bpf_prog_put(*bpf);
+		*bpf = NULL;
+	}
+
+	switch (feb->out.bpf_action) {
+	case FUSE_ACTION_KEEP:
+		pi = get_fuse_inode(parent);
+		*bpf = pi->bpf;
+		if (*bpf)
+			bpf_prog_inc(*bpf);
+		break;
+
+	case FUSE_ACTION_REMOVE:
+		break;
+
+	case FUSE_ACTION_REPLACE: {
+		struct file *bpf_file = feb->bpf_file;
+		struct bpf_prog *bpf_prog = ERR_PTR(-EINVAL);
+
+		if (bpf_file && !IS_ERR(bpf_file))
+			bpf_prog = fuse_get_bpf_prog(bpf_file);
+
+		if (IS_ERR(bpf_prog))
+			return PTR_ERR(bpf_prog);
+
+		*bpf = bpf_prog;
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+out:
+	return 0;
+}
+
+struct dentry *fuse_lookup_finalize(struct fuse_bpf_args *fa, struct inode *dir,
+			   struct dentry *entry, unsigned int flags)
+{
+	struct fuse_dentry *fd;
+	struct dentry *bd;
+	struct inode *inode, *backing_inode;
+	struct inode *d_inode = entry->d_inode;
+	struct fuse_entry_out *feo = fa->out_args[0].value;
+	struct fuse_entry_bpf_out *febo = fa->out_args[1].value;
+	struct fuse_entry_bpf *feb = container_of(febo, struct fuse_entry_bpf, out);
+	int error = -1;
+	u64 target_nodeid = 0;
+
+	fd = get_fuse_dentry(entry);
+	if (!fd)
+		return ERR_PTR(-EIO);
+	bd = fd->backing_path.dentry;
+	if (!bd)
+		return ERR_PTR(-ENOENT);
+	backing_inode = bd->d_inode;
+	if (!backing_inode)
+		return 0;
+
+	if (d_inode)
+		target_nodeid = get_fuse_inode(d_inode)->nodeid;
+
+	inode = fuse_iget_backing(dir->i_sb, target_nodeid, backing_inode);
+
+	if (IS_ERR(inode))
+		return ERR_PTR(PTR_ERR(inode));
+
+	error = fuse_handle_bpf_prog(feb, dir, &get_fuse_inode(inode)->bpf);
+	if (error)
+		return ERR_PTR(error);
+
+	error = fuse_handle_backing(feb, &get_fuse_inode(inode)->backing_inode, &fd->backing_path);
+	if (error)
+		return ERR_PTR(error);
+
+	get_fuse_inode(inode)->nodeid = feo->nodeid;
+
+	return d_splice_alias(inode, entry);
+}
+
+int fuse_revalidate_backing(struct dentry *entry, unsigned int flags)
+{
+	struct fuse_dentry *fuse_dentry = get_fuse_dentry(entry);
+	struct dentry *backing_entry = fuse_dentry->backing_path.dentry;
+
+	spin_lock(&backing_entry->d_lock);
+	if (d_unhashed(backing_entry)) {
+		spin_unlock(&backing_entry->d_lock);
+			return 0;
+	}
+	spin_unlock(&backing_entry->d_lock);
+
+	if (unlikely(backing_entry->d_flags & DCACHE_OP_REVALIDATE))
+		return backing_entry->d_op->d_revalidate(backing_entry, flags);
+	return 1;
+}
+
+int fuse_canonical_path_initialize(struct fuse_bpf_args *fa,
+				   struct fuse_dummy_io *fdi,
+				   const struct path *path,
+				   struct path *canonical_path)
+{
+	fa->opcode = FUSE_CANONICAL_PATH;
+	return 0;
+}
+
+int fuse_canonical_path_backing(struct fuse_bpf_args *fa, const struct path *path,
+				struct path *canonical_path)
+{
+	get_fuse_backing_path(path->dentry, canonical_path);
+	return 0;
+}
+
+void *fuse_canonical_path_finalize(struct fuse_bpf_args *fa,
+				   const struct path *path,
+				   struct path *canonical_path)
+{
+	return NULL;
+}
+
+int fuse_mknod_initialize(
+		struct fuse_bpf_args *fa, struct fuse_mknod_in *fmi,
+		struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev)
+{
+	*fmi = (struct fuse_mknod_in) {
+		.mode = mode,
+		.rdev = new_encode_dev(rdev),
+		.umask = current_umask(),
+	};
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(dir),
+		.opcode = FUSE_MKNOD,
+		.in_numargs = 2,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(*fmi),
+			.value = fmi,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_mknod_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev)
+{
+	int err = 0;
+	const struct fuse_mknod_in *fmi = fa->in_args[0].value;
+	struct fuse_inode *fuse_inode = get_fuse_inode(dir);
+	struct inode *backing_inode = fuse_inode->backing_inode;
+	struct path backing_path = {};
+	struct inode *inode = NULL;
+
+	//TODO Actually deal with changing the backing entry in mknod
+	get_fuse_backing_path(entry, &backing_path);
+	if (!backing_path.dentry)
+		return -EBADF;
+
+	inode_lock_nested(backing_inode, I_MUTEX_PARENT);
+	mode = fmi->mode;
+	if (!IS_POSIXACL(backing_inode))
+		mode &= ~fmi->umask;
+	err = vfs_mknod(&init_user_ns, backing_inode, backing_path.dentry,
+			mode, new_decode_dev(fmi->rdev));
+	inode_unlock(backing_inode);
+	if (err)
+		goto out;
+	if (d_really_is_negative(backing_path.dentry) ||
+		unlikely(d_unhashed(backing_path.dentry))) {
+		err = -EINVAL;
+		/**
+		 * TODO: overlayfs responds to this situation with a
+		 * lookupOneLen. Should we do that too?
+		 */
+		goto out;
+	}
+	inode = fuse_iget_backing(dir->i_sb, fuse_inode->nodeid, backing_inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	d_instantiate(entry, inode);
+out:
+	path_put(&backing_path);
+	return err;
+}
+
+void *fuse_mknod_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev)
+{
+	return NULL;
+}
+
+int fuse_mkdir_initialize(
+		struct fuse_bpf_args *fa, struct fuse_mkdir_in *fmi,
+		struct inode *dir, struct dentry *entry, umode_t mode)
+{
+	*fmi = (struct fuse_mkdir_in) {
+		.mode = mode,
+		.umask = current_umask(),
+	};
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(dir),
+		.opcode = FUSE_MKDIR,
+		.in_numargs = 2,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(*fmi),
+			.value = fmi,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_mkdir_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode)
+{
+	int err = 0;
+	const struct fuse_mkdir_in *fmi = fa->in_args[0].value;
+	struct fuse_inode *fuse_inode = get_fuse_inode(dir);
+	struct inode *backing_inode = fuse_inode->backing_inode;
+	struct path backing_path = {};
+	struct inode *inode = NULL;
+	struct dentry *d;
+
+	//TODO Actually deal with changing the backing entry in mkdir
+	get_fuse_backing_path(entry, &backing_path);
+	if (!backing_path.dentry)
+		return -EBADF;
+
+	inode_lock_nested(backing_inode, I_MUTEX_PARENT);
+	mode = fmi->mode;
+	if (!IS_POSIXACL(backing_inode))
+		mode &= ~fmi->umask;
+	err = vfs_mkdir(&init_user_ns, backing_inode, backing_path.dentry, mode);
+	if (err)
+		goto out;
+	if (d_really_is_negative(backing_path.dentry) ||
+		unlikely(d_unhashed(backing_path.dentry))) {
+		d = lookup_one_len(entry->d_name.name, backing_path.dentry->d_parent,
+				entry->d_name.len);
+		if (IS_ERR(d)) {
+			err = PTR_ERR(d);
+			goto out;
+		}
+		dput(backing_path.dentry);
+		backing_path.dentry = d;
+	}
+	inode = fuse_iget_backing(dir->i_sb, fuse_inode->nodeid, backing_inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	d_instantiate(entry, inode);
+out:
+	inode_unlock(backing_inode);
+	path_put(&backing_path);
+	return err;
+}
+
+void *fuse_mkdir_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode)
+{
+	return NULL;
+}
+
+int fuse_rmdir_initialize(
+		struct fuse_bpf_args *fa, struct fuse_dummy_io *dummy,
+		struct inode *dir, struct dentry *entry)
+{
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(dir),
+		.opcode = FUSE_RMDIR,
+		.in_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_rmdir_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry)
+{
+	int err = 0;
+	struct path backing_path = {};
+	struct dentry *backing_parent_dentry;
+	struct inode *backing_inode;
+
+	/* TODO Actually deal with changing the backing entry in rmdir */
+	get_fuse_backing_path(entry, &backing_path);
+	if (!backing_path.dentry)
+		return -EBADF;
+
+	/* TODO Not sure if we should reverify like overlayfs, or get inode from d_parent */
+	backing_parent_dentry = dget_parent(backing_path.dentry);
+	backing_inode = d_inode(backing_parent_dentry);
+
+	inode_lock_nested(backing_inode, I_MUTEX_PARENT);
+	err = vfs_rmdir(&init_user_ns, backing_inode, backing_path.dentry);
+	inode_unlock(backing_inode);
+
+	dput(backing_parent_dentry);
+	if (!err)
+		d_drop(entry);
+	path_put(&backing_path);
+	return err;
+}
+
+void *fuse_rmdir_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry)
+{
+	return NULL;
+}
+
+static int fuse_rename_backing_common(
+			 struct inode *olddir, struct dentry *oldent,
+			 struct inode *newdir, struct dentry *newent,
+			 unsigned int flags)
+{
+	int err = 0;
+	struct path old_backing_path;
+	struct path new_backing_path;
+	struct dentry *old_backing_dir_dentry;
+	struct dentry *old_backing_dentry;
+	struct dentry *new_backing_dir_dentry;
+	struct dentry *new_backing_dentry;
+	struct dentry *trap = NULL;
+	struct inode *target_inode;
+	struct renamedata rd;
+
+	//TODO Actually deal with changing anything that isn't a flag
+	get_fuse_backing_path(oldent, &old_backing_path);
+	if (!old_backing_path.dentry)
+		return -EBADF;
+	get_fuse_backing_path(newent, &new_backing_path);
+	if (!new_backing_path.dentry) {
+		/*
+		 * TODO A file being moved from a backing path to another
+		 * backing path which is not yet instrumented with FUSE-BPF.
+		 * This may be slow and should be substituted with something
+		 * more clever.
+		 */
+		err = -EXDEV;
+		goto put_old_path;
+	}
+	if (new_backing_path.mnt != old_backing_path.mnt) {
+		err = -EXDEV;
+		goto put_new_path;
+	}
+	old_backing_dentry = old_backing_path.dentry;
+	new_backing_dentry = new_backing_path.dentry;
+	old_backing_dir_dentry = dget_parent(old_backing_dentry);
+	new_backing_dir_dentry = dget_parent(new_backing_dentry);
+	target_inode = d_inode(newent);
+
+	trap = lock_rename(old_backing_dir_dentry, new_backing_dir_dentry);
+	if (trap == old_backing_dentry) {
+		err = -EINVAL;
+		goto put_parents;
+	}
+	if (trap == new_backing_dentry) {
+		err = -ENOTEMPTY;
+		goto put_parents;
+	}
+	rd = (struct renamedata) {
+		.old_mnt_userns = &init_user_ns,
+		.old_dir = d_inode(old_backing_dir_dentry),
+		.old_dentry = old_backing_dentry,
+		.new_mnt_userns = &init_user_ns,
+		.new_dir = d_inode(new_backing_dir_dentry),
+		.new_dentry = new_backing_dentry,
+		.flags = flags,
+	};
+	err = vfs_rename(&rd);
+	if (err)
+		goto unlock;
+	if (target_inode)
+		fsstack_copy_attr_all(target_inode,
+				get_fuse_inode(target_inode)->backing_inode);
+	fsstack_copy_attr_all(d_inode(oldent), d_inode(old_backing_dentry));
+unlock:
+	unlock_rename(old_backing_dir_dentry, new_backing_dir_dentry);
+put_parents:
+	dput(new_backing_dir_dentry);
+	dput(old_backing_dir_dentry);
+put_new_path:
+	path_put(&new_backing_path);
+put_old_path:
+	path_put(&old_backing_path);
+	return err;
+}
+
+int fuse_rename2_initialize(struct fuse_bpf_args *fa, struct fuse_rename2_in *fri,
+			    struct inode *olddir, struct dentry *oldent,
+			    struct inode *newdir, struct dentry *newent,
+			    unsigned int flags)
+{
+	*fri = (struct fuse_rename2_in) {
+		.newdir = get_node_id(newdir),
+		.flags = flags,
+	};
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(olddir),
+		.opcode = FUSE_RENAME2,
+		.in_numargs = 3,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(*fri),
+			.value = fri,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = oldent->d_name.len + 1,
+			.value = oldent->d_name.name,
+		},
+		.in_args[2] = (struct fuse_bpf_in_arg) {
+			.size = newent->d_name.len + 1,
+			.value = newent->d_name.name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_rename2_backing(struct fuse_bpf_args *fa,
+			 struct inode *olddir, struct dentry *oldent,
+			 struct inode *newdir, struct dentry *newent,
+			 unsigned int flags)
+{
+	const struct fuse_rename2_in *fri = fa->in_args[0].value;
+
+	/* TODO: deal with changing dirs/ents */
+	return fuse_rename_backing_common(olddir, oldent, newdir, newent, fri->flags);
+}
+
+void *fuse_rename2_finalize(struct fuse_bpf_args *fa,
+			    struct inode *olddir, struct dentry *oldent,
+			    struct inode *newdir, struct dentry *newent,
+			    unsigned int flags)
+{
+	return NULL;
+}
+
+int fuse_rename_initialize(struct fuse_bpf_args *fa, struct fuse_rename_in *fri,
+			   struct inode *olddir, struct dentry *oldent,
+			   struct inode *newdir, struct dentry *newent)
+{
+	*fri = (struct fuse_rename_in) {
+		.newdir = get_node_id(newdir),
+	};
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(olddir),
+		.opcode = FUSE_RENAME,
+		.in_numargs = 3,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(*fri),
+			.value = fri,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = oldent->d_name.len + 1,
+			.value = oldent->d_name.name,
+		},
+		.in_args[2] = (struct fuse_bpf_in_arg) {
+			.size = newent->d_name.len + 1,
+			.value = newent->d_name.name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_rename_backing(struct fuse_bpf_args *fa,
+			struct inode *olddir, struct dentry *oldent,
+			struct inode *newdir, struct dentry *newent)
+{
+	/* TODO: deal with changing dirs/ents */
+	return fuse_rename_backing_common(olddir, oldent, newdir, newent, 0);
+}
+
+void *fuse_rename_finalize(struct fuse_bpf_args *fa,
+			   struct inode *olddir, struct dentry *oldent,
+			   struct inode *newdir, struct dentry *newent)
+{
+	return NULL;
+}
+
+int fuse_unlink_initialize(
+		struct fuse_bpf_args *fa, struct fuse_dummy_io *dummy,
+		struct inode *dir, struct dentry *entry)
+{
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(dir),
+		.opcode = FUSE_UNLINK,
+		.in_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_unlink_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry)
+{
+	int err = 0;
+	struct path backing_path = {};
+	struct dentry *backing_parent_dentry;
+	struct inode *backing_inode;
+
+	/* TODO Actually deal with changing the backing entry in unlink */
+	get_fuse_backing_path(entry, &backing_path);
+	if (!backing_path.dentry)
+		return -EBADF;
+
+	/* TODO Not sure if we should reverify like overlayfs, or get inode from d_parent */
+	backing_parent_dentry = dget_parent(backing_path.dentry);
+	backing_inode = d_inode(backing_parent_dentry);
+
+	inode_lock_nested(backing_inode, I_MUTEX_PARENT);
+	err = vfs_unlink(&init_user_ns, backing_inode, backing_path.dentry, NULL);
+	inode_unlock(backing_inode);
+
+	dput(backing_parent_dentry);
+	if (!err)
+		d_drop(entry);
+	path_put(&backing_path);
+	return err;
+}
+
+void *fuse_unlink_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry)
+{
+	return NULL;
+}
+
+int fuse_link_initialize(struct fuse_bpf_args *fa, struct fuse_link_in *fli,
+			 struct dentry *entry, struct inode *dir,
+			 struct dentry *newent)
+{
+	struct inode *src_inode = entry->d_inode;
+
+	*fli = (struct fuse_link_in){
+		.oldnodeid = get_node_id(src_inode),
+	};
+
+	fa->opcode = FUSE_LINK;
+	fa->in_numargs = 2;
+	fa->in_args[0].size = sizeof(*fli);
+	fa->in_args[0].value = fli;
+	fa->in_args[1].size = newent->d_name.len + 1;
+	fa->in_args[1].value = newent->d_name.name;
+
+	return 0;
+}
+
+int fuse_link_backing(struct fuse_bpf_args *fa, struct dentry *entry,
+		      struct inode *dir, struct dentry *newent)
+{
+	int err = 0;
+	struct path backing_old_path = {};
+	struct path backing_new_path = {};
+	struct dentry *backing_dir_dentry;
+	struct inode *fuse_new_inode = NULL;
+	struct fuse_inode *fuse_dir_inode = get_fuse_inode(dir);
+	struct inode *backing_dir_inode = fuse_dir_inode->backing_inode;
+
+	get_fuse_backing_path(entry, &backing_old_path);
+	if (!backing_old_path.dentry)
+		return -EBADF;
+
+	get_fuse_backing_path(newent, &backing_new_path);
+	if (!backing_new_path.dentry) {
+		err = -EBADF;
+		goto err_dst_path;
+	}
+
+	backing_dir_dentry = dget_parent(backing_new_path.dentry);
+	backing_dir_inode = d_inode(backing_dir_dentry);
+
+	inode_lock_nested(backing_dir_inode, I_MUTEX_PARENT);
+	err = vfs_link(backing_old_path.dentry,  &init_user_ns,
+		       backing_dir_inode, backing_new_path.dentry, NULL);
+	inode_unlock(backing_dir_inode);
+	if (err)
+		goto out;
+
+	if (d_really_is_negative(backing_new_path.dentry) ||
+	    unlikely(d_unhashed(backing_new_path.dentry))) {
+		err = -EINVAL;
+		/**
+		 * TODO: overlayfs responds to this situation with a
+		 * lookupOneLen. Should we do that too?
+		 */
+		goto out;
+	}
+
+	fuse_new_inode = fuse_iget_backing(dir->i_sb, fuse_dir_inode->nodeid, backing_dir_inode);
+	if (IS_ERR(fuse_new_inode)) {
+		err = PTR_ERR(fuse_new_inode);
+		goto out;
+	}
+	d_instantiate(newent, fuse_new_inode);
+
+out:
+	dput(backing_dir_dentry);
+	path_put(&backing_new_path);
+err_dst_path:
+	path_put(&backing_old_path);
+	return err;
+}
+
+void *fuse_link_finalize(struct fuse_bpf_args *fa, struct dentry *entry,
+			 struct inode *dir, struct dentry *newent)
+{
+	return NULL;
+}
+
+int fuse_getattr_initialize(struct fuse_bpf_args *fa, struct fuse_getattr_io *fgio,
+			const struct dentry *entry, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
+{
+	fgio->fgi = (struct fuse_getattr_in) {
+		.getattr_flags = flags,
+		.fh = -1, /* TODO is this OK? */
+	};
+
+	fgio->fao = (struct fuse_attr_out) {0};
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(entry->d_inode),
+		.opcode = FUSE_GETATTR,
+		.in_numargs = 1,
+		.out_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(fgio->fgi),
+			.value = &fgio->fgi,
+		},
+		.out_args[0] = (struct fuse_bpf_arg) {
+			.size = sizeof(fgio->fao),
+			.value = &fgio->fao,
+		},
+	};
+
+	return 0;
+}
+
+static void fuse_stat_to_attr(struct fuse_conn *fc, struct inode *inode,
+		struct kstat *stat, struct fuse_attr *attr)
+{
+	unsigned int blkbits;
+
+	/* see the comment in fuse_change_attributes() */
+	if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+		stat->size = i_size_read(inode);
+		stat->mtime.tv_sec = inode->i_mtime.tv_sec;
+		stat->mtime.tv_nsec = inode->i_mtime.tv_nsec;
+		stat->ctime.tv_sec = inode->i_ctime.tv_sec;
+		stat->ctime.tv_nsec = inode->i_ctime.tv_nsec;
+	}
+
+	attr->ino = stat->ino;
+	attr->mode = (inode->i_mode & S_IFMT) | (stat->mode & 07777);
+	attr->nlink = stat->nlink;
+	attr->uid = from_kuid(fc->user_ns, stat->uid);
+	attr->gid = from_kgid(fc->user_ns, stat->gid);
+	attr->atime = stat->atime.tv_sec;
+	attr->atimensec = stat->atime.tv_nsec;
+	attr->mtime = stat->mtime.tv_sec;
+	attr->mtimensec = stat->mtime.tv_nsec;
+	attr->ctime = stat->ctime.tv_sec;
+	attr->ctimensec = stat->ctime.tv_nsec;
+	attr->size = stat->size;
+	attr->blocks = stat->blocks;
+
+	if (stat->blksize != 0)
+		blkbits = ilog2(stat->blksize);
+	else
+		blkbits = inode->i_sb->s_blocksize_bits;
+
+	attr->blksize = 1 << blkbits;
+}
+
+int fuse_getattr_backing(struct fuse_bpf_args *fa,
+		const struct dentry *entry, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
+{
+	struct path *backing_path =
+		&get_fuse_dentry(entry)->backing_path;
+	struct inode *backing_inode = backing_path->dentry->d_inode;
+	struct fuse_attr_out *fao = fa->out_args[0].value;
+	struct kstat tmp;
+	int err;
+
+	if (!stat)
+		stat = &tmp;
+
+	err = vfs_getattr(backing_path, stat, request_mask, flags);
+
+	if (!err)
+		fuse_stat_to_attr(get_fuse_conn(entry->d_inode),
+				  backing_inode, stat, &fao->attr);
+
+	return err;
+}
+
+void *fuse_getattr_finalize(struct fuse_bpf_args *fa,
+			const struct dentry *entry, struct kstat *stat,
+			u32 request_mask, unsigned int flags)
+{
+	struct fuse_attr_out *outarg = fa->out_args[0].value;
+	struct inode *inode = entry->d_inode;
+	u64 attr_version = fuse_get_attr_version(get_fuse_mount(inode)->fc);
+	int err = 0;
+
+	/* TODO: Ensure this doesn't happen if we had an error getting attrs in
+	 * backing.
+	 */
+	err = finalize_attr(inode, outarg, attr_version, stat);
+	return ERR_PTR(err);
+}
+
+static void fattr_to_iattr(struct fuse_conn *fc,
+			   const struct fuse_setattr_in *arg,
+			   struct iattr *iattr)
+{
+	unsigned int fvalid = arg->valid;
+
+	if (fvalid & FATTR_MODE)
+		iattr->ia_valid |= ATTR_MODE, iattr->ia_mode = arg->mode;
+	if (fvalid & FATTR_UID) {
+		iattr->ia_valid |= ATTR_UID;
+		iattr->ia_uid = make_kuid(fc->user_ns, arg->uid);
+	}
+	if (fvalid & FATTR_GID) {
+		iattr->ia_valid |= ATTR_GID;
+		iattr->ia_gid = make_kgid(fc->user_ns, arg->gid);
+	}
+	if (fvalid & FATTR_SIZE)
+		iattr->ia_valid |= ATTR_SIZE,  iattr->ia_size = arg->size;
+	if (fvalid & FATTR_ATIME) {
+		iattr->ia_valid |= ATTR_ATIME;
+		iattr->ia_atime.tv_sec = arg->atime;
+		iattr->ia_atime.tv_nsec = arg->atimensec;
+		if (!(fvalid & FATTR_ATIME_NOW))
+			iattr->ia_valid |= ATTR_ATIME_SET;
+	}
+	if (fvalid & FATTR_MTIME) {
+		iattr->ia_valid |= ATTR_MTIME;
+		iattr->ia_mtime.tv_sec = arg->mtime;
+		iattr->ia_mtime.tv_nsec = arg->mtimensec;
+		if (!(fvalid & FATTR_MTIME_NOW))
+			iattr->ia_valid |= ATTR_MTIME_SET;
+	}
+	if (fvalid & FATTR_CTIME) {
+		iattr->ia_valid |= ATTR_CTIME;
+		iattr->ia_ctime.tv_sec = arg->ctime;
+		iattr->ia_ctime.tv_nsec = arg->ctimensec;
+	}
+}
+
+int fuse_setattr_initialize(struct fuse_bpf_args *fa, struct fuse_setattr_io *fsio,
+		struct dentry *dentry, struct iattr *attr, struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(dentry->d_inode);
+
+	*fsio = (struct fuse_setattr_io) {0};
+	iattr_to_fattr(fc, attr, &fsio->fsi, true);
+
+	*fa = (struct fuse_bpf_args) {
+		.opcode = FUSE_SETATTR,
+		.nodeid = get_node_id(dentry->d_inode),
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(fsio->fsi),
+		.in_args[0].value = &fsio->fsi,
+		.out_numargs = 1,
+		.out_args[0].size = sizeof(fsio->fao),
+		.out_args[0].value = &fsio->fao,
+	};
+
+	return 0;
+}
+
+int fuse_setattr_backing(struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct iattr *attr, struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(dentry->d_inode);
+	const struct fuse_setattr_in *fsi = fa->in_args[0].value;
+	struct iattr new_attr = {0};
+	struct path *backing_path = &get_fuse_dentry(dentry)->backing_path;
+	int res;
+
+	fattr_to_iattr(fc, fsi, &new_attr);
+	/* TODO: Some info doesn't get saved by the attr->fattr->attr transition
+	 * When we actually allow the bpf to change these, we may have to consider
+	 * the extra flags more, or pass more info into the bpf. Until then we can
+	 * keep everything except for ATTR_FILE, since we'd need a file on the
+	 * lower fs. For what it's worth, neither f2fs nor ext4 make use of that
+	 * even if it is present.
+	 */
+	new_attr.ia_valid = attr->ia_valid & ~ATTR_FILE;
+	inode_lock(d_inode(backing_path->dentry));
+	res = notify_change(&init_user_ns, backing_path->dentry, &new_attr,
+			    NULL);
+	inode_unlock(d_inode(backing_path->dentry));
+
+	if (res == 0 && (new_attr.ia_valid & ATTR_SIZE))
+		i_size_write(dentry->d_inode, new_attr.ia_size);
+	return res;
+}
+
+void *fuse_setattr_finalize(struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct iattr *attr, struct file *file)
+{
+	return NULL;
+}
+
+int fuse_statfs_initialize(
+		struct fuse_bpf_args *fa, struct fuse_statfs_out *fso,
+		struct dentry *dentry, struct kstatfs *buf)
+{
+	*fso = (struct fuse_statfs_out) {0};
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(d_inode(dentry)),
+		.opcode = FUSE_STATFS,
+		.out_numargs = 1,
+		.out_numargs = 1,
+		.out_args[0].size = sizeof(fso),
+		.out_args[0].value = fso,
+	};
+
+	return 0;
+}
+
+int fuse_statfs_backing(
+		struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct kstatfs *buf)
+{
+	int err = 0;
+	struct path backing_path;
+	struct fuse_statfs_out *fso = fa->out_args[0].value;
+
+	get_fuse_backing_path(dentry, &backing_path);
+	if (!backing_path.dentry)
+		return -EBADF;
+	err = vfs_statfs(&backing_path, buf);
+	path_put(&backing_path);
+	buf->f_type = FUSE_SUPER_MAGIC;
+
+	//TODO Provide postfilter opportunity to modify
+	if (!err)
+		convert_statfs_to_fuse(&fso->st, buf);
+
+	return err;
+}
+
+void *fuse_statfs_finalize(
+		struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct kstatfs *buf)
+{
+	struct fuse_statfs_out *fso = fa->out_args[0].value;
+
+	if (!fa->error_in)
+		convert_fuse_statfs(buf, &fso->st);
+	return NULL;
+}
+
+int fuse_get_link_initialize(struct fuse_bpf_args *fa, struct fuse_dummy_io *unused,
+		struct inode *inode, struct dentry *dentry,
+		struct delayed_call *callback, const char **out)
+{
+	/*
+	 * TODO
+	 * If we want to handle changing these things, we'll need to copy
+	 * the lower fs's data into our own buffer, and provide our own callback
+	 * to free that buffer.
+	 *
+	 * Pre could change the name we're looking at
+	 * postfilter can change the name we return
+	 *
+	 * We ought to only make that buffer if it's been requested, so leaving
+	 * this unimplemented for the moment
+	 */
+	*fa = (struct fuse_bpf_args) {
+		.opcode = FUSE_READLINK,
+		.nodeid = get_node_id(inode),
+		.in_numargs = 1,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = dentry->d_name.len + 1,
+			.value = dentry->d_name.name,
+		},
+		/*
+		 * .out_argvar = 1,
+		 * .out_numargs = 1,
+		 * .out_args[0].size = ,
+		 * .out_args[0].value = ,
+		 */
+	};
+
+	return 0;
+}
+
+int fuse_get_link_backing(struct fuse_bpf_args *fa,
+		struct inode *inode, struct dentry *dentry,
+		struct delayed_call *callback, const char **out)
+{
+	struct path backing_path;
+
+	if (!dentry) {
+		*out = ERR_PTR(-ECHILD);
+		return PTR_ERR(*out);
+	}
+
+	get_fuse_backing_path(dentry, &backing_path);
+	if (!backing_path.dentry) {
+		*out = ERR_PTR(-ECHILD);
+		return PTR_ERR(*out);
+	}
+
+	/*
+	 * TODO: If we want to do our own thing, copy the data and then call the
+	 * callback
+	 */
+	*out = vfs_get_link(backing_path.dentry, callback);
+
+	path_put(&backing_path);
+	return 0;
+}
+
+void *fuse_get_link_finalize(struct fuse_bpf_args *fa,
+		struct inode *inode, struct dentry *dentry,
+		struct delayed_call *callback,  const char **out)
+{
+	return NULL;
+}
+
+int fuse_symlink_initialize(
+		struct fuse_bpf_args *fa, struct fuse_dummy_io *unused,
+		struct inode *dir, struct dentry *entry, const char *link, int len)
+{
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = get_node_id(dir),
+		.opcode = FUSE_SYMLINK,
+		.in_numargs = 2,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = entry->d_name.len + 1,
+			.value = entry->d_name.name,
+		},
+		.in_args[1] = (struct fuse_bpf_in_arg) {
+			.size = len,
+			.value = link,
+		},
+	};
+
+	return 0;
+}
+
+int fuse_symlink_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, const char *link, int len)
+{
+	int err = 0;
+	struct fuse_inode *fuse_inode = get_fuse_inode(dir);
+	struct inode *backing_inode = fuse_inode->backing_inode;
+	struct path backing_path = {};
+	struct inode *inode = NULL;
+
+	//TODO Actually deal with changing the backing entry in symlink
+	get_fuse_backing_path(entry, &backing_path);
+	if (!backing_path.dentry)
+		return -EBADF;
+
+	inode_lock_nested(backing_inode, I_MUTEX_PARENT);
+	err = vfs_symlink(&init_user_ns, backing_inode, backing_path.dentry,
+			  link);
+	inode_unlock(backing_inode);
+	if (err)
+		goto out;
+	if (d_really_is_negative(backing_path.dentry) ||
+		unlikely(d_unhashed(backing_path.dentry))) {
+		err = -EINVAL;
+		/**
+		 * TODO: overlayfs responds to this situation with a
+		 * lookupOneLen. Should we do that too?
+		 */
+		goto out;
+	}
+	inode = fuse_iget_backing(dir->i_sb, fuse_inode->nodeid, backing_inode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	d_instantiate(entry, inode);
+out:
+	path_put(&backing_path);
+	return err;
+}
+
+void *fuse_symlink_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, const char *link, int len)
+{
+	return NULL;
+}
+
+int fuse_readdir_initialize(struct fuse_bpf_args *fa, struct fuse_read_io *frio,
+			    struct file *file, struct dir_context *ctx,
+			    bool *force_again, bool *allow_force, bool is_continued)
+{
+	struct fuse_file *ff = file->private_data;
+	u8 *page = (u8 *)__get_free_page(GFP_KERNEL);
+
+	if (!page)
+		return -ENOMEM;
+
+	*fa = (struct fuse_bpf_args) {
+		.nodeid = ff->nodeid,
+		.opcode = FUSE_READDIR,
+		.in_numargs = 1,
+		.flags = FUSE_BPF_OUT_ARGVAR,
+		.out_numargs = 2,
+		.in_args[0] = (struct fuse_bpf_in_arg) {
+			.size = sizeof(frio->fri),
+			.value = &frio->fri,
+		},
+		.out_args[0] = (struct fuse_bpf_arg) {
+			.size = sizeof(frio->fro),
+			.value = &frio->fro,
+		},
+		.out_args[1] = (struct fuse_bpf_arg) {
+			.size = PAGE_SIZE,
+			.value = page,
+		},
+	};
+
+	frio->fri = (struct fuse_read_in) {
+		.fh = ff->fh,
+		.offset = ctx->pos,
+		.size = PAGE_SIZE,
+	};
+	frio->fro = (struct fuse_read_out) {
+		.again = 0,
+		.offset = 0,
+	};
+	*force_again = false;
+	*allow_force = true;
+	return 0;
+}
+
+struct extfuse_ctx {
+	struct dir_context ctx;
+	u8 *addr;
+	size_t offset;
+};
+
+static bool filldir(struct dir_context *ctx, const char *name, int namelen,
+				   loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct extfuse_ctx *ec = container_of(ctx, struct extfuse_ctx, ctx);
+	struct fuse_dirent *fd = (struct fuse_dirent *) (ec->addr + ec->offset);
+
+	if (ec->offset + sizeof(struct fuse_dirent) + namelen > PAGE_SIZE)
+		return false;
+
+	*fd = (struct fuse_dirent) {
+		.ino = ino,
+		.off = offset,
+		.namelen = namelen,
+		.type = d_type,
+	};
+
+	memcpy(fd->name, name, namelen);
+	ec->offset += FUSE_DIRENT_SIZE(fd);
+
+	return true;
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct dir_context *ctx)
+{
+	while (nbytes >= FUSE_NAME_OFFSET) {
+		struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+		size_t reclen = FUSE_DIRENT_SIZE(dirent);
+
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
+
+		ctx->pos = dirent->off;
+		if (!dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
+				dirent->type))
+			break;
+
+		buf += reclen;
+		nbytes -= reclen;
+	}
+
+	return 0;
+}
+
+
+int fuse_readdir_backing(struct fuse_bpf_args *fa,
+			 struct file *file, struct dir_context *ctx,
+			 bool *force_again, bool *allow_force, bool is_continued)
+{
+	struct fuse_file *ff = file->private_data;
+	struct file *backing_dir = ff->backing_file;
+	struct fuse_read_out *fro = fa->out_args[0].value;
+	struct extfuse_ctx ec;
+	int err;
+
+	ec = (struct extfuse_ctx) {
+		.ctx.actor = filldir,
+		.ctx.pos = ctx->pos,
+		.addr = fa->out_args[1].value,
+	};
+
+	if (!ec.addr)
+		return -ENOMEM;
+
+	if (!is_continued)
+		backing_dir->f_pos = file->f_pos;
+
+	err = iterate_dir(backing_dir, &ec.ctx);
+	if (ec.offset == 0)
+		*allow_force = false;
+	fa->out_args[1].size = ec.offset;
+
+	fro->offset = ec.ctx.pos;
+	fro->again = false;
+	return err;
+}
+
+void *fuse_readdir_finalize(struct fuse_bpf_args *fa,
+			    struct file *file, struct dir_context *ctx,
+			    bool *force_again, bool *allow_force, bool is_continued)
+{
+	struct fuse_read_out *fro = fa->out_args[0].value;
+	struct fuse_file *ff = file->private_data;
+	struct file *backing_dir = ff->backing_file;
+	int err = 0;
+
+	err = parse_dirfile(fa->out_args[1].value, fa->out_args[1].size, ctx);
+	*force_again = !!fro->again;
+	if (*force_again && !*allow_force)
+		err = -EINVAL;
+
+	ctx->pos = fro->offset;
+	backing_dir->f_pos = fro->offset;
+
+	free_page((unsigned long) fa->out_args[1].value);
+	return ERR_PTR(err);
+}
+
+int fuse_access_initialize(struct fuse_bpf_args *fa, struct fuse_access_in *fai,
+			    struct inode *inode, int mask)
+{
+	*fai = (struct fuse_access_in) {
+		.mask = mask,
+	};
+
+	*fa = (struct fuse_bpf_args) {
+		.opcode = FUSE_ACCESS,
+		.nodeid = get_node_id(inode),
+		.in_numargs = 1,
+		.in_args[0].size = sizeof(*fai),
+		.in_args[0].value = fai,
+	};
+
+	return 0;
+}
+
+int fuse_access_backing(struct fuse_bpf_args *fa, struct inode *inode, int mask)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	const struct fuse_access_in *fai = fa->in_args[0].value;
+
+	return inode_permission(&init_user_ns, fi->backing_inode, fai->mask);
+}
+
+void *fuse_access_finalize(struct fuse_bpf_args *fa, struct inode *inode, int mask)
+{
+	return NULL;
+}
+
+int __init fuse_bpf_init(void)
+{
+	fuse_bpf_aio_request_cachep = kmem_cache_create("fuse_bpf_aio_req",
+						   sizeof(struct fuse_bpf_aio_req),
+						   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!fuse_bpf_aio_request_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit fuse_bpf_cleanup(void)
+{
+	kmem_cache_destroy(fuse_bpf_aio_request_cachep);
+}
+
+ssize_t fuse_bpf_simple_request(struct fuse_mount *fm, struct fuse_bpf_args *bpf_args)
+{
+	int i;
+	ssize_t res;
+	struct fuse_args args = {
+		.nodeid = bpf_args->nodeid,
+		.opcode = bpf_args->opcode,
+		.error_in = bpf_args->error_in,
+		.in_numargs = bpf_args->in_numargs,
+		.out_numargs = bpf_args->out_numargs,
+		.force = !!(bpf_args->flags & FUSE_BPF_FORCE),
+		.out_argvar = !!(bpf_args->flags & FUSE_BPF_OUT_ARGVAR),
+	};
+
+	for (i = 0; i < args.in_numargs; ++i)
+		args.in_args[i] = (struct fuse_in_arg) {
+			.size = bpf_args->in_args[i].size,
+			.value = bpf_args->in_args[i].value,
+		};
+	for (i = 0; i < args.out_numargs; ++i)
+		args.out_args[i] = (struct fuse_arg) {
+			.size = bpf_args->out_args[i].size,
+			.value = bpf_args->out_args[i].value,
+		};
+
+	res = fuse_simple_request(fm, &args);
+
+	*bpf_args = (struct fuse_bpf_args) {
+		.nodeid = args.nodeid,
+		.opcode = args.opcode,
+		.error_in = args.error_in,
+		.in_numargs = args.in_numargs,
+		.out_numargs = args.out_numargs,
+	};
+	if (args.force)
+		bpf_args->flags |= FUSE_BPF_FORCE;
+	if (args.out_argvar)
+		bpf_args->flags |= FUSE_BPF_OUT_ARGVAR;
+	for (i = 0; i < args.in_numargs; ++i)
+		bpf_args->in_args[i] = (struct fuse_bpf_in_arg) {
+			.size = args.in_args[i].size,
+			.value = args.in_args[i].value,
+		};
+	for (i = 0; i < args.out_numargs; ++i)
+		bpf_args->out_args[i] = (struct fuse_bpf_arg) {
+			.size = args.out_args[i].size,
+			.value = args.out_args[i].value,
+		};
+	return res;
+}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 247ef4f76761..685552453751 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -378,7 +378,7 @@ int __init fuse_ctl_init(void)
 	return register_filesystem(&fuse_ctl_fs_type);
 }
 
-void __exit fuse_ctl_cleanup(void)
+void fuse_ctl_cleanup(void)
 {
 	unregister_filesystem(&fuse_ctl_fs_type);
 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index aac986034464..155e0732a46a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -242,6 +242,11 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 {
 	struct fuse_iqueue *fiq = &fc->iq;
 
+	if (nodeid == 0) {
+		kfree(forget);
+		return;
+	}
+
 	forget->forget_one.nodeid = nodeid;
 	forget->forget_one.nlookup = nlookup;
 
@@ -479,6 +484,7 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
 {
 	req->in.h.opcode = args->opcode;
 	req->in.h.nodeid = args->nodeid;
+	req->in.h.padding = args->error_in;
 	req->args = args;
 	if (args->end)
 		__set_bit(FR_ASYNC, &req->flags);
@@ -1934,6 +1940,19 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 			kern_path(path, 0, req->args->canonical_path);
 	}
 
+	if (!err && (req->in.h.opcode == FUSE_LOOKUP ||
+		     req->in.h.opcode == (FUSE_LOOKUP | FUSE_POSTFILTER)) &&
+		req->args->out_args[1].size == sizeof(struct fuse_entry_bpf_out)) {
+		struct fuse_entry_bpf_out *febo = (struct fuse_entry_bpf_out *)
+				req->args->out_args[1].value;
+		struct fuse_entry_bpf *feb = container_of(febo, struct fuse_entry_bpf, out);
+
+		if (febo->backing_action == FUSE_ACTION_REPLACE)
+			feb->backing_file = fget(febo->backing_fd);
+		if (febo->bpf_action == FUSE_ACTION_REPLACE)
+			feb->bpf_file = fget(febo->bpf_fd);
+	}
+
 	spin_lock(&fpq->lock);
 	clear_bit(FR_LOCKED, &req->flags);
 	if (!fpq->connected)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fe49b2cb98e4..d7c90d733bb7 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -8,8 +8,10 @@
 
 #include "fuse_i.h"
 
+#include <linux/fdtable.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
+#include <linux/filter.h>
 #include <linux/fs_context.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
@@ -27,6 +29,8 @@ module_param(allow_sys_admin_access, bool, 0644);
 MODULE_PARM_DESC(allow_sys_admin_access,
 		 "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
 
+#include "../internal.h"
+
 static void fuse_advise_use_readdirplus(struct inode *dir)
 {
 	struct fuse_inode *fi = get_fuse_inode(dir);
@@ -34,7 +38,7 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
 	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
 }
 
-#if BITS_PER_LONG >= 64
+#if BITS_PER_LONG >= 64 && !defined(CONFIG_FUSE_BPF)
 static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
 {
 	entry->d_fsdata = (void *) time;
@@ -46,19 +50,15 @@ static inline u64 fuse_dentry_time(const struct dentry *entry)
 }
 
 #else
-union fuse_dentry {
-	u64 time;
-	struct rcu_head rcu;
-};
 
 static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
 {
-	((union fuse_dentry *) dentry->d_fsdata)->time = time;
+	((struct fuse_dentry *) dentry->d_fsdata)->time = time;
 }
 
 static inline u64 fuse_dentry_time(const struct dentry *entry)
 {
-	return ((union fuse_dentry *) entry->d_fsdata)->time;
+	return ((struct fuse_dentry *) entry->d_fsdata)->time;
 }
 #endif
 
@@ -83,26 +83,16 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time)
 	__fuse_dentry_settime(dentry, time);
 }
 
-/*
- * FUSE caches dentries and attributes with separate timeout.  The
- * time in jiffies until the dentry/attributes are valid is stored in
- * dentry->d_fsdata and fuse_inode->i_time respectively.
- */
-
-/*
- * Calculate the time in jiffies until a dentry/attributes are valid
- */
-static u64 time_to_jiffies(u64 sec, u32 nsec)
+void fuse_init_dentry_root(struct dentry *root, struct file *backing_dir)
 {
-	if (sec || nsec) {
-		struct timespec64 ts = {
-			sec,
-			min_t(u32, nsec, NSEC_PER_SEC - 1)
-		};
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_dentry *fuse_dentry = root->d_fsdata;
 
-		return get_jiffies_64() + timespec64_to_jiffies(&ts);
-	} else
-		return 0;
+	if (backing_dir) {
+		fuse_dentry->backing_path = backing_dir->f_path;
+		path_get(&fuse_dentry->backing_path);
+	}
+#endif
 }
 
 /*
@@ -115,11 +105,6 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o)
 		time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
 }
 
-static u64 attr_timeout(struct fuse_attr_out *o)
-{
-	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
-}
-
 u64 entry_attr_timeout(struct fuse_entry_out *o)
 {
 	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
@@ -180,7 +165,8 @@ static void fuse_invalidate_entry(struct dentry *entry)
 
 static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
 			     u64 nodeid, const struct qstr *name,
-			     struct fuse_entry_out *outarg)
+			     struct fuse_entry_out *outarg,
+			     struct fuse_entry_bpf_out *bpf_outarg)
 {
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
 	args->opcode = FUSE_LOOKUP;
@@ -188,11 +174,52 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
 	args->in_numargs = 1;
 	args->in_args[0].size = name->len + 1;
 	args->in_args[0].value = name->name;
-	args->out_numargs = 1;
+	args->out_argvar = true;
+	args->out_numargs = 2;
 	args->out_args[0].size = sizeof(struct fuse_entry_out);
 	args->out_args[0].value = outarg;
+	args->out_args[1].size = sizeof(struct fuse_entry_bpf_out);
+	args->out_args[1].value = bpf_outarg;
 }
 
+#ifdef CONFIG_FUSE_BPF
+static bool backing_data_changed(struct fuse_inode *fi, struct dentry *entry,
+				 struct fuse_entry_bpf *bpf_arg)
+{
+	struct path new_backing_path;
+	struct inode *new_backing_inode;
+	struct bpf_prog *bpf = NULL;
+	int err;
+	bool ret = true;
+
+	if (!entry)
+		return false;
+
+	get_fuse_backing_path(entry, &new_backing_path);
+	new_backing_inode = fi->backing_inode;
+	ihold(new_backing_inode);
+
+	err = fuse_handle_backing(bpf_arg, &new_backing_inode, &new_backing_path);
+
+	if (err)
+		goto put_inode;
+
+	err = fuse_handle_bpf_prog(bpf_arg, entry->d_parent->d_inode, &bpf);
+	if (err)
+		goto put_bpf;
+
+	ret = (bpf != fi->bpf || fi->backing_inode != new_backing_inode ||
+			!path_equal(&get_fuse_dentry(entry)->backing_path, &new_backing_path));
+put_bpf:
+	if (bpf)
+		bpf_prog_put(bpf);
+put_inode:
+	iput(new_backing_inode);
+	path_put(&new_backing_path);
+	return ret;
+}
+#endif
+
 /*
  * Check whether the dentry is still valid
  *
@@ -213,9 +240,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 	inode = d_inode_rcu(entry);
 	if (inode && fuse_is_bad(inode))
 		goto invalid;
-	else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
+
+#ifdef CONFIG_FUSE_BPF
+	/* TODO: Do we need bpf support for revalidate?
+	 * If the lower filesystem says the entry is invalid, FUSE probably shouldn't
+	 * try to fix that without going through the normal lookup path...
+	 */
+	if (get_fuse_dentry(entry)->backing_path.dentry) {
+		ret = fuse_revalidate_backing(entry, flags);
+		if (ret <= 0) {
+			goto out;
+		}
+	}
+#endif
+	if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
 		 (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
 		struct fuse_entry_out outarg;
+		struct fuse_entry_bpf bpf_arg;
 		FUSE_ARGS(args);
 		struct fuse_forget_link *forget;
 		u64 attr_version;
@@ -227,27 +268,44 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 		ret = -ECHILD;
 		if (flags & LOOKUP_RCU)
 			goto out;
-
 		fm = get_fuse_mount(inode);
 
+		parent = dget_parent(entry);
+
+#ifdef CONFIG_FUSE_BPF
+		/* TODO: Once we're handling timeouts for backing inodes, do a
+		 * bpf based lookup_revalidate here.
+		 */
+		if (get_fuse_inode(parent->d_inode)->backing_inode) {
+			dput(parent);
+			ret = 1;
+			goto out;
+		}
+#endif
 		forget = fuse_alloc_forget();
 		ret = -ENOMEM;
-		if (!forget)
+		if (!forget) {
+			dput(parent);
 			goto out;
+		}
 
 		attr_version = fuse_get_attr_version(fm->fc);
 
-		parent = dget_parent(entry);
 		fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)),
-				 &entry->d_name, &outarg);
+				 &entry->d_name, &outarg, &bpf_arg.out);
 		ret = fuse_simple_request(fm, &args);
 		dput(parent);
+
 		/* Zero nodeid is same as -ENOENT */
 		if (!ret && !outarg.nodeid)
 			ret = -ENOENT;
-		if (!ret) {
+		if (!ret || ret == sizeof(bpf_arg.out)) {
 			fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode) ||
+#ifdef CONFIG_FUSE_BPF
+			    (ret == sizeof(bpf_arg.out) &&
+					    backing_data_changed(fi, entry, &bpf_arg)) ||
+#endif
 			    (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) {
 				fuse_queue_forget(fm->fc, forget,
 						  outarg.nodeid, 1);
@@ -289,17 +347,20 @@ invalid:
 	goto out;
 }
 
-#if BITS_PER_LONG < 64
+#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF)
 static int fuse_dentry_init(struct dentry *dentry)
 {
-	dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
+	dentry->d_fsdata = kzalloc(sizeof(struct fuse_dentry),
 				   GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
 
 	return dentry->d_fsdata ? 0 : -ENOMEM;
 }
 static void fuse_dentry_release(struct dentry *dentry)
 {
-	union fuse_dentry *fd = dentry->d_fsdata;
+	struct fuse_dentry *fd = dentry->d_fsdata;
+
+	if (fd && fd->backing_path.dentry)
+		path_put(&fd->backing_path);
 
 	kfree_rcu(fd, rcu);
 }
@@ -353,6 +414,18 @@ static void fuse_dentry_canonical_path(const struct path *path,
 	char *path_name;
 	int err;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_dummy_io,
+			       fuse_canonical_path_initialize,
+			       fuse_canonical_path_backing,
+			       fuse_canonical_path_finalize, path,
+			       canonical_path);
+	if (fer.ret)
+		return;
+#endif
+
 	path_name = (char *)get_zeroed_page(GFP_KERNEL);
 	if (!path_name)
 		goto default_path;
@@ -379,7 +452,7 @@ default_path:
 const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 	.d_delete	= fuse_dentry_delete,
-#if BITS_PER_LONG < 64
+#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF)
 	.d_init		= fuse_dentry_init,
 	.d_release	= fuse_dentry_release,
 #endif
@@ -388,7 +461,7 @@ const struct dentry_operations fuse_dentry_operations = {
 };
 
 const struct dentry_operations fuse_root_dentry_operations = {
-#if BITS_PER_LONG < 64
+#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF)
 	.d_init		= fuse_dentry_init,
 	.d_release	= fuse_dentry_release,
 #endif
@@ -407,10 +480,13 @@ bool fuse_invalid_attr(struct fuse_attr *attr)
 }
 
 int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
-		     struct fuse_entry_out *outarg, struct inode **inode)
+		     struct fuse_entry_out *outarg,
+		     struct dentry *entry,
+		     struct inode **inode)
 {
 	struct fuse_mount *fm = get_fuse_mount_super(sb);
 	FUSE_ARGS(args);
+	struct fuse_entry_bpf bpf_arg = {0};
 	struct fuse_forget_link *forget;
 	u64 attr_version;
 	int err;
@@ -428,23 +504,68 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
 
 	attr_version = fuse_get_attr_version(fm->fc);
 
-	fuse_lookup_init(fm->fc, &args, nodeid, name, outarg);
+	fuse_lookup_init(fm->fc, &args, nodeid, name, outarg, &bpf_arg.out);
 	err = fuse_simple_request(fm, &args);
-	/* Zero nodeid is same as -ENOENT, but with valid timeout */
-	if (err || !outarg->nodeid)
-		goto out_put_forget;
 
-	err = -EIO;
-	if (!outarg->nodeid)
-		goto out_put_forget;
-	if (fuse_invalid_attr(&outarg->attr))
-		goto out_put_forget;
+#ifdef CONFIG_FUSE_BPF
+	if (err == sizeof(bpf_arg.out)) {
+		/* TODO Make sure this handles invalid handles */
+		struct file *backing_file;
+		struct inode *backing_inode;
+
+		err = -ENOENT;
+		if (!entry)
+			goto out_queue_forget;
+
+		err = -EINVAL;
+		backing_file = bpf_arg.backing_file;
+		if (!backing_file)
+			goto out_queue_forget;
+
+		if (IS_ERR(backing_file)) {
+			err = PTR_ERR(backing_file);
+			goto out_queue_forget;
+		}
+
+		backing_inode = backing_file->f_inode;
+		*inode = fuse_iget_backing(sb, outarg->nodeid, backing_inode);
+		if (!*inode)
+			goto bpf_arg_out;
+
+		err = fuse_handle_backing(&bpf_arg,
+				&get_fuse_inode(*inode)->backing_inode,
+				&get_fuse_dentry(entry)->backing_path);
+		if (err)
+			goto out;
+
+		err = fuse_handle_bpf_prog(&bpf_arg, NULL, &get_fuse_inode(*inode)->bpf);
+		if (err)
+			goto out;
+bpf_arg_out:
+		fput(backing_file);
+	} else
+#endif
+	{
+		/* Zero nodeid is same as -ENOENT, but with valid timeout */
+		if (err || !outarg->nodeid)
+			goto out_put_forget;
+
+		err = -EIO;
+		if (!outarg->nodeid)
+			goto out_put_forget;
+		if (fuse_invalid_attr(&outarg->attr))
+			goto out_put_forget;
+
+		*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+				   &outarg->attr, entry_attr_timeout(outarg),
+				   attr_version);
+	}
 
-	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
-			   &outarg->attr, entry_attr_timeout(outarg),
-			   attr_version);
 	err = -ENOMEM;
-	if (!*inode) {
+#ifdef CONFIG_FUSE_BPF
+out_queue_forget:
+#endif
+	if (!*inode && outarg->nodeid) {
 		fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1);
 		goto out;
 	}
@@ -466,12 +587,23 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	bool outarg_valid = true;
 	bool locked;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(dir, struct fuse_lookup_io,
+			       fuse_lookup_initialize, fuse_lookup_backing,
+			       fuse_lookup_finalize,
+			       dir, entry, flags);
+	if (fer.ret)
+		return fer.result;
+#endif
+
 	if (fuse_is_bad(dir))
 		return ERR_PTR(-EIO);
 
 	locked = fuse_lock_inode(dir);
 	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
-			       &outarg, &inode);
+			       &outarg, entry, &inode);
 	fuse_unlock_inode(dir, locked);
 	if (err == -ENOENT) {
 		outarg_valid = false;
@@ -589,6 +721,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	/* Userspace expects S_IFREG in create mode */
 	BUG_ON((mode & S_IFMT) != S_IFREG);
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(dir, struct fuse_create_open_io,
+				       fuse_create_open_initialize,
+				       fuse_create_open_backing,
+				       fuse_create_open_finalize,
+				       dir, entry, file, flags, mode);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	forget = fuse_alloc_forget();
 	err = -ENOMEM;
 	if (!forget)
@@ -822,6 +968,17 @@ static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	struct fuse_mount *fm = get_fuse_mount(dir);
 	FUSE_ARGS(args);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(dir, struct fuse_mknod_in,
+			fuse_mknod_initialize, fuse_mknod_backing,
+			fuse_mknod_finalize,
+			dir, entry, mode, rdev);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (!fm->fc->dont_mask)
 		mode &= ~current_umask();
 
@@ -868,6 +1025,17 @@ static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	struct fuse_mount *fm = get_fuse_mount(dir);
 	FUSE_ARGS(args);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(dir, struct fuse_mkdir_in,
+			fuse_mkdir_initialize, fuse_mkdir_backing,
+			fuse_mkdir_finalize,
+			dir, entry, mode);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (!fm->fc->dont_mask)
 		mode &= ~current_umask();
 
@@ -890,6 +1058,17 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	unsigned len = strlen(link) + 1;
 	FUSE_ARGS(args);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(dir, struct fuse_dummy_io,
+			fuse_symlink_initialize, fuse_symlink_backing,
+			fuse_symlink_finalize,
+			dir, entry, link, len);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	args.opcode = FUSE_SYMLINK;
 	args.in_numargs = 2;
 	args.in_args[0].size = entry->d_name.len + 1;
@@ -953,6 +1132,20 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 	if (fuse_is_bad(dir))
 		return -EIO;
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(dir, struct fuse_dummy_io,
+					fuse_unlink_initialize,
+					fuse_unlink_backing,
+					fuse_unlink_finalize,
+					dir, entry);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	args.opcode = FUSE_UNLINK;
 	args.nodeid = get_node_id(dir);
 	args.in_numargs = 1;
@@ -976,6 +1169,20 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	if (fuse_is_bad(dir))
 		return -EIO;
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(dir, struct fuse_dummy_io,
+					fuse_rmdir_initialize,
+					fuse_rmdir_backing,
+					fuse_rmdir_finalize,
+					dir, entry);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	args.opcode = FUSE_RMDIR;
 	args.nodeid = get_node_id(dir);
 	args.in_numargs = 1;
@@ -1054,6 +1261,18 @@ static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir,
 		return -EINVAL;
 
 	if (flags) {
+#ifdef CONFIG_FUSE_BPF
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(olddir, struct fuse_rename2_in,
+						fuse_rename2_initialize, fuse_rename2_backing,
+						fuse_rename2_finalize,
+						olddir, oldent, newdir, newent, flags);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+#endif
+
+		/* TODO: how should this go with bpfs involved? */
 		if (fc->no_rename2 || fc->minor < 23)
 			return -EINVAL;
 
@@ -1065,6 +1284,17 @@ static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir,
 			err = -EINVAL;
 		}
 	} else {
+#ifdef CONFIG_FUSE_BPF
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(olddir, struct fuse_rename_in,
+						fuse_rename_initialize, fuse_rename_backing,
+						fuse_rename_finalize,
+						olddir, oldent, newdir, newent);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+#endif
+
 		err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
 					 FUSE_RENAME,
 					 sizeof(struct fuse_rename_in));
@@ -1082,6 +1312,16 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	struct fuse_mount *fm = get_fuse_mount(inode);
 	FUSE_ARGS(args);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_link_in, fuse_link_initialize,
+			       fuse_link_backing, fuse_link_finalize, entry,
+			       newdir, newent);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
 	args.opcode = FUSE_LINK;
@@ -1099,7 +1339,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	return err;
 }
 
-static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
+void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 			  struct kstat *stat)
 {
 	unsigned int blkbits;
@@ -1159,23 +1399,13 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	args.out_args[0].size = sizeof(outarg);
 	args.out_args[0].value = &outarg;
 	err = fuse_simple_request(fm, &args);
-	if (!err) {
-		if (fuse_invalid_attr(&outarg.attr) ||
-		    inode_wrong_type(inode, outarg.attr.mode)) {
-			fuse_make_bad(inode);
-			err = -EIO;
-		} else {
-			fuse_change_attributes(inode, &outarg.attr,
-					       attr_timeout(&outarg),
-					       attr_version);
-			if (stat)
-				fuse_fillattr(inode, &outarg.attr, stat);
-		}
-	}
+	if (!err)
+		err = finalize_attr(inode, &outarg, attr_version, stat);
 	return err;
 }
 
 static int fuse_update_get_attr(struct inode *inode, struct file *file,
+				const struct path *path,
 				struct kstat *stat, u32 request_mask,
 				unsigned int flags)
 {
@@ -1185,6 +1415,17 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
 	u32 inval_mask = READ_ONCE(fi->inval_mask);
 	u32 cache_mask = fuse_get_cache_mask(inode);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_getattr_io,
+			       fuse_getattr_initialize,	fuse_getattr_backing,
+			       fuse_getattr_finalize,
+			       path->dentry, stat, request_mask, flags);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (flags & AT_STATX_FORCE_SYNC)
 		sync = true;
 	else if (flags & AT_STATX_DONT_SYNC)
@@ -1208,7 +1449,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
 
 int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
 {
-	return fuse_update_get_attr(inode, file, NULL, mask, 0);
+	/* Do *not* need to get atime for internal purposes */
+	return fuse_update_get_attr(inode, file, &file->f_path, NULL,
+				    mask & ~STATX_ATIME, 0);
 }
 
 int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
@@ -1319,6 +1562,16 @@ static int fuse_access(struct inode *inode, int mask)
 	struct fuse_access_in inarg;
 	int err;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_access_in,
+			       fuse_access_initialize, fuse_access_backing,
+			       fuse_access_finalize, inode, mask);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	BUG_ON(mask & MAY_NOT_BLOCK);
 
 	if (fm->fc->no_access)
@@ -1367,6 +1620,10 @@ static int fuse_permission(struct user_namespace *mnt_userns,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
 	int err = 0;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+#endif
 
 	if (fuse_is_bad(inode))
 		return -EIO;
@@ -1374,12 +1631,19 @@ static int fuse_permission(struct user_namespace *mnt_userns,
 	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
+#ifdef CONFIG_FUSE_BPF
+	fer = fuse_bpf_backing(inode, struct fuse_access_in,
+			       fuse_access_initialize, fuse_access_backing,
+			       fuse_access_finalize, inode, mask);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	/*
 	 * If attributes are needed, refresh them before proceeding
 	 */
 	if (fc->default_permissions ||
 	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
 		u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID;
 
 		if (perm_mask & READ_ONCE(fi->inval_mask) ||
@@ -1470,6 +1734,21 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
 	if (fuse_is_bad(inode))
 		goto out_err;
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+		const char *out = NULL;
+
+		fer = fuse_bpf_backing(inode, struct fuse_dummy_io,
+				       fuse_get_link_initialize,
+				       fuse_get_link_backing,
+				       fuse_get_link_finalize,
+				       inode, dentry, callback, &out);
+		if (fer.ret)
+			return fer.result ?: out;
+	}
+#endif
+
 	if (fc->cache_symlinks)
 		return page_get_link(dentry, inode, callback);
 
@@ -1503,8 +1782,18 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
 
 static int fuse_dir_release(struct inode *inode, struct file *file)
 {
-	fuse_release_common(file, true);
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
 
+	fer = fuse_bpf_backing(inode, struct fuse_release_in,
+		       fuse_releasedir_initialize, fuse_release_backing,
+		       fuse_release_finalize,
+		       inode, file);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
+	fuse_release_common(file, true);
 	return 0;
 }
 
@@ -1518,6 +1807,19 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
 	if (fuse_is_bad(inode))
 		return -EIO;
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(inode, struct fuse_fsync_in,
+				fuse_dir_fsync_initialize, fuse_fsync_backing,
+				fuse_fsync_finalize,
+				file, start, end, datasync);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	if (fc->no_fsyncdir)
 		return 0;
 
@@ -1556,58 +1858,6 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
 				 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
 }
 
-static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
-{
-	/* Always update if mtime is explicitly set  */
-	if (ivalid & ATTR_MTIME_SET)
-		return true;
-
-	/* Or if kernel i_mtime is the official one */
-	if (trust_local_mtime)
-		return true;
-
-	/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
-	if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
-		return false;
-
-	/* In all other cases update */
-	return true;
-}
-
-static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
-			   struct fuse_setattr_in *arg, bool trust_local_cmtime)
-{
-	unsigned ivalid = iattr->ia_valid;
-
-	if (ivalid & ATTR_MODE)
-		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
-	if (ivalid & ATTR_UID)
-		arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
-	if (ivalid & ATTR_GID)
-		arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
-	if (ivalid & ATTR_SIZE)
-		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
-	if (ivalid & ATTR_ATIME) {
-		arg->valid |= FATTR_ATIME;
-		arg->atime = iattr->ia_atime.tv_sec;
-		arg->atimensec = iattr->ia_atime.tv_nsec;
-		if (!(ivalid & ATTR_ATIME_SET))
-			arg->valid |= FATTR_ATIME_NOW;
-	}
-	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
-		arg->valid |= FATTR_MTIME;
-		arg->mtime = iattr->ia_mtime.tv_sec;
-		arg->mtimensec = iattr->ia_mtime.tv_nsec;
-		if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
-			arg->valid |= FATTR_MTIME_NOW;
-	}
-	if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
-		arg->valid |= FATTR_CTIME;
-		arg->ctime = iattr->ia_ctime.tv_sec;
-		arg->ctimensec = iattr->ia_ctime.tv_nsec;
-	}
-}
-
 /*
  * Prevent concurrent writepages on inode
  *
@@ -1722,6 +1972,16 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 	bool trust_local_cmtime = is_wb;
 	bool fault_blocked = false;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_setattr_io,
+			       fuse_setattr_initialize, fuse_setattr_backing,
+			       fuse_setattr_finalize, dentry, attr, file);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (!fc->default_permissions)
 		attr->ia_valid |= ATTR_FORCE;
 
@@ -1897,11 +2157,22 @@ static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry,
 		 * This should be done on write(), truncate() and chown().
 		 */
 		if (!fc->handle_killpriv && !fc->handle_killpriv_v2) {
+#ifdef CONFIG_FUSE_BPF
+			struct fuse_err_ret fer;
+
 			/*
 			 * ia_mode calculation may have used stale i_mode.
 			 * Refresh and recalculate.
 			 */
-			ret = fuse_do_getattr(inode, NULL, file);
+			fer = fuse_bpf_backing(inode, struct fuse_getattr_io,
+					       fuse_getattr_initialize,	fuse_getattr_backing,
+					       fuse_getattr_finalize,
+					       entry, NULL, 0, 0);
+			if (fer.ret)
+				ret = PTR_ERR(fer.result);
+			else
+#endif
+				ret = fuse_do_getattr(inode, NULL, file);
 			if (ret)
 				return ret;
 
@@ -1958,7 +2229,8 @@ static int fuse_getattr(struct user_namespace *mnt_userns,
 		return -EACCES;
 	}
 
-	return fuse_update_get_attr(inode, NULL, stat, request_mask, flags);
+	return fuse_update_get_attr(inode, NULL, path, stat, request_mask,
+				    flags);
 }
 
 static const struct inode_operations fuse_dir_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index edceaa0f0c14..bccf5cf7ec2a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -8,6 +8,7 @@
 
 #include "fuse_i.h"
 
+#include <linux/filter.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -235,6 +236,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	if (err)
 		return err;
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(inode, struct fuse_open_io,
+				       fuse_open_initialize,
+				       fuse_open_backing,
+				       fuse_open_finalize,
+				       inode, file, isdir);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	if (is_wb_truncate || dax_truncate)
 		inode_lock(inode);
 
@@ -346,6 +361,17 @@ static int fuse_release(struct inode *inode, struct file *file)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_release_in,
+		       fuse_release_initialize, fuse_release_backing,
+		       fuse_release_finalize,
+		       inode, file);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	/*
 	 * Dirty pages might remain despite write_inode_now() call from
 	 * fuse_flush() due to writes racing with the close.
@@ -488,6 +514,17 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	FUSE_ARGS(args);
 	int err;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(file->f_inode, struct fuse_flush_in,
+			       fuse_flush_initialize, fuse_flush_backing,
+			       fuse_flush_finalize,
+			       file, id);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fuse_is_bad(inode))
 		return -EIO;
 
@@ -563,6 +600,17 @@ static int fuse_fsync(struct file *file, loff_t start, loff_t end,
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_fsync_in,
+			       fuse_fsync_initialize, fuse_fsync_backing,
+			       fuse_fsync_finalize,
+			       file, start, end, datasync);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fuse_is_bad(inode))
 		return -EIO;
 
@@ -1600,6 +1648,20 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (FUSE_IS_DAX(inode))
 		return fuse_dax_read_iter(iocb, to);
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(inode, struct fuse_file_read_iter_io,
+				       fuse_file_read_iter_initialize,
+				       fuse_file_read_iter_backing,
+				       fuse_file_read_iter_finalize,
+				       iocb, to);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	if (ff->passthrough.filp)
 		return fuse_passthrough_read_iter(iocb, to);
 	else if (!(ff->open_flags & FOPEN_DIRECT_IO))
@@ -1620,6 +1682,20 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (FUSE_IS_DAX(inode))
 		return fuse_dax_write_iter(iocb, from);
 
+#ifdef CONFIG_FUSE_BPF
+	{
+		struct fuse_err_ret fer;
+
+		fer = fuse_bpf_backing(inode, struct fuse_file_write_iter_io,
+				       fuse_file_write_iter_initialize,
+				       fuse_file_write_iter_backing,
+				       fuse_file_write_iter_finalize,
+				       iocb, from);
+		if (fer.ret)
+			return PTR_ERR(fer.result);
+	}
+#endif
+
 	if (ff->passthrough.filp)
 		return fuse_passthrough_write_iter(iocb, from);
 	else if (!(ff->open_flags & FOPEN_DIRECT_IO))
@@ -1868,6 +1944,19 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct fuse_file *ff;
 	int err;
 
+	/**
+	 * TODO - fully understand why this is necessary
+	 *
+	 * With fuse-bpf, fsstress fails if rename is enabled without this
+	 *
+	 * We are getting writes here on directory inodes, which do not have an
+	 * initialized file list so crash.
+	 *
+	 * The question is why we are getting those writes
+	 */
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+
 	/*
 	 * Inode is always written before the last reference is dropped and
 	 * hence this should not be reached from reclaim.
@@ -2439,6 +2528,12 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (FUSE_IS_DAX(file_inode(file)))
 		return fuse_dax_mmap(file, vma);
 
+#ifdef CONFIG_FUSE_BPF
+	/* TODO - this is simply passthrough, not a proper BPF filter */
+	if (ff->backing_file)
+		return fuse_backing_mmap(file, vma);
+#endif
+
 	if (ff->passthrough.filp)
 		return fuse_passthrough_mmap(file, vma);
 
@@ -2687,6 +2782,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t retval;
 	struct inode *inode = file_inode(file);
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_lseek_io,
+			       fuse_lseek_initialize,
+			       fuse_lseek_backing,
+			       fuse_lseek_finalize,
+			       file, offset, whence);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
 
 	switch (whence) {
 	case SEEK_SET:
@@ -2976,6 +3082,18 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		(!(mode & FALLOC_FL_KEEP_SIZE) ||
 		 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_fallocate_in,
+			       fuse_file_fallocate_initialize,
+			       fuse_file_fallocate_backing,
+			       fuse_file_fallocate_finalize,
+			       file, mode, offset, length);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 		     FALLOC_FL_ZERO_RANGE))
 		return -EOPNOTSUPP;
@@ -3079,6 +3197,18 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
 	bool is_unstable = (!fc->writeback_cache) &&
 			   ((pos_out + len) > inode_out->i_size);
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(file_in->f_inode, struct fuse_copy_file_range_io,
+			       fuse_copy_file_range_initialize,
+			       fuse_copy_file_range_backing,
+			       fuse_copy_file_range_finalize,
+			       file_in, pos_in, file_out, pos_out, len, flags);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fc->no_copy_file_range)
 		return -EOPNOTSUPP;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4d8a98646c0c..637a30e0cbcb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -13,6 +13,9 @@
 # define pr_fmt(fmt) "fuse: " fmt
 #endif
 
+#include <linux/android_fuse.h>
+#include <linux/filter.h>
+#include <linux/pagemap.h>
 #include <linux/fuse.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -31,6 +34,9 @@
 #include <linux/pid_namespace.h>
 #include <linux/refcount.h>
 #include <linux/user_namespace.h>
+#include <linux/statfs.h>
+
+#define FUSE_SUPER_MAGIC 0x65735546
 
 /** Default max number of pages that can be used in a single read request */
 #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
@@ -63,11 +69,57 @@ struct fuse_forget_link {
 	struct fuse_forget_link *next;
 };
 
+/** FUSE specific dentry data */
+#if BITS_PER_LONG < 64 || defined(CONFIG_FUSE_BPF)
+struct fuse_dentry {
+	union {
+		u64 time;
+		struct rcu_head rcu;
+	};
+	struct path backing_path;
+};
+
+static inline struct fuse_dentry *get_fuse_dentry(const struct dentry *entry)
+{
+	return entry->d_fsdata;
+}
+#endif
+
+#ifdef CONFIG_FUSE_BPF
+static inline void get_fuse_backing_path(const struct dentry *d,
+					  struct path *path)
+{
+	struct fuse_dentry *di = get_fuse_dentry(d);
+
+	if (!di) {
+		*path = (struct path) {};
+		return;
+	}
+
+	*path = di->backing_path;
+	path_get(path);
+}
+#endif
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
 	struct inode inode;
 
+#ifdef CONFIG_FUSE_BPF
+	/**
+	 * Backing inode, if this inode is from a backing file system.
+	 * If this is set, nodeid is 0.
+	 */
+	struct inode *backing_inode;
+
+	/**
+	 * bpf_prog, run on all operations to determine whether to pass through
+	 * or handle in place
+	 */
+	struct bpf_prog *bpf;
+#endif
+
 	/** Unique ID, which identifies the inode between userspace
 	 * and kernel */
 	u64 nodeid;
@@ -232,6 +284,14 @@ struct fuse_file {
 	/** Container for data related to the passthrough functionality */
 	struct fuse_passthrough passthrough;
 
+#ifdef CONFIG_FUSE_BPF
+	/**
+	 * TODO: Reconcile with passthrough file
+	 * backing file when in bpf mode
+	 */
+	struct file *backing_file;
+#endif
+
 	/** RB node to be linked on fuse_conn->polled_files */
 	struct rb_node polled_node;
 
@@ -263,6 +323,7 @@ struct fuse_page_desc {
 struct fuse_args {
 	uint64_t nodeid;
 	uint32_t opcode;
+	uint32_t error_in;
 	unsigned short in_numargs;
 	unsigned short out_numargs;
 	bool force:1;
@@ -275,8 +336,8 @@ struct fuse_args {
 	bool page_zeroing:1;
 	bool page_replace:1;
 	bool may_block:1;
-	struct fuse_in_arg in_args[3];
-	struct fuse_arg out_args[2];
+	struct fuse_in_arg in_args[FUSE_MAX_IN_ARGS];
+	struct fuse_arg out_args[FUSE_MAX_OUT_ARGS];
 	void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
 
 	/* Path used for completing d_canonical_path */
@@ -526,9 +587,12 @@ struct fuse_fs_context {
 	bool no_force_umount:1;
 	bool legacy_opts_show:1;
 	enum fuse_dax_mode dax_mode;
+	bool no_daemon:1;
 	unsigned int max_read;
 	unsigned int blksize;
 	const char *subtype;
+	struct bpf_prog *root_bpf;
+	struct file *root_dir;
 
 	/* DAX device, may be NULL */
 	struct dax_device *dax_dev;
@@ -805,6 +869,9 @@ struct fuse_conn {
 	/* Is tmpfile not implemented by fs? */
 	unsigned int no_tmpfile:1;
 
+	/** BPF Only, no Daemon running */
+	unsigned int no_daemon:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -979,14 +1046,18 @@ extern const struct dentry_operations fuse_dentry_operations;
 extern const struct dentry_operations fuse_root_dentry_operations;
 
 /**
- * Get a filled in inode
+ * Get a filled-in inode
  */
+struct inode *fuse_iget_backing(struct super_block *sb,
+				u64 nodeid,
+				struct inode *backing_inode);
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
 			u64 attr_valid, u64 attr_version);
 
 int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
-		     struct fuse_entry_out *outarg, struct inode **inode);
+		     struct fuse_entry_out *outarg,
+		     struct dentry *entry, struct inode **inode);
 
 /**
  * Send FORGET command
@@ -1023,7 +1094,6 @@ struct fuse_io_args {
 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 			 size_t count, int opcode);
 
-
 /**
  * Send OPEN or OPENDIR request
  */
@@ -1095,7 +1165,7 @@ int fuse_dev_init(void);
 void fuse_dev_cleanup(void);
 
 int fuse_ctl_init(void);
-void __exit fuse_ctl_cleanup(void);
+void fuse_ctl_cleanup(void);
 
 /**
  * Simple request sending that does request allocation and freeing
@@ -1131,6 +1201,7 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 void fuse_invalidate_atime(struct inode *inode);
 
 u64 entry_attr_timeout(struct fuse_entry_out *o);
+void fuse_init_dentry_root(struct dentry *root, struct file *backing_dir);
 void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
 
 /**
@@ -1344,6 +1415,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 		       unsigned int open_flags, fl_owner_t id, bool isdir);
 
 /* passthrough.c */
+void fuse_copyattr(struct file *dst_file, struct file *src_file);
 int fuse_passthrough_open(struct fuse_dev *fud, u32 lower_fd);
 int fuse_passthrough_setup(struct fuse_conn *fc, struct fuse_file *ff,
 			   struct fuse_open_out *openarg);
@@ -1352,4 +1424,640 @@ ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *to);
 ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *from);
 ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma);
 
+/* backing.c */
+
+struct bpf_prog *fuse_get_bpf_prog(struct file *file);
+
+/*
+ * Dummy io passed to fuse_bpf_backing when io operation needs no scratch space
+ */
+struct fuse_dummy_io {
+	int unused;
+};
+
+struct fuse_open_io {
+	struct fuse_open_in foi;
+	struct fuse_open_out foo;
+};
+
+int fuse_open_initialize(struct fuse_bpf_args *fa, struct fuse_open_io *foi,
+			 struct inode *inode, struct file *file, bool isdir);
+int fuse_open_backing(struct fuse_bpf_args *fa,
+		      struct inode *inode, struct file *file, bool isdir);
+void *fuse_open_finalize(struct fuse_bpf_args *fa,
+		       struct inode *inode, struct file *file, bool isdir);
+
+struct fuse_create_open_io {
+	struct fuse_create_in fci;
+	struct fuse_entry_out feo;
+	struct fuse_open_out foo;
+};
+
+int fuse_create_open_initialize(
+		struct fuse_bpf_args *fa, struct fuse_create_open_io *fcoi,
+		struct inode *dir, struct dentry *entry,
+		struct file *file, unsigned int flags, umode_t mode);
+int fuse_create_open_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry,
+		struct file *file, unsigned int flags, umode_t mode);
+void *fuse_create_open_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry,
+		struct file *file, unsigned int flags, umode_t mode);
+
+int fuse_mknod_initialize(
+		struct fuse_bpf_args *fa, struct fuse_mknod_in *fmi,
+		struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev);
+int fuse_mknod_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev);
+void *fuse_mknod_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev);
+
+int fuse_mkdir_initialize(
+		struct fuse_bpf_args *fa, struct fuse_mkdir_in *fmi,
+		struct inode *dir, struct dentry *entry, umode_t mode);
+int fuse_mkdir_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode);
+void *fuse_mkdir_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, umode_t mode);
+
+int fuse_rmdir_initialize(
+		struct fuse_bpf_args *fa, struct fuse_dummy_io *fmi,
+		struct inode *dir, struct dentry *entry);
+int fuse_rmdir_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry);
+void *fuse_rmdir_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry);
+
+int fuse_rename2_initialize(struct fuse_bpf_args *fa, struct fuse_rename2_in *fri,
+			    struct inode *olddir, struct dentry *oldent,
+			    struct inode *newdir, struct dentry *newent,
+			    unsigned int flags);
+int fuse_rename2_backing(struct fuse_bpf_args *fa,
+			 struct inode *olddir, struct dentry *oldent,
+			 struct inode *newdir, struct dentry *newent,
+			 unsigned int flags);
+void *fuse_rename2_finalize(struct fuse_bpf_args *fa,
+			    struct inode *olddir, struct dentry *oldent,
+			    struct inode *newdir, struct dentry *newent,
+			    unsigned int flags);
+
+int fuse_rename_initialize(struct fuse_bpf_args *fa, struct fuse_rename_in *fri,
+			   struct inode *olddir, struct dentry *oldent,
+			   struct inode *newdir, struct dentry *newent);
+int fuse_rename_backing(struct fuse_bpf_args *fa,
+			struct inode *olddir, struct dentry *oldent,
+			struct inode *newdir, struct dentry *newent);
+void *fuse_rename_finalize(struct fuse_bpf_args *fa,
+			   struct inode *olddir, struct dentry *oldent,
+			   struct inode *newdir, struct dentry *newent);
+
+int fuse_unlink_initialize(
+		struct fuse_bpf_args *fa, struct fuse_dummy_io *fmi,
+		struct inode *dir, struct dentry *entry);
+int fuse_unlink_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry);
+void *fuse_unlink_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry);
+
+int fuse_link_initialize(struct fuse_bpf_args *fa, struct fuse_link_in *fli,
+			  struct dentry *entry, struct inode *dir,
+			  struct dentry *newent);
+int fuse_link_backing(struct fuse_bpf_args *fa, struct dentry *entry,
+		      struct inode *dir, struct dentry *newent);
+void *fuse_link_finalize(struct fuse_bpf_args *fa, struct dentry *entry,
+			 struct inode *dir, struct dentry *newent);
+
+int fuse_release_initialize(struct fuse_bpf_args *fa, struct fuse_release_in *fri,
+			    struct inode *inode, struct file *file);
+int fuse_releasedir_initialize(struct fuse_bpf_args *fa,
+			struct fuse_release_in *fri,
+			struct inode *inode, struct file *file);
+int fuse_release_backing(struct fuse_bpf_args *fa,
+			 struct inode *inode, struct file *file);
+void *fuse_release_finalize(struct fuse_bpf_args *fa,
+			    struct inode *inode, struct file *file);
+
+int fuse_flush_initialize(struct fuse_bpf_args *fa, struct fuse_flush_in *ffi,
+			  struct file *file, fl_owner_t id);
+int fuse_flush_backing(struct fuse_bpf_args *fa, struct file *file, fl_owner_t id);
+void *fuse_flush_finalize(struct fuse_bpf_args *fa,
+			  struct file *file, fl_owner_t id);
+
+struct fuse_lseek_io {
+	struct fuse_lseek_in fli;
+	struct fuse_lseek_out flo;
+};
+
+int fuse_lseek_initialize(struct fuse_bpf_args *fa, struct fuse_lseek_io *fli,
+			  struct file *file, loff_t offset, int whence);
+int fuse_lseek_backing(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence);
+void *fuse_lseek_finalize(struct fuse_bpf_args *fa, struct file *file, loff_t offset, int whence);
+
+struct fuse_copy_file_range_io {
+	struct fuse_copy_file_range_in fci;
+	struct fuse_write_out fwo;
+};
+
+int fuse_copy_file_range_initialize(struct fuse_bpf_args *fa,
+				   struct fuse_copy_file_range_io *fcf,
+				   struct file *file_in, loff_t pos_in,
+				   struct file *file_out, loff_t pos_out,
+				   size_t len, unsigned int flags);
+int fuse_copy_file_range_backing(struct fuse_bpf_args *fa,
+				 struct file *file_in, loff_t pos_in,
+				 struct file *file_out, loff_t pos_out,
+				 size_t len, unsigned int flags);
+void *fuse_copy_file_range_finalize(struct fuse_bpf_args *fa,
+				    struct file *file_in, loff_t pos_in,
+				    struct file *file_out, loff_t pos_out,
+				    size_t len, unsigned int flags);
+
+int fuse_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi,
+		   struct file *file, loff_t start, loff_t end, int datasync);
+int fuse_fsync_backing(struct fuse_bpf_args *fa,
+		   struct file *file, loff_t start, loff_t end, int datasync);
+void *fuse_fsync_finalize(struct fuse_bpf_args *fa,
+		   struct file *file, loff_t start, loff_t end, int datasync);
+int fuse_dir_fsync_initialize(struct fuse_bpf_args *fa, struct fuse_fsync_in *ffi,
+		   struct file *file, loff_t start, loff_t end, int datasync);
+
+struct fuse_getxattr_io {
+	struct fuse_getxattr_in fgi;
+	struct fuse_getxattr_out fgo;
+};
+
+int fuse_getxattr_initialize(
+		struct fuse_bpf_args *fa, struct fuse_getxattr_io *fgio,
+		struct dentry *dentry, const char *name, void *value,
+		size_t size);
+int fuse_getxattr_backing(
+		struct fuse_bpf_args *fa,
+		struct dentry *dentry, const char *name, void *value,
+		size_t size);
+void *fuse_getxattr_finalize(
+		struct fuse_bpf_args *fa,
+		struct dentry *dentry, const char *name, void *value,
+		size_t size);
+
+int fuse_listxattr_initialize(struct fuse_bpf_args *fa,
+			       struct fuse_getxattr_io *fgio,
+			       struct dentry *dentry, char *list, size_t size);
+int fuse_listxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry,
+			   char *list, size_t size);
+void *fuse_listxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry,
+			      char *list, size_t size);
+
+int fuse_setxattr_initialize(struct fuse_bpf_args *fa,
+			     struct fuse_setxattr_in *fsxi,
+			     struct dentry *dentry, const char *name,
+			     const void *value, size_t size, int flags);
+int fuse_setxattr_backing(struct fuse_bpf_args *fa, struct dentry *dentry,
+			  const char *name, const void *value, size_t size,
+			  int flags);
+void *fuse_setxattr_finalize(struct fuse_bpf_args *fa, struct dentry *dentry,
+			     const char *name, const void *value, size_t size,
+			     int flags);
+
+int fuse_removexattr_initialize(struct fuse_bpf_args *fa,
+				struct fuse_dummy_io *unused,
+				struct dentry *dentry, const char *name);
+int fuse_removexattr_backing(struct fuse_bpf_args *fa,
+			     struct dentry *dentry, const char *name);
+void *fuse_removexattr_finalize(struct fuse_bpf_args *fa,
+				struct dentry *dentry, const char *name);
+
+struct fuse_read_iter_out {
+	uint64_t ret;
+};
+struct fuse_file_read_iter_io {
+	struct fuse_read_in fri;
+	struct fuse_read_iter_out frio;
+};
+
+int fuse_file_read_iter_initialize(
+		struct fuse_bpf_args *fa, struct fuse_file_read_iter_io *fri,
+		struct kiocb *iocb, struct iov_iter *to);
+int fuse_file_read_iter_backing(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *to);
+void *fuse_file_read_iter_finalize(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *to);
+
+struct fuse_write_iter_out {
+	uint64_t ret;
+};
+struct fuse_file_write_iter_io {
+	struct fuse_write_in fwi;
+	struct fuse_write_out fwo;
+	struct fuse_write_iter_out fwio;
+};
+
+int fuse_file_write_iter_initialize(
+		struct fuse_bpf_args *fa, struct fuse_file_write_iter_io *fwio,
+		struct kiocb *iocb, struct iov_iter *from);
+int fuse_file_write_iter_backing(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *from);
+void *fuse_file_write_iter_finalize(struct fuse_bpf_args *fa,
+		struct kiocb *iocb, struct iov_iter *from);
+
+ssize_t fuse_backing_mmap(struct file *file, struct vm_area_struct *vma);
+
+int fuse_file_fallocate_initialize(struct fuse_bpf_args *fa,
+		struct fuse_fallocate_in *ffi,
+		struct file *file, int mode, loff_t offset, loff_t length);
+int fuse_file_fallocate_backing(struct fuse_bpf_args *fa,
+		struct file *file, int mode, loff_t offset, loff_t length);
+void *fuse_file_fallocate_finalize(struct fuse_bpf_args *fa,
+		struct file *file, int mode, loff_t offset, loff_t length);
+
+struct fuse_lookup_io {
+	struct fuse_entry_out feo;
+	struct fuse_entry_bpf feb;
+};
+
+int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode,
+			struct path *backing_path);
+int fuse_handle_bpf_prog(struct fuse_entry_bpf *feb, struct inode *parent,
+			 struct bpf_prog **bpf);
+
+int fuse_lookup_initialize(struct fuse_bpf_args *fa, struct fuse_lookup_io *feo,
+	       struct inode *dir, struct dentry *entry, unsigned int flags);
+int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir,
+			  struct dentry *entry, unsigned int flags);
+struct dentry *fuse_lookup_finalize(struct fuse_bpf_args *fa, struct inode *dir,
+			   struct dentry *entry, unsigned int flags);
+int fuse_revalidate_backing(struct dentry *entry, unsigned int flags);
+
+int fuse_canonical_path_initialize(struct fuse_bpf_args *fa,
+				   struct fuse_dummy_io *fdi,
+				   const struct path *path,
+				   struct path *canonical_path);
+int fuse_canonical_path_backing(struct fuse_bpf_args *fa, const struct path *path,
+				struct path *canonical_path);
+void *fuse_canonical_path_finalize(struct fuse_bpf_args *fa,
+				   const struct path *path,
+				   struct path *canonical_path);
+
+struct fuse_getattr_io {
+	struct fuse_getattr_in fgi;
+	struct fuse_attr_out fao;
+};
+int fuse_getattr_initialize(struct fuse_bpf_args *fa, struct fuse_getattr_io *fgio,
+			const struct dentry *entry, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
+int fuse_getattr_backing(struct fuse_bpf_args *fa,
+			const struct dentry *entry, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
+void *fuse_getattr_finalize(struct fuse_bpf_args *fa,
+			const struct dentry *entry, struct kstat *stat,
+			u32 request_mask, unsigned int flags);
+
+struct fuse_setattr_io {
+	struct fuse_setattr_in fsi;
+	struct fuse_attr_out fao;
+};
+
+int fuse_setattr_initialize(struct fuse_bpf_args *fa, struct fuse_setattr_io *fsi,
+		struct dentry *dentry, struct iattr *attr, struct file *file);
+int fuse_setattr_backing(struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct iattr *attr, struct file *file);
+void *fuse_setattr_finalize(struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct iattr *attr, struct file *file);
+
+int fuse_statfs_initialize(struct fuse_bpf_args *fa, struct fuse_statfs_out *fso,
+		struct dentry *dentry, struct kstatfs *buf);
+int fuse_statfs_backing(struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct kstatfs *buf);
+void *fuse_statfs_finalize(struct fuse_bpf_args *fa,
+		struct dentry *dentry, struct kstatfs *buf);
+
+int fuse_get_link_initialize(struct fuse_bpf_args *fa, struct fuse_dummy_io *dummy,
+		struct inode *inode, struct dentry *dentry,
+		struct delayed_call *callback, const char **out);
+int fuse_get_link_backing(struct fuse_bpf_args *fa,
+		struct inode *inode, struct dentry *dentry,
+		struct delayed_call *callback, const char **out);
+void *fuse_get_link_finalize(struct fuse_bpf_args *fa,
+		struct inode *inode, struct dentry *dentry,
+		struct delayed_call *callback, const char **out);
+
+int fuse_symlink_initialize(
+		struct fuse_bpf_args *fa, struct fuse_dummy_io *unused,
+		struct inode *dir, struct dentry *entry, const char *link, int len);
+int fuse_symlink_backing(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, const char *link, int len);
+void *fuse_symlink_finalize(
+		struct fuse_bpf_args *fa,
+		struct inode *dir, struct dentry *entry, const char *link, int len);
+
+struct fuse_read_io {
+	struct fuse_read_in fri;
+	struct fuse_read_out fro;
+};
+
+int fuse_readdir_initialize(struct fuse_bpf_args *fa, struct fuse_read_io *frio,
+			    struct file *file, struct dir_context *ctx,
+			    bool *force_again, bool *allow_force, bool is_continued);
+int fuse_readdir_backing(struct fuse_bpf_args *fa,
+			 struct file *file, struct dir_context *ctx,
+			 bool *force_again, bool *allow_force, bool is_continued);
+void *fuse_readdir_finalize(struct fuse_bpf_args *fa,
+			    struct file *file, struct dir_context *ctx,
+			    bool *force_again, bool *allow_force, bool is_continued);
+
+int fuse_access_initialize(struct fuse_bpf_args *fa, struct fuse_access_in *fai,
+			   struct inode *inode, int mask);
+int fuse_access_backing(struct fuse_bpf_args *fa, struct inode *inode, int mask);
+void *fuse_access_finalize(struct fuse_bpf_args *fa, struct inode *inode, int mask);
+
+/*
+ * FUSE caches dentries and attributes with separate timeout.  The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_fsdata and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
+static inline u64 time_to_jiffies(u64 sec, u32 nsec)
+{
+	if (sec || nsec) {
+		struct timespec64 ts = {
+			sec,
+			min_t(u32, nsec, NSEC_PER_SEC - 1)
+		};
+
+		return get_jiffies_64() + timespec64_to_jiffies(&ts);
+	} else
+		return 0;
+}
+
+static inline u64 attr_timeout(struct fuse_attr_out *o)
+{
+	return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+static inline bool update_mtime(unsigned int ivalid, bool trust_local_mtime)
+{
+	/* Always update if mtime is explicitly set  */
+	if (ivalid & ATTR_MTIME_SET)
+		return true;
+
+	/* Or if kernel i_mtime is the official one */
+	if (trust_local_mtime)
+		return true;
+
+	/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
+	if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
+		return false;
+
+	/* In all other cases update */
+	return true;
+}
+
+void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
+			  struct kstat *stat);
+
+static inline void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
+			   struct fuse_setattr_in *arg, bool trust_local_cmtime)
+{
+	unsigned int ivalid = iattr->ia_valid;
+
+	if (ivalid & ATTR_MODE)
+		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
+	if (ivalid & ATTR_UID)
+		arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
+	if (ivalid & ATTR_GID)
+		arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
+	if (ivalid & ATTR_SIZE)
+		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
+	if (ivalid & ATTR_ATIME) {
+		arg->valid |= FATTR_ATIME;
+		arg->atime = iattr->ia_atime.tv_sec;
+		arg->atimensec = iattr->ia_atime.tv_nsec;
+		if (!(ivalid & ATTR_ATIME_SET))
+			arg->valid |= FATTR_ATIME_NOW;
+	}
+	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
+		arg->valid |= FATTR_MTIME;
+		arg->mtime = iattr->ia_mtime.tv_sec;
+		arg->mtimensec = iattr->ia_mtime.tv_nsec;
+		if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
+			arg->valid |= FATTR_MTIME_NOW;
+	}
+	if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+		arg->valid |= FATTR_CTIME;
+		arg->ctime = iattr->ia_ctime.tv_sec;
+		arg->ctimensec = iattr->ia_ctime.tv_nsec;
+	}
+}
+
+static inline int finalize_attr(struct inode *inode, struct fuse_attr_out *outarg,
+				u64 attr_version, struct kstat *stat)
+{
+	int err = 0;
+
+	if (fuse_invalid_attr(&outarg->attr) ||
+	    ((inode->i_mode ^ outarg->attr.mode) & S_IFMT)) {
+		fuse_make_bad(inode);
+		err = -EIO;
+	} else {
+		fuse_change_attributes(inode, &outarg->attr,
+				       attr_timeout(outarg),
+				       attr_version);
+		if (stat)
+			fuse_fillattr(inode, &outarg->attr, stat);
+	}
+	return err;
+}
+
+static inline void convert_statfs_to_fuse(struct fuse_kstatfs *attr, struct kstatfs *stbuf)
+{
+	attr->bsize   = stbuf->f_bsize;
+	attr->frsize  = stbuf->f_frsize;
+	attr->blocks  = stbuf->f_blocks;
+	attr->bfree   = stbuf->f_bfree;
+	attr->bavail  = stbuf->f_bavail;
+	attr->files   = stbuf->f_files;
+	attr->ffree   = stbuf->f_ffree;
+	attr->namelen = stbuf->f_namelen;
+	/* fsid is left zero */
+}
+
+static inline void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+	stbuf->f_type    = FUSE_SUPER_MAGIC;
+	stbuf->f_bsize   = attr->bsize;
+	stbuf->f_frsize  = attr->frsize;
+	stbuf->f_blocks  = attr->blocks;
+	stbuf->f_bfree   = attr->bfree;
+	stbuf->f_bavail  = attr->bavail;
+	stbuf->f_files   = attr->files;
+	stbuf->f_ffree   = attr->ffree;
+	stbuf->f_namelen = attr->namelen;
+	/* fsid is left zero */
+}
+
+#ifdef CONFIG_FUSE_BPF
+struct fuse_err_ret {
+	void *result;
+	bool ret;
+};
+
+int __init fuse_bpf_init(void);
+void __exit fuse_bpf_cleanup(void);
+
+ssize_t fuse_bpf_simple_request(struct fuse_mount *fm, struct fuse_bpf_args *args);
+
+/*
+ * expression statement to wrap the backing filter logic
+ * struct inode *inode: inode with bpf and backing inode
+ * typedef io: (typically complex) type whose components fuse_args can point to.
+ *	An instance of this type is created locally and passed to initialize
+ * void initialize(struct fuse_bpf_args *fa, io *in_out, args...): function that sets
+ *	up fa and io based on args
+ * int backing(struct fuse_bpf_args *fa, args...): function that actually performs
+ *	the backing io operation
+ * void *finalize(struct fuse_bpf_args *, args...): function that performs any final
+ *	work needed to commit the backing io
+ */
+#define fuse_bpf_backing(inode, io, initialize, backing, finalize,	\
+			 args...)					\
+({									\
+	struct fuse_err_ret fer = {0};					\
+	int ext_flags;							\
+	struct fuse_inode *fuse_inode = get_fuse_inode(inode);		\
+	struct fuse_mount *fm = get_fuse_mount(inode);			\
+	io feo = {0};							\
+	struct fuse_bpf_args fa = {0}, fa_backup = {0};			\
+	bool locked;							\
+	ssize_t res;							\
+	void *err;							\
+	int i;								\
+	bool initialized = false;					\
+									\
+	do {								\
+		if (!fuse_inode || !fuse_inode->backing_inode)		\
+			break;						\
+									\
+		err = ERR_PTR(initialize(&fa, &feo, args));		\
+		if (err) {						\
+			fer = (struct fuse_err_ret) {			\
+				err,					\
+				true,					\
+			};						\
+			break;						\
+		}							\
+		initialized = true;					\
+									\
+		fa_backup = fa;						\
+		fa.opcode |= FUSE_PREFILTER;				\
+		for (i = 0; i < fa.in_numargs; ++i)			\
+			fa.out_args[i] = (struct fuse_bpf_arg) {	\
+				.size = fa.in_args[i].size,		\
+				.value = (void *)fa.in_args[i].value,	\
+			};						\
+		fa.out_numargs = fa.in_numargs;				\
+									\
+		ext_flags = fuse_inode->bpf ?				\
+			bpf_prog_run(fuse_inode->bpf, &fa) :		\
+			FUSE_BPF_BACKING;				\
+		if (ext_flags < 0) {					\
+			fer = (struct fuse_err_ret) {			\
+				ERR_PTR(ext_flags),			\
+				true,					\
+			};						\
+			break;						\
+		}							\
+									\
+		if (ext_flags & FUSE_BPF_USER_FILTER) {			\
+			locked = fuse_lock_inode(inode);		\
+			res = fuse_bpf_simple_request(fm, &fa);		\
+			fuse_unlock_inode(inode, locked);		\
+			if (res < 0) {					\
+				fer = (struct fuse_err_ret) {		\
+					ERR_PTR(res),			\
+					true,				\
+				};					\
+				break;					\
+			}						\
+		}							\
+									\
+		if (!(ext_flags & FUSE_BPF_BACKING))			\
+			break;						\
+									\
+		fa.opcode &= ~FUSE_PREFILTER;				\
+		for (i = 0; i < fa.in_numargs; ++i)			\
+			fa.in_args[i] = (struct fuse_bpf_in_arg) {	\
+				.size = fa.out_args[i].size,		\
+				.value = fa.out_args[i].value,		\
+			};						\
+		for (i = 0; i < fa_backup.out_numargs; ++i)		\
+			fa.out_args[i] = (struct fuse_bpf_arg) {	\
+				.size = fa_backup.out_args[i].size,	\
+				.value = fa_backup.out_args[i].value,	\
+			};						\
+		fa.out_numargs = fa_backup.out_numargs;			\
+									\
+		fer = (struct fuse_err_ret) {				\
+			ERR_PTR(backing(&fa, args)),			\
+			true,						\
+		};							\
+		if (IS_ERR(fer.result))					\
+			fa.error_in = PTR_ERR(fer.result);		\
+		if (!(ext_flags & FUSE_BPF_POST_FILTER))		\
+			break;						\
+									\
+		fa.opcode |= FUSE_POSTFILTER;				\
+		for (i = 0; i < fa.out_numargs; ++i)			\
+			fa.in_args[fa.in_numargs++] =			\
+				(struct fuse_bpf_in_arg) {		\
+					.size = fa.out_args[i].size,	\
+					.value = fa.out_args[i].value,	\
+				};					\
+		ext_flags = bpf_prog_run(fuse_inode->bpf, &fa);		\
+		if (ext_flags < 0) {					\
+			fer = (struct fuse_err_ret) {			\
+				ERR_PTR(ext_flags),			\
+				true,					\
+			};						\
+			break;						\
+		}							\
+		if (!(ext_flags & FUSE_BPF_USER_FILTER))		\
+			break;						\
+									\
+		fa.out_args[0].size = fa_backup.out_args[0].size;	\
+		fa.out_args[1].size = fa_backup.out_args[1].size;	\
+		fa.out_numargs = fa_backup.out_numargs;			\
+		locked = fuse_lock_inode(inode);			\
+		res = fuse_bpf_simple_request(fm, &fa);			\
+		fuse_unlock_inode(inode, locked);			\
+		if (res < 0) {						\
+			fer.result = ERR_PTR(res);			\
+			break;						\
+		}							\
+	} while (false);						\
+									\
+	if (initialized && fer.ret) {					\
+		err = finalize(&fa, args);				\
+		if (err)						\
+			fer.result = err;				\
+	}								\
+									\
+	fer;								\
+})
+
+struct bpf_prog *fuse_get_bpf_prog(struct file *file);
+#endif /* CONFIG_FUSE_BPF */
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 632f94b69e88..6ccb7fc7c121 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -78,6 +78,10 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 
 	fi->i_time = 0;
 	fi->inval_mask = 0;
+#ifdef CONFIG_FUSE_BPF
+	fi->backing_inode = NULL;
+	fi->bpf = NULL;
+#endif
 	fi->nodeid = 0;
 	fi->nlookup = 0;
 	fi->attr_version = 0;
@@ -120,6 +124,12 @@ static void fuse_evict_inode(struct inode *inode)
 	/* Will write inode on close/munmap and in all other dirtiers */
 	WARN_ON(inode->i_state & I_DIRTY_INODE);
 
+#ifdef CONFIG_FUSE_BPF
+	iput(fi->backing_inode);
+	if (fi->bpf)
+		bpf_prog_put(fi->bpf);
+	fi->bpf = NULL;
+#endif
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 	if (inode->i_sb->s_flags & SB_ACTIVE) {
@@ -162,6 +172,28 @@ static ino_t fuse_squash_ino(u64 ino64)
 	return ino;
 }
 
+static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
+				      const struct inode *inode)
+{
+	*attr = (struct fuse_attr){
+		.ino		= inode->i_ino,
+		.size		= inode->i_size,
+		.blocks		= inode->i_blocks,
+		.atime		= inode->i_atime.tv_sec,
+		.mtime		= inode->i_mtime.tv_sec,
+		.ctime		= inode->i_ctime.tv_sec,
+		.atimensec	= inode->i_atime.tv_nsec,
+		.mtimensec	= inode->i_mtime.tv_nsec,
+		.ctimensec	= inode->i_ctime.tv_nsec,
+		.mode		= inode->i_mode,
+		.nlink		= inode->i_nlink,
+		.uid		= inode->i_uid.val,
+		.gid		= inode->i_gid.val,
+		.rdev		= inode->i_rdev,
+		.blksize	= 1u << inode->i_blkbits,
+	};
+}
+
 void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 				   u64 attr_valid, u32 cache_mask)
 {
@@ -329,28 +361,104 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 		 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		fuse_init_common(inode);
-		init_special_inode(inode, inode->i_mode,
-				   new_decode_dev(attr->rdev));
+		init_special_inode(inode, inode->i_mode, attr->rdev);
 	} else
 		BUG();
 }
 
+struct fuse_inode_identifier {
+	u64 nodeid;
+	struct inode *backing_inode;
+};
+
 static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
 {
-	u64 nodeid = *(u64 *) _nodeidp;
-	if (get_node_id(inode) == nodeid)
-		return 1;
-	else
-		return 0;
+	struct fuse_inode_identifier *fii =
+		(struct fuse_inode_identifier *) _nodeidp;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	return fii->nodeid == fi->nodeid;
+}
+
+static int fuse_inode_backing_eq(struct inode *inode, void *_nodeidp)
+{
+	struct fuse_inode_identifier *fii =
+		(struct fuse_inode_identifier *) _nodeidp;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	return fii->nodeid == fi->nodeid
+#ifdef CONFIG_FUSE_BPF
+		&& fii->backing_inode == fi->backing_inode
+#endif
+		;
 }
 
 static int fuse_inode_set(struct inode *inode, void *_nodeidp)
 {
-	u64 nodeid = *(u64 *) _nodeidp;
-	get_fuse_inode(inode)->nodeid = nodeid;
+	struct fuse_inode_identifier *fii =
+		(struct fuse_inode_identifier *) _nodeidp;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	fi->nodeid = fii->nodeid;
+
 	return 0;
 }
 
+static int fuse_inode_backing_set(struct inode *inode, void *_nodeidp)
+{
+	struct fuse_inode_identifier *fii =
+		(struct fuse_inode_identifier *) _nodeidp;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	fi->nodeid = fii->nodeid;
+#ifdef CONFIG_FUSE_BPF
+	fi->backing_inode = fii->backing_inode;
+	if (fi->backing_inode)
+		ihold(fi->backing_inode);
+#endif
+
+	return 0;
+}
+
+struct inode *fuse_iget_backing(struct super_block *sb, u64 nodeid,
+				struct inode *backing_inode)
+{
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct fuse_inode_identifier fii = {
+		.nodeid = nodeid,
+		.backing_inode = backing_inode,
+	};
+	struct fuse_attr attr;
+	unsigned long hash = (unsigned long) backing_inode;
+
+	if (nodeid)
+		hash = nodeid;
+
+	fuse_fill_attr_from_inode(&attr, backing_inode);
+	inode = iget5_locked(sb, hash, fuse_inode_backing_eq,
+			     fuse_inode_backing_set, &fii);
+	if (!inode)
+		return NULL;
+
+	if ((inode->i_state & I_NEW)) {
+		inode->i_flags |= S_NOATIME;
+		if (!fc->writeback_cache)
+			inode->i_flags |= S_NOCMTIME;
+		fuse_init_common(inode);
+		unlock_new_inode(inode);
+	}
+
+	fi = get_fuse_inode(inode);
+	fuse_init_inode(inode, &attr);
+	spin_lock(&fi->lock);
+	fi->nlookup++;
+	spin_unlock(&fi->lock);
+
+	return inode;
+}
+
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
 			u64 attr_valid, u64 attr_version)
@@ -358,6 +466,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 	struct inode *inode;
 	struct fuse_inode *fi;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct fuse_inode_identifier fii = {
+		.nodeid = nodeid,
+	};
 
 	/*
 	 * Auto mount points get their node id from the submount root, which is
@@ -379,7 +490,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 	}
 
 retry:
-	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &fii);
 	if (!inode)
 		return NULL;
 
@@ -411,13 +522,16 @@ struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid,
 {
 	struct fuse_mount *fm_iter;
 	struct inode *inode;
+	struct fuse_inode_identifier fii = {
+		.nodeid = nodeid,
+	};
 
 	WARN_ON(!rwsem_is_locked(&fc->killsb));
 	list_for_each_entry(fm_iter, &fc->mounts, fc_entry) {
 		if (!fm_iter->sb)
 			continue;
 
-		inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid);
+		inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &fii);
 		if (inode) {
 			if (fm)
 				*fm = fm_iter;
@@ -504,20 +618,6 @@ static void fuse_send_destroy(struct fuse_mount *fm)
 	}
 }
 
-static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
-{
-	stbuf->f_type    = FUSE_SUPER_MAGIC;
-	stbuf->f_bsize   = attr->bsize;
-	stbuf->f_frsize  = attr->frsize;
-	stbuf->f_blocks  = attr->blocks;
-	stbuf->f_bfree   = attr->bfree;
-	stbuf->f_bavail  = attr->bavail;
-	stbuf->f_files   = attr->files;
-	stbuf->f_ffree   = attr->ffree;
-	stbuf->f_namelen = attr->namelen;
-	/* fsid is left zero */
-}
-
 static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -525,12 +625,24 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	FUSE_ARGS(args);
 	struct fuse_statfs_out outarg;
 	int err;
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+#endif
 
 	if (!fuse_allow_current_process(fm->fc)) {
 		buf->f_type = FUSE_SUPER_MAGIC;
 		return 0;
 	}
 
+#ifdef CONFIG_FUSE_BPF
+	fer = fuse_bpf_backing(dentry->d_inode, struct fuse_statfs_out,
+			       fuse_statfs_initialize, fuse_statfs_backing,
+			       fuse_statfs_finalize,
+			       dentry, buf);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	memset(&outarg, 0, sizeof(outarg));
 	args.in_numargs = 0;
 	args.opcode = FUSE_STATFS;
@@ -647,6 +759,9 @@ enum {
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+	OPT_ROOT_BPF,
+	OPT_ROOT_DIR,
+	OPT_NO_DAEMON,
 	OPT_ERR
 };
 
@@ -661,6 +776,9 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = {
 	fsparam_u32	("max_read",		OPT_MAX_READ),
 	fsparam_u32	("blksize",		OPT_BLKSIZE),
 	fsparam_string	("subtype",		OPT_SUBTYPE),
+	fsparam_u32	("root_bpf",		OPT_ROOT_BPF),
+	fsparam_u32	("root_dir",		OPT_ROOT_DIR),
+	fsparam_flag	("no_daemon",		OPT_NO_DAEMON),
 	{}
 };
 
@@ -744,6 +862,26 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param)
 		ctx->blksize = result.uint_32;
 		break;
 
+	case OPT_ROOT_BPF:
+		ctx->root_bpf = bpf_prog_get_type_dev(result.uint_32,
+						BPF_PROG_TYPE_FUSE, false);
+		if (IS_ERR(ctx->root_bpf)) {
+			ctx->root_bpf = NULL;
+			return invalfc(fsc, "Unable to open bpf program");
+		}
+		break;
+
+	case OPT_ROOT_DIR:
+		ctx->root_dir = fget(result.uint_32);
+		if (!ctx->root_dir)
+			return invalfc(fsc, "Unable to open root directory");
+		break;
+
+	case OPT_NO_DAEMON:
+		ctx->no_daemon = true;
+		ctx->fd_present = true;
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -756,6 +894,10 @@ static void fuse_free_fsc(struct fs_context *fsc)
 	struct fuse_fs_context *ctx = fsc->fs_private;
 
 	if (ctx) {
+		if (ctx->root_dir)
+			fput(ctx->root_dir);
+		if (ctx->root_bpf)
+			bpf_prog_put(ctx->root_bpf);
 		kfree(ctx->subtype);
 		kfree(ctx);
 	}
@@ -885,15 +1027,34 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
 }
 EXPORT_SYMBOL_GPL(fuse_conn_get);
 
-static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb,
+					 unsigned int mode,
+					 struct bpf_prog *root_bpf,
+					 struct file *backing_fd)
 {
 	struct fuse_attr attr;
-	memset(&attr, 0, sizeof(attr));
+	struct inode *inode;
 
+	memset(&attr, 0, sizeof(attr));
 	attr.mode = mode;
 	attr.ino = FUSE_ROOT_ID;
 	attr.nlink = 1;
-	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+	inode = fuse_iget(sb, 1, 0, &attr, 0, 0);
+	if (!inode)
+		return NULL;
+
+#ifdef CONFIG_FUSE_BPF
+	get_fuse_inode(inode)->bpf = root_bpf;
+	if (root_bpf)
+		bpf_prog_inc(root_bpf);
+
+	if (backing_fd) {
+		get_fuse_inode(inode)->backing_inode = backing_fd->f_inode;
+		ihold(backing_fd->f_inode);
+	}
+#endif
+
+	return inode;
 }
 
 struct fuse_inode_handle {
@@ -908,11 +1069,14 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
 	struct inode *inode;
 	struct dentry *entry;
 	int err = -ESTALE;
+	struct fuse_inode_identifier fii = {
+		.nodeid = handle->nodeid,
+	};
 
 	if (handle->nodeid == 0)
 		goto out_err;
 
-	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &fii);
 	if (!inode) {
 		struct fuse_entry_out outarg;
 		const struct qstr name = QSTR_INIT(".", 1);
@@ -921,7 +1085,7 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
 			goto out_err;
 
 		err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
-				       &inode);
+				       NULL, &inode);
 		if (err && err != -ENOENT)
 			goto out_err;
 		if (err || !inode) {
@@ -1015,13 +1179,14 @@ static struct dentry *fuse_get_parent(struct dentry *child)
 	struct inode *inode;
 	struct dentry *parent;
 	struct fuse_entry_out outarg;
+	const struct qstr name = QSTR_INIT("..", 2);
 	int err;
 
 	if (!fc->export_support)
 		return ERR_PTR(-ESTALE);
 
 	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
-			       &dotdot_name, &outarg, &inode);
+			       &name, &outarg, NULL, &inode);
 	if (err) {
 		if (err == -ENOENT)
 			return ERR_PTR(-ESTALE);
@@ -1284,7 +1449,7 @@ void fuse_send_init(struct fuse_mount *fm)
 	ia->args.nocreds = true;
 	ia->args.end = process_init_reply;
 
-	if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
+	if (unlikely(fm->fc->no_daemon) || fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0)
 		process_init_reply(fm, &ia->args, -ENOTCONN);
 }
 EXPORT_SYMBOL_GPL(fuse_send_init);
@@ -1408,28 +1573,6 @@ void fuse_dev_free(struct fuse_dev *fud)
 }
 EXPORT_SYMBOL_GPL(fuse_dev_free);
 
-static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
-				      const struct fuse_inode *fi)
-{
-	*attr = (struct fuse_attr){
-		.ino		= fi->inode.i_ino,
-		.size		= fi->inode.i_size,
-		.blocks		= fi->inode.i_blocks,
-		.atime		= fi->inode.i_atime.tv_sec,
-		.mtime		= fi->inode.i_mtime.tv_sec,
-		.ctime		= fi->inode.i_ctime.tv_sec,
-		.atimensec	= fi->inode.i_atime.tv_nsec,
-		.mtimensec	= fi->inode.i_mtime.tv_nsec,
-		.ctimensec	= fi->inode.i_ctime.tv_nsec,
-		.mode		= fi->inode.i_mode,
-		.nlink		= fi->inode.i_nlink,
-		.uid		= fi->inode.i_uid.val,
-		.gid		= fi->inode.i_gid.val,
-		.rdev		= fi->inode.i_rdev,
-		.blksize	= 1u << fi->inode.i_blkbits,
-	};
-}
-
 static void fuse_sb_defaults(struct super_block *sb)
 {
 	sb->s_magic = FUSE_SUPER_MAGIC;
@@ -1473,7 +1616,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
 	if (parent_sb->s_subtype && !sb->s_subtype)
 		return -ENOMEM;
 
-	fuse_fill_attr_from_inode(&root_attr, parent_fi);
+	fuse_fill_attr_from_inode(&root_attr, &parent_fi->inode);
 	root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0);
 	/*
 	 * This inode is just a duplicate, so it is not looked up and
@@ -1600,13 +1743,16 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 	fc->destroy = ctx->destroy;
 	fc->no_control = ctx->no_control;
 	fc->no_force_umount = ctx->no_force_umount;
+	fc->no_daemon = ctx->no_daemon;
 
 	err = -ENOMEM;
-	root = fuse_get_root_inode(sb, ctx->rootmode);
+	root = fuse_get_root_inode(sb, ctx->rootmode, ctx->root_bpf,
+				   ctx->root_dir);
 	sb->s_d_op = &fuse_root_dentry_operations;
 	root_dentry = d_make_root(root);
 	if (!root_dentry)
 		goto err_dev_free;
+	fuse_init_dentry_root(root_dentry, ctx->root_dir);
 	/* Root dentry doesn't have .d_revalidate */
 	sb->s_d_op = &fuse_dentry_operations;
 
@@ -1645,18 +1791,20 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
 	struct fuse_fs_context *ctx = fsc->fs_private;
 	int err;
 
-	if (!ctx->file || !ctx->rootmode_present ||
-	    !ctx->user_id_present || !ctx->group_id_present)
-		return -EINVAL;
+	if (!ctx->no_daemon) {
+		if (!ctx->file || !ctx->rootmode_present ||
+		    !ctx->user_id_present || !ctx->group_id_present)
+			return -EINVAL;
 
-	/*
-	 * Require mount to happen from the same user namespace which
-	 * opened /dev/fuse to prevent potential attacks.
-	 */
-	if ((ctx->file->f_op != &fuse_dev_operations) ||
-	    (ctx->file->f_cred->user_ns != sb->s_user_ns))
-		return -EINVAL;
-	ctx->fudptr = &ctx->file->private_data;
+		/*
+		 * Require mount to happen from the same user namespace which
+		 * opened /dev/fuse to prevent potential attacks.
+		 */
+		if ((ctx->file->f_op != &fuse_dev_operations) ||
+		    (ctx->file->f_cred->user_ns != sb->s_user_ns))
+			return -EINVAL;
+		ctx->fudptr = &ctx->file->private_data;
+	}
 
 	err = fuse_fill_super_common(sb, ctx);
 	if (err)
@@ -1937,6 +2085,26 @@ static void fuse_fs_cleanup(void)
 
 static struct kobject *fuse_kobj;
 
+/* TODO Remove this once BPF_PROG_TYPE_FUSE is upstreamed */
+static ssize_t bpf_prog_type_fuse_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buff)
+{
+	return sysfs_emit(buff, "%d\n", BPF_PROG_TYPE_FUSE);
+}
+
+static struct kobj_attribute bpf_prog_type_fuse_attr =
+		__ATTR_RO(bpf_prog_type_fuse);
+
+static struct attribute *bpf_attributes[] = {
+	&bpf_prog_type_fuse_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group bpf_attr_group = {
+	.attrs = bpf_attributes,
+};
+/* TODO remove to here */
+
 static int fuse_sysfs_init(void)
 {
 	int err;
@@ -1951,8 +2119,15 @@ static int fuse_sysfs_init(void)
 	if (err)
 		goto out_fuse_unregister;
 
+	/* TODO Remove when BPF_PROG_TYPE_FUSE is upstreamed */
+	err = sysfs_create_group(fuse_kobj, &bpf_attr_group);
+	if (err)
+		goto out_fuse_remove_mount_point;
+
 	return 0;
 
+ out_fuse_remove_mount_point:
+	sysfs_remove_mount_point(fuse_kobj, "connections");
  out_fuse_unregister:
 	kobject_put(fuse_kobj);
  out_err:
@@ -1989,11 +2164,21 @@ static int __init fuse_init(void)
 	if (res)
 		goto err_sysfs_cleanup;
 
+#ifdef CONFIG_FUSE_BPF
+	res = fuse_bpf_init();
+	if (res)
+		goto err_ctl_cleanup;
+#endif
+
 	sanitize_global_limit(&max_user_bgreq);
 	sanitize_global_limit(&max_user_congthresh);
 
 	return 0;
 
+#ifdef CONFIG_FUSE_BPF
+ err_ctl_cleanup:
+	fuse_ctl_cleanup();
+#endif
  err_sysfs_cleanup:
 	fuse_sysfs_cleanup();
  err_dev_cleanup:
@@ -2011,6 +2196,9 @@ static void __exit fuse_exit(void)
 	fuse_ctl_cleanup();
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
+#ifdef CONFIG_FUSE_BPF
+	fuse_bpf_cleanup();
+#endif
 	fuse_dev_cleanup();
 }
 
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 630fa2880f8b..c0ae3069c6c8 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -35,7 +35,7 @@ static void fuse_file_accessed(struct file *dst_file, struct file *src_file)
 	touch_atime(&dst_file->f_path);
 }
 
-static void fuse_copyattr(struct file *dst_file, struct file *src_file)
+void fuse_copyattr(struct file *dst_file, struct file *src_file)
 {
 	struct inode *dst = file_inode(dst_file);
 	struct inode *src = file_inode(src_file);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index e8deaacf1832..4d97cddf6e0d 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -20,6 +20,8 @@ static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 
 	if (!fc->do_readdirplus)
 		return false;
+	if (fi->nodeid == 0)
+		return false;
 	if (!fc->readdirplus_auto)
 		return true;
 	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
@@ -579,6 +581,26 @@ int fuse_readdir(struct file *file, struct dir_context *ctx)
 	struct inode *inode = file_inode(file);
 	int err;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+	bool allow_force;
+	bool force_again = false;
+	bool is_continued = false;
+
+again:
+	fer = fuse_bpf_backing(inode, struct fuse_read_io,
+			       fuse_readdir_initialize, fuse_readdir_backing,
+			       fuse_readdir_finalize,
+			       file, ctx, &force_again, &allow_force, is_continued);
+	if (force_again && !IS_ERR(fer.result)) {
+		is_continued = true;
+		goto again;
+	}
+
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fuse_is_bad(inode))
 		return -EIO;
 
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 0d3e7177fce0..3029d1e2bda2 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -115,6 +115,17 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	struct fuse_getxattr_out outarg;
 	ssize_t ret;
 
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_getxattr_io,
+			       fuse_listxattr_initialize,
+			       fuse_listxattr_backing, fuse_listxattr_finalize,
+			       entry, list, size);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fuse_is_bad(inode))
 		return -EIO;
 
@@ -182,6 +193,17 @@ static int fuse_xattr_get(const struct xattr_handler *handler,
 			 struct dentry *dentry, struct inode *inode,
 			 const char *name, void *value, size_t size)
 {
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	fer = fuse_bpf_backing(inode, struct fuse_getxattr_io,
+			       fuse_getxattr_initialize, fuse_getxattr_backing,
+			       fuse_getxattr_finalize,
+			       dentry, name, value, size);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fuse_is_bad(inode))
 		return -EIO;
 
@@ -194,6 +216,24 @@ static int fuse_xattr_set(const struct xattr_handler *handler,
 			  const char *name, const void *value, size_t size,
 			  int flags)
 {
+#ifdef CONFIG_FUSE_BPF
+	struct fuse_err_ret fer;
+
+	if (value)
+		fer = fuse_bpf_backing(inode, struct fuse_setxattr_in,
+			       fuse_setxattr_initialize, fuse_setxattr_backing,
+			       fuse_setxattr_finalize, dentry, name, value,
+			       size, flags);
+	else
+		fer = fuse_bpf_backing(inode, struct fuse_dummy_io,
+				       fuse_removexattr_initialize,
+				       fuse_removexattr_backing,
+				       fuse_removexattr_finalize,
+				       dentry, name);
+	if (fer.ret)
+		return PTR_ERR(fer.result);
+#endif
+
 	if (fuse_is_bad(inode))
 		return -EIO;
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2c6a4f2562a7..1c612043e327 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -79,6 +79,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 #endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 	      void *, void *)
+#ifdef CONFIG_FUSE_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_FUSE, fuse, struct fuse_bpf_args, struct fuse_bpf_args)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/android_fuse.h b/include/uapi/linux/android_fuse.h
new file mode 100644
index 000000000000..630b752d34b7
--- /dev/null
+++ b/include/uapi/linux/android_fuse.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause WITH Linux-syscall-note */
+/* Copyright (c) 2022 Google LLC */
+
+#ifndef _LINUX_ANDROID_FUSE_H
+#define _LINUX_ANDROID_FUSE_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#define FUSE_ACTION_KEEP	0
+#define FUSE_ACTION_REMOVE	1
+#define FUSE_ACTION_REPLACE	2
+
+struct fuse_entry_bpf_out {
+	uint64_t	backing_action;
+	uint64_t	backing_fd;
+	uint64_t	bpf_action;
+	uint64_t	bpf_fd;
+};
+
+struct fuse_entry_bpf {
+	struct fuse_entry_bpf_out out;
+	struct file *backing_file;
+	struct file *bpf_file;
+};
+
+struct fuse_read_out {
+	uint64_t	offset;
+	uint32_t	again;
+	uint32_t	padding;
+};
+
+struct fuse_in_postfilter_header {
+	uint32_t	len;
+	uint32_t	opcode;
+	uint64_t	unique;
+	uint64_t	nodeid;
+	uint32_t	uid;
+	uint32_t	gid;
+	uint32_t	pid;
+	uint32_t	error_in;
+};
+
+/*
+ * Fuse BPF Args
+ *
+ * Used to communicate with bpf programs to allow checking or altering certain values.
+ * The end_offset allows the bpf verifier to check boundaries statically. This reflects
+ * the ends of the buffer. size shows the length that was actually used.
+ *
+ */
+
+/** One input argument of a request */
+struct fuse_bpf_in_arg {
+	uint32_t size;
+	const void *value;
+	const void *end_offset;
+};
+
+/** One output argument of a request */
+struct fuse_bpf_arg {
+	uint32_t size;
+	void *value;
+	void *end_offset;
+};
+
+#define FUSE_MAX_IN_ARGS 5
+#define FUSE_MAX_OUT_ARGS 3
+
+#define FUSE_BPF_FORCE (1 << 0)
+#define FUSE_BPF_OUT_ARGVAR (1 << 6)
+
+struct fuse_bpf_args {
+	uint64_t nodeid;
+	uint32_t opcode;
+	uint32_t error_in;
+	uint32_t in_numargs;
+	uint32_t out_numargs;
+	uint32_t flags;
+	struct fuse_bpf_in_arg in_args[FUSE_MAX_IN_ARGS];
+	struct fuse_bpf_arg out_args[FUSE_MAX_OUT_ARGS];
+};
+
+#define FUSE_BPF_USER_FILTER	1
+#define FUSE_BPF_BACKING	2
+#define FUSE_BPF_POST_FILTER	4
+
+#define FUSE_OPCODE_FILTER	0x0ffff
+#define FUSE_PREFILTER		0x10000
+#define FUSE_POSTFILTER		0x20000
+
+#endif  // _LINUX_ANDROID_FUSE_H
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 51b9aa640ad2..a8b5bf3363b8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -978,6 +978,16 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+
+	/*
+	 * Until fuse-bpf is upstreamed, this value must be at the end to allow for
+	 * other recently-added upstreamed values to be correct.
+	 * This works because no one should use this value directly, rather they must
+	 * read the value from /sys/fs/fuse/bpf_prog_type_fuse
+	 * Please maintain this value at the end of the list until fuse-bpf is
+	 * upstreamed.
+	 */
+	BPF_PROG_TYPE_FUSE,
 };
 
 enum bpf_attach_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 341c94f208f4..0f0b2836be66 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -43,3 +43,6 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
 obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
 $(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
 	$(call if_changed_rule,cc_o_c)
+ifeq ($(CONFIG_FUSE_BPF),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_fuse.o
+endif
diff --git a/kernel/bpf/bpf_fuse.c b/kernel/bpf/bpf_fuse.c
new file mode 100644
index 000000000000..c6aa670bc54c
--- /dev/null
+++ b/kernel/bpf/bpf_fuse.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2021 Google LLC
+
+#include <linux/filter.h>
+#include <linux/android_fuse.h>
+
+static const struct bpf_func_proto *
+fuse_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_trace_printk:
+			return bpf_get_trace_printk_proto();
+
+	case BPF_FUNC_get_current_uid_gid:
+			return &bpf_get_current_uid_gid_proto;
+
+	case BPF_FUNC_get_current_pid_tgid:
+			return &bpf_get_current_pid_tgid_proto;
+
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+
+	default:
+		pr_debug("Invalid fuse bpf func %d\n", func_id);
+		return NULL;
+	}
+}
+
+static bool fuse_prog_is_valid_access(int off, int size,
+				enum bpf_access_type type,
+				const struct bpf_prog *prog,
+				struct bpf_insn_access_aux *info)
+{
+	int i;
+
+	if (off < 0 || off > offsetofend(struct fuse_bpf_args, out_args))
+		return false;
+
+	/* TODO This is garbage. Do it properly */
+	for (i = 0; i < 5; i++) {
+		if (off == offsetof(struct fuse_bpf_args, in_args[i].value)) {
+			info->reg_type = PTR_TO_BUF;
+			info->ctx_field_size = 256;
+			if (type != BPF_READ)
+				return false;
+			return true;
+		}
+	}
+	for (i = 0; i < 3; i++) {
+		if (off == offsetof(struct fuse_bpf_args, out_args[i].value)) {
+			info->reg_type = PTR_TO_BUF;
+			info->ctx_field_size = 256;
+			return true;
+		}
+	}
+	if (type != BPF_READ)
+		return false;
+
+	return true;
+}
+
+const struct bpf_verifier_ops fuse_verifier_ops = {
+	.get_func_proto  = fuse_prog_func_proto,
+	.is_valid_access = fuse_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops fuse_prog_ops = {
+};
+
+struct bpf_prog *fuse_get_bpf_prog(struct file *file)
+{
+	struct bpf_prog *bpf_prog = ERR_PTR(-EINVAL);
+
+	if (!file || IS_ERR(file))
+		return bpf_prog;
+	/**
+	 * Two ways of getting a bpf prog from another task's fd, since
+	 * bpf_prog_get_type_dev only works with an fd
+	 *
+	 * 1) Duplicate a little of the needed code. Requires access to
+	 *    bpf_prog_fops for validation, which is not exported for modules
+	 * 2) Insert the bpf_file object into a fd from the current task
+	 *    Stupidly complex, but I think OK, as security checks are not run
+	 *    during the existence of the handle
+	 *
+	 * Best would be to upstream 1) into kernel/bpf/syscall.c and export it
+	 * for use here. Failing that, we have to use 2, since fuse must be
+	 * compilable as a module.
+	 */
+#if 1
+	if (file->f_op != &bpf_prog_fops)
+		goto out;
+
+	bpf_prog = file->private_data;
+	if (bpf_prog->type == BPF_PROG_TYPE_FUSE)
+		bpf_prog_inc(bpf_prog);
+	else
+		bpf_prog = ERR_PTR(-EINVAL);
+
+#else
+	{
+		int task_fd = get_unused_fd_flags(file->f_flags);
+
+		if (task_fd < 0)
+			goto out;
+
+		fd_install(task_fd, file);
+
+		bpf_prog = bpf_prog_get_type_dev(task_fd, BPF_PROG_TYPE_FUSE,
+						 false);
+
+		/* Close the fd, which also closes the file */
+		__close_fd(current->files, task_fd);
+		file = NULL;
+	}
+#endif
+
+out:
+	if (file)
+		fput(file);
+	return bpf_prog;
+}
+EXPORT_SYMBOL(fuse_get_bpf_prog);
+
+
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 35c07afac924..092c285c5cc7 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3,6 +3,7 @@
 
 #include <uapi/linux/btf.h>
 #include <uapi/linux/bpf.h>
+#include <uapi/linux/android_fuse.h>
 #include <uapi/linux/bpf_perf_event.h>
 #include <uapi/linux/types.h>
 #include <linux/seq_file.h>
diff --git a/tools/testing/selftests/filesystems/fuse/.gitignore b/tools/testing/selftests/filesystems/fuse/.gitignore
new file mode 100644
index 000000000000..3ee9a27fe66a
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/.gitignore
@@ -0,0 +1,2 @@
+fuse_test
+*.raw
diff --git a/tools/testing/selftests/filesystems/fuse/Makefile b/tools/testing/selftests/filesystems/fuse/Makefile
new file mode 100644
index 000000000000..261d7606560a
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -D_FILE_OFFSET_BITS=64 -Wall -Werror -I../.. -I../../../../.. -I../../../../include
+LDLIBS := -lpthread -lelf
+TEST_GEN_PROGS := fuse_test fuse_daemon
+TEST_GEN_FILES := \
+	test_bpf.bpf \
+	fd_bpf.bpf \
+	fd.sh \
+
+EXTRA_CLEAN := *.bpf
+BPF_FLAGS = -Wall -Werror -O2 -g -emit-llvm \
+	    -I ../../../../../include \
+	    -idirafter /usr/lib/gcc/x86_64-linux-gnu/10/include \
+	    -idirafter /usr/local/include \
+	    -idirafter /usr/include/x86_64-linux-gnu \
+	    -idirafter /usr/include \
+
+include ../../lib.mk
+
+# Put after include ../../lib.mk since that changes $(TEST_GEN_PROGS)
+# Otherwise you get multiple targets, this becomes the default, and it's a mess
+EXTRA_SOURCES := bpf_loader.c
+$(TEST_GEN_PROGS) : $(EXTRA_SOURCES)
+
+$(OUTPUT)/%.ir: %.c
+	clang $(BPF_FLAGS) -c $< -o $@
+
+$(OUTPUT)/%.bpf: $(OUTPUT)/%.ir
+	llc -march=bpf -filetype=obj -o $@ $<
+
+$(OUTPUT)/fd.sh: fd.txt
+	cp $< $@
+	chmod 755 $@
+
diff --git a/tools/testing/selftests/filesystems/fuse/OWNERS b/tools/testing/selftests/filesystems/fuse/OWNERS
new file mode 100644
index 000000000000..5eb371e1a5a3
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/OWNERS
@@ -0,0 +1,2 @@
+# include OWNERS from the authoritative android-mainline branch
+include kernel/common:android-mainline:/tools/testing/selftests/filesystems/incfs/OWNERS
diff --git a/tools/testing/selftests/filesystems/fuse/bpf_loader.c b/tools/testing/selftests/filesystems/fuse/bpf_loader.c
new file mode 100644
index 000000000000..5bf26eadd421
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/bpf_loader.c
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ */
+
+#include "test_fuse.h"
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <gelf.h>
+#include <libelf.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/xattr.h>
+
+#include <linux/unistd.h>
+
+#include <include/uapi/linux/fuse.h>
+#include <include/uapi/linux/bpf.h>
+
+struct _test_options test_options;
+
+struct s s(const char *s1)
+{
+	struct s s = {0};
+
+	if (!s1)
+		return s;
+
+	s.s = malloc(strlen(s1) + 1);
+	if (!s.s)
+		return s;
+
+	strcpy(s.s, s1);
+	return s;
+}
+
+struct s sn(const char *s1, const char *s2)
+{
+	struct s s = {0};
+
+	if (!s1)
+		return s;
+
+	s.s = malloc(s2 - s1 + 1);
+	if (!s.s)
+		return s;
+
+	strncpy(s.s, s1, s2 - s1);
+	s.s[s2 - s1] = 0;
+	return s;
+}
+
+int s_cmp(struct s s1, struct s s2)
+{
+	int result = -1;
+
+	if (!s1.s || !s2.s)
+		goto out;
+	result = strcmp(s1.s, s2.s);
+out:
+	free(s1.s);
+	free(s2.s);
+	return result;
+}
+
+struct s s_cat(struct s s1, struct s s2)
+{
+	struct s s = {0};
+
+	if (!s1.s || !s2.s)
+		goto out;
+
+	s.s = malloc(strlen(s1.s) + strlen(s2.s) + 1);
+	if (!s.s)
+		goto out;
+
+	strcpy(s.s, s1.s);
+	strcat(s.s, s2.s);
+out:
+	free(s1.s);
+	free(s2.s);
+	return s;
+}
+
+struct s s_splitleft(struct s s1, char c)
+{
+	struct s s = {0};
+	char *split;
+
+	if (!s1.s)
+		return s;
+
+	split = strchr(s1.s, c);
+	if (split)
+		s = sn(s1.s, split);
+
+	free(s1.s);
+	return s;
+}
+
+struct s s_splitright(struct s s1, char c)
+{
+	struct s s2 = {0};
+	char *split;
+
+	if (!s1.s)
+		return s2;
+
+	split = strchr(s1.s, c);
+	if (split)
+		s2 = s(split + 1);
+
+	free(s1.s);
+	return s2;
+}
+
+struct s s_word(struct s s1, char c, size_t n)
+{
+	while (n--)
+		s1 = s_splitright(s1, c);
+	return s_splitleft(s1, c);
+}
+
+struct s s_path(struct s s1, struct s s2)
+{
+	return s_cat(s_cat(s1, s("/")), s2);
+}
+
+struct s s_pathn(size_t n, struct s s1, ...)
+{
+	va_list argp;
+
+	va_start(argp, s1);
+	while (--n)
+		s1 = s_path(s1, va_arg(argp, struct s));
+	va_end(argp);
+	return s1;
+}
+
+int s_link(struct s src_pathname, struct s dst_pathname)
+{
+	int res;
+
+	if (src_pathname.s && dst_pathname.s) {
+		res = link(src_pathname.s, dst_pathname.s);
+	} else {
+		res = -1;
+		errno = ENOMEM;
+	}
+
+	free(src_pathname.s);
+	free(dst_pathname.s);
+	return res;
+}
+
+int s_symlink(struct s src_pathname, struct s dst_pathname)
+{
+	int res;
+
+	if (src_pathname.s && dst_pathname.s) {
+		res = symlink(src_pathname.s, dst_pathname.s);
+	} else {
+		res = -1;
+		errno = ENOMEM;
+	}
+
+	free(src_pathname.s);
+	free(dst_pathname.s);
+	return res;
+}
+
+
+int s_mkdir(struct s pathname, mode_t mode)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = mkdir(pathname.s, mode);
+	free(pathname.s);
+	return res;
+}
+
+int s_rmdir(struct s pathname)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = rmdir(pathname.s);
+	free(pathname.s);
+	return res;
+}
+
+int s_unlink(struct s pathname)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = unlink(pathname.s);
+	free(pathname.s);
+	return res;
+}
+
+int s_open(struct s pathname, int flags, ...)
+{
+	va_list ap;
+	int res;
+
+	va_start(ap, flags);
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	if (flags & (O_CREAT | O_TMPFILE))
+		res = open(pathname.s, flags, va_arg(ap, mode_t));
+	else
+		res = open(pathname.s, flags);
+
+	free(pathname.s);
+	va_end(ap);
+	return res;
+}
+
+int s_openat(int dirfd, struct s pathname, int flags, ...)
+{
+	va_list ap;
+	int res;
+
+	va_start(ap, flags);
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	if (flags & (O_CREAT | O_TMPFILE))
+		res = openat(dirfd, pathname.s, flags, va_arg(ap, mode_t));
+	else
+		res = openat(dirfd, pathname.s, flags);
+
+	free(pathname.s);
+	va_end(ap);
+	return res;
+}
+
+int s_creat(struct s pathname, mode_t mode)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = open(pathname.s, O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, mode);
+	free(pathname.s);
+	return res;
+}
+
+int s_mkfifo(struct s pathname, mode_t mode)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = mknod(pathname.s, S_IFIFO | mode, 0);
+	free(pathname.s);
+	return res;
+}
+
+int s_stat(struct s pathname, struct stat *st)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = stat(pathname.s, st);
+	free(pathname.s);
+	return res;
+}
+
+int s_statfs(struct s pathname, struct statfs *st)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = statfs(pathname.s, st);
+	free(pathname.s);
+	return res;
+}
+
+DIR *s_opendir(struct s pathname)
+{
+	DIR *res;
+
+	res = opendir(pathname.s);
+	free(pathname.s);
+	return res;
+}
+
+int s_getxattr(struct s pathname, const char name[], void *value, size_t size,
+	       ssize_t *ret_size)
+{
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	*ret_size = getxattr(pathname.s, name, value, size);
+	free(pathname.s);
+	return *ret_size >= 0 ? 0 : -1;
+}
+
+int s_listxattr(struct s pathname, void *list, size_t size, ssize_t *ret_size)
+{
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	*ret_size = listxattr(pathname.s, list, size);
+	free(pathname.s);
+	return *ret_size >= 0 ? 0 : -1;
+}
+
+int s_setxattr(struct s pathname, const char name[], const void *value, size_t size, int flags)
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = setxattr(pathname.s, name, value, size, flags);
+	free(pathname.s);
+	return res;
+}
+
+int s_removexattr(struct s pathname, const char name[])
+{
+	int res;
+
+	if (!pathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = removexattr(pathname.s, name);
+	free(pathname.s);
+	return res;
+}
+
+int s_rename(struct s oldpathname, struct s newpathname)
+{
+	int res;
+
+	if (!oldpathname.s || !newpathname.s) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	res = rename(oldpathname.s, newpathname.s);
+	free(oldpathname.s);
+	free(newpathname.s);
+	return res;
+}
+
+int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out)
+{
+
+	struct stat st;
+	int result = TEST_FAILURE;
+
+	TESTSYSCALL(s_stat(pathname, &st));
+
+	fuse_attr_out->ino = st.st_ino;
+	fuse_attr_out->mode = st.st_mode;
+	fuse_attr_out->nlink = st.st_nlink;
+	fuse_attr_out->uid = st.st_uid;
+	fuse_attr_out->gid = st.st_gid;
+	fuse_attr_out->rdev = st.st_rdev;
+	fuse_attr_out->size = st.st_size;
+	fuse_attr_out->blksize = st.st_blksize;
+	fuse_attr_out->blocks = st.st_blocks;
+	fuse_attr_out->atime = st.st_atime;
+	fuse_attr_out->mtime = st.st_mtime;
+	fuse_attr_out->ctime = st.st_ctime;
+	fuse_attr_out->atimensec = UINT32_MAX;
+	fuse_attr_out->mtimensec = UINT32_MAX;
+	fuse_attr_out->ctimensec = UINT32_MAX;
+
+	result = TEST_SUCCESS;
+out:
+	return result;
+}
+
+struct s tracing_folder(void)
+{
+	struct s trace = {0};
+	FILE *mounts = NULL;
+	char *line = NULL;
+	size_t size = 0;
+
+	TEST(mounts = fopen("/proc/mounts", "re"), mounts);
+	while (getline(&line, &size, mounts) != -1) {
+		if (!s_cmp(s_word(sn(line, line + size), ' ', 2),
+			   s("tracefs"))) {
+			trace = s_word(sn(line, line + size), ' ', 1);
+			break;
+		}
+
+		if (!s_cmp(s_word(sn(line, line + size), ' ', 2), s("debugfs")))
+			trace = s_path(s_word(sn(line, line + size), ' ', 1),
+				       s("tracing"));
+	}
+
+out:
+	free(line);
+	fclose(mounts);
+	return trace;
+}
+
+int tracing_on(void)
+{
+	int result = TEST_FAILURE;
+	int tracing_on = -1;
+
+	TEST(tracing_on = s_open(s_path(tracing_folder(), s("tracing_on")),
+				 O_WRONLY | O_CLOEXEC),
+	     tracing_on != -1);
+	TESTEQUAL(write(tracing_on, "1", 1), 1);
+	result = TEST_SUCCESS;
+out:
+	close(tracing_on);
+	return result;
+}
+
+char *concat_file_name(const char *dir, const char *file)
+{
+	char full_name[FILENAME_MAX] = "";
+
+	if (snprintf(full_name, ARRAY_SIZE(full_name), "%s/%s", dir, file) < 0)
+		return NULL;
+	return strdup(full_name);
+}
+
+char *setup_mount_dir(const char *name)
+{
+	struct stat st;
+	char *current_dir = getcwd(NULL, 0);
+	char *mount_dir = concat_file_name(current_dir, name);
+
+	free(current_dir);
+	if (stat(mount_dir, &st) == 0) {
+		if (S_ISDIR(st.st_mode))
+			return mount_dir;
+
+		ksft_print_msg("%s is a file, not a dir.\n", mount_dir);
+		return NULL;
+	}
+
+	if (mkdir(mount_dir, 0777)) {
+		ksft_print_msg("Can't create mount dir.");
+		return NULL;
+	}
+
+	return mount_dir;
+}
+
+int delete_dir_tree(const char *dir_path, bool remove_root)
+{
+	DIR *dir = NULL;
+	struct dirent *dp;
+	int result = 0;
+
+	dir = opendir(dir_path);
+	if (!dir) {
+		result = -errno;
+		goto out;
+	}
+
+	while ((dp = readdir(dir))) {
+		char *full_path;
+
+		if (!strcmp(dp->d_name, ".") || !strcmp(dp->d_name, ".."))
+			continue;
+
+		full_path = concat_file_name(dir_path, dp->d_name);
+		if (dp->d_type == DT_DIR)
+			result = delete_dir_tree(full_path, true);
+		else
+			result = unlink(full_path);
+		free(full_path);
+		if (result)
+			goto out;
+	}
+
+out:
+	if (dir)
+		closedir(dir);
+	if (!result && remove_root)
+		rmdir(dir_path);
+	return result;
+}
+
+static int mount_fuse_maybe_init(const char *mount_dir, int bpf_fd, int dir_fd,
+			     int *fuse_dev_ptr, bool init)
+{
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	char options[FILENAME_MAX];
+	uint8_t bytes_in[FUSE_MIN_READ_BUFFER];
+	uint8_t bytes_out[FUSE_MIN_READ_BUFFER];
+
+	DECL_FUSE_IN(init);
+
+	TEST(fuse_dev = open("/dev/fuse", O_RDWR | O_CLOEXEC), fuse_dev != -1);
+	snprintf(options, FILENAME_MAX, "fd=%d,user_id=0,group_id=0,rootmode=0040000",
+		 fuse_dev);
+	if (bpf_fd != -1)
+		snprintf(options + strlen(options),
+			 sizeof(options) - strlen(options),
+			 ",root_bpf=%d", bpf_fd);
+	if (dir_fd != -1)
+		snprintf(options + strlen(options),
+			 sizeof(options) - strlen(options),
+			 ",root_dir=%d", dir_fd);
+	TESTSYSCALL(mount("ABC", mount_dir, "fuse", 0, options));
+
+	if (init) {
+		TESTFUSEIN(FUSE_INIT, init_in);
+		TESTEQUAL(init_in->major, FUSE_KERNEL_VERSION);
+		TESTEQUAL(init_in->minor, FUSE_KERNEL_MINOR_VERSION);
+		TESTFUSEOUT1(fuse_init_out, ((struct fuse_init_out) {
+			.major = FUSE_KERNEL_VERSION,
+			.minor = FUSE_KERNEL_MINOR_VERSION,
+			.max_readahead = 4096,
+			.flags = 0,
+			.max_background = 0,
+			.congestion_threshold = 0,
+			.max_write = 4096,
+			.time_gran = 1000,
+			.max_pages = 12,
+			.map_alignment = 4096,
+		}));
+	}
+
+	*fuse_dev_ptr = fuse_dev;
+	fuse_dev = -1;
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	return result;
+}
+
+int mount_fuse(const char *mount_dir, int bpf_fd, int dir_fd, int *fuse_dev_ptr)
+{
+	return mount_fuse_maybe_init(mount_dir, bpf_fd, dir_fd, fuse_dev_ptr,
+				     true);
+}
+
+int mount_fuse_no_init(const char *mount_dir, int bpf_fd, int dir_fd,
+		       int *fuse_dev_ptr)
+{
+	return mount_fuse_maybe_init(mount_dir, bpf_fd, dir_fd, fuse_dev_ptr,
+				     false);
+}
+
+struct fuse_bpf_map {
+	unsigned int map_type;
+	size_t key_size;
+	size_t value_size;
+	unsigned int max_entries;
+};
+
+static int install_maps(Elf_Data *maps, int maps_index, Elf *elf,
+			Elf_Data *symbols, int symbol_index,
+			struct map_relocation **mr, size_t *map_count)
+{
+	int result = TEST_FAILURE;
+	int i;
+	GElf_Sym symbol;
+
+	TESTNE((void *)symbols, NULL);
+
+	for (i = 0; i < symbols->d_size / sizeof(symbol); ++i) {
+		TESTNE((void *)gelf_getsym(symbols, i, &symbol), 0);
+		if (symbol.st_shndx == maps_index) {
+			struct fuse_bpf_map *map;
+			union bpf_attr attr;
+			int map_fd;
+
+			map = (struct fuse_bpf_map *)
+				((char *)maps->d_buf + symbol.st_value);
+
+			attr = (union bpf_attr) {
+				.map_type = map->map_type,
+				.key_size = map->key_size,
+				.value_size = map->value_size,
+				.max_entries = map->max_entries,
+			};
+
+			TEST(*mr = realloc(*mr, ++*map_count *
+					   sizeof(struct fuse_bpf_map)),
+			     *mr);
+			TEST(map_fd = syscall(__NR_bpf, BPF_MAP_CREATE,
+					      &attr, sizeof(attr)),
+			     map_fd != -1);
+			(*mr)[*map_count - 1] = (struct map_relocation) {
+				.name = strdup(elf_strptr(elf, symbol_index,
+							  symbol.st_name)),
+				.fd = map_fd,
+				.value = symbol.st_value,
+			};
+		}
+	}
+
+	result = TEST_SUCCESS;
+out:
+	return result;
+}
+
+static inline int relocate_maps(GElf_Shdr *rel_header, Elf_Data *rel_data,
+			 Elf_Data *prog_data, Elf_Data *symbol_data,
+			 struct map_relocation *map_relocations,
+			 size_t map_count)
+{
+	int result = TEST_FAILURE;
+	int i;
+	struct bpf_insn *insns = (struct bpf_insn *) prog_data->d_buf;
+
+	for (i = 0; i < rel_header->sh_size / rel_header->sh_entsize; ++i) {
+		GElf_Sym sym;
+		GElf_Rel rel;
+		unsigned int insn_idx;
+		int map_idx;
+
+		gelf_getrel(rel_data, i, &rel);
+		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+		insns[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
+
+		gelf_getsym(symbol_data, GELF_R_SYM(rel.r_info), &sym);
+		for (map_idx = 0; map_idx < map_count; map_idx++) {
+			if (map_relocations[map_idx].value == sym.st_value) {
+				insns[insn_idx].imm =
+					map_relocations[map_idx].fd;
+				break;
+			}
+		}
+		TESTNE(map_idx, map_count);
+	}
+
+	result = TEST_SUCCESS;
+out:
+	return result;
+}
+
+int install_elf_bpf(const char *file, const char *section, int *fd,
+		    struct map_relocation **map_relocations, size_t *map_count)
+{
+	int result = TEST_FAILURE;
+	char path[PATH_MAX] = {};
+	char *last_slash;
+	int filter_fd = -1;
+	union bpf_attr bpf_attr;
+	static char log[1 << 20];
+	Elf *elf = NULL;
+	GElf_Ehdr ehdr;
+	Elf_Data *data_prog = NULL, *data_maps = NULL, *data_symbols = NULL;
+	int maps_index, symbol_index, prog_index;
+	int i;
+	int bpf_prog_type_fuse_fd = -1;
+	char buffer[10] = {0};
+	int bpf_prog_type_fuse;
+
+	TESTNE(readlink("/proc/self/exe", path, PATH_MAX), -1);
+	TEST(last_slash = strrchr(path, '/'), last_slash);
+	strcpy(last_slash + 1, file);
+	TEST(filter_fd = open(path, O_RDONLY | O_CLOEXEC), filter_fd != -1);
+	TESTNE(elf_version(EV_CURRENT), EV_NONE);
+	TEST(elf = elf_begin(filter_fd, ELF_C_READ, NULL), elf);
+	TESTEQUAL((void *) gelf_getehdr(elf, &ehdr), &ehdr);
+	for (i = 1; i < ehdr.e_shnum; i++) {
+		char *shname;
+		GElf_Shdr shdr;
+		Elf_Scn *scn;
+
+		TEST(scn = elf_getscn(elf, i), scn);
+		TESTEQUAL((void *)gelf_getshdr(scn, &shdr), &shdr);
+		TEST(shname = elf_strptr(elf, ehdr.e_shstrndx, shdr.sh_name),
+		     shname);
+
+		if (!strcmp(shname, "maps")) {
+			TEST(data_maps = elf_getdata(scn, 0), data_maps);
+			maps_index = i;
+		} else if (shdr.sh_type == SHT_SYMTAB) {
+			TEST(data_symbols = elf_getdata(scn, 0), data_symbols);
+			symbol_index = shdr.sh_link;
+		} else if (!strcmp(shname, section)) {
+			TEST(data_prog = elf_getdata(scn, 0), data_prog);
+			prog_index = i;
+		}
+	}
+	TESTNE((void *) data_prog, NULL);
+
+	if (data_maps)
+		TESTEQUAL(install_maps(data_maps, maps_index, elf,
+				       data_symbols, symbol_index,
+				       map_relocations, map_count), 0);
+
+	/* Now relocate maps */
+	for (i = 1; i < ehdr.e_shnum; i++) {
+		GElf_Shdr rel_header;
+		Elf_Scn *scn;
+		Elf_Data *rel_data;
+
+		TEST(scn = elf_getscn(elf, i), scn);
+		TESTEQUAL((void *)gelf_getshdr(scn, &rel_header),
+			&rel_header);
+		if (rel_header.sh_type != SHT_REL)
+			continue;
+		TEST(rel_data = elf_getdata(scn, 0), rel_data);
+
+		if (rel_header.sh_info != prog_index)
+			continue;
+		TESTEQUAL(relocate_maps(&rel_header, rel_data,
+					data_prog, data_symbols,
+					*map_relocations, *map_count),
+			  0);
+	}
+
+	TEST(bpf_prog_type_fuse_fd = open("/sys/fs/fuse/bpf_prog_type_fuse",
+					  O_RDONLY | O_CLOEXEC),
+	     bpf_prog_type_fuse_fd != -1);
+	TESTGE(read(bpf_prog_type_fuse_fd, buffer, sizeof(buffer)), 1);
+	TEST(bpf_prog_type_fuse = strtol(buffer, NULL, 10),
+	     bpf_prog_type_fuse != 0);
+
+	bpf_attr = (union bpf_attr) {
+		.prog_type = bpf_prog_type_fuse,
+		.insn_cnt = data_prog->d_size / 8,
+		.insns = ptr_to_u64(data_prog->d_buf),
+		.license = ptr_to_u64("GPL"),
+		.log_buf = test_options.verbose ? ptr_to_u64(log) : 0,
+		.log_size = test_options.verbose ? sizeof(log) : 0,
+		.log_level = test_options.verbose ? 2 : 0,
+	};
+	*fd = syscall(__NR_bpf, BPF_PROG_LOAD, &bpf_attr, sizeof(bpf_attr));
+	if (test_options.verbose)
+		ksft_print_msg("%s\n", log);
+	if (*fd == -1 && errno == ENOSPC)
+		ksft_print_msg("bpf log size too small!\n");
+	TESTNE(*fd, -1);
+
+	result = TEST_SUCCESS;
+out:
+	close(filter_fd);
+	close(bpf_prog_type_fuse_fd);
+	return result;
+}
+
+
diff --git a/tools/testing/selftests/filesystems/fuse/fd.txt b/tools/testing/selftests/filesystems/fuse/fd.txt
new file mode 100644
index 000000000000..15ce77180d55
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/fd.txt
@@ -0,0 +1,21 @@
+fuse_daemon $*
+cd fd-dst
+ls
+cd show
+ls
+fsstress -s 123 -d . -p 4 -n 100 -l5
+echo test > wibble
+ls
+cat wibble
+fallocate -l 1000 wobble
+mkdir testdir
+mkdir tmpdir
+rmdir tmpdir
+touch tmp
+mv tmp tmp2
+rm tmp2
+
+# FUSE_LINK
+echo "ln_src contents" > ln_src
+ln ln_src ln_link
+cat ln_link
diff --git a/tools/testing/selftests/filesystems/fuse/fd_bpf.c b/tools/testing/selftests/filesystems/fuse/fd_bpf.c
new file mode 100644
index 000000000000..3cd82d67e759
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/fd_bpf.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2021 Google LLC
+
+#include "test_fuse_bpf.h"
+
+SEC("maps") struct fuse_bpf_map test_map = {
+	BPF_MAP_TYPE_ARRAY,
+	sizeof(uint32_t),
+	sizeof(uint32_t),
+	1000,
+};
+
+SEC("maps") struct fuse_bpf_map test_map2 = {
+	BPF_MAP_TYPE_HASH,
+	sizeof(uint32_t),
+	sizeof(uint64_t),
+	76,
+};
+
+SEC("test_daemon") int trace_daemon(struct fuse_bpf_args *fa)
+{
+	uint64_t uid_gid = bpf_get_current_uid_gid();
+	uint32_t uid = uid_gid & 0xffffffff;
+	uint64_t pid_tgid = bpf_get_current_pid_tgid();
+	uint32_t pid = pid_tgid & 0xffffffff;
+	uint32_t key = 23;
+	uint32_t *pvalue;
+
+	pvalue = bpf_map_lookup_elem(&test_map, &key);
+	if (pvalue) {
+		uint32_t value = *pvalue;
+
+		bpf_printk("pid %u uid %u value %u", pid, uid, value);
+		value++;
+		bpf_map_update_elem(&test_map, &key,  &value, BPF_ANY);
+	}
+
+	switch (fa->opcode) {
+	case FUSE_ACCESS | FUSE_PREFILTER: {
+		bpf_printk("Access: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_GETATTR | FUSE_PREFILTER: {
+		const struct fuse_getattr_in *fgi = fa->in_args[0].value;
+
+		bpf_printk("Get Attr %d", fgi->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_SETATTR | FUSE_PREFILTER: {
+		const struct fuse_setattr_in *fsi = fa->in_args[0].value;
+
+		bpf_printk("Set Attr %d", fsi->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_OPENDIR | FUSE_PREFILTER: {
+		bpf_printk("Open Dir: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_READDIR | FUSE_PREFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+
+		bpf_printk("Read Dir: fh: %lu", fri->fh, fri->offset);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LOOKUP | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("Lookup: %lx %s", fa->nodeid, name);
+		if (fa->nodeid == 1)
+			return FUSE_BPF_USER_FILTER | FUSE_BPF_BACKING;
+		else
+			return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_MKNOD | FUSE_PREFILTER: {
+		const struct fuse_mknod_in *fmi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("mknod %s %x %x", name,  fmi->rdev | fmi->mode, fmi->umask);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_MKDIR | FUSE_PREFILTER: {
+		const struct fuse_mkdir_in *fmi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("mkdir: %s %x %x", name, fmi->mode, fmi->umask);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RMDIR | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("rmdir: %s", name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RENAME | FUSE_PREFILTER: {
+		const char *oldname = fa->in_args[1].value;
+		const char *newname = fa->in_args[2].value;
+
+		bpf_printk("rename from %s", oldname);
+		bpf_printk("rename to %s", newname);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RENAME2 | FUSE_PREFILTER: {
+		const struct fuse_rename2_in *fri = fa->in_args[0].value;
+		uint32_t flags = fri->flags;
+		const char *oldname = fa->in_args[1].value;
+		const char *newname = fa->in_args[2].value;
+
+		bpf_printk("rename(%x) from %s", flags, oldname);
+		bpf_printk("rename to %s", newname);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_UNLINK | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("unlink: %s", name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LINK | FUSE_PREFILTER: {
+		const struct fuse_link_in *fli = fa->in_args[0].value;
+		const char *dst_name = fa->in_args[1].value;
+
+		bpf_printk("Link: %d %s", fli->oldnodeid, dst_name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_SYMLINK | FUSE_PREFILTER: {
+		const char *link_name = fa->in_args[0].value;
+		const char *link_dest = fa->in_args[1].value;
+
+		bpf_printk("symlink from %s", link_name);
+		bpf_printk("symlink to %s", link_dest);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_READLINK | FUSE_PREFILTER: {
+		const char *link_name = fa->in_args[0].value;
+
+		bpf_printk("readlink from %s", link_name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RELEASE | FUSE_PREFILTER: {
+		const struct fuse_release_in *fri = fa->in_args[0].value;
+
+		bpf_printk("Release: %d", fri->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RELEASEDIR | FUSE_PREFILTER: {
+		const struct fuse_release_in *fri = fa->in_args[0].value;
+
+		bpf_printk("Release Dir: %d", fri->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_CREATE | FUSE_PREFILTER: {
+		bpf_printk("Create %s", fa->in_args[1].value);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_OPEN | FUSE_PREFILTER: {
+		bpf_printk("Open: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_READ | FUSE_PREFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+
+		bpf_printk("Read: fh: %lu, offset %lu, size %lu",
+			   fri->fh, fri->offset, fri->size);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_WRITE | FUSE_PREFILTER: {
+		const struct fuse_write_in *fwi = fa->in_args[0].value;
+
+		bpf_printk("Write: fh: %lu, offset %lu, size %lu",
+			   fwi->fh, fwi->offset, fwi->size);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_FLUSH | FUSE_PREFILTER: {
+		const struct fuse_flush_in *ffi = fa->in_args[0].value;
+
+		bpf_printk("Flush %d", ffi->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_FALLOCATE | FUSE_PREFILTER: {
+		const struct fuse_fallocate_in *ffa = fa->in_args[0].value;
+
+		bpf_printk("Fallocate %d %lu", ffa->fh, ffa->length);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_GETXATTR | FUSE_PREFILTER: {
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("Getxattr %d %s", fa->nodeid, name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LISTXATTR | FUSE_PREFILTER: {
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("Listxattr %d %s", fa->nodeid, name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_SETXATTR | FUSE_PREFILTER: {
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("Setxattr %d %s", fa->nodeid, name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_STATFS | FUSE_PREFILTER: {
+		bpf_printk("statfs %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LSEEK | FUSE_PREFILTER: {
+		const struct fuse_lseek_in *fli = fa->in_args[0].value;
+
+		bpf_printk("lseek type:%d, offset:%lld", fli->whence, fli->offset);
+		return FUSE_BPF_BACKING;
+	}
+
+	default:
+		if (fa->opcode & FUSE_PREFILTER)
+			bpf_printk("prefilter *** UNKNOWN *** opcode: %d",
+				   fa->opcode & FUSE_OPCODE_FILTER);
+		else if (fa->opcode & FUSE_POSTFILTER)
+			bpf_printk("postfilter *** UNKNOWN *** opcode: %d",
+				   fa->opcode & FUSE_OPCODE_FILTER);
+		else
+			bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode);
+		return FUSE_BPF_BACKING;
+	}
+}
diff --git a/tools/testing/selftests/filesystems/fuse/fuse_daemon.c b/tools/testing/selftests/filesystems/fuse/fuse_daemon.c
new file mode 100644
index 000000000000..1b6f8c2acf2b
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/fuse_daemon.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ */
+
+#include "test_fuse.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include <linux/unistd.h>
+
+#include <include/uapi/linux/fuse.h>
+#include <include/uapi/linux/bpf.h>
+
+bool user_messages;
+bool kernel_messages;
+
+static int display_trace(void)
+{
+	int pid = -1;
+	int tp = -1;
+	char c;
+	ssize_t bytes_read;
+	static char line[256] = {0};
+
+	if (!kernel_messages)
+		return TEST_SUCCESS;
+
+	TEST(pid = fork(), pid != -1);
+	if (pid != 0)
+		return pid;
+
+	TESTEQUAL(tracing_on(), 0);
+	TEST(tp = s_open(s_path(tracing_folder(), s("trace_pipe")),
+			 O_RDONLY | O_CLOEXEC), tp != -1);
+	for (;;) {
+		TEST(bytes_read = read(tp, &c, sizeof(c)),
+		     bytes_read == 1);
+		if (c == '\n') {
+			printf("%s\n", line);
+			line[0] = 0;
+		} else
+			sprintf(line + strlen(line), "%c", c);
+	}
+out:
+	if (pid == 0) {
+		close(tp);
+		exit(TEST_FAILURE);
+	}
+	return pid;
+}
+
+static const char *fuse_opcode_to_string(int opcode)
+{
+	switch (opcode & FUSE_OPCODE_FILTER) {
+	case FUSE_LOOKUP:
+		return "FUSE_LOOKUP";
+	case FUSE_FORGET:
+		return "FUSE_FORGET";
+	case FUSE_GETATTR:
+		return "FUSE_GETATTR";
+	case FUSE_SETATTR:
+		return "FUSE_SETATTR";
+	case FUSE_READLINK:
+		return "FUSE_READLINK";
+	case FUSE_SYMLINK:
+		return "FUSE_SYMLINK";
+	case FUSE_MKNOD:
+		return "FUSE_MKNOD";
+	case FUSE_MKDIR:
+		return "FUSE_MKDIR";
+	case FUSE_UNLINK:
+		return "FUSE_UNLINK";
+	case FUSE_RMDIR:
+		return "FUSE_RMDIR";
+	case FUSE_RENAME:
+		return "FUSE_RENAME";
+	case FUSE_LINK:
+		return "FUSE_LINK";
+	case FUSE_OPEN:
+		return "FUSE_OPEN";
+	case FUSE_READ:
+		return "FUSE_READ";
+	case FUSE_WRITE:
+		return "FUSE_WRITE";
+	case FUSE_STATFS:
+		return "FUSE_STATFS";
+	case FUSE_RELEASE:
+		return "FUSE_RELEASE";
+	case FUSE_FSYNC:
+		return "FUSE_FSYNC";
+	case FUSE_SETXATTR:
+		return "FUSE_SETXATTR";
+	case FUSE_GETXATTR:
+		return "FUSE_GETXATTR";
+	case FUSE_LISTXATTR:
+		return "FUSE_LISTXATTR";
+	case FUSE_REMOVEXATTR:
+		return "FUSE_REMOVEXATTR";
+	case FUSE_FLUSH:
+		return "FUSE_FLUSH";
+	case FUSE_INIT:
+		return "FUSE_INIT";
+	case FUSE_OPENDIR:
+		return "FUSE_OPENDIR";
+	case FUSE_READDIR:
+		return "FUSE_READDIR";
+	case FUSE_RELEASEDIR:
+		return "FUSE_RELEASEDIR";
+	case FUSE_FSYNCDIR:
+		return "FUSE_FSYNCDIR";
+	case FUSE_GETLK:
+		return "FUSE_GETLK";
+	case FUSE_SETLK:
+		return "FUSE_SETLK";
+	case FUSE_SETLKW:
+		return "FUSE_SETLKW";
+	case FUSE_ACCESS:
+		return "FUSE_ACCESS";
+	case FUSE_CREATE:
+		return "FUSE_CREATE";
+	case FUSE_INTERRUPT:
+		return "FUSE_INTERRUPT";
+	case FUSE_BMAP:
+		return "FUSE_BMAP";
+	case FUSE_DESTROY:
+		return "FUSE_DESTROY";
+	case FUSE_IOCTL:
+		return "FUSE_IOCTL";
+	case FUSE_POLL:
+		return "FUSE_POLL";
+	case FUSE_NOTIFY_REPLY:
+		return "FUSE_NOTIFY_REPLY";
+	case FUSE_BATCH_FORGET:
+		return "FUSE_BATCH_FORGET";
+	case FUSE_FALLOCATE:
+		return "FUSE_FALLOCATE";
+	case FUSE_READDIRPLUS:
+		return "FUSE_READDIRPLUS";
+	case FUSE_RENAME2:
+		return "FUSE_RENAME2";
+	case FUSE_LSEEK:
+		return "FUSE_LSEEK";
+	case FUSE_COPY_FILE_RANGE:
+		return "FUSE_COPY_FILE_RANGE";
+	case FUSE_SETUPMAPPING:
+		return "FUSE_SETUPMAPPING";
+	case FUSE_REMOVEMAPPING:
+		return "FUSE_REMOVEMAPPING";
+	//case FUSE_SYNCFS:
+	//	return "FUSE_SYNCFS";
+	case CUSE_INIT:
+		return "CUSE_INIT";
+	case CUSE_INIT_BSWAP_RESERVED:
+		return "CUSE_INIT_BSWAP_RESERVED";
+	case FUSE_INIT_BSWAP_RESERVED:
+		return "FUSE_INIT_BSWAP_RESERVED";
+	}
+	return "?";
+}
+
+static int parse_options(int argc, char *const *argv)
+{
+	signed char c;
+
+	while ((c = getopt(argc, argv, "kuv")) != -1)
+		switch (c) {
+		case 'v':
+			test_options.verbose = true;
+			break;
+
+		case 'u':
+			user_messages = true;
+			break;
+
+		case 'k':
+			kernel_messages = true;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int result = TEST_FAILURE;
+	int trace_pid = -1;
+	char *mount_dir = NULL;
+	char *src_dir = NULL;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	struct map_relocation *map_relocations = NULL;
+	size_t map_count = 0;
+	int i;
+
+	if (geteuid() != 0)
+		ksft_print_msg("Not a root, might fail to mount.\n");
+	TESTEQUAL(parse_options(argc, argv), 0);
+
+	TEST(trace_pid = display_trace(), trace_pid != -1);
+
+	delete_dir_tree("fd-src", true);
+	TEST(src_dir = setup_mount_dir("fd-src"), src_dir);
+	delete_dir_tree("fd-dst", true);
+	TEST(mount_dir = setup_mount_dir("fd-dst"), mount_dir);
+
+	TESTEQUAL(install_elf_bpf("fd_bpf.bpf", "test_daemon", &bpf_fd,
+				  &map_relocations, &map_count), 0);
+
+	TEST(src_fd = open("fd-src", O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTSYSCALL(mkdirat(src_fd, "show", 0777));
+	TESTSYSCALL(mkdirat(src_fd, "hide", 0777));
+
+	for (i = 0; i < map_count; ++i)
+		if (!strcmp(map_relocations[i].name, "test_map")) {
+			uint32_t key = 23;
+			uint32_t value = 1234;
+			union bpf_attr attr = {
+				.map_fd = map_relocations[i].fd,
+				.key    = ptr_to_u64(&key),
+				.value  = ptr_to_u64(&value),
+				.flags  = BPF_ANY,
+			};
+			TESTSYSCALL(syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM,
+					    &attr, sizeof(attr)));
+		}
+
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	if (fork())
+		return 0;
+
+	for (;;) {
+		uint8_t bytes_in[FUSE_MIN_READ_BUFFER];
+		uint8_t bytes_out[FUSE_MIN_READ_BUFFER] __maybe_unused;
+		struct fuse_in_header *in_header =
+			(struct fuse_in_header *)bytes_in;
+		ssize_t res = read(fuse_dev, bytes_in, sizeof(bytes_in));
+
+		if (res == -1)
+			break;
+
+		switch (in_header->opcode) {
+		case FUSE_LOOKUP | FUSE_PREFILTER: {
+			char *name = (char *)(bytes_in + sizeof(*in_header));
+
+			if (user_messages)
+				printf("Lookup %s\n", name);
+			if (!strcmp(name, "hide"))
+				TESTFUSEOUTERROR(-ENOENT);
+			else
+				TESTFUSEOUTREAD(name, strlen(name) + 1);
+			break;
+		}
+		default:
+			if (user_messages) {
+				printf("opcode is %d (%s)\n", in_header->opcode,
+				       fuse_opcode_to_string(
+					       in_header->opcode));
+			}
+			break;
+		}
+	}
+
+	result = TEST_SUCCESS;
+
+out:
+	for (i = 0; i < map_count; ++i) {
+		free(map_relocations[i].name);
+		close(map_relocations[i].fd);
+	}
+	free(map_relocations);
+	umount2(mount_dir, MNT_FORCE);
+	delete_dir_tree(mount_dir, true);
+	free(mount_dir);
+	delete_dir_tree(src_dir, true);
+	free(src_dir);
+	if (trace_pid != -1)
+		kill(trace_pid, SIGKILL);
+	return result;
+}
diff --git a/tools/testing/selftests/filesystems/fuse/fuse_test.c b/tools/testing/selftests/filesystems/fuse/fuse_test.c
new file mode 100644
index 000000000000..c23f75be15d5
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/fuse_test.c
@@ -0,0 +1,2142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ */
+#define _GNU_SOURCE
+
+#include "test_fuse.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/inotify.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#include <linux/capability.h>
+#include <linux/random.h>
+
+#include <include/uapi/linux/fuse.h>
+#include <include/uapi/linux/bpf.h>
+
+static const char *ft_src = "ft-src";
+static const char *ft_dst = "ft-dst";
+
+static void fill_buffer(uint8_t *data, size_t len, int file, int block)
+{
+	int i;
+	int seed = 7919 * file + block;
+
+	for (i = 0; i < len; i++) {
+		seed = 1103515245 * seed + 12345;
+		data[i] = (uint8_t)(seed >> (i % 13));
+	}
+}
+
+static bool test_buffer(uint8_t *data, size_t len, int file, int block)
+{
+	int i;
+	int seed = 7919 * file + block;
+
+	for (i = 0; i < len; i++) {
+		seed = 1103515245 * seed + 12345;
+		if (data[i] != (uint8_t)(seed >> (i % 13)))
+			return false;
+	}
+
+	return true;
+}
+
+static int create_file(int dir, struct s name, int index, size_t blocks)
+{
+	int result = TEST_FAILURE;
+	int fd = -1;
+	int i;
+	uint8_t data[PAGE_SIZE];
+
+	TEST(fd = s_openat(dir, name, O_CREAT | O_WRONLY, 0777), fd != -1);
+	for (i = 0; i < blocks; ++i) {
+		fill_buffer(data, PAGE_SIZE, index, i);
+		TESTEQUAL(write(fd, data, sizeof(data)), PAGE_SIZE);
+	}
+	TESTSYSCALL(close(fd));
+	result = TEST_SUCCESS;
+
+out:
+	close(fd);
+	return result;
+}
+
+static int bpf_clear_trace(void)
+{
+	int result = TEST_FAILURE;
+	int tp = -1;
+
+	TEST(tp = s_open(s_path(tracing_folder(), s("trace")),
+			 O_WRONLY | O_TRUNC | O_CLOEXEC), tp != -1);
+
+	result = TEST_SUCCESS;
+out:
+	close(tp);
+	return result;
+}
+
+static int bpf_test_trace_maybe(const char *substr, bool present)
+{
+	int result = TEST_FAILURE;
+	int tp = -1;
+	char trace_buffer[4096] = {};
+	ssize_t bytes_read;
+
+	TEST(tp = s_open(s_path(tracing_folder(), s("trace_pipe")),
+			 O_RDONLY | O_CLOEXEC),
+	     tp != -1);
+	fcntl(tp, F_SETFL, O_NONBLOCK);
+
+	for (;;) {
+		bytes_read = read(tp, trace_buffer, sizeof(trace_buffer));
+		if (present)
+			TESTCOND(bytes_read > 0);
+		else if (bytes_read <= 0) {
+			result = TEST_SUCCESS;
+			break;
+		}
+
+		if (test_options.verbose)
+			ksft_print_msg("%s\n", trace_buffer);
+
+		if (strstr(trace_buffer, substr)) {
+			if (present)
+				result = TEST_SUCCESS;
+			break;
+		}
+	}
+out:
+	close(tp);
+	return result;
+}
+
+static int bpf_test_trace(const char *substr)
+{
+	return bpf_test_trace_maybe(substr, true);
+}
+
+static int bpf_test_no_trace(const char *substr)
+{
+	return bpf_test_trace_maybe(substr, false);
+}
+
+static int basic_test(const char *mount_dir)
+{
+	const char *test_name = "test";
+	const char *test_data = "data";
+
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	char *filename = NULL;
+	int fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TESTEQUAL(mount_fuse(mount_dir, -1, -1, &fuse_dev), 0);
+	FUSE_START_DAEMON();
+	if (action) {
+		char data[256];
+
+		filename = concat_file_name(mount_dir, test_name);
+		TESTERR(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1);
+		TESTEQUAL(read(fd, data, strlen(test_data)), strlen(test_data));
+		TESTCOND(!strcmp(data, test_data));
+		TESTSYSCALL(close(fd));
+		fd = -1;
+	} else {
+		DECL_FUSE_IN(open);
+		DECL_FUSE_IN(read);
+		DECL_FUSE_IN(flush);
+		DECL_FUSE_IN(release);
+
+		TESTFUSELOOKUP(test_name, 0);
+		TESTFUSEOUT1(fuse_entry_out, ((struct fuse_entry_out) {
+			.nodeid		= 2,
+			.generation	= 1,
+			.attr.ino = 100,
+			.attr.size = 4,
+			.attr.blksize = 512,
+			.attr.mode = S_IFREG | 0777,
+			}));
+
+		TESTFUSEIN(FUSE_OPEN, open_in);
+		TESTFUSEOUT1(fuse_open_out, ((struct fuse_open_out) {
+			.fh = 1,
+			.open_flags = open_in->flags,
+		}));
+
+		//TESTFUSEINNULL(FUSE_CANONICAL_PATH);
+		//TESTFUSEOUTREAD("ignored", 7);
+
+		TESTFUSEIN(FUSE_READ, read_in);
+		TESTFUSEOUTREAD(test_data, strlen(test_data));
+
+		TESTFUSEIN(FUSE_FLUSH, flush_in);
+		TESTFUSEOUTEMPTY();
+
+		TESTFUSEIN(FUSE_RELEASE, release_in);
+		TESTFUSEOUTEMPTY();
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(fd);
+	free(filename);
+	umount(mount_dir);
+	return result;
+}
+
+static int bpf_test_real(const char *mount_dir)
+{
+	const char *test_name = "real";
+	const char *test_data = "Weebles wobble but they don't fall down";
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	char *filename = NULL;
+	int fd = -1;
+	char read_buffer[256] = {};
+	ssize_t bytes_read;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TEST(fd = openat(src_fd, test_name, O_CREAT | O_RDWR | O_CLOEXEC, 0777),
+	     fd != -1);
+	TESTEQUAL(write(fd, test_data, strlen(test_data)), strlen(test_data));
+	TESTSYSCALL(close(fd));
+	fd = -1;
+
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	filename = concat_file_name(mount_dir, test_name);
+	TESTERR(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1);
+	bytes_read = read(fd, read_buffer, strlen(test_data));
+	TESTEQUAL(bytes_read, strlen(test_data));
+	TESTEQUAL(strcmp(test_data, read_buffer), 0);
+	TESTEQUAL(bpf_test_trace("read"), 0);
+
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	close(fd);
+	free(filename);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+
+static int bpf_test_partial(const char *mount_dir)
+{
+	const char *test_name = "partial";
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	char *filename = NULL;
+	int fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(create_file(src_fd, s(test_name), 1, 2), 0);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		uint8_t data[PAGE_SIZE];
+
+		TEST(filename = concat_file_name(mount_dir, test_name),
+		     filename);
+		TESTERR(fd = open(filename, O_RDONLY | O_CLOEXEC), fd != -1);
+		TESTEQUAL(read(fd, data, PAGE_SIZE), PAGE_SIZE);
+		TESTEQUAL(bpf_test_trace("read"), 0);
+		TESTCOND(test_buffer(data, PAGE_SIZE, 2, 0));
+		TESTCOND(!test_buffer(data, PAGE_SIZE, 1, 0));
+		TESTEQUAL(read(fd, data, PAGE_SIZE), PAGE_SIZE);
+		TESTCOND(test_buffer(data, PAGE_SIZE, 1, 1));
+		TESTCOND(!test_buffer(data, PAGE_SIZE, 2, 1));
+		TESTSYSCALL(close(fd));
+		fd = -1;
+	} else {
+		DECL_FUSE(open);
+		DECL_FUSE(read);
+		DECL_FUSE(release);
+		uint8_t data[PAGE_SIZE];
+
+		TESTFUSEIN2(FUSE_OPEN | FUSE_POSTFILTER, open_in, open_out);
+		TESTFUSEOUT1(fuse_open_out, ((struct fuse_open_out) {
+			.fh = 1,
+			.open_flags = open_in->flags,
+		}));
+
+		TESTFUSEIN(FUSE_READ, read_in);
+		fill_buffer(data, PAGE_SIZE, 2, 0);
+		TESTFUSEOUTREAD(data, PAGE_SIZE);
+
+		TESTFUSEIN(FUSE_RELEASE, release_in);
+		TESTFUSEOUTEMPTY();
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(fd);
+	free(filename);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+static int bpf_test_attrs(const char *mount_dir)
+{
+	const char *test_name = "partial";
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	char *filename = NULL;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(create_file(src_fd, s(test_name), 1, 2), 0);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TEST(filename = concat_file_name(mount_dir, test_name), filename);
+	TESTSYSCALL(stat(filename, &st));
+	TESTSYSCALL(chmod(filename, 0111));
+	TESTSYSCALL(stat(filename, &st));
+	TESTEQUAL(st.st_mode & 0777, 0111);
+	TESTSYSCALL(chmod(filename, 0777));
+	TESTSYSCALL(stat(filename, &st));
+	TESTEQUAL(st.st_mode & 0777, 0777);
+	TESTSYSCALL(chown(filename, 5, 6));
+	TESTSYSCALL(stat(filename, &st));
+	TESTEQUAL(st.st_uid, 5);
+	TESTEQUAL(st.st_gid, 6);
+
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	free(filename);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+static int bpf_test_readdir(const char *mount_dir)
+{
+	static const char * const names[] = {
+		"real", "partial", "fake", ".", ".."
+	};
+	bool used[ARRAY_SIZE(names)] = { false };
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	DIR *dir = NULL;
+	struct dirent *dirent;
+	FUSE_DECLARE_DAEMON;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(create_file(src_fd, s(names[0]), 1, 2), 0);
+	TESTEQUAL(create_file(src_fd, s(names[1]), 1, 2), 0);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		int i, j;
+
+		TEST(dir = s_opendir(s(mount_dir)), dir);
+		TESTEQUAL(bpf_test_trace("opendir"), 0);
+
+		for (i = 0; i < ARRAY_SIZE(names); ++i) {
+			TEST(dirent = readdir(dir), dirent);
+
+			for (j = 0; j < ARRAY_SIZE(names); ++j)
+				if (!used[j] &&
+				    strcmp(names[j], dirent->d_name) == 0) {
+					used[j] = true;
+					break;
+				}
+			TESTNE(j, ARRAY_SIZE(names));
+		}
+		TEST(dirent = readdir(dir), dirent == NULL);
+		TESTSYSCALL(closedir(dir));
+		dir = NULL;
+		TESTEQUAL(bpf_test_trace("readdir"), 0);
+	} else {
+		struct fuse_in_header *in_header =
+			(struct fuse_in_header *)bytes_in;
+		ssize_t res = read(fuse_dev, bytes_in, sizeof(bytes_in));
+		struct fuse_read_out *read_out =
+			(struct fuse_read_out *) (bytes_in +
+					sizeof(*in_header) +
+					sizeof(struct fuse_read_in));
+		struct fuse_dirent *fuse_dirent =
+			(struct fuse_dirent *) (bytes_in + res);
+
+		TESTGE(res, sizeof(*in_header) + sizeof(struct fuse_read_in));
+		TESTEQUAL(in_header->opcode, FUSE_READDIR | FUSE_POSTFILTER);
+		*fuse_dirent = (struct fuse_dirent) {
+			.ino = 100,
+			.off = 5,
+			.namelen = strlen("fake"),
+			.type = DT_REG,
+		};
+		strcpy((char *)(bytes_in + res + sizeof(*fuse_dirent)), "fake");
+		res += FUSE_DIRENT_ALIGN(sizeof(*fuse_dirent) + strlen("fake") +
+					 1);
+		TESTFUSEDIROUTREAD(read_out,
+				bytes_in +
+				   sizeof(struct fuse_in_header) +
+				   sizeof(struct fuse_read_in) +
+				   sizeof(struct fuse_read_out),
+				res - sizeof(struct fuse_in_header) -
+				    sizeof(struct fuse_read_in) -
+				    sizeof(struct fuse_read_out));
+		res = read(fuse_dev, bytes_in, sizeof(bytes_in));
+		TESTEQUAL(res, sizeof(*in_header) +
+			  sizeof(struct fuse_read_in) +
+			  sizeof(struct fuse_read_out));
+		TESTEQUAL(in_header->opcode, FUSE_READDIR | FUSE_POSTFILTER);
+		TESTFUSEDIROUTREAD(read_out, bytes_in, 0);
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	closedir(dir);
+	close(fuse_dev);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+static int bpf_test_redact_readdir(const char *mount_dir)
+{
+	static const char * const names[] = {
+		"f1", "f2", "f3", "f4", "f5", "f6", ".", ".."
+	};
+	bool used[ARRAY_SIZE(names)] = { false };
+	int num_shown = (ARRAY_SIZE(names) - 2) / 2 + 2;
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	DIR *dir = NULL;
+	struct dirent *dirent;
+	int i;
+	int count = 0;
+	FUSE_DECLARE_DAEMON;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	for (i = 0; i < ARRAY_SIZE(names) - 2; i++)
+		TESTEQUAL(create_file(src_fd, s(names[i]), 1, 2), 0);
+
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_readdir_redact",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		int j;
+
+		TEST(dir = s_opendir(s(mount_dir)), dir);
+		while ((dirent = readdir(dir))) {
+			errno = 0;
+			TESTEQUAL(errno, 0);
+
+			for (j = 0; j < ARRAY_SIZE(names); ++j)
+				if (!used[j] &&
+				    strcmp(names[j], dirent->d_name) == 0) {
+					used[j] = true;
+					count++;
+					break;
+				}
+			TESTNE(j, ARRAY_SIZE(names));
+			TESTGE(num_shown, count);
+		}
+		TESTEQUAL(count, num_shown);
+		TESTSYSCALL(closedir(dir));
+		dir = NULL;
+	} else {
+		bool skip = true;
+
+		for (int i = 0; i < ARRAY_SIZE(names) + 1; i++) {
+			uint8_t bytes_in[FUSE_MIN_READ_BUFFER];
+			uint8_t bytes_out[FUSE_MIN_READ_BUFFER];
+			struct fuse_in_header *in_header =
+				(struct fuse_in_header *)bytes_in;
+			ssize_t res = read(fuse_dev, bytes_in, sizeof(bytes_in));
+			int length_out = 0;
+			uint8_t *pos;
+			uint8_t *dirs_in;
+			uint8_t *dirs_out;
+			struct fuse_read_in *fuse_read_in;
+			struct fuse_read_out *fuse_read_out_in;
+			struct fuse_read_out *fuse_read_out_out;
+			struct fuse_dirent *fuse_dirent_in = NULL;
+			struct fuse_dirent *next = NULL;
+			bool again = false;
+			int dir_ent_len = 0;
+
+			TESTGE(res, sizeof(struct fuse_in_header) +
+					sizeof(struct fuse_read_in) +
+					sizeof(struct fuse_read_out));
+
+			pos = bytes_in + sizeof(struct fuse_in_header);
+			fuse_read_in = (struct fuse_read_in *) pos;
+			pos += sizeof(*fuse_read_in);
+			fuse_read_out_in = (struct fuse_read_out *) pos;
+			pos += sizeof(*fuse_read_out_in);
+			dirs_in = pos;
+
+			pos = bytes_out + sizeof(struct fuse_out_header);
+			fuse_read_out_out = (struct fuse_read_out *) pos;
+			pos += sizeof(*fuse_read_out_out);
+			dirs_out = pos;
+
+			if (dirs_in < bytes_in + res) {
+				bool is_dot;
+
+				fuse_dirent_in = (struct fuse_dirent *) dirs_in;
+				is_dot = (fuse_dirent_in->namelen == 1 &&
+						!strncmp(fuse_dirent_in->name, ".", 1)) ||
+					 (fuse_dirent_in->namelen == 2 &&
+						!strncmp(fuse_dirent_in->name, "..", 2));
+
+				dir_ent_len = FUSE_DIRENT_ALIGN(
+					sizeof(*fuse_dirent_in) +
+					fuse_dirent_in->namelen);
+
+				if (dirs_in + dir_ent_len < bytes_in + res)
+					next = (struct fuse_dirent *)
+							(dirs_in + dir_ent_len);
+
+				if (!skip || is_dot) {
+					memcpy(dirs_out, fuse_dirent_in,
+					       sizeof(struct fuse_dirent) +
+					       fuse_dirent_in->namelen);
+					length_out += dir_ent_len;
+				}
+				again = ((skip && !is_dot) && next);
+
+				if (!is_dot)
+					skip = !skip;
+			}
+
+			fuse_read_out_out->offset = next ? next->off :
+					fuse_read_out_in->offset;
+			fuse_read_out_out->again = again;
+
+			{
+			struct fuse_out_header *out_header =
+				(struct fuse_out_header *)bytes_out;
+
+			*out_header = (struct fuse_out_header) {
+				.len = sizeof(*out_header) +
+				       sizeof(*fuse_read_out_out) + length_out,
+				.unique = in_header->unique,
+			};
+			TESTEQUAL(write(fuse_dev, bytes_out, out_header->len),
+				  out_header->len);
+			}
+		}
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	closedir(dir);
+	close(fuse_dev);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+/*
+ * This test is more to show what classic fuse does with a creat in a subdir
+ * than a test of any new functionality
+ */
+static int bpf_test_creat(const char *mount_dir)
+{
+	const char *dir_name = "show";
+	const char *file_name = "file";
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	int fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TESTEQUAL(mount_fuse(mount_dir, -1, -1, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		TEST(fd = s_creat(s_path(s_path(s(mount_dir), s(dir_name)),
+					 s(file_name)),
+				  0777),
+		     fd != -1);
+		TESTSYSCALL(close(fd));
+	} else {
+		DECL_FUSE_IN(create);
+		DECL_FUSE_IN(release);
+		DECL_FUSE_IN(flush);
+
+		TESTFUSELOOKUP(dir_name, 0);
+		TESTFUSEOUT1(fuse_entry_out, ((struct fuse_entry_out) {
+			.nodeid		= 3,
+			.generation	= 1,
+			.attr.ino = 100,
+			.attr.size = 4,
+			.attr.blksize = 512,
+			.attr.mode = S_IFDIR | 0777,
+			}));
+
+		TESTFUSELOOKUP(file_name, 0);
+		TESTFUSEOUTERROR(-ENOENT);
+
+		TESTFUSEINEXT(FUSE_CREATE, create_in, strlen(file_name) + 1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+			.nodeid		= 2,
+			.generation	= 1,
+			.attr.ino = 200,
+			.attr.size = 4,
+			.attr.blksize = 512,
+			.attr.mode = S_IFREG,
+			}),
+			fuse_open_out, ((struct fuse_open_out) {
+			.fh = 1,
+			.open_flags = create_in->flags,
+			}));
+
+		//TESTFUSEINNULL(FUSE_CANONICAL_PATH);
+		//TESTFUSEOUTREAD("ignored", 7);
+
+		TESTFUSEIN(FUSE_FLUSH, flush_in);
+		TESTFUSEOUTEMPTY();
+
+		TESTFUSEIN(FUSE_RELEASE, release_in);
+		TESTFUSEOUTEMPTY();
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	umount(mount_dir);
+	return result;
+}
+
+static int bpf_test_hidden_entries(const char *mount_dir)
+{
+	static const char * const dir_names[] = {
+		"show",
+		"hide",
+	};
+	const char *file_name = "file";
+	const char *data = "The quick brown fox jumps over the lazy dog\n";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTSYSCALL(mkdirat(src_fd, dir_names[0], 0777));
+	TESTSYSCALL(mkdirat(src_fd, dir_names[1], 0777));
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_hidden",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TEST(fd = s_creat(s_path(s_path(s(mount_dir), s(dir_names[0])),
+				 s(file_name)),
+			  0777),
+	     fd != -1);
+	TESTSYSCALL(fallocate(fd, 0, 0, 4096));
+	TEST(write(fd, data, strlen(data)), strlen(data));
+	TESTSYSCALL(close(fd));
+	TESTEQUAL(bpf_test_trace("Create"), 0);
+
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_dir(const char *mount_dir)
+{
+	const char *dir_name = "dir";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777));
+	TESTEQUAL(bpf_test_trace("mkdir"), 0);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name)), &st));
+	TESTSYSCALL(s_rmdir(s_path(s(mount_dir), s(dir_name))));
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(dir_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_file(const char *mount_dir, bool close_first)
+{
+	const char *file_name = "real";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+			  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TEST(fd = s_creat(s_path(s(mount_dir), s(file_name)),
+			  0777),
+	     fd != -1);
+	TESTEQUAL(bpf_test_trace("Create"), 0);
+	if (close_first) {
+		TESTSYSCALL(close(fd));
+		fd = -1;
+	}
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st));
+	TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name))));
+	TESTEQUAL(bpf_test_trace("unlink"), 0);
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+	if (!close_first) {
+		TESTSYSCALL(close(fd));
+		fd = -1;
+	}
+	result = TEST_SUCCESS;
+out:
+	close(fd);
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_file_early_close(const char *mount_dir)
+{
+	return bpf_test_file(mount_dir, true);
+}
+
+static int bpf_test_file_late_close(const char *mount_dir)
+{
+	return bpf_test_file(mount_dir, false);
+}
+
+static int bpf_test_alter_errcode_bpf(const char *mount_dir)
+{
+	const char *dir_name = "dir";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_error",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777));
+	//TESTEQUAL(bpf_test_trace("mkdir"), 0);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name)), &st));
+	TESTEQUAL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777), -EPERM);
+	TESTSYSCALL(s_rmdir(s_path(s(mount_dir), s(dir_name))));
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(dir_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_alter_errcode_userspace(const char *mount_dir)
+{
+	const char *dir_name = "doesnotexist";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_error",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		TESTEQUAL(s_unlink(s_path(s(mount_dir), s(dir_name))),
+		     -1);
+		TESTEQUAL(errno, ENOMEM);
+	} else {
+		TESTFUSELOOKUP("doesnotexist", FUSE_POSTFILTER);
+		TESTFUSEOUTERROR(-ENOMEM);
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_mknod(const char *mount_dir)
+{
+	const char *file_name = "real";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_mkfifo(s_path(s(mount_dir), s(file_name)), 0777));
+	TESTEQUAL(bpf_test_trace("mknod"), 0);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st));
+	TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name))));
+	TESTEQUAL(bpf_test_trace("unlink"), 0);
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_largedir(const char *mount_dir)
+{
+	const char *show = "show";
+	const int files = 1000;
+
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct map_relocation *map_relocations = NULL;
+	size_t map_count = 0;
+	FUSE_DECLARE_DAEMON;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("fd_bpf.bpf", "test_daemon",
+			  &bpf_fd, &map_relocations, &map_count), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		int i;
+		int fd;
+		DIR *dir = NULL;
+		struct dirent *dirent;
+
+		TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(show)), 0777));
+		for (i = 0; i < files; ++i) {
+			char filename[NAME_MAX];
+
+			sprintf(filename, "%d", i);
+			TEST(fd = s_creat(s_path(s_path(s(mount_dir), s(show)),
+						 s(filename)), 0777), fd != -1);
+			TESTSYSCALL(close(fd));
+		}
+
+		TEST(dir = s_opendir(s_path(s(mount_dir), s(show))), dir);
+		for (dirent = readdir(dir); dirent; dirent = readdir(dir))
+			;
+		closedir(dir);
+	} else {
+		int i;
+
+		for (i = 0; i < files + 2; ++i) {
+			TESTFUSELOOKUP(show, FUSE_PREFILTER);
+			TESTFUSEOUTREAD(show, 5);
+		}
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_link(const char *mount_dir)
+{
+	const char *file_name = "real";
+	const char *link_name = "partial";
+	int result = TEST_FAILURE;
+	int fd = -1;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", &bpf_fd, NULL,
+				  NULL),
+		  0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TEST(fd = s_creat(s_path(s(mount_dir), s(file_name)), 0777), fd != -1);
+	TESTEQUAL(bpf_test_trace("Create"), 0);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st));
+
+	TESTSYSCALL(s_link(s_path(s(mount_dir), s(file_name)),
+			   s_path(s(mount_dir), s(link_name))));
+
+	TESTEQUAL(bpf_test_trace("link"), 0);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(link_name)), &st));
+
+	TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(link_name))));
+	TESTEQUAL(bpf_test_trace("unlink"), 0);
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(link_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+
+	TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name))));
+	TESTEQUAL(bpf_test_trace("unlink"), 0);
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+
+	result = TEST_SUCCESS;
+out:
+	close(fd);
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_symlink(const char *mount_dir)
+{
+	const char *test_name = "real";
+	const char *symlink_name = "partial";
+	const char *test_data = "Weebles wobble but they don't fall down";
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+	char read_buffer[256] = {};
+	ssize_t bytes_read;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TEST(fd = openat(src_fd, test_name, O_CREAT | O_RDWR | O_CLOEXEC, 0777),
+	     fd != -1);
+	TESTEQUAL(write(fd, test_data, strlen(test_data)), strlen(test_data));
+	TESTSYSCALL(close(fd));
+	fd = -1;
+
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_symlink(s_path(s(mount_dir), s(test_name)),
+				   s_path(s(mount_dir), s(symlink_name))));
+	TESTEQUAL(bpf_test_trace("symlink"), 0);
+
+	TESTERR(fd = s_open(s_path(s(mount_dir), s(symlink_name)), O_RDONLY | O_CLOEXEC), fd != -1);
+	bytes_read = read(fd, read_buffer, strlen(test_data));
+	TESTEQUAL(bpf_test_trace("readlink"), 0);
+	TESTEQUAL(bytes_read, strlen(test_data));
+	TESTEQUAL(strcmp(test_data, read_buffer), 0);
+
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	close(fd);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+static int bpf_test_xattr(const char *mount_dir)
+{
+	static const char file_name[] = "real";
+	static const char xattr_name[] = "user.xattr_test_name";
+	static const char xattr_value[] = "this_is_a_test";
+	const size_t xattr_size = sizeof(xattr_value);
+	char xattr_value_ret[256];
+	ssize_t xattr_size_ret;
+	int result = TEST_FAILURE;
+	int fd = -1;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct stat st;
+
+	memset(xattr_value_ret, '\0', sizeof(xattr_value_ret));
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace", &bpf_fd, NULL,
+				  NULL),
+		  0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TEST(fd = s_creat(s_path(s(mount_dir), s(file_name)), 0777), fd != -1);
+	TESTEQUAL(bpf_test_trace("Create"), 0);
+	TESTSYSCALL(close(fd));
+
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(file_name)), &st));
+	TEST(result = s_getxattr(s_path(s(mount_dir), s(file_name)), xattr_name,
+				 xattr_value_ret, sizeof(xattr_value_ret),
+				 &xattr_size_ret),
+	     result == -1);
+	TESTEQUAL(errno, ENODATA);
+	TESTEQUAL(bpf_test_trace("getxattr"), 0);
+
+	TESTSYSCALL(s_listxattr(s_path(s(mount_dir), s(file_name)),
+				xattr_value_ret, sizeof(xattr_value_ret),
+				&xattr_size_ret));
+	TESTEQUAL(bpf_test_trace("listxattr"), 0);
+	TESTEQUAL(xattr_size_ret, 0);
+
+	TESTSYSCALL(s_setxattr(s_path(s(mount_dir), s(file_name)), xattr_name,
+			       xattr_value, xattr_size, 0));
+	TESTEQUAL(bpf_test_trace("setxattr"), 0);
+
+	TESTSYSCALL(s_listxattr(s_path(s(mount_dir), s(file_name)),
+				xattr_value_ret, sizeof(xattr_value_ret),
+				&xattr_size_ret));
+	TESTEQUAL(bpf_test_trace("listxattr"), 0);
+	TESTEQUAL(xattr_size_ret, sizeof(xattr_name));
+	TESTEQUAL(strcmp(xattr_name, xattr_value_ret), 0);
+
+	TESTSYSCALL(s_getxattr(s_path(s(mount_dir), s(file_name)), xattr_name,
+			       xattr_value_ret, sizeof(xattr_value_ret),
+			       &xattr_size_ret));
+	TESTEQUAL(bpf_test_trace("getxattr"), 0);
+	TESTEQUAL(xattr_size, xattr_size_ret);
+	TESTEQUAL(strcmp(xattr_value, xattr_value_ret), 0);
+
+	TESTSYSCALL(s_removexattr(s_path(s(mount_dir), s(file_name)), xattr_name));
+	TESTEQUAL(bpf_test_trace("removexattr"), 0);
+
+	TESTEQUAL(s_getxattr(s_path(s(mount_dir), s(file_name)), xattr_name,
+			       xattr_value_ret, sizeof(xattr_value_ret),
+			       &xattr_size_ret), -1);
+	TESTEQUAL(errno, ENODATA);
+
+	TESTSYSCALL(s_unlink(s_path(s(mount_dir), s(file_name))));
+	TESTEQUAL(bpf_test_trace("unlink"), 0);
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(file_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_set_backing(const char *mount_dir)
+{
+	const char *backing_name = "backing";
+	const char *test_data = "data";
+	const char *test_name = "test";
+
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	int fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0);
+	FUSE_START_DAEMON();
+	if (action) {
+		char data[256] = {0};
+
+		TESTERR(fd = s_open(s_path(s(mount_dir), s(test_name)),
+				    O_RDONLY | O_CLOEXEC), fd != -1);
+		TESTEQUAL(read(fd, data, strlen(test_data)), strlen(test_data));
+		TESTCOND(!strcmp(data, test_data));
+		TESTSYSCALL(close(fd));
+		fd = -1;
+		TESTSYSCALL(umount(mount_dir));
+	} else {
+		int bpf_fd  = -1;
+		int backing_fd = -1;
+
+		TESTERR(backing_fd = s_creat(s_path(s(ft_src), s(backing_name)), 0777),
+			backing_fd != -1);
+		TESTEQUAL(write(backing_fd, test_data, strlen(test_data)),
+			  strlen(test_data));
+		TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_simple",
+					  &bpf_fd, NULL, NULL), 0);
+
+		TESTFUSEINIT();
+		TESTFUSELOOKUP(test_name, 0);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {0}),
+			     fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+			.backing_action = FUSE_ACTION_REPLACE,
+			.backing_fd = backing_fd,
+			.bpf_action = FUSE_ACTION_REPLACE,
+			.bpf_fd = bpf_fd,
+			}));
+		read(fuse_dev, bytes_in, sizeof(bytes_in));
+		TESTSYSCALL(close(bpf_fd));
+		TESTSYSCALL(close(backing_fd));
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(fd);
+	umount(mount_dir);
+	return result;
+}
+
+static int bpf_test_remove_backing(const char *mount_dir)
+{
+	const char *folder1 = "folder1";
+	const char *folder2 = "folder2";
+	const char *file = "file1";
+	const char *contents1 = "contents1";
+	const char *contents2 = "contents2";
+
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	int fd = -1;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	char data[256] = {0};
+	FUSE_DECLARE_DAEMON;
+
+	/*
+	 * Create folder1/file
+	 *        folder2/file
+	 *
+	 * test will install bpf into mount.
+	 * bpf will postfilter root lookup to daemon.
+	 * daemon will remove bpf and redirect opens on folder1 to folder2.
+	 * test will open folder1/file which will be redirected to folder2.
+	 * test will check no traces for file, and contents are folder2/file.
+	 */
+	TESTEQUAL(bpf_clear_trace(), 0);
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777));
+	TEST(fd = s_creat(s_pathn(3, s(ft_src), s(folder1), s(file)), 0777),
+	     fd != -1);
+	TESTEQUAL(write(fd, contents1, strlen(contents1)), strlen(contents1));
+	TESTSYSCALL(close(fd));
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder2)), 0777));
+	TEST(fd = s_creat(s_pathn(3, s(ft_src), s(folder2), s(file)), 0777),
+	     fd != -1);
+	TESTEQUAL(write(fd, contents2, strlen(contents2)), strlen(contents2));
+	TESTSYSCALL(close(fd));
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_passthrough", &bpf_fd,
+				  NULL, NULL), 0);
+	TESTEQUAL(mount_fuse_no_init(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		TESTERR(fd = s_open(s_pathn(3, s(mount_dir), s(folder1),
+					    s(file)),
+				    O_RDONLY | O_CLOEXEC), fd != -1);
+		TESTEQUAL(read(fd, data, sizeof(data)), strlen(contents2));
+		TESTCOND(!strcmp(data, contents2));
+		TESTEQUAL(bpf_test_no_trace("file"), 0);
+		TESTSYSCALL(close(fd));
+		fd = -1;
+		TESTSYSCALL(umount(mount_dir));
+	} else {
+		struct {
+			char name[8];
+			struct fuse_entry_out feo;
+			struct fuse_entry_bpf_out febo;
+		} __packed in;
+		int backing_fd = -1;
+
+		TESTFUSEINIT();
+		TESTFUSEIN(FUSE_LOOKUP | FUSE_POSTFILTER, &in);
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder2)),
+				 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {0}),
+			     fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+			.bpf_action = FUSE_ACTION_REMOVE,
+			.backing_action = FUSE_ACTION_REPLACE,
+			.backing_fd = backing_fd,
+			}));
+
+		while (read(fuse_dev, bytes_in, sizeof(bytes_in)) != -1)
+			;
+		TESTSYSCALL(close(backing_fd));
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(fd);
+	close(src_fd);
+	close(bpf_fd);
+	umount(mount_dir);
+	return result;
+}
+
+static int bpf_test_dir_rename(const char *mount_dir)
+{
+	const char *dir_name = "dir";
+	const char *dir_name2 = "dir2";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	struct stat st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir_name)), 0777));
+	TESTEQUAL(bpf_test_trace("mkdir"), 0);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name)), &st));
+	TESTSYSCALL(s_rename(s_path(s(mount_dir), s(dir_name)),
+			     s_path(s(mount_dir), s(dir_name2))));
+	TESTEQUAL(s_stat(s_path(s(ft_src), s(dir_name)), &st), -1);
+	TESTEQUAL(errno, ENOENT);
+	TESTSYSCALL(s_stat(s_path(s(ft_src), s(dir_name2)), &st));
+	result = TEST_SUCCESS;
+out:
+	close(fuse_dev);
+	umount(mount_dir);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_file_rename(const char *mount_dir)
+{
+	const char *dir = "dir";
+	const char *file1 = "file1";
+	const char *file2 = "file2";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s(dir)), 0777));
+	TEST(fd = s_creat(s_pathn(3, s(mount_dir), s(dir), s(file1)), 0777),
+	     fd != -1);
+	TESTSYSCALL(s_rename(s_pathn(3, s(mount_dir), s(dir), s(file1)),
+			     s_pathn(3, s(mount_dir), s(dir), s(file2))));
+	result = TEST_SUCCESS;
+out:
+	close(fd);
+	umount(mount_dir);
+	close(fuse_dev);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int mmap_test(const char *mount_dir)
+{
+	const char *file = "file";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+	char *addr = NULL;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(mount_fuse(mount_dir, -1, src_fd, &fuse_dev), 0);
+	TEST(fd = s_open(s_path(s(mount_dir), s(file)),
+			 O_CREAT | O_RDWR | O_CLOEXEC, 0777),
+	     fd != -1);
+	TESTSYSCALL(fallocate(fd, 0, 4096, SEEK_CUR));
+	TEST(addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0),
+	     addr != (void *) -1);
+	memset(addr, 'a', 4096);
+
+	result = TEST_SUCCESS;
+out:
+	munmap(addr, 4096);
+	close(fd);
+	umount(mount_dir);
+	close(fuse_dev);
+	close(src_fd);
+	return result;
+}
+
+static int readdir_perms_test(const char *mount_dir)
+{
+	int result = TEST_FAILURE;
+	struct __user_cap_header_struct uchs = { _LINUX_CAPABILITY_VERSION_3 };
+	struct __user_cap_data_struct ucds[2];
+	int src_fd = -1;
+	int fuse_dev = -1;
+	DIR *dir = NULL;
+
+	/* Must remove capabilities for this test. */
+	TESTSYSCALL(syscall(SYS_capget, &uchs, ucds));
+	ucds[0].effective &= ~(1 << CAP_DAC_OVERRIDE | 1 << CAP_DAC_READ_SEARCH);
+	TESTSYSCALL(syscall(SYS_capset, &uchs, ucds));
+
+	/* This is what we are testing in fuseland. First test without fuse, */
+	TESTSYSCALL(mkdir("test", 0111));
+	TEST(dir = opendir("test"), dir == NULL);
+	closedir(dir);
+	dir = NULL;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(mount_fuse(mount_dir, -1, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_mkdir(s_path(s(mount_dir), s("test")), 0111));
+	TEST(dir = s_opendir(s_path(s(mount_dir), s("test"))), dir == NULL);
+
+	result = TEST_SUCCESS;
+out:
+	ucds[0].effective |= 1 << CAP_DAC_OVERRIDE | 1 << CAP_DAC_READ_SEARCH;
+	syscall(SYS_capset, &uchs, ucds);
+
+	closedir(dir);
+	s_rmdir(s_path(s(mount_dir), s("test")));
+	umount(mount_dir);
+	close(fuse_dev);
+	close(src_fd);
+	rmdir("test");
+	return result;
+}
+
+static int inotify_test(const char *mount_dir)
+{
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	struct s dir;
+	int inotify_fd = -1;
+	int watch;
+	int fd = -1;
+	char buffer[sizeof(struct inotify_event) + NAME_MAX + 1];
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(mount_fuse(mount_dir, -1, src_fd, &fuse_dev), 0);
+
+	TEST(inotify_fd = inotify_init1(IN_CLOEXEC), inotify_fd != -1);
+	dir = s_path(s(mount_dir), s("dir"));
+	TESTSYSCALL(mkdir(dir.s, 0777));
+	TEST(watch = inotify_add_watch(inotify_fd, dir.s, IN_CREATE), watch);
+	TEST(fd = s_creat(s_path(s(ft_src), s("dir/file")), 0777), fd != -1);
+	// buffer will be two struct lengths, as "file" gets rounded up to the
+	// next multiple of struct inotify_event
+	TESTEQUAL(read(inotify_fd, &buffer, sizeof(buffer)),
+		  sizeof(struct inotify_event) * 2);
+
+	result = TEST_SUCCESS;
+out:
+	close(fd);
+	s_unlink(s_path(s(ft_src), s("dir/file")));
+	close(inotify_fd);
+	rmdir(dir.s);
+	free(dir.s);
+	umount(mount_dir);
+	close(fuse_dev);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_statfs(const char *mount_dir)
+{
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+	struct statfs st;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TESTSYSCALL(s_statfs(s(mount_dir), &st));
+	TESTEQUAL(bpf_test_trace("statfs"), 0);
+	TESTEQUAL(st.f_type, 0x65735546);
+	result = TEST_SUCCESS;
+out:
+	close(fd);
+	umount(mount_dir);
+	close(fuse_dev);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+static int bpf_test_lseek(const char *mount_dir)
+{
+	const char *file = "real";
+	const char *test_data = "data";
+	int result = TEST_FAILURE;
+	int src_fd = -1;
+	int bpf_fd = -1;
+	int fuse_dev = -1;
+	int fd = -1;
+
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TEST(fd = openat(src_fd, file, O_CREAT | O_RDWR | O_CLOEXEC, 0777),
+	     fd != -1);
+	TESTEQUAL(write(fd, test_data, strlen(test_data)), strlen(test_data));
+	TESTSYSCALL(close(fd));
+	fd = -1;
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_trace",
+				  &bpf_fd, NULL, NULL), 0);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+
+	TEST(fd = s_open(s_path(s(mount_dir), s(file)), O_RDONLY | O_CLOEXEC),
+	     fd != -1);
+	TESTEQUAL(lseek(fd, 3, SEEK_SET), 3);
+	TESTEQUAL(bpf_test_trace("lseek"), 0);
+	TESTEQUAL(lseek(fd, 5, SEEK_END), 9);
+	TESTEQUAL(bpf_test_trace("lseek"), 0);
+	TESTEQUAL(lseek(fd, 1, SEEK_CUR), 10);
+	TESTEQUAL(bpf_test_trace("lseek"), 0);
+	TESTEQUAL(lseek(fd, 1, SEEK_DATA), 1);
+	TESTEQUAL(bpf_test_trace("lseek"), 0);
+	result = TEST_SUCCESS;
+out:
+	close(fd);
+	umount(mount_dir);
+	close(fuse_dev);
+	close(bpf_fd);
+	close(src_fd);
+	return result;
+}
+
+/*
+ * State:
+ * Original: dst/folder1/content.txt
+ *                  ^
+ *                  |
+ *                  |
+ * Backing:  src/folder1/content.txt
+ *
+ * Step 1:  open(folder1) - set backing to src/folder1
+ * Check 1: cat(content.txt) - check not receiving call on the fuse daemon
+ *                             and content is the same
+ * Step 2:  readdirplus(dst)
+ * Check 2: cat(content.txt) - check not receiving call on the fuse daemon
+ *                             and content is the same
+ */
+static int bpf_test_readdirplus_not_overriding_backing(const char *mount_dir)
+{
+	const char *folder1 = "folder1";
+	const char *content_file = "content.txt";
+	const char *content = "hello world";
+
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	int src_fd = -1;
+	int content_fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777));
+	TEST(content_fd = s_creat(s_pathn(3, s(ft_src), s(folder1), s(content_file)), 0777),
+		content_fd != -1);
+	TESTEQUAL(write(content_fd, content, strlen(content)), strlen(content));
+	TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0);
+
+	FUSE_START_DAEMON();
+	if (action) {
+		DIR *open_mount_dir = NULL;
+		struct dirent *mount_dirent;
+		int dst_folder1_fd = -1;
+		int dst_content_fd = -1;
+		char content_buffer[12];
+
+		// Step 1: Lookup folder1
+		TESTERR(dst_folder1_fd = s_open(s_path(s(mount_dir), s(folder1)),
+			O_RDONLY | O_CLOEXEC), dst_folder1_fd != -1);
+
+		// Check 1: Read content file (backed)
+		TESTERR(dst_content_fd =
+			s_open(s_pathn(3, s(mount_dir), s(folder1), s(content_file)),
+			O_RDONLY | O_CLOEXEC), dst_content_fd != -1);
+
+		TESTEQUAL(read(dst_content_fd, content_buffer, strlen(content)),
+			  strlen(content));
+		TESTEQUAL(strncmp(content, content_buffer, strlen(content)), 0);
+
+		TESTSYSCALL(close(dst_content_fd));
+		dst_content_fd = -1;
+		TESTSYSCALL(close(dst_folder1_fd));
+		dst_folder1_fd = -1;
+		memset(content_buffer, 0, strlen(content));
+
+		// Step 2: readdir folder 1
+		TEST(open_mount_dir = s_opendir(s(mount_dir)),
+			open_mount_dir != NULL);
+		TEST(mount_dirent = readdir(open_mount_dir), mount_dirent != NULL);
+		TESTSYSCALL(closedir(open_mount_dir));
+		open_mount_dir = NULL;
+
+		// Check 2: Read content file again (must be backed)
+		TESTERR(dst_content_fd =
+			s_open(s_pathn(3, s(mount_dir), s(folder1), s(content_file)),
+			O_RDONLY | O_CLOEXEC), dst_content_fd != -1);
+
+		TESTEQUAL(read(dst_content_fd, content_buffer, strlen(content)),
+			  strlen(content));
+		TESTEQUAL(strncmp(content, content_buffer, strlen(content)), 0);
+
+		TESTSYSCALL(close(dst_content_fd));
+		dst_content_fd = -1;
+	} else {
+		size_t read_size = 0;
+		struct fuse_in_header *in_header = (struct fuse_in_header *)bytes_in;
+		struct fuse_read_out *read_out = NULL;
+		struct fuse_attr attr = {};
+		int backing_fd = -1;
+		DECL_FUSE_IN(open);
+		DECL_FUSE_IN(getattr);
+
+		TESTFUSEINITFLAGS(FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO);
+
+		// Step 1: Lookup folder 1 with backing
+		TESTFUSELOOKUP(folder1, 0);
+		TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr));
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)),
+					 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+				.nodeid = attr.ino,
+				.generation = 0,
+				.entry_valid = UINT64_MAX,
+				.attr_valid = UINT64_MAX,
+				.entry_valid_nsec = UINT32_MAX,
+				.attr_valid_nsec = UINT32_MAX,
+				.attr = attr,
+			     }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+				.backing_action = FUSE_ACTION_REPLACE,
+				.backing_fd = backing_fd,
+			     }));
+		TESTSYSCALL(close(backing_fd));
+
+		// Step 2: Open root dir
+		TESTFUSEIN(FUSE_OPENDIR, open_in);
+		TESTFUSEOUT1(fuse_open_out, ((struct fuse_open_out) {
+			.fh = 100,
+			.open_flags = open_in->flags
+		}));
+
+		// Step 2: Handle getattr
+		TESTFUSEIN(FUSE_GETATTR, getattr_in);
+		TESTSYSCALL(s_fuse_attr(s(ft_src), &attr));
+		TESTFUSEOUT1(fuse_attr_out, ((struct fuse_attr_out) {
+			.attr_valid = UINT64_MAX,
+			.attr_valid_nsec = UINT32_MAX,
+			.attr = attr
+		}));
+
+		// Step 2: Handle readdirplus
+		read_size = read(fuse_dev, bytes_in, sizeof(bytes_in));
+		TESTEQUAL(in_header->opcode, FUSE_READDIRPLUS);
+
+		struct fuse_direntplus *dirent_plus =
+			(struct fuse_direntplus *) (bytes_in + read_size);
+		struct fuse_dirent dirent;
+		struct fuse_entry_out entry_out;
+
+		read_out = (struct fuse_read_out *) (bytes_in +
+					sizeof(*in_header) +
+					sizeof(struct fuse_read_in));
+
+		TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr));
+
+		dirent = (struct fuse_dirent) {
+			.ino = attr.ino,
+			.off = 1,
+			.namelen = strlen(folder1),
+			.type = DT_REG
+		};
+		entry_out = (struct fuse_entry_out) {
+			.nodeid = attr.ino,
+			.generation = 0,
+			.entry_valid = UINT64_MAX,
+			.attr_valid = UINT64_MAX,
+			.entry_valid_nsec = UINT32_MAX,
+			.attr_valid_nsec = UINT32_MAX,
+			.attr = attr
+		};
+		*dirent_plus = (struct fuse_direntplus) {
+			.dirent = dirent,
+			.entry_out = entry_out
+		};
+
+		strcpy((char *)(bytes_in + read_size + sizeof(*dirent_plus)), folder1);
+		read_size += FUSE_DIRENT_ALIGN(sizeof(*dirent_plus) + strlen(folder1) +
+					1);
+		TESTFUSEDIROUTREAD(read_out,
+				bytes_in +
+				sizeof(struct fuse_in_header) +
+				sizeof(struct fuse_read_in) +
+				sizeof(struct fuse_read_out),
+				read_size - sizeof(struct fuse_in_header) -
+					sizeof(struct fuse_read_in) -
+					sizeof(struct fuse_read_out));
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(content_fd);
+	close(src_fd);
+	umount(mount_dir);
+	return result;
+}
+
+static int bpf_test_no_readdirplus_without_nodeid(const char *mount_dir)
+{
+	const char *folder1 = "folder1";
+	const char *folder2 = "folder2";
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	int src_fd = -1;
+	int content_fd = -1;
+	int bpf_fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_readdirplus",
+					  &bpf_fd, NULL, NULL), 0);
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777));
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder2)), 0777));
+	TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0);
+	FUSE_START_DAEMON();
+	if (action) {
+		DIR *open_dir = NULL;
+		struct dirent *dirent;
+
+		// Folder 1: Readdir with no nodeid
+		TEST(open_dir = s_opendir(s_path(s(ft_dst), s(folder1))),
+				open_dir != NULL);
+		TEST(dirent = readdir(open_dir), dirent == NULL);
+		TESTCOND(errno == EINVAL);
+		TESTSYSCALL(closedir(open_dir));
+		open_dir = NULL;
+
+		// Folder 2: Readdir with a nodeid
+		TEST(open_dir = s_opendir(s_path(s(ft_dst), s(folder2))),
+				open_dir != NULL);
+		TEST(dirent = readdir(open_dir), dirent == NULL);
+		TESTCOND(errno == EINVAL);
+		TESTSYSCALL(closedir(open_dir));
+		open_dir = NULL;
+	} else {
+		size_t read_size;
+		struct fuse_in_header *in_header = (struct fuse_in_header *)bytes_in;
+		struct fuse_attr attr = {};
+		int backing_fd = -1;
+
+		TESTFUSEINITFLAGS(FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO);
+
+		// folder 1: Set 0 as nodeid, Expect READDIR
+		TESTFUSELOOKUP(folder1, 0);
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)),
+					 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+				.nodeid = 0,
+				.generation = 0,
+				.entry_valid = UINT64_MAX,
+				.attr_valid = UINT64_MAX,
+				.entry_valid_nsec = UINT32_MAX,
+				.attr_valid_nsec = UINT32_MAX,
+				.attr = attr,
+			     }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+				.backing_action = FUSE_ACTION_REPLACE,
+				.backing_fd = backing_fd,
+				.bpf_action = FUSE_ACTION_REPLACE,
+				.bpf_fd = bpf_fd,
+			     }));
+		TESTSYSCALL(close(backing_fd));
+		TEST(read_size = read(fuse_dev, bytes_in, sizeof(bytes_in)), read_size > 0);
+		TESTEQUAL(in_header->opcode, FUSE_READDIR);
+		TESTFUSEOUTERROR(-EINVAL);
+
+		// folder 2: Set 10 as nodeid, Expect READDIRPLUS
+		TESTFUSELOOKUP(folder2, 0);
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder2)),
+					 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+				.nodeid = 10,
+				.generation = 0,
+				.entry_valid = UINT64_MAX,
+				.attr_valid = UINT64_MAX,
+				.entry_valid_nsec = UINT32_MAX,
+				.attr_valid_nsec = UINT32_MAX,
+				.attr = attr,
+			     }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+				.backing_action = FUSE_ACTION_REPLACE,
+				.backing_fd = backing_fd,
+				.bpf_action = FUSE_ACTION_REPLACE,
+				.bpf_fd = bpf_fd,
+			     }));
+		TESTSYSCALL(close(backing_fd));
+		TEST(read_size = read(fuse_dev, bytes_in, sizeof(bytes_in)), read_size > 0);
+		TESTEQUAL(in_header->opcode, FUSE_READDIRPLUS);
+		TESTFUSEOUTERROR(-EINVAL);
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(content_fd);
+	close(src_fd);
+	close(bpf_fd);
+	umount(mount_dir);
+	return result;
+}
+
+/*
+ * State:
+ * Original: dst/folder1/content.txt
+ *                  ^
+ *                  |
+ *                  |
+ * Backing:  src/folder1/content.txt
+ *
+ * Step 1:  open(folder1) - lookup folder1 with entry_timeout set to 0
+ * Step 2:  open(folder1) - lookup folder1 again to trigger revalidate wich will
+ *                          set backing fd
+ *
+ * Check 1: cat(content.txt) - check not receiving call on the fuse daemon
+ *                             and content is the same
+ */
+static int bpf_test_revalidate_handle_backing_fd(const char *mount_dir)
+{
+	const char *folder1 = "folder1";
+	const char *content_file = "content.txt";
+	const char *content = "hello world";
+	int result = TEST_FAILURE;
+	int fuse_dev = -1;
+	int src_fd = -1;
+	int content_fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(folder1)), 0777));
+	TEST(content_fd = s_creat(s_pathn(3, s(ft_src), s(folder1), s(content_file)), 0777),
+		content_fd != -1);
+	TESTEQUAL(write(content_fd, content, strlen(content)), strlen(content));
+	TESTSYSCALL(close(content_fd));
+	content_fd = -1;
+	TESTEQUAL(mount_fuse_no_init(mount_dir, -1, -1, &fuse_dev), 0);
+	FUSE_START_DAEMON();
+	if (action) {
+		int dst_folder1_fd = -1;
+		int dst_content_fd = -1;
+		char content_buffer[9] = {0};
+		// Step 1: Lookup folder1
+		TESTERR(dst_folder1_fd = s_open(s_path(s(mount_dir), s(folder1)),
+			O_RDONLY | O_CLOEXEC), dst_folder1_fd != -1);
+		TESTSYSCALL(close(dst_folder1_fd));
+		dst_folder1_fd = -1;
+		// Step 2: Lookup folder1 again
+		TESTERR(dst_folder1_fd = s_open(s_path(s(mount_dir), s(folder1)),
+			O_RDONLY | O_CLOEXEC), dst_folder1_fd != -1);
+		TESTSYSCALL(close(dst_folder1_fd));
+		dst_folder1_fd = -1;
+		// Check 1: Read content file (must be backed)
+		TESTERR(dst_content_fd =
+			s_open(s_pathn(3, s(mount_dir), s(folder1), s(content_file)),
+			O_RDONLY | O_CLOEXEC), dst_content_fd != -1);
+		TESTEQUAL(read(dst_content_fd, content_buffer, strlen(content)),
+			  strlen(content));
+		TESTEQUAL(strncmp(content, content_buffer, strlen(content)), 0);
+		TESTSYSCALL(close(dst_content_fd));
+		dst_content_fd = -1;
+	} else {
+		struct fuse_attr attr = {};
+		int backing_fd = -1;
+
+		TESTFUSEINITFLAGS(FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO);
+		// Step 1: Lookup folder1 set entry_timeout to 0 to trigger
+		// revalidate later
+		TESTFUSELOOKUP(folder1, 0);
+		TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr));
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)),
+					 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+				.nodeid = attr.ino,
+				.generation = 0,
+				.entry_valid = 0,
+				.attr_valid = UINT64_MAX,
+				.entry_valid_nsec = 0,
+				.attr_valid_nsec = UINT32_MAX,
+				.attr = attr,
+			     }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+				.backing_action = FUSE_ACTION_REPLACE,
+				.backing_fd = backing_fd,
+			     }));
+		TESTSYSCALL(close(backing_fd));
+		// Step 1: Lookup folder1 as a reaction to revalidate call
+		// This attempts to change the backing node, which is not allowed on revalidate
+		TESTFUSELOOKUP(folder1, 0);
+		TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr));
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)),
+					 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+				.nodeid = attr.ino,
+				.generation = 0,
+				.entry_valid = UINT64_MAX,
+				.attr_valid = UINT64_MAX,
+				.entry_valid_nsec = UINT32_MAX,
+				.attr_valid_nsec = UINT32_MAX,
+				.attr = attr,
+			     }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+				.backing_action = FUSE_ACTION_REPLACE,
+				.backing_fd = backing_fd,
+			     }));
+		TESTSYSCALL(close(backing_fd));
+
+		// Lookup folder1 as a reaction to failed revalidate
+		TESTFUSELOOKUP(folder1, 0);
+		TESTSYSCALL(s_fuse_attr(s_path(s(ft_src), s(folder1)), &attr));
+		TEST(backing_fd = s_open(s_path(s(ft_src), s(folder1)),
+					 O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+		     backing_fd != -1);
+		TESTFUSEOUT2(fuse_entry_out, ((struct fuse_entry_out) {
+				.nodeid = attr.ino,
+				.generation = 0,
+				.entry_valid = UINT64_MAX,
+				.attr_valid = UINT64_MAX,
+				.entry_valid_nsec = UINT32_MAX,
+				.attr_valid_nsec = UINT32_MAX,
+				.attr = attr,
+			     }), fuse_entry_bpf_out, ((struct fuse_entry_bpf_out) {
+				.backing_action = FUSE_ACTION_REPLACE,
+				.backing_fd = backing_fd,
+			     }));
+		TESTSYSCALL(close(backing_fd));
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(fuse_dev);
+	close(content_fd);
+	close(src_fd);
+	umount(mount_dir);
+	return result;
+}
+
+static int bpf_test_lookup_postfilter(const char *mount_dir)
+{
+	const char *file1_name = "file1";
+	const char *file2_name = "file2";
+	const char *file3_name = "file3";
+	int result = TEST_FAILURE;
+	int bpf_fd = -1;
+	int src_fd = -1;
+	int fuse_dev = -1;
+	int file_fd = -1;
+	FUSE_DECLARE_DAEMON;
+
+	TEST(file_fd = s_creat(s_path(s(ft_src), s(file1_name)), 0777),
+	     file_fd != -1);
+	TESTSYSCALL(close(file_fd));
+	TEST(file_fd = s_creat(s_path(s(ft_src), s(file2_name)), 0777),
+	     file_fd != -1);
+	TESTSYSCALL(close(file_fd));
+	file_fd = -1;
+	TESTEQUAL(install_elf_bpf("test_bpf.bpf", "test_lookup_postfilter",
+					  &bpf_fd, NULL, NULL), 0);
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(mount_fuse(mount_dir, bpf_fd, src_fd, &fuse_dev), 0);
+	FUSE_START_DAEMON();
+	if (action) {
+		int fd = -1;
+
+		TESTEQUAL(s_open(s_path(s(mount_dir), s(file1_name)), O_RDONLY),
+			  -1);
+		TESTEQUAL(errno, ENOENT);
+		TEST(fd = s_open(s_path(s(mount_dir), s(file2_name)), O_RDONLY),
+		     fd != -1);
+		TESTSYSCALL(close(fd));
+		TESTEQUAL(s_open(s_path(s(mount_dir), s(file3_name)), O_RDONLY),
+			  -1);
+	} else {
+		struct fuse_in_postfilter_header *in_header =
+				(struct fuse_in_postfilter_header *)bytes_in;
+		struct fuse_entry_out *feo;
+		struct fuse_entry_bpf_out *febo;
+
+		TESTFUSELOOKUP(file1_name, FUSE_POSTFILTER);
+		TESTFUSEOUTERROR(-ENOENT);
+
+		TESTFUSELOOKUP(file2_name, FUSE_POSTFILTER);
+		feo = (struct fuse_entry_out *) (bytes_in +
+			sizeof(struct fuse_in_header) +	strlen(file2_name) + 1);
+		febo = (struct fuse_entry_bpf_out *) ((char *)feo +
+			sizeof(*feo));
+		TESTFUSEOUT2(fuse_entry_out, *feo, fuse_entry_bpf_out, *febo);
+
+		TESTFUSELOOKUP(file3_name, FUSE_POSTFILTER);
+		TESTEQUAL(in_header->error_in, -ENOENT);
+		TESTFUSEOUTERROR(-ENOENT);
+		exit(TEST_SUCCESS);
+	}
+	FUSE_END_DAEMON();
+	close(file_fd);
+	close(fuse_dev);
+	umount(mount_dir);
+	close(src_fd);
+	close(bpf_fd);
+	return result;
+}
+
+static void parse_range(const char *ranges, bool *run_test, size_t tests)
+{
+	size_t i;
+	char *range;
+
+	for (i = 0; i < tests; ++i)
+		run_test[i] = false;
+
+	range = strtok(optarg, ",");
+	while (range) {
+		char *dash = strchr(range, '-');
+
+		if (dash) {
+			size_t start = 1, end = tests;
+			char *end_ptr;
+
+			if (dash > range) {
+				start = strtol(range, &end_ptr, 10);
+				if (*end_ptr != '-' || start <= 0 || start > tests)
+					ksft_exit_fail_msg("Bad range\n");
+			}
+
+			if (dash[1]) {
+				end = strtol(dash + 1, &end_ptr, 10);
+				if (*end_ptr || end <= start || end > tests)
+					ksft_exit_fail_msg("Bad range\n");
+			}
+
+			for (i = start; i <= end; ++i)
+				run_test[i - 1] = true;
+		} else {
+			char *end;
+			long value = strtol(range, &end, 10);
+
+			if (*end || value <= 0 || value > tests)
+				ksft_exit_fail_msg("Bad range\n");
+			run_test[value - 1] = true;
+		}
+		range = strtok(NULL, ",");
+	}
+}
+
+static int parse_options(int argc, char *const *argv, bool *run_test,
+			 size_t tests)
+{
+	signed char c;
+
+	while ((c = getopt(argc, argv, "f:t:v")) != -1)
+		switch (c) {
+		case 'f':
+			test_options.file = strtol(optarg, NULL, 10);
+			break;
+
+		case 't':
+			parse_range(optarg, run_test, tests);
+			break;
+
+		case 'v':
+			test_options.verbose = true;
+			break;
+
+		default:
+			return -EINVAL;
+		}
+
+	return 0;
+}
+
+struct test_case {
+	int (*pfunc)(const char *dir);
+	const char *name;
+};
+
+static void run_one_test(const char *mount_dir,
+			 const struct test_case *test_case)
+{
+	ksft_print_msg("Running %s\n", test_case->name);
+	if (test_case->pfunc(mount_dir) == TEST_SUCCESS)
+		ksft_test_result_pass("%s\n", test_case->name);
+	else
+		ksft_test_result_fail("%s\n", test_case->name);
+}
+
+int main(int argc, char *argv[])
+{
+	char *mount_dir = NULL;
+	char *src_dir = NULL;
+	int i;
+	int fd, count;
+
+#define MAKE_TEST(test)                                                        \
+	{                                                                      \
+		test, #test                                                    \
+	}
+	const struct test_case cases[] = {
+		MAKE_TEST(basic_test),
+		MAKE_TEST(bpf_test_real),
+		MAKE_TEST(bpf_test_partial),
+		MAKE_TEST(bpf_test_attrs),
+		MAKE_TEST(bpf_test_readdir),
+		MAKE_TEST(bpf_test_creat),
+		MAKE_TEST(bpf_test_hidden_entries),
+		MAKE_TEST(bpf_test_dir),
+		MAKE_TEST(bpf_test_file_early_close),
+		MAKE_TEST(bpf_test_file_late_close),
+		MAKE_TEST(bpf_test_mknod),
+		MAKE_TEST(bpf_test_largedir),
+		MAKE_TEST(bpf_test_link),
+		MAKE_TEST(bpf_test_symlink),
+		MAKE_TEST(bpf_test_xattr),
+		MAKE_TEST(bpf_test_redact_readdir),
+		MAKE_TEST(bpf_test_set_backing),
+		MAKE_TEST(bpf_test_remove_backing),
+		MAKE_TEST(bpf_test_dir_rename),
+		MAKE_TEST(bpf_test_file_rename),
+		MAKE_TEST(bpf_test_alter_errcode_bpf),
+		MAKE_TEST(bpf_test_alter_errcode_userspace),
+		MAKE_TEST(mmap_test),
+		MAKE_TEST(readdir_perms_test),
+		MAKE_TEST(inotify_test),
+		MAKE_TEST(bpf_test_statfs),
+		MAKE_TEST(bpf_test_lseek),
+		MAKE_TEST(bpf_test_readdirplus_not_overriding_backing),
+		MAKE_TEST(bpf_test_no_readdirplus_without_nodeid),
+		MAKE_TEST(bpf_test_revalidate_handle_backing_fd),
+		MAKE_TEST(bpf_test_lookup_postfilter),
+	};
+#undef MAKE_TEST
+
+	bool run_test[ARRAY_SIZE(cases)];
+
+	for (int i = 0; i < ARRAY_SIZE(cases); ++i)
+		run_test[i] = true;
+
+	if (parse_options(argc, argv, run_test, ARRAY_SIZE(cases)))
+		ksft_exit_fail_msg("Bad options\n");
+
+	// Seed randomness pool for testing on QEMU
+	// NOTE - this abuses the concept of randomness - do *not* ever do this
+	// on a machine for production use - the device will think it has good
+	// randomness when it does not.
+	fd = open("/dev/urandom", O_WRONLY | O_CLOEXEC);
+	count = 4096;
+	for (int i = 0; i < 128; ++i)
+		ioctl(fd, RNDADDTOENTCNT, &count);
+	close(fd);
+
+	ksft_print_header();
+
+	if (geteuid() != 0)
+		ksft_print_msg("Not a root, might fail to mount.\n");
+
+	if (tracing_on() != TEST_SUCCESS)
+		ksft_exit_fail_msg("Can't turn on tracing\n");
+
+	src_dir = setup_mount_dir(ft_src);
+	mount_dir = setup_mount_dir(ft_dst);
+	if (src_dir == NULL || mount_dir == NULL)
+		ksft_exit_fail_msg("Can't create a mount dir\n");
+
+	ksft_set_plan(ARRAY_SIZE(run_test));
+
+	for (i = 0; i < ARRAY_SIZE(run_test); ++i)
+		if (run_test[i]) {
+			delete_dir_tree(mount_dir, false);
+			delete_dir_tree(src_dir, false);
+			run_one_test(mount_dir, &cases[i]);
+		} else
+			ksft_cnt.ksft_xskip++;
+
+	umount2(mount_dir, MNT_FORCE);
+	delete_dir_tree(mount_dir, true);
+	delete_dir_tree(src_dir, true);
+	return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/filesystems/fuse/test_bpf.c b/tools/testing/selftests/filesystems/fuse/test_bpf.c
new file mode 100644
index 000000000000..032cb1178f9f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/test_bpf.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2022 Google LLC
+
+#include "test_fuse_bpf.h"
+
+SEC("test_readdir_redact")
+/* return FUSE_BPF_BACKING to use backing fs, 0 to pass to usermode */
+int readdir_test(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_READDIR | FUSE_PREFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+
+		bpf_printk("readdir %d", fri->fh);
+		return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+	}
+
+	case FUSE_READDIR | FUSE_POSTFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+
+		bpf_printk("readdir postfilter %x", fri->fh);
+		return FUSE_BPF_USER_FILTER;
+	}
+
+	default:
+		bpf_printk("opcode %d", fa->opcode);
+		return FUSE_BPF_BACKING;
+	}
+}
+
+SEC("test_trace")
+/* return FUSE_BPF_BACKING to use backing fs, 0 to pass to usermode */
+int trace_test(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_LOOKUP | FUSE_PREFILTER: {
+		/* real and partial use backing file */
+		const char *name = fa->in_args[0].value;
+		bool backing = false;
+
+		if (strcmp(name, "real") == 0 || strcmp(name, "partial") == 0)
+			backing = true;
+
+		if (strcmp(name, "dir") == 0)
+			backing = true;
+		if (strcmp(name, "dir2") == 0)
+			backing = true;
+
+		if (strcmp(name, "file1") == 0)
+			backing = true;
+		if (strcmp(name, "file2") == 0)
+			backing = true;
+
+		bpf_printk("lookup %s %d", name, backing);
+		return backing ? FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER : 0;
+	}
+
+	case FUSE_LOOKUP | FUSE_POSTFILTER: {
+		const char *name = fa->in_args[0].value;
+		struct fuse_entry_out *feo = fa->out_args[0].value;
+
+		if (strcmp(name, "real") == 0)
+			feo->nodeid = 5;
+		else if (strcmp(name, "partial") == 0)
+			feo->nodeid = 6;
+
+		bpf_printk("post-lookup %s %d", name, feo->nodeid);
+		return 0;
+	}
+
+	case FUSE_ACCESS | FUSE_PREFILTER: {
+		bpf_printk("Access: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_CREATE | FUSE_PREFILTER:
+		bpf_printk("Create: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+
+	case FUSE_MKNOD | FUSE_PREFILTER: {
+		const struct fuse_mknod_in *fmi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("mknod %s %x %x", name, fmi->rdev | fmi->mode, fmi->umask);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_MKDIR | FUSE_PREFILTER: {
+		const struct fuse_mkdir_in *fmi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("mkdir %s %x %x", name, fmi->mode, fmi->umask);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RMDIR | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("rmdir %s", name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RENAME | FUSE_PREFILTER: {
+		const char *oldname = fa->in_args[1].value;
+		const char *newname = fa->in_args[2].value;
+
+		bpf_printk("rename from %s", oldname);
+		bpf_printk("rename to %s", newname);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RENAME2 | FUSE_PREFILTER: {
+		const struct fuse_rename2_in *fri = fa->in_args[0].value;
+		uint32_t flags = fri->flags;
+		const char *oldname = fa->in_args[1].value;
+		const char *newname = fa->in_args[2].value;
+
+		bpf_printk("rename(%x) from %s", flags, oldname);
+		bpf_printk("rename to %s", newname);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_UNLINK | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("unlink %s", name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LINK | FUSE_PREFILTER: {
+		const struct fuse_link_in *fli = fa->in_args[0].value;
+		const char *link_name = fa->in_args[1].value;
+
+		bpf_printk("link %d %s", fli->oldnodeid, link_name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_SYMLINK | FUSE_PREFILTER: {
+		const char *link_name = fa->in_args[0].value;
+		const char *link_dest = fa->in_args[1].value;
+
+		bpf_printk("symlink from %s", link_name);
+		bpf_printk("symlink to %s", link_dest);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_READLINK | FUSE_PREFILTER: {
+		const char *link_name = fa->in_args[0].value;
+
+		bpf_printk("readlink from", link_name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_OPEN | FUSE_PREFILTER: {
+		int backing = 0;
+
+		switch (fa->nodeid) {
+		case 5:
+			backing = FUSE_BPF_BACKING;
+			break;
+
+		case 6:
+			backing = FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+			break;
+
+		default:
+			break;
+		}
+
+		bpf_printk("open %d %d", fa->nodeid, backing);
+		return backing;
+	}
+
+	case FUSE_OPEN | FUSE_POSTFILTER:
+		bpf_printk("open postfilter");
+		return FUSE_BPF_USER_FILTER;
+
+	case FUSE_READ | FUSE_PREFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+
+		bpf_printk("read %llu %llu", fri->fh, fri->offset);
+		if (fri->fh == 1 && fri->offset == 0)
+			return 0;
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_GETATTR | FUSE_PREFILTER: {
+		/* real and partial use backing file */
+		int backing = 0;
+
+		switch (fa->nodeid) {
+		case 1:
+		case 5:
+		case 6:
+		/*
+		 * TODO: Find better solution
+		 * Add 100 to stop clang compiling to jump table which bpf hates
+		 */
+		case 100:
+			backing = FUSE_BPF_BACKING;
+			break;
+		}
+
+		bpf_printk("getattr %d %d", fa->nodeid, backing);
+		return backing;
+	}
+
+	case FUSE_SETATTR | FUSE_PREFILTER: {
+		/* real and partial use backing file */
+		int backing = 0;
+
+		switch (fa->nodeid) {
+		case 1:
+		case 5:
+		case 6:
+		/* TODO See above */
+		case 100:
+			backing = FUSE_BPF_BACKING;
+			break;
+		}
+
+		bpf_printk("setattr %d %d", fa->nodeid, backing);
+		return backing;
+	}
+
+	case FUSE_OPENDIR | FUSE_PREFILTER: {
+		int backing = 0;
+
+		switch (fa->nodeid) {
+		case 1:
+			backing = FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+			break;
+		}
+
+		bpf_printk("opendir %d %d", fa->nodeid, backing);
+		return backing;
+	}
+
+	case FUSE_OPENDIR | FUSE_POSTFILTER: {
+		struct fuse_open_out *foo = fa->out_args[0].value;
+
+		foo->fh = 2;
+		bpf_printk("opendir postfilter");
+		return 0;
+	}
+
+	case FUSE_READDIR | FUSE_PREFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+		int backing = 0;
+
+		if (fri->fh == 2)
+			backing = FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+
+		bpf_printk("readdir %d %d", fri->fh, backing);
+		return backing;
+	}
+
+	case FUSE_READDIR | FUSE_POSTFILTER: {
+		const struct fuse_read_in *fri = fa->in_args[0].value;
+		int backing = 0;
+
+		if (fri->fh == 2)
+			backing = FUSE_BPF_USER_FILTER | FUSE_BPF_BACKING |
+				  FUSE_BPF_POST_FILTER;
+
+		bpf_printk("readdir postfilter %d %d", fri->fh, backing);
+		return backing;
+	}
+
+	case FUSE_FLUSH | FUSE_PREFILTER: {
+		const struct fuse_flush_in *ffi = fa->in_args[0].value;
+
+		bpf_printk("Flush %d", ffi->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_GETXATTR | FUSE_PREFILTER: {
+		const struct fuse_flush_in *ffi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("getxattr %d %s", ffi->fh, name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LISTXATTR | FUSE_PREFILTER: {
+		const struct fuse_flush_in *ffi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+
+		bpf_printk("listxattr %d %s", ffi->fh, name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_SETXATTR | FUSE_PREFILTER: {
+		const struct fuse_flush_in *ffi = fa->in_args[0].value;
+		const char *name = fa->in_args[1].value;
+		unsigned int size = fa->in_args[2].size;
+
+		bpf_printk("setxattr %d %s %u", ffi->fh, name, size);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_REMOVEXATTR | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("removexattr %s", name);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_CANONICAL_PATH | FUSE_PREFILTER: {
+		bpf_printk("canonical_path");
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_STATFS | FUSE_PREFILTER: {
+		bpf_printk("statfs");
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_LSEEK | FUSE_PREFILTER: {
+		const struct fuse_lseek_in *fli = fa->in_args[0].value;
+
+		bpf_printk("lseek type:%d, offset:%lld", fli->whence, fli->offset);
+		return FUSE_BPF_BACKING;
+	}
+
+	default:
+		bpf_printk("Unknown opcode %d", fa->opcode);
+		return 0;
+	}
+}
+
+SEC("test_hidden")
+int trace_hidden(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_LOOKUP | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("Lookup: %s", name);
+		if (!strcmp(name, "show"))
+			return FUSE_BPF_BACKING;
+		if (!strcmp(name, "hide"))
+			return -ENOENT;
+
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_ACCESS | FUSE_PREFILTER: {
+		bpf_printk("Access: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_CREATE | FUSE_PREFILTER:
+		bpf_printk("Create: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+
+	case FUSE_WRITE | FUSE_PREFILTER:
+	// TODO: Clang combines similar printk calls, causing BPF to complain
+	//	bpf_printk("Write: %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+
+	case FUSE_FLUSH | FUSE_PREFILTER: {
+	//	const struct fuse_flush_in *ffi = fa->in_args[0].value;
+
+	//	bpf_printk("Flush %d", ffi->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_RELEASE | FUSE_PREFILTER: {
+	//	const struct fuse_release_in *fri = fa->in_args[0].value;
+
+	//	bpf_printk("Release %d", fri->fh);
+		return FUSE_BPF_BACKING;
+	}
+
+	case FUSE_FALLOCATE | FUSE_PREFILTER:
+	//	bpf_printk("fallocate %d", fa->nodeid);
+		return FUSE_BPF_BACKING;
+
+	case FUSE_CANONICAL_PATH | FUSE_PREFILTER: {
+		return FUSE_BPF_BACKING;
+	}
+	default:
+		bpf_printk("Unknown opcode: %d", fa->opcode);
+		return 0;
+	}
+}
+
+SEC("test_simple")
+int trace_simple(struct fuse_bpf_args *fa)
+{
+	if (fa->opcode & FUSE_PREFILTER)
+		bpf_printk("prefilter opcode: %d",
+			   fa->opcode & FUSE_OPCODE_FILTER);
+	else if (fa->opcode & FUSE_POSTFILTER)
+		bpf_printk("postfilter opcode: %d",
+			   fa->opcode & FUSE_OPCODE_FILTER);
+	else
+		bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode);
+	return FUSE_BPF_BACKING;
+}
+
+SEC("test_passthrough")
+int trace_daemon(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_LOOKUP | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("Lookup prefilter: %lx %s", fa->nodeid, name);
+		return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+	}
+
+	case FUSE_LOOKUP | FUSE_POSTFILTER: {
+		const char *name = fa->in_args[0].value;
+		struct fuse_entry_bpf_out *febo = fa->out_args[1].value;
+
+		bpf_printk("Lookup postfilter: %lx %s %lu", fa->nodeid, name);
+		febo->bpf_action = FUSE_ACTION_REMOVE;
+
+		return FUSE_BPF_USER_FILTER;
+	}
+
+	default:
+		if (fa->opcode & FUSE_PREFILTER)
+			bpf_printk("prefilter opcode: %d",
+				   fa->opcode & FUSE_OPCODE_FILTER);
+		else if (fa->opcode & FUSE_POSTFILTER)
+			bpf_printk("postfilter opcode: %d",
+				   fa->opcode & FUSE_OPCODE_FILTER);
+		else
+			bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode);
+		return FUSE_BPF_BACKING;
+	}
+}
+
+SEC("test_error")
+/* return FUSE_BPF_BACKING to use backing fs, 0 to pass to usermode */
+int error_test(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_MKDIR | FUSE_PREFILTER: {
+		bpf_printk("mkdir");
+		return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+	}
+	case FUSE_MKDIR | FUSE_POSTFILTER: {
+		bpf_printk("mkdir postfilter");
+		if (fa->error_in == -EEXIST)
+			return -EPERM;
+
+		return 0;
+	}
+
+	case FUSE_LOOKUP | FUSE_PREFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("lookup prefilter %s", name);
+		return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+	}
+	case FUSE_LOOKUP | FUSE_POSTFILTER: {
+		const char *name = fa->in_args[0].value;
+
+		bpf_printk("lookup postfilter %s %d", name, fa->error_in);
+		if (strcmp(name, "doesnotexist") == 0/* && fa->error_in == -EEXIST*/) {
+			bpf_printk("lookup postfilter doesnotexist");
+			return FUSE_BPF_USER_FILTER;
+		}
+		bpf_printk("meh");
+		return 0;
+	}
+
+	default:
+		if (fa->opcode & FUSE_PREFILTER)
+			bpf_printk("prefilter opcode: %d",
+				   fa->opcode & FUSE_OPCODE_FILTER);
+		else if (fa->opcode & FUSE_POSTFILTER)
+			bpf_printk("postfilter opcode: %d",
+				   fa->opcode & FUSE_OPCODE_FILTER);
+		else
+			bpf_printk("*** UNKNOWN *** opcode: %d", fa->opcode);
+		return FUSE_BPF_BACKING;
+	}
+}
+
+SEC("test_readdirplus")
+int readdirplus_test(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_READDIR | FUSE_PREFILTER: {
+		return 0;
+	}
+	}
+	return FUSE_BPF_BACKING;
+}
+
+SEC("test_lookup_postfilter")
+int lookuppostfilter_test(struct fuse_bpf_args *fa)
+{
+	switch (fa->opcode) {
+	case FUSE_LOOKUP | FUSE_PREFILTER:
+		return FUSE_BPF_BACKING | FUSE_BPF_POST_FILTER;
+	case FUSE_LOOKUP | FUSE_POSTFILTER:
+		return FUSE_BPF_USER_FILTER;
+	default:
+		return FUSE_BPF_BACKING;
+	}
+}
diff --git a/tools/testing/selftests/filesystems/fuse/test_framework.h b/tools/testing/selftests/filesystems/fuse/test_framework.h
new file mode 100644
index 000000000000..efc6f53ea803
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/test_framework.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 Google LLC
+ */
+
+#ifndef _TEST_FRAMEWORK_H
+#define _TEST_FRAMEWORK_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <linux/compiler.h>
+
+#ifdef __ANDROID__
+static int test_case_pass;
+static int test_case_fail;
+#define ksft_print_msg			printf
+#define ksft_test_result_pass(...)	({test_case_pass++; printf(__VA_ARGS__); })
+#define ksft_test_result_fail(...)	({test_case_fail++; printf(__VA_ARGS__); })
+#define ksft_exit_fail_msg(...)		printf(__VA_ARGS__)
+#define ksft_print_header()
+#define ksft_set_plan(cnt)
+#define ksft_get_fail_cnt()		test_case_fail
+#define ksft_exit_pass()		0
+#define ksft_exit_fail()		1
+#else
+#include <kselftest.h>
+#endif
+
+#define TEST_FAILURE 1
+#define TEST_SUCCESS 0
+
+#define ptr_to_u64(p) ((__u64)p)
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define le16_to_cpu(x)          (x)
+#define le32_to_cpu(x)          (x)
+#define le64_to_cpu(x)          (x)
+#else
+#error Big endian not supported!
+#endif
+
+struct _test_options {
+	int file;
+	bool verbose;
+};
+
+extern struct _test_options test_options;
+
+#define TESTCOND(condition)						\
+	do {								\
+		if (!(condition)) {					\
+			ksft_print_msg("%s failed %d\n",		\
+				       __func__, __LINE__);		\
+			goto out;					\
+		} else if (test_options.verbose)			\
+			ksft_print_msg("%s succeeded %d\n",		\
+				       __func__, __LINE__);		\
+	} while (false)
+
+#define TESTCONDERR(condition)						\
+	do {								\
+		if (!(condition)) {					\
+			ksft_print_msg("%s failed %d\n",		\
+				       __func__, __LINE__);		\
+			ksft_print_msg("Error %d (\"%s\")\n",		\
+				       errno, strerror(errno));		\
+			goto out;					\
+		} else if (test_options.verbose)			\
+			ksft_print_msg("%s succeeded %d\n",		\
+				       __func__, __LINE__);		\
+	} while (false)
+
+#define TEST(statement, condition)					\
+	do {								\
+		statement;						\
+		TESTCOND(condition);					\
+	} while (false)
+
+#define TESTERR(statement, condition)					\
+	do {								\
+		statement;						\
+		TESTCONDERR(condition);					\
+	} while (false)
+
+enum _operator {
+	_eq,
+	_ne,
+	_ge,
+};
+
+static const char * const _operator_name[] = {
+	"==",
+	"!=",
+	">=",
+};
+
+#define _TEST_OPERATOR(name, _type, format_specifier)			\
+static inline int _test_operator_##name(const char *func, int line,	\
+				_type a, _type b, enum _operator o)	\
+{									\
+	bool pass;							\
+	switch (o) {							\
+	case _eq:							\
+		pass = a == b;						\
+		break;							\
+	case _ne:							\
+		pass = a != b;						\
+		break;							\
+	case _ge:							\
+		pass = a >= b;						\
+		break;							\
+	}								\
+									\
+	if (!pass)							\
+		ksft_print_msg("Failed: %s at line %d, "		\
+			       format_specifier " %s "			\
+			       format_specifier	"\n",			\
+			       func, line, a, _operator_name[o], b);	\
+	else if (test_options.verbose)					\
+		ksft_print_msg("Passed: %s at line %d, "		\
+			       format_specifier " %s "			\
+			       format_specifier "\n",			\
+			       func, line, a, _operator_name[o], b);	\
+									\
+	return pass ? TEST_SUCCESS : TEST_FAILURE;			\
+}
+
+_TEST_OPERATOR(i, int, "%d")
+_TEST_OPERATOR(ui, unsigned int, "%u")
+_TEST_OPERATOR(lui, unsigned long, "%lu")
+_TEST_OPERATOR(ss, ssize_t, "%zd")
+_TEST_OPERATOR(vp, void *, "%px")
+_TEST_OPERATOR(cp, char *, "%px")
+
+#define _CALL_TO(_type, name, a, b, o)					\
+	_test_operator_##name(__func__, __LINE__,			\
+				  (_type) (long long) (a),		\
+				  (_type) (long long) (b), o)
+
+#define TESTOPERATOR(a, b, o)						\
+	do {								\
+		if (_Generic((a),					\
+			int : _CALL_TO(int, i, a, b, o),		\
+			unsigned int : _CALL_TO(unsigned int, ui, a, b, o),	\
+			unsigned long : _CALL_TO(unsigned long, lui, a, b, o),	\
+			ssize_t : _CALL_TO(ssize_t, ss, a, b, o),		\
+			void * : _CALL_TO(void *, vp, a, b, o),		\
+			char * : _CALL_TO(char *, cp, a, b, o)		\
+		))							\
+			goto out;					\
+	} while (false)
+
+#define TESTEQUAL(a, b) TESTOPERATOR(a, b, _eq)
+#define TESTNE(a, b) TESTOPERATOR(a, b, _ne)
+#define TESTGE(a, b) TESTOPERATOR(a, b, _ge)
+
+/* For testing a syscall that returns 0 on success and sets errno otherwise */
+#define TESTSYSCALL(statement) TESTCONDERR((statement) == 0)
+
+static inline void print_bytes(const void *data, size_t size)
+{
+	const char *bytes = data;
+	int i;
+
+	for (i = 0; i < size; ++i) {
+		if (i % 0x10 == 0)
+			printf("%08x:", i);
+		printf("%02x ", (unsigned int) (unsigned char) bytes[i]);
+		if (i % 0x10 == 0x0f)
+			printf("\n");
+	}
+
+	if (i % 0x10 != 0)
+		printf("\n");
+}
+
+
+
+#endif
diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse.h b/tools/testing/selftests/filesystems/fuse/test_fuse.h
new file mode 100644
index 000000000000..69dadc9c7e45
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/test_fuse.h
@@ -0,0 +1,337 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021 Google LLC
+ */
+
+#ifndef TEST_FUSE__H
+#define TEST_FUSE__H
+
+#define _GNU_SOURCE
+
+#include "test_framework.h"
+
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+
+#include <include/uapi/linux/android_fuse.h>
+#include <include/uapi/linux/fuse.h>
+
+#define PAGE_SIZE 4096
+#define FUSE_POSTFILTER 0x20000
+
+extern struct _test_options test_options;
+
+/* Slow but semantically easy string functions */
+
+/*
+ * struct s just wraps a char pointer
+ * It is a pointer to a malloc'd string, or null
+ * All consumers handle null input correctly
+ * All consumers free the string
+ */
+struct s {
+	char *s;
+};
+
+struct s s(const char *s1);
+struct s sn(const char *s1, const char *s2);
+int s_cmp(struct s s1, struct s s2);
+struct s s_cat(struct s s1, struct s s2);
+struct s s_splitleft(struct s s1, char c);
+struct s s_splitright(struct s s1, char c);
+struct s s_word(struct s s1, char c, size_t n);
+struct s s_path(struct s s1, struct s s2);
+struct s s_pathn(size_t n, struct s s1, ...);
+int s_link(struct s src_pathname, struct s dst_pathname);
+int s_symlink(struct s src_pathname, struct s dst_pathname);
+int s_mkdir(struct s pathname, mode_t mode);
+int s_rmdir(struct s pathname);
+int s_unlink(struct s pathname);
+int s_open(struct s pathname, int flags, ...);
+int s_openat(int dirfd, struct s pathname, int flags, ...);
+int s_creat(struct s pathname, mode_t mode);
+int s_mkfifo(struct s pathname, mode_t mode);
+int s_stat(struct s pathname, struct stat *st);
+int s_statfs(struct s pathname, struct statfs *st);
+int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out);
+DIR *s_opendir(struct s pathname);
+int s_getxattr(struct s pathname, const char name[], void *value, size_t size,
+	       ssize_t *ret_size);
+int s_listxattr(struct s pathname, void *list, size_t size, ssize_t *ret_size);
+int s_setxattr(struct s pathname, const char name[], const void *value,
+	       size_t size, int flags);
+int s_removexattr(struct s pathname, const char name[]);
+int s_rename(struct s oldpathname, struct s newpathname);
+
+struct s tracing_folder(void);
+int tracing_on(void);
+
+char *concat_file_name(const char *dir, const char *file);
+char *setup_mount_dir(const char *name);
+int delete_dir_tree(const char *dir_path, bool remove_root);
+
+#define TESTFUSEINNULL(_opcode)						\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		ssize_t res = read(fuse_dev, &bytes_in,			\
+			sizeof(bytes_in));				\
+									\
+		TESTEQUAL(in_header->opcode, _opcode);			\
+		TESTEQUAL(res, sizeof(*in_header));			\
+	} while (false)
+
+#define TESTFUSEIN(_opcode, in_struct)					\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		ssize_t res = read(fuse_dev, &bytes_in,			\
+			sizeof(bytes_in));				\
+									\
+		TESTEQUAL(in_header->opcode, _opcode);			\
+		TESTEQUAL(res, sizeof(*in_header) + sizeof(*in_struct));\
+	} while (false)
+
+#define TESTFUSEIN2(_opcode, in_struct1, in_struct2)			\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		ssize_t res = read(fuse_dev, &bytes_in,			\
+			sizeof(bytes_in));				\
+									\
+		TESTEQUAL(in_header->opcode, _opcode);			\
+		TESTEQUAL(res, sizeof(*in_header) + sizeof(*in_struct1) \
+						+ sizeof(*in_struct2)); \
+		in_struct1 = (void *)(bytes_in + sizeof(*in_header));	\
+		in_struct2 = (void *)(bytes_in + sizeof(*in_header)	\
+				      + sizeof(*in_struct1));		\
+	} while (false)
+
+#define TESTFUSEINEXT(_opcode, in_struct, extra)			\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		ssize_t res = read(fuse_dev, &bytes_in,			\
+			sizeof(bytes_in));				\
+									\
+		TESTEQUAL(in_header->opcode, _opcode);			\
+		TESTEQUAL(res,						\
+		       sizeof(*in_header) + sizeof(*in_struct) + extra);\
+	} while (false)
+
+#define TESTFUSEINUNKNOWN()						\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		ssize_t res = read(fuse_dev, &bytes_in,			\
+			sizeof(bytes_in));				\
+									\
+		TESTGE(res, sizeof(*in_header));			\
+		TESTEQUAL(in_header->opcode, -1);			\
+	} while (false)
+
+/* Special case lookup since it is asymmetric */
+#define TESTFUSELOOKUP(expected, filter)				\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		char *name = (char *) (bytes_in + sizeof(*in_header));	\
+		ssize_t res;						\
+									\
+		TEST(res = read(fuse_dev, &bytes_in, sizeof(bytes_in)),	\
+			  res != -1);					\
+		/* TODO once we handle forgets properly, remove */	\
+		if (in_header->opcode == FUSE_FORGET)			\
+			continue;					\
+		if (in_header->opcode == FUSE_BATCH_FORGET)		\
+			continue;					\
+		TESTGE(res, sizeof(*in_header));			\
+		TESTEQUAL(in_header->opcode,				\
+			FUSE_LOOKUP | filter);				\
+		TESTEQUAL(res,						\
+			  sizeof(*in_header) + strlen(expected) + 1 +	\
+				(filter == FUSE_POSTFILTER ?		\
+				sizeof(struct fuse_entry_out) +		\
+				sizeof(struct fuse_entry_bpf_out) : 0));\
+		TESTCOND(!strcmp(name, expected));			\
+		break;							\
+	} while (true)
+
+#define TESTFUSEOUTEMPTY()						\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		struct fuse_out_header *out_header =			\
+			(struct fuse_out_header *)bytes_out;		\
+									\
+		*out_header = (struct fuse_out_header) {		\
+			.len = sizeof(*out_header),			\
+			.unique = in_header->unique,			\
+		};							\
+		TESTEQUAL(write(fuse_dev, bytes_out, out_header->len),	\
+			  out_header->len);				\
+	} while (false)
+
+#define TESTFUSEOUTERROR(errno)						\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		struct fuse_out_header *out_header =			\
+			(struct fuse_out_header *)bytes_out;		\
+									\
+		*out_header = (struct fuse_out_header) {		\
+			.len = sizeof(*out_header),			\
+			.error = errno,					\
+			.unique = in_header->unique,			\
+		};							\
+		TESTEQUAL(write(fuse_dev, bytes_out, out_header->len),	\
+			  out_header->len);				\
+	} while (false)
+
+#define TESTFUSEOUTREAD(data, length)					\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		struct fuse_out_header *out_header =			\
+			(struct fuse_out_header *)bytes_out;		\
+									\
+		*out_header = (struct fuse_out_header) {		\
+			.len = sizeof(*out_header) + length,		\
+			.unique = in_header->unique,			\
+		};							\
+		memcpy(bytes_out + sizeof(*out_header), data, length);	\
+		TESTEQUAL(write(fuse_dev, bytes_out, out_header->len),	\
+			  out_header->len);				\
+	} while (false)
+
+#define TESTFUSEDIROUTREAD(read_out, data, length)			\
+	do {								\
+		struct fuse_in_header *in_header =			\
+				(struct fuse_in_header *)bytes_in;	\
+		struct fuse_out_header *out_header =			\
+			(struct fuse_out_header *)bytes_out;		\
+									\
+		*out_header = (struct fuse_out_header) {		\
+			.len = sizeof(*out_header) +			\
+			       sizeof(*read_out) + length,		\
+			.unique = in_header->unique,			\
+		};							\
+		memcpy(bytes_out + sizeof(*out_header) +		\
+				sizeof(*read_out), data, length);	\
+		memcpy(bytes_out + sizeof(*out_header),			\
+				read_out, sizeof(*read_out));		\
+		TESTEQUAL(write(fuse_dev, bytes_out, out_header->len),	\
+			  out_header->len);				\
+	} while (false)
+
+#define TESTFUSEOUT1(type1, obj1)					\
+	do {								\
+		*(struct fuse_out_header *) bytes_out			\
+			= (struct fuse_out_header) {			\
+			.len = sizeof(struct fuse_out_header)		\
+				+ sizeof(struct type1),			\
+			.unique = ((struct fuse_in_header *)		\
+				   bytes_in)->unique,			\
+		};							\
+		*(struct type1 *) (bytes_out				\
+			+ sizeof(struct fuse_out_header))		\
+			= obj1;						\
+		TESTEQUAL(write(fuse_dev, bytes_out,			\
+			((struct fuse_out_header *)bytes_out)->len),	\
+			((struct fuse_out_header *)bytes_out)->len);	\
+	} while (false)
+
+#define TESTFUSEOUT2(type1, obj1, type2, obj2)				\
+	do {								\
+		*(struct fuse_out_header *) bytes_out			\
+			= (struct fuse_out_header) {			\
+			.len = sizeof(struct fuse_out_header)		\
+				+ sizeof(struct type1)			\
+				+ sizeof(struct type2),			\
+			.unique = ((struct fuse_in_header *)		\
+				   bytes_in)->unique,			\
+		};							\
+		*(struct type1 *) (bytes_out				\
+			+ sizeof(struct fuse_out_header))		\
+			= obj1;						\
+		*(struct type2 *) (bytes_out				\
+			+ sizeof(struct fuse_out_header)		\
+			+ sizeof(struct type1))				\
+			= obj2;						\
+		TESTEQUAL(write(fuse_dev, bytes_out,			\
+			((struct fuse_out_header *)bytes_out)->len),	\
+			((struct fuse_out_header *)bytes_out)->len);	\
+	} while (false)
+
+#define TESTFUSEINITFLAGS(fuse_connection_flags)			\
+	do {								\
+		DECL_FUSE_IN(init);					\
+									\
+		TESTFUSEIN(FUSE_INIT, init_in);				\
+		TESTEQUAL(init_in->major, FUSE_KERNEL_VERSION);		\
+		TESTEQUAL(init_in->minor, FUSE_KERNEL_MINOR_VERSION);	\
+		TESTFUSEOUT1(fuse_init_out, ((struct fuse_init_out) {	\
+			.major = FUSE_KERNEL_VERSION,			\
+			.minor = FUSE_KERNEL_MINOR_VERSION,		\
+			.max_readahead = 4096,				\
+			.flags = fuse_connection_flags,			\
+			.max_background = 0,				\
+			.congestion_threshold = 0,			\
+			.max_write = 4096,				\
+			.time_gran = 1000,				\
+			.max_pages = 12,				\
+			.map_alignment = 4096,				\
+		}));							\
+	} while (false)
+
+#define TESTFUSEINIT()							\
+	TESTFUSEINITFLAGS(0)
+
+#define DECL_FUSE_IN(name)						\
+	struct fuse_##name##_in *name##_in =				\
+		(struct fuse_##name##_in *)				\
+		(bytes_in + sizeof(struct fuse_in_header))
+
+#define DECL_FUSE(name)							\
+	struct fuse_##name##_in *name##_in __maybe_unused;		\
+	struct fuse_##name##_out *name##_out __maybe_unused
+
+#define FUSE_DECLARE_DAEMON						\
+	int daemon = -1;						\
+	int status;							\
+	bool action;							\
+	uint8_t bytes_in[FUSE_MIN_READ_BUFFER] __maybe_unused;		\
+	uint8_t bytes_out[FUSE_MIN_READ_BUFFER]	__maybe_unused
+
+#define FUSE_START_DAEMON()						\
+	do {								\
+		TEST(daemon = fork(), daemon != -1);			\
+		action = daemon != 0;					\
+	} while (false)
+
+#define FUSE_END_DAEMON()						\
+	do {								\
+		TESTEQUAL(waitpid(daemon, &status, 0), daemon);		\
+		TESTEQUAL(status, TEST_SUCCESS);			\
+		result = TEST_SUCCESS;					\
+out:									\
+		if (!daemon)						\
+			exit(TEST_FAILURE);				\
+	} while (false)
+
+
+struct map_relocation {
+	char *name;
+	int fd;
+	int value;
+};
+
+int mount_fuse(const char *mount_dir, int bpf_fd, int dir_fd,
+	       int *fuse_dev_ptr);
+int mount_fuse_no_init(const char *mount_dir, int bpf_fd, int dir_fd,
+	       int *fuse_dev_ptr);
+int install_elf_bpf(const char *file, const char *section, int *fd,
+		    struct map_relocation **map_relocations, size_t *map_count);
+#endif
diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h b/tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h
new file mode 100644
index 000000000000..9097626b7c4d
--- /dev/null
+++ b/tools/testing/selftests/filesystems/fuse/test_fuse_bpf.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Google LLC
+ */
+
+#ifndef TEST_FUSE__BPF__H
+#define TEST_FUSE__BPF__H
+
+#define __EXPORTED_HEADERS__
+#define __KERNEL__
+
+#ifdef __ANDROID__
+#include <stdint.h>
+#endif
+
+#include <uapi/linux/types.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/android_fuse.h>
+#include <uapi/linux/fuse.h>
+#include <uapi/linux/errno.h>
+
+#define SEC(NAME) __section(NAME)
+
+struct fuse_bpf_map {
+	int map_type;
+	size_t key_size;
+	size_t value_size;
+	int max_entries;
+};
+
+static void *(*bpf_map_lookup_elem)(struct fuse_bpf_map *map, void *key)
+	= (void *) 1;
+
+static void *(*bpf_map_update_elem)(struct fuse_bpf_map *map, void *key,
+				    void *value, int flags)
+	= (void *) 2;
+
+static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...)
+	= (void *) 6;
+
+static long (*bpf_get_current_pid_tgid)()
+	= (void *) 14;
+
+static long (*bpf_get_current_uid_gid)()
+	= (void *) 15;
+
+#define bpf_printk(fmt, ...)					\
+	({			                                \
+		char ____fmt[] = fmt;                           \
+		bpf_trace_printk(____fmt, sizeof(____fmt),      \
+					##__VA_ARGS__);		\
+	})
+
+SEC("dummy") inline int strcmp(const char *a, const char *b)
+{
+	int i;
+
+	for (i = 0; i < __builtin_strlen(b) + 1; ++i)
+		if (a[i] != b[i])
+			return -1;
+
+	return 0;
+}
+
+#endif

From f6d21159ccbd638ac6e9de50fb5085ce54fb3735 Mon Sep 17 00:00:00 2001
From: Paul Lawrence <paullawrence@google.com>
Date: Mon, 5 Dec 2022 14:11:13 -0800
Subject: [PATCH 449/457] ANDROID: fuse-bpf: Make sure to declare functions

Bug: 202785178
Bug: 265206112
Test: Compiles
Reported-by: kernel test robot <lkp@intel.com>
Change-Id: I70983d0d66d88008af3a1d51ab0de564c20312e9
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 fs/fuse/fuse_i.h                  | 3 ---
 include/uapi/linux/android_fuse.h | 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 637a30e0cbcb..3e2131faa5ed 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1426,8 +1426,6 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma);
 
 /* backing.c */
 
-struct bpf_prog *fuse_get_bpf_prog(struct file *file);
-
 /*
  * Dummy io passed to fuse_bpf_backing when io operation needs no scratch space
  */
@@ -2057,7 +2055,6 @@ ssize_t fuse_bpf_simple_request(struct fuse_mount *fm, struct fuse_bpf_args *arg
 	fer;								\
 })
 
-struct bpf_prog *fuse_get_bpf_prog(struct file *file);
 #endif /* CONFIG_FUSE_BPF */
 
 #endif /* _FS_FUSE_I_H */
diff --git a/include/uapi/linux/android_fuse.h b/include/uapi/linux/android_fuse.h
index 630b752d34b7..221e30ea7f01 100644
--- a/include/uapi/linux/android_fuse.h
+++ b/include/uapi/linux/android_fuse.h
@@ -92,4 +92,6 @@ struct fuse_bpf_args {
 #define FUSE_PREFILTER		0x10000
 #define FUSE_POSTFILTER		0x20000
 
-#endif  // _LINUX_ANDROID_FUSE_H
+struct bpf_prog *fuse_get_bpf_prog(struct file *file);
+
+#endif  /* _LINUX_ANDROID_FUSE_H */

From ae696a5a231d6d9e5ae318b48179c96557412db0 Mon Sep 17 00:00:00 2001
From: Paul Lawrence <paullawrence@google.com>
Date: Mon, 5 Dec 2022 15:22:09 -0800
Subject: [PATCH 450/457] ANDROID: fuse-bpf: Add /sys/fs flags for fuse-bpf
 version

Note that this is specific for the non-upstreamed version

Bug: 202785178
Bug: 265206112
Test: cat /sys/fs/fuse/fuse_bpf_major_version
Change-Id: I68f9ca56778874975428839dfc1fd8f48b11bd75
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 Documentation/ABI/testing/sysfs-fs-fuse | 19 +++++++++++++
 fs/fuse/inode.c                         | 36 +++++++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-fs-fuse

diff --git a/Documentation/ABI/testing/sysfs-fs-fuse b/Documentation/ABI/testing/sysfs-fs-fuse
new file mode 100644
index 000000000000..b9956842b36f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-fuse
@@ -0,0 +1,19 @@
+What:		/sys/fs/fuse/features/fuse_bpf
+Date:		December 2022
+Contact:	Paul Lawrence <paullawrence@google.com>
+Description:
+		Read-only file that contains the word 'supported' if fuse-bpf is
+		supported, does not exist otherwise
+
+What:		/sys/fs/fuse/bpf_prog_type_fuse
+Date:		December 2022
+Contact:	Paul Lawrence <paullawrence@google.com>
+Description:
+		bpf_prog_type_fuse defines the program type of bpf programs that
+		may be passed to fuse-bpf. For upstream bpf program types, this
+		is a constant defined in a contiguous array of constants.
+		bpf_prog_type_fuse is appended to the end of the list, so it may
+		change and therefore its value must be read from this file.
+
+		Contents is ASCII decimal representation of bpf_prog_type_fuse
+
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6ccb7fc7c121..1f593230bd4a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -2085,7 +2085,31 @@ static void fuse_fs_cleanup(void)
 
 static struct kobject *fuse_kobj;
 
-/* TODO Remove this once BPF_PROG_TYPE_FUSE is upstreamed */
+static ssize_t fuse_bpf_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buff)
+{
+	return sysfs_emit(buff, "supported\n");
+}
+
+static struct kobj_attribute fuse_bpf_attr =
+		__ATTR_RO(fuse_bpf);
+
+static struct attribute *bpf_features[] = {
+	&fuse_bpf_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group bpf_features_group = {
+	.name = "features",
+	.attrs = bpf_features,
+};
+
+/*
+ * TODO Remove this once fuse-bpf is upstreamed
+ *
+ * bpf_prog_type_fuse exports the bpf_prog_type_fuse 'constant', which cannot be
+ * constant until the code is upstreamed
+ */
 static ssize_t bpf_prog_type_fuse_show(struct kobject *kobj,
 				       struct kobj_attribute *attr, char *buff)
 {
@@ -2103,6 +2127,13 @@ static struct attribute *bpf_attributes[] = {
 static const struct attribute_group bpf_attr_group = {
 	.attrs = bpf_attributes,
 };
+
+static const struct attribute_group *attribute_groups[] = {
+	&bpf_features_group,
+	&bpf_attr_group,
+	NULL
+};
+
 /* TODO remove to here */
 
 static int fuse_sysfs_init(void)
@@ -2120,7 +2151,7 @@ static int fuse_sysfs_init(void)
 		goto out_fuse_unregister;
 
 	/* TODO Remove when BPF_PROG_TYPE_FUSE is upstreamed */
-	err = sysfs_create_group(fuse_kobj, &bpf_attr_group);
+	err = sysfs_create_groups(fuse_kobj, attribute_groups);
 	if (err)
 		goto out_fuse_remove_mount_point;
 
@@ -2136,6 +2167,7 @@ static int fuse_sysfs_init(void)
 
 static void fuse_sysfs_cleanup(void)
 {
+	sysfs_remove_groups(fuse_kobj, attribute_groups);
 	sysfs_remove_mount_point(fuse_kobj, "connections");
 	kobject_put(fuse_kobj);
 }

From 1a11a5283818fddcd82993c39fb1d16a55dfa2a7 Mon Sep 17 00:00:00 2001
From: Paul Lawrence <paullawrence@google.com>
Date: Wed, 11 Jan 2023 15:40:14 -0800
Subject: [PATCH 451/457] ANDROID: fuse-bpf: Fix crash from assuming iter is
 kvec

Note that this just stops the crash, it does not correctly set the
buffer. However, since no current use case in Android requires the
buffer, this is tolerable for now.

Bug: 265206112
Test: atest android.scopedstorage.cts.host.ScopedStorageHostTest
      crashes without this, passes with it
Change-Id: I25efac2b1d38fa54b9f26a3f297196fa79e5e7c3
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 fs/fuse/backing.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
index 22656a0624ba..e784c17306db 100644
--- a/fs/fuse/backing.c
+++ b/fs/fuse/backing.c
@@ -920,7 +920,8 @@ int fuse_file_write_iter_initialize(
 		.in_args[0].size = sizeof(fwio->fwi),
 		.in_args[0].value = &fwio->fwi,
 		.in_args[1].size = fwio->fwi.size,
-		.in_args[1].value = from->kvec->iov_base,
+		.in_args[1].value = iov_iter_is_kvec(from)
+			? from->kvec->iov_base : NULL,
 		.out_numargs = 1,
 		.out_args[0].size = sizeof(fwio->fwio),
 		.out_args[0].value = &fwio->fwio,

From 683603b6bf70eeb9bf6843dbd46d3c2507960d86 Mon Sep 17 00:00:00 2001
From: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
Date: Tue, 1 Nov 2022 12:19:14 -0700
Subject: [PATCH 452/457] ANDROID: mm: arm64: Allow remapping logical mappings
 as uncached

Add a function to remap memory in the logical mapping as uncached.
This is intended to be called before calling remove_memory() in
instances where clean evictions cannot be tolerated after logical
mapping removed.

Bug: 256898253
Change-Id: I10090c7423c5dcf43d4e706ea62f86eab7b997b7
Signed-off-by: Chris Goldsworthy <quic_cgoldswo@quicinc.com>
---
 arch/arm64/include/asm/set_memory.h |  1 +
 arch/arm64/mm/pageattr.c            | 17 +++++++++++++++++
 include/linux/set_memory.h          |  2 ++
 mm/memory.c                         | 11 +++++++++++
 4 files changed, 31 insertions(+)

diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 0f740b781187..0ad879e4893f 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -10,6 +10,7 @@ bool can_set_direct_map(void);
 
 int set_memory_valid(unsigned long addr, int numpages, int enable);
 
+int arch_set_direct_map_range_uncached(unsigned long addr, unsigned long numpages);
 int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
 bool kernel_page_present(struct page *page);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 5922178d7a06..debdecfd0d4d 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -160,6 +160,23 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
 					__pgprot(PTE_VALID));
 }
 
+/*
+ * Only to be used with memory in the logical map (e.g. vmapped memory will
+ * face coherency issues as we don't call vm_unmap_aliases()). Only to be used
+ * whilst accesses are not ongoing to the region, as we do not follow the
+ * make-before-break sequence in order to cut down the run time of this
+ * function.
+ */
+int arch_set_direct_map_range_uncached(unsigned long addr, unsigned long numpages)
+{
+	if (!can_set_direct_map())
+		return 0;
+
+	return __change_memory_common(addr, PAGE_SIZE * numpages,
+				      __pgprot(PTE_ATTRINDX(MT_NORMAL_NC)),
+				      __pgprot(PTE_ATTRINDX_MASK));
+}
+
 int set_direct_map_invalid_noflush(struct page *page)
 {
 	struct page_change_data data = {
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 369769ce7399..68c9116a0805 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -56,6 +56,8 @@ static inline int clear_mce_nospec(unsigned long pfn)
 }
 #endif
 
+int set_direct_map_range_uncached(unsigned long addr, unsigned long numpages);
+
 #ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT
 static inline int set_memory_encrypted(unsigned long addr, int numpages)
 {
diff --git a/mm/memory.c b/mm/memory.c
index dcb42980fddf..26835349cace 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,6 +77,7 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#include <linux/set_memory.h>
 
 #include <trace/events/kmem.h>
 
@@ -5851,3 +5852,13 @@ void ptlock_free(struct page *page)
 	kmem_cache_free(page_ptl_cachep, page->ptl);
 }
 #endif
+
+int set_direct_map_range_uncached(unsigned long addr, unsigned long numpages)
+{
+#ifdef CONFIG_ARM64
+	return arch_set_direct_map_range_uncached(addr, numpages);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+EXPORT_SYMBOL_GPL(set_direct_map_range_uncached);

From f70e13c34987fda9e65ae7c18b31fa2deb8c0b7c Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
Date: Tue, 3 Jan 2023 12:32:15 -0700
Subject: [PATCH 453/457] ANDROID: GKI: Enable CONFIG_NF_CONNTRACK_PROCFS

This is needed for connection tracking functionality.

Bug: 262352213
Change-Id: Ifb44cdb7e93a6b89c7e071ab6fdf737cf2461221
Signed-off-by: Subash Abhinov Kasiviswanathan <quic_subashab@quicinc.com>
---
 arch/arm64/configs/gki_defconfig | 1 +
 arch/x86/configs/gki_defconfig   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index b733a1dad78e..e6836ce9c0c2 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -144,6 +144,7 @@ CONFIG_IPV6_MROUTE=y
 CONFIG_NETFILTER=y
 CONFIG_NF_CONNTRACK=y
 CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
 CONFIG_NF_CONNTRACK_EVENTS=y
 CONFIG_NF_CONNTRACK_AMANDA=y
 CONFIG_NF_CONNTRACK_FTP=y
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index 0978b0ab7dd2..d3c59b668ae0 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -132,6 +132,7 @@ CONFIG_IPV6_MROUTE=y
 CONFIG_NETFILTER=y
 CONFIG_NF_CONNTRACK=y
 CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_PROCFS=y
 CONFIG_NF_CONNTRACK_EVENTS=y
 CONFIG_NF_CONNTRACK_AMANDA=y
 CONFIG_NF_CONNTRACK_FTP=y

From 4dea3c22ab65899917b6e68a2277c19e5469f67d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 13 Jan 2023 00:23:41 +0000
Subject: [PATCH 454/457] ANDROID: fscrypt, blk-crypto: drop HW-wrapped key
 compatibility check

In the updated HW-wrapped key code in the android14 kernels, HW-wrapped
keys are only allowed on a multi-block-device filesystem if they have a
compatible HW-wrapped keys implementation.  While in principle this is a
good thing to check, my implementation of it, which simply checks
whether the block devices have the same crypto profiles, doesn't work
when device-mapper is being used.

To actually do that check correctly, I think we'd need to add a
HW-wrapped keys implementation name or ID to the crypto capabilities.

That being said, in Android the HW-wrapped keys implementation is a
global thing anyway.  So in the interest of not overcomplicating things,
for now let's just drop these extra checks that are causing problems.

Bug: 160883801
Bug: 265180564
Fixes: 2fd53f809834 ("ANDROID: fscrypt: add support for hardware-wrapped keys")
Fixes: 139dbaa221e2 ("ANDROID: update "block: add basic hardware-wrapped key support" to v7")
Change-Id: Ia49d62cc2c56447fb898f19bf67df1a38af379f8
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 block/blk-crypto-profile.c | 18 +-----------------
 fs/crypto/inline_crypt.c   | 24 +-----------------------
 include/linux/blk-crypto.h |  3 ---
 3 files changed, 2 insertions(+), 43 deletions(-)

diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 6b9ac2596dba..757d0fc4fc00 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -468,9 +468,7 @@ EXPORT_SYMBOL_GPL(blk_crypto_register);
 
 /**
  * blk_crypto_derive_sw_secret() - Derive software secret from wrapped key
- * @bdev: a block device whose hardware-wrapped keys implementation is
- *	  compatible (blk_crypto_hw_wrapped_keys_compatible()) with all block
- *	  devices on which the key will be used.
+ * @bdev: a block device that supports hardware-wrapped keys
  * @eph_key: the hardware-wrapped key in ephemerally-wrapped form
  * @eph_key_size: size of @eph_key in bytes
  * @sw_secret: (output) the software secret
@@ -507,20 +505,6 @@ int blk_crypto_derive_sw_secret(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(blk_crypto_derive_sw_secret);
 
-/**
- * blk_crypto_hw_wrapped_keys_compatible() - Check HW-wrapped key compatibility
- * @bdev1: the first block device
- * @bdev2: the second block device
- *
- * Return: true if HW-wrapped keys used on @bdev1 can also be used on @bdev2.
- */
-bool blk_crypto_hw_wrapped_keys_compatible(struct block_device *bdev1,
-					   struct block_device *bdev2)
-{
-	return bdev_get_queue(bdev1)->crypto_profile ==
-		bdev_get_queue(bdev2)->crypto_profile;
-}
-
 /**
  * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
  *					 by child device
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 8d33accf0670..64fff3895a22 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -243,9 +243,6 @@ int fscrypt_derive_sw_secret(struct super_block *sb,
 			     const u8 *wrapped_key, size_t wrapped_key_size,
 			     u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
 {
-	struct block_device **devs;
-	unsigned int num_devs;
-	unsigned int i;
 	int err;
 
 	/* The filesystem must be mounted with -o inlinecrypt. */
@@ -256,31 +253,12 @@ int fscrypt_derive_sw_secret(struct super_block *sb,
 		return -EOPNOTSUPP;
 	}
 
-	/*
-	 * Hardware-wrapped keys might be specific to a particular storage
-	 * device, so for now we don't allow them to be used if the filesystem
-	 * uses block devices with different crypto profiles.  This way, there
-	 * is no ambiguity about which ->derive_sw_secret method to call.
-	 */
-	devs = fscrypt_get_devices(sb, &num_devs);
-	if (IS_ERR(devs))
-		return PTR_ERR(devs);
-	for (i = 1; i < num_devs; i++) {
-		if (!blk_crypto_hw_wrapped_keys_compatible(devs[0], devs[i])) {
-			fscrypt_warn(NULL,
-				     "%s: unsupported multi-device configuration for hardware-wrapped keys",
-				     sb->s_id);
-			kfree(devs);
-			return -EOPNOTSUPP;
-		}
-	}
-	err = blk_crypto_derive_sw_secret(devs[0], wrapped_key,
+	err = blk_crypto_derive_sw_secret(sb->s_bdev, wrapped_key,
 					  wrapped_key_size, sw_secret);
 	if (err == -EOPNOTSUPP)
 		fscrypt_warn(NULL,
 			     "%s: block device doesn't support hardware-wrapped keys\n",
 			     sb->s_id);
-	kfree(devs);
 	return err;
 }
 
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index a8a7cd233b26..ef771c94f59a 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -163,9 +163,6 @@ int blk_crypto_derive_sw_secret(struct block_device *bdev,
 				const u8 *eph_key, size_t eph_key_size,
 				u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
 
-bool blk_crypto_hw_wrapped_keys_compatible(struct block_device *bdev1,
-					   struct block_device *bdev2);
-
 #else /* CONFIG_BLK_INLINE_ENCRYPTION */
 
 static inline bool bio_has_crypt_ctx(struct bio *bio)

From a2a9e34d164e90fc08d35fd097a164b9101d72ef Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Mon, 19 Dec 2022 19:09:18 +0100
Subject: [PATCH 455/457] FROMLIST: kasan: allow sampling page_alloc
 allocations for HW_TAGS

[The patch is in mm-unstable tree.]

As Hardware Tag-Based KASAN is intended to be used in production, its
performance impact is crucial.  As page_alloc allocations tend to be big,
tagging and checking all such allocations can introduce a significant
slowdown.

Add two new boot parameters that allow to alleviate that slowdown:

- kasan.page_alloc.sample, which makes Hardware Tag-Based KASAN tag only
  every Nth page_alloc allocation with the order configured by the second
  added parameter (default: tag every such allocation).

- kasan.page_alloc.sample.order, which makes sampling enabled by the first
  parameter only affect page_alloc allocations with the order equal or
  greater than the specified value (default: 3, see below).

The exact performance improvement caused by using the new parameters
depends on their values and the applied workload.

The chosen default value for kasan.page_alloc.sample.order is 3, which
matches both PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER.  This is
done for two reasons:

1. PAGE_ALLOC_COSTLY_ORDER is "the order at which allocations are deemed
   costly to service", which corresponds to the idea that only large and
   thus costly allocations are supposed to sampled.

2. One of the workloads targeted by this patch is a benchmark that sends
   a large amount of data over a local loopback connection. Most multi-page
   data allocations in the networking subsystem have the order of
   SKB_FRAG_PAGE_ORDER (or PAGE_ALLOC_COSTLY_ORDER).

When running a local loopback test on a testing MTE-enabled device in sync
mode, enabling Hardware Tag-Based KASAN introduces a ~50% slowdown.
Applying this patch and setting kasan.page_alloc.sampling to a value
higher than 1 allows to lower the slowdown.  The performance improvement
saturates around the sampling interval value of 10 with the default
sampling page order of 3.  This lowers the slowdown to ~20%.  The slowdown
in real scenarios involving the network will likely be better.

Enabling page_alloc sampling has a downside: KASAN misses bad accesses to
a page_alloc allocation that has not been tagged.  This lowers the value
of KASAN as a security mitigation.

However, based on measuring the number of page_alloc allocations of
different orders during boot in a test build, sampling with the default
kasan.page_alloc.sample.order value affects only ~7% of allocations.  The
rest ~93% of allocations are still checked deterministically.

Link: https://lkml.kernel.org/r/129da0614123bb85ed4dd61ae30842b2dd7c903f.1671471846.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Mark Brand <markbrand@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Bug: 238286329
Bug: 264310057
Link: https://lore.kernel.org/all/129da0614123bb85ed4dd61ae30842b2dd7c903f.1671471846.git.andreyknvl@google.com
Change-Id: Icc7befe61848021c68a12034f426f1c300181ad6
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
---
 Documentation/dev-tools/kasan.rst | 17 +++++++++
 include/linux/kasan.h             | 14 +++++---
 mm/kasan/common.c                 |  9 +++--
 mm/kasan/hw_tags.c                | 60 +++++++++++++++++++++++++++++++
 mm/kasan/kasan.h                  | 27 ++++++++++++++
 mm/page_alloc.c                   | 43 ++++++++++++++--------
 6 files changed, 149 insertions(+), 21 deletions(-)

diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 5c93ab915049..e66916a483cd 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -140,6 +140,23 @@ disabling KASAN altogether or controlling its features:
 - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
   allocations (default: ``on``).
 
+- ``kasan.page_alloc.sample=<sampling interval>`` makes KASAN tag only every
+  Nth page_alloc allocation with the order equal or greater than
+  ``kasan.page_alloc.sample.order``, where N is the value of the ``sample``
+  parameter (default: ``1``, or tag every such allocation).
+  This parameter is intended to mitigate the performance overhead introduced
+  by KASAN.
+  Note that enabling this parameter makes Hardware Tag-Based KASAN skip checks
+  of allocations chosen by sampling and thus miss bad accesses to these
+  allocations. Use the default value for accurate bug detection.
+
+- ``kasan.page_alloc.sample.order=<minimum page order>`` specifies the minimum
+  order of allocations that are affected by sampling (default: ``3``).
+  Only applies when ``kasan.page_alloc.sample`` is set to a value greater
+  than ``1``.
+  This parameter is intended to allow sampling only large page_alloc
+  allocations, which is the biggest source of the performance overhead.
+
 Error reports
 ~~~~~~~~~~~~~
 
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d811b3d7d2a1..629309182a78 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -120,12 +120,13 @@ static __always_inline void kasan_poison_pages(struct page *page,
 		__kasan_poison_pages(page, order, init);
 }
 
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
-static __always_inline void kasan_unpoison_pages(struct page *page,
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init);
+static __always_inline bool kasan_unpoison_pages(struct page *page,
 						 unsigned int order, bool init)
 {
 	if (kasan_enabled())
-		__kasan_unpoison_pages(page, order, init);
+		return __kasan_unpoison_pages(page, order, init);
+	return false;
 }
 
 void __kasan_cache_create_kmalloc(struct kmem_cache *cache);
@@ -249,8 +250,11 @@ static __always_inline bool kasan_check_byte(const void *addr)
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
 static inline void kasan_poison_pages(struct page *page, unsigned int order,
 				      bool init) {}
-static inline void kasan_unpoison_pages(struct page *page, unsigned int order,
-					bool init) {}
+static inline bool kasan_unpoison_pages(struct page *page, unsigned int order,
+					bool init)
+{
+	return false;
+}
 static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {}
 static inline void kasan_poison_slab(struct slab *slab) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 833bf2cfd2a3..1d0008e1c420 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -95,19 +95,24 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 }
 #endif /* CONFIG_KASAN_STACK */
 
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
 {
 	u8 tag;
 	unsigned long i;
 
 	if (unlikely(PageHighMem(page)))
-		return;
+		return false;
+
+	if (!kasan_sample_page_alloc(order))
+		return false;
 
 	tag = kasan_random_tag();
 	kasan_unpoison(set_tag(page_address(page), tag),
 		       PAGE_SIZE << order, init);
 	for (i = 0; i < (1 << order); i++)
 		page_kasan_tag_set(page + i, tag);
+
+	return true;
 }
 
 void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index b22c4f461cb0..d1bcb0205327 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -59,6 +59,24 @@ EXPORT_SYMBOL_GPL(kasan_mode);
 /* Whether to enable vmalloc tagging. */
 DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
 
+#define PAGE_ALLOC_SAMPLE_DEFAULT	1
+#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT	3
+
+/*
+ * Sampling interval of page_alloc allocation (un)poisoning.
+ * Defaults to no sampling.
+ */
+unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+
+/*
+ * Minimum order of page_alloc allocations to be affected by sampling.
+ * The default value is chosen to match both
+ * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER.
+ */
+unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+
+DEFINE_PER_CPU(long, kasan_page_alloc_skip);
+
 /* kasan=off/on */
 static int __init early_kasan_flag(char *arg)
 {
@@ -122,6 +140,48 @@ static inline const char *kasan_mode_info(void)
 		return "sync";
 }
 
+/* kasan.page_alloc.sample=<sampling interval> */
+static int __init early_kasan_flag_page_alloc_sample(char *arg)
+{
+	int rv;
+
+	if (!arg)
+		return -EINVAL;
+
+	rv = kstrtoul(arg, 0, &kasan_page_alloc_sample);
+	if (rv)
+		return rv;
+
+	if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) {
+		kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample);
+
+/* kasan.page_alloc.sample.order=<minimum page order> */
+static int __init early_kasan_flag_page_alloc_sample_order(char *arg)
+{
+	int rv;
+
+	if (!arg)
+		return -EINVAL;
+
+	rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order);
+	if (rv)
+		return rv;
+
+	if (kasan_page_alloc_sample_order > INT_MAX) {
+		kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order);
+
 /*
  * kasan_init_hw_tags_cpu() is called for each CPU.
  * Not marked as __init as a CPU can be hot-plugged after boot.
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index abbcc1b0eec5..8b6e09839d8d 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -42,6 +42,10 @@ enum kasan_mode {
 
 extern enum kasan_mode kasan_mode __ro_after_init;
 
+extern unsigned long kasan_page_alloc_sample;
+extern unsigned int kasan_page_alloc_sample_order;
+DECLARE_PER_CPU(long, kasan_page_alloc_skip);
+
 static inline bool kasan_vmalloc_enabled(void)
 {
 	return static_branch_likely(&kasan_flag_vmalloc);
@@ -57,6 +61,24 @@ static inline bool kasan_sync_fault_possible(void)
 	return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
 }
 
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+	/* Fast-path for when sampling is disabled. */
+	if (kasan_page_alloc_sample == 1)
+		return true;
+
+	if (order < kasan_page_alloc_sample_order)
+		return true;
+
+	if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) {
+		this_cpu_write(kasan_page_alloc_skip,
+			       kasan_page_alloc_sample - 1);
+		return true;
+	}
+
+	return false;
+}
+
 #else /* CONFIG_KASAN_HW_TAGS */
 
 static inline bool kasan_async_fault_possible(void)
@@ -69,6 +91,11 @@ static inline bool kasan_sync_fault_possible(void)
 	return true;
 }
 
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+	return true;
+}
+
 #endif /* CONFIG_KASAN_HW_TAGS */
 
 #ifdef CONFIG_KASAN_GENERIC
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 285628e90117..0876589bad80 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1385,6 +1385,8 @@ out:
  *    see the comment next to it.
  * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
  *    see the comment next to it.
+ * 4. The allocation is excluded from being checked due to sampling,
+ *    see the call to kasan_unpoison_pages.
  *
  * Poisoning pages during deferred memory init will greatly lengthen the
  * process and cause problem in large memory systems as the deferred pages
@@ -2494,7 +2496,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 {
 	bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
 			!should_skip_init(gfp_flags);
-	bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+	bool reset_tags = !zero_tags;
 	int i;
 
 	set_page_private(page, 0);
@@ -2517,30 +2520,42 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 	 */
 
 	/*
-	 * If memory tags should be zeroed (which happens only when memory
-	 * should be initialized as well).
+	 * If memory tags should be zeroed
+	 * (which happens only when memory should be initialized as well).
 	 */
-	if (init_tags) {
+	if (zero_tags) {
 		/* Initialize both memory and tags. */
 		for (i = 0; i != 1 << order; ++i)
 			tag_clear_highpage(page + i);
 
-		/* Note that memory is already initialized by the loop above. */
+		/* Take note that memory was initialized by the loop above. */
 		init = false;
 	}
 	if (!should_skip_kasan_unpoison(gfp_flags)) {
-		/* Unpoison shadow memory or set memory tags. */
-		kasan_unpoison_pages(page, order, init);
-
-		/* Note that memory is already initialized by KASAN. */
-		if (kasan_has_integrated_init())
-			init = false;
-	} else {
-		/* Ensure page_address() dereferencing does not fault. */
+		/* Try unpoisoning (or setting tags) and initializing memory. */
+		if (kasan_unpoison_pages(page, order, init)) {
+			/* Take note that memory was initialized by KASAN. */
+			if (kasan_has_integrated_init())
+				init = false;
+			/* Take note that memory tags were set by KASAN. */
+			reset_tags = false;
+		} else {
+			/*
+			 * KASAN decided to exclude this allocation from being
+			 * poisoned due to sampling. Skip poisoning as well.
+			 */
+			SetPageSkipKASanPoison(page);
+		}
+	}
+	/*
+	 * If memory tags have not been set, reset the page tags to ensure
+	 * page_address() dereferencing does not fault.
+	 */
+	if (reset_tags) {
 		for (i = 0; i != 1 << order; ++i)
 			page_kasan_tag_reset(page + i);
 	}
-	/* If memory is still not initialized, do it now. */
+	/* If memory is still not initialized, initialize it now. */
 	if (init)
 		kernel_init_pages(page, 1 << order);
 	/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */

From 435e2a6a6c8ba8d0eb55f9aaade53e7a3957322b Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Wed, 11 Jan 2023 02:19:46 +0100
Subject: [PATCH 456/457] ANDROID: gki_defconfig: sample large page_alloc
 allocations with HW_TAGS KASAN

Add kasan.page_alloc.sample=10 to CONFIG_CMDLINE in gki_defconfig to make
Hardware Tag-Based (MTE) KASAN tag only one out of every 10 page_alloc
allocations with the order equal or larger than 3, which the omitted
default value for the kasan.page_alloc.sample.order parameter.

As Hardware Tag-Based KASAN is intended to be used in production, its
performance impact is crucial. As page_alloc allocations tend to be big,
tagging and checking all such allocations can introduce a significant
slowdown.

When running a local loopback test on a testing MTE-enabled device in sync
mode, enabling Hardware Tag-Based KASAN introduces a ~50% slowdown.
Setting kasan.page_alloc.sampling to a value higher than 1 allows to lower
the slowdown. The performance improvement saturates around the sampling
interval value of 10 with the default sampling page order of 3, see
b/238286329. This lowers the slowdown to ~20%.  The slowdown in real
scenarios involving the network will likely be better.

Enabling page_alloc sampling has a downside: KASAN misses bad accesses to
a page_alloc allocation that has not been tagged. This lowers the value
of KASAN as a security mitigation.

However, based on measuring the number of page_alloc allocations of
different orders during boot in a test build, sampling with the default
kasan.page_alloc.sample.order value affects only ~7% of allocations. The
rest ~93% of allocations are still checked deterministically.

Bug: 238286329
Bug: 264310057
Change-Id: Idfc8600d1f7cc7af28482ff2c8e8ad5ad5948058
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
---
 arch/arm64/configs/gki_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index e6836ce9c0c2..364672cccecc 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -57,7 +57,7 @@ CONFIG_CP15_BARRIER_EMULATION=y
 CONFIG_SETEND_EMULATION=y
 CONFIG_RANDOMIZE_BASE=y
 # CONFIG_RANDOMIZE_MODULE_REGION_FULL is not set
-CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure kasan.stacktrace=off kvm-arm.mode=protected bootconfig ioremap_guard"
+CONFIG_CMDLINE="console=ttynull stack_depot_disable=on cgroup_disable=pressure kasan.page_alloc.sample=10 kasan.stacktrace=off kvm-arm.mode=protected bootconfig ioremap_guard"
 CONFIG_CMDLINE_EXTEND=y
 # CONFIG_DMI is not set
 CONFIG_PM_WAKELOCKS=y

From 09ad10d4ee63f8983acad5515463dd202cc83054 Mon Sep 17 00:00:00 2001
From: Peter Griffin <gpeter@google.com>
Date: Fri, 6 Jan 2023 23:36:38 +0000
Subject: [PATCH 457/457] ANDROID: GKI: Remove CONFIG_LOCALVERSION="-mainline"
 on 6.1 branch

Appending -mainline should not be done on the v6.1 branch.

Since the BRANCH constant was correctly set in
e486eabf3a8c5c95b44aa2f540f3cc25c4d34f8b having
CONFIG_LOCALVERSION also set causes the bazel
//common:kernel_aarch64_test test_vmlinux_contains_scmversion
test to fail.

Bug: 264649337
Bug: 259701619
Change-Id: I9b7235bcbfdb2fafb4699dcc0e7bf809c68b608a
Signed-off-by: Peter Griffin <gpeter@google.com>
---
 arch/arm64/configs/gki_defconfig | 1 -
 arch/x86/configs/gki_defconfig   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index 364672cccecc..69d2ec28f629 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -1,5 +1,4 @@
 CONFIG_UAPI_HEADER_TEST=y
-CONFIG_LOCALVERSION="-mainline"
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index d3c59b668ae0..109d0ec44fce 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -1,5 +1,4 @@
 CONFIG_UAPI_HEADER_TEST=y
-CONFIG_LOCALVERSION="-mainline"
 CONFIG_KERNEL_LZ4=y
 CONFIG_AUDIT=y
 CONFIG_NO_HZ=y