From ddf142e5a817e3a260e5dedce0cd29db6fbaa010 Mon Sep 17 00:00:00 2001
From: Jong eon Park <jongeon.park@samsung.com>
Date: Wed, 29 Nov 2023 16:50:44 +0900
Subject: [PATCH 001/139] ANDROID: netlink: add netlink poll and hooks

In huge uevents generating system, especially for user apps who have
small size of rcvbuf socket, it has been reported that netlink overrun
happens quite frequently.

Moreover, if there's no POLLERR (caused by this netlink overrun) handler
in user apps, the system can almost be stucked by calling 'poll' repeatedly.

Regarding this issue, I have sent a kernel netlink patch to linux
maintainers and got replied that this is absolutely user app's problem,
must not addressing kernel.

Until Android team look into this issue and some modification comes out,
we need kernel patch for temporary. To minimize the effect by this patch
to the others who have never met this issue, I would like to just add
netlink's dedicated poll and its hooks.

Please refer to below v1/v2 patch links for history.

v1: https://lore.kernel.org/netdev/20231110110002.7279f895@kernel.org/T/#t
v2: https://lore.kernel.org/netdev/d599922fd89b3e61c7cf531a03ea8b81cbcb003e.camel@redhat.com/T/#t

Bug: 300009377
Link: https://lore.kernel.org/netdev/d599922fd89b3e61c7cf531a03ea8b81cbcb003e.camel@redhat.com/T/#t

Change-Id: I4f11399d61c10332ba05bac64cfa1e92bb111565
Signed-off-by: Jong eon Park <jongeon.park@samsung.com>
---
 drivers/android/vendor_hooks.c |  1 +
 include/trace/hooks/net.h      |  7 +++++++
 net/netlink/af_netlink.c       | 14 ++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 8425e8709b41..789fa7beea83 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -364,3 +364,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_slab_folio_alloced);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_large_alloced);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_netlink_poll);
diff --git a/include/trace/hooks/net.h b/include/trace/hooks/net.h
index 50988f672216..835943c31f3d 100644
--- a/include/trace/hooks/net.h
+++ b/include/trace/hooks/net.h
@@ -25,6 +25,13 @@ DECLARE_RESTRICTED_HOOK(android_rvh_sk_alloc,
 DECLARE_RESTRICTED_HOOK(android_rvh_sk_free,
 	TP_PROTO(struct sock *sock), TP_ARGS(sock), 1);
 
+struct poll_table_struct;
+typedef struct poll_table_struct poll_table;
+DECLARE_HOOK(android_vh_netlink_poll,
+	TP_PROTO(struct file *file, struct socket *sock, poll_table *wait,
+		__poll_t *mask),
+	TP_ARGS(file, sock, wait, mask));
+
 /* macro versions of hooks are no longer required */
 
 #endif /* _TRACE_HOOK_NET_VH_H */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index cb833302270a..5b328a82ea70 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -71,7 +71,8 @@
 #include <net/netlink.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/netlink.h>
-
+#undef CREATE_TRACE_POINTS
+#include <trace/hooks/net.h>
 #include "af_netlink.h"
 
 struct listeners {
@@ -1966,6 +1967,15 @@ out:
 	return err ? : copied;
 }
 
+static __poll_t netlink_poll(struct file *file, struct socket *sock,
+			     poll_table *wait)
+{
+	__poll_t mask = datagram_poll(file, sock, wait);
+
+	trace_android_vh_netlink_poll(file, sock, wait, &mask);
+	return mask;
+}
+
 static void netlink_data_ready(struct sock *sk)
 {
 	BUG();
@@ -2766,7 +2776,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		datagram_poll,
+	.poll =		netlink_poll,
 	.ioctl =	netlink_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,

From 2d8a5ddebb22fd1dab4c2fe3a7ba978be62423a3 Mon Sep 17 00:00:00 2001
From: Richard Chang <richardycc@google.com>
Date: Thu, 7 Dec 2023 07:31:31 +0000
Subject: [PATCH 002/139] ANDROID: Update the ABI symbol list

Adding the following symbol:
  - __cma_alloc

Bug: 308881290
Change-Id: I5b3ffb0c804dc636355c1462aaa6e96b1189446b
Signed-off-by: Richard Chang <richardycc@google.com>
---
 android/abi_gki_aarch64.stg   | 18 ++++++++++++++++++
 android/abi_gki_aarch64_pixel |  1 +
 2 files changed, 19 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 2a708ffe2af0..b6bcc62d2c90 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -327602,6 +327602,14 @@ function {
   parameter_id: 0x4585663f
   parameter_id: 0x6d7f5ff6
 }
+function {
+  id: 0xb94f7fed
+  return_type_id: 0x06835e9c
+  parameter_id: 0x1023f4f6
+  parameter_id: 0x33756485
+  parameter_id: 0x4585663f
+  parameter_id: 0xf1a6dfed
+}
 function {
   id: 0xb957d705
   return_type_id: 0x6720d32f
@@ -331936,6 +331944,15 @@ elf_symbol {
   type_id: 0x9b8e2bf2
   full_name: "__clocksource_register_scale"
 }
+elf_symbol {
+  id: 0xc7d06fb9
+  name: "__cma_alloc"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x5dbf1ea9
+  type_id: 0xb94f7fed
+  full_name: "__cma_alloc"
+}
 elf_symbol {
   id: 0xac1ff1ce
   name: "__const_udelay"
@@ -397960,6 +397977,7 @@ interface {
   symbol_id: 0x6a30419a
   symbol_id: 0x021741b4
   symbol_id: 0x9339caba
+  symbol_id: 0xc7d06fb9
   symbol_id: 0xac1ff1ce
   symbol_id: 0xba429af2
   symbol_id: 0xe495eb53
diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index 8da36c314c22..d19d3f7e39aa 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -188,6 +188,7 @@
   clockevents_config_and_register
   clocks_calc_mult_shift
   __clocksource_register_scale
+  __cma_alloc
   cma_alloc
   cma_for_each_area
   cma_get_name

From a9567a35d0b87f17387ee2a86f6092aa6c1c85d0 Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Mon, 11 Dec 2023 19:53:17 +0000
Subject: [PATCH 003/139] ANDROID: arm64: Disable workaround for CPU errata
 2441007 and 2441009

CPU errata 2441007 (Cortex-A55) and 2441009 (Cortex-A510) are categorised
as "rare" by Arm and consequently the workaround is not intended to be
deployed in practice as the issue is not expected to occur in real-world
environments.

Given that the cost of the workaround, which issues additional broadcast
TLB invalidation requests, has been shown to impact kswapd significantly
on Pixel devices, disable the workaround following Arm's recommendation.

Bug: 306231846
Signed-off-by: Will Deacon <willdeacon@google.com>
Change-Id: I39b6d9736cfa79827321151b45774f62c8d1a747
(cherry picked from commit 4ba6c3197cb6f0e11cb8af10bb0924ba9d73c110)
---
 arch/arm64/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7dafeacab872..ce95f67faa64 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -642,7 +642,6 @@ config ARM64_WORKAROUND_REPEAT_TLBI
 
 config ARM64_ERRATUM_2441007
 	bool "Cortex-A55: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
-	default y
 	select ARM64_WORKAROUND_REPEAT_TLBI
 	help
 	  This option adds a workaround for ARM Cortex-A55 erratum #2441007.
@@ -881,7 +880,6 @@ config ARM64_ERRATUM_2224489
 
 config ARM64_ERRATUM_2441009
 	bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
-	default y
 	select ARM64_WORKAROUND_REPEAT_TLBI
 	help
 	  This option adds a workaround for ARM Cortex-A510 erratum #2441009.

From 071c14698cf57357fba712330849a1515a993960 Mon Sep 17 00:00:00 2001
From: Guan-Yu Lin <guanyulin@google.com>
Date: Thu, 16 Nov 2023 16:32:16 +0800
Subject: [PATCH 004/139] FROMGIT: usb: typec: tcpm: skip checking
 port->send_discover in PD3.0

The original Collison Avoidance mechanism, port->send_discover, avoids
the conflict when port partners start AMS almost the same time. However,
this mechanism is replaced by SINK_TX_OK and SINK_TX_NG. Skip the check
in PD3.0 to avoid the deadlock when source is requesting DR_SWAP where
sink is requesting DISCOVER_IDENTITY.

Signed-off-by: Guan-Yu Lin <guanyulin@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20231116083221.1201892-1-guanyulin@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Bug: 292178486
(cherry picked from commit e0cc05d52ad310cced029449bcda0f9fc847097c
 https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git/ usb-next)
Change-Id: I4691628d8085dfa7be9189b2bd598896664c38b5
Signed-off-by: Guan-Yu Lin <guanyulin@google.com>
---
 drivers/usb/typec/tcpm/tcpm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c
index f00e69bf4c64..c3dd132f02fe 100644
--- a/drivers/usb/typec/tcpm/tcpm.c
+++ b/drivers/usb/typec/tcpm/tcpm.c
@@ -2855,7 +2855,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 					   PD_MSG_CTRL_NOT_SUPP,
 					   NONE_AMS);
 		} else {
-			if (port->send_discover) {
+			if (port->send_discover && port->negotiated_rev < PD_REV30) {
 				tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
 				break;
 			}
@@ -2871,7 +2871,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 					   PD_MSG_CTRL_NOT_SUPP,
 					   NONE_AMS);
 		} else {
-			if (port->send_discover) {
+			if (port->send_discover && port->negotiated_rev < PD_REV30) {
 				tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
 				break;
 			}
@@ -2880,7 +2880,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port,
 		}
 		break;
 	case PD_CTRL_VCONN_SWAP:
-		if (port->send_discover) {
+		if (port->send_discover && port->negotiated_rev < PD_REV30) {
 			tcpm_queue_message(port, PD_MSG_CTRL_WAIT);
 			break;
 		}

From 62b97630d464e048769c5f4011565b00c60d264a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 21 Jun 2023 17:42:42 +0200
Subject: [PATCH 005/139] UPSTREAM: x86/sev: Check IOBM for IOIO exceptions
 from user-space

Upstream commit: b9cb9c45583b911e0db71d09caa6b56469eb2bdf

Check the IO permission bitmap (if present) before emulating IOIO #VC
exceptions for user-space. These permissions are checked by hardware
already before the #VC is raised, but due to the VC-handler decoding
race it needs to be checked again in software.

Bug: 309733863
Fixes: 25189d08e516 ("x86/sev-es: Add support for handling IOIO exceptions")
Reported-by: Tom Dohrmann <erbse.13@gmx.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Tested-by: Tom Dohrmann <erbse.13@gmx.de>
Cc: <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit def94eb9a804acdcdba5b959ad72cf9119f03f3b)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: Ia520acc67da21353148fd07a3a8e48ee8a97d364
---
 arch/x86/boot/compressed/sev.c |  5 +++++
 arch/x86/kernel/sev-shared.c   | 22 +++++++++++++++-------
 arch/x86/kernel/sev.c          | 27 +++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index e65f0968e0d9..b9b8ff3fe8e9 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -103,6 +103,11 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
 	return ES_OK;
 }
 
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+	return ES_OK;
+}
+
 #undef __init
 #undef __pa
 #define __init
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 7dce812ce253..abbe7af14d92 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -693,6 +693,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
 static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
 {
 	struct insn *insn = &ctxt->insn;
+	size_t size;
+	u64 port;
+
 	*exitinfo = 0;
 
 	switch (insn->opcode.bytes[0]) {
@@ -701,7 +704,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
 	case 0x6d:
 		*exitinfo |= IOIO_TYPE_INS;
 		*exitinfo |= IOIO_SEG_ES;
-		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+		port	   = ctxt->regs->dx & 0xffff;
 		break;
 
 	/* OUTS opcodes */
@@ -709,41 +712,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
 	case 0x6f:
 		*exitinfo |= IOIO_TYPE_OUTS;
 		*exitinfo |= IOIO_SEG_DS;
-		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+		port	   = ctxt->regs->dx & 0xffff;
 		break;
 
 	/* IN immediate opcodes */
 	case 0xe4:
 	case 0xe5:
 		*exitinfo |= IOIO_TYPE_IN;
-		*exitinfo |= (u8)insn->immediate.value << 16;
+		port	   = (u8)insn->immediate.value & 0xffff;
 		break;
 
 	/* OUT immediate opcodes */
 	case 0xe6:
 	case 0xe7:
 		*exitinfo |= IOIO_TYPE_OUT;
-		*exitinfo |= (u8)insn->immediate.value << 16;
+		port	   = (u8)insn->immediate.value & 0xffff;
 		break;
 
 	/* IN register opcodes */
 	case 0xec:
 	case 0xed:
 		*exitinfo |= IOIO_TYPE_IN;
-		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+		port	   = ctxt->regs->dx & 0xffff;
 		break;
 
 	/* OUT register opcodes */
 	case 0xee:
 	case 0xef:
 		*exitinfo |= IOIO_TYPE_OUT;
-		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+		port	   = ctxt->regs->dx & 0xffff;
 		break;
 
 	default:
 		return ES_DECODE_FAILED;
 	}
 
+	*exitinfo |= port << 16;
+
 	switch (insn->opcode.bytes[0]) {
 	case 0x6c:
 	case 0x6e:
@@ -753,12 +758,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
 	case 0xee:
 		/* Single byte opcodes */
 		*exitinfo |= IOIO_DATA_8;
+		size       = 1;
 		break;
 	default:
 		/* Length determined by instruction parsing */
 		*exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
 						     : IOIO_DATA_32;
+		size       = (insn->opnd_bytes == 2) ? 2 : 4;
 	}
+
 	switch (insn->addr_bytes) {
 	case 2:
 		*exitinfo |= IOIO_ADDR_16;
@@ -774,7 +782,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
 	if (insn_has_rep_prefix(insn))
 		*exitinfo |= IOIO_REP;
 
-	return ES_OK;
+	return vc_ioio_check(ctxt, (u16)port, size);
 }
 
 static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index afda719dd725..392097f7c241 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -512,6 +512,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
 	return ES_OK;
 }
 
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+	BUG_ON(size > 4);
+
+	if (user_mode(ctxt->regs)) {
+		struct thread_struct *t = &current->thread;
+		struct io_bitmap *iobm = t->io_bitmap;
+		size_t idx;
+
+		if (!iobm)
+			goto fault;
+
+		for (idx = port; idx < port + size; ++idx) {
+			if (test_bit(idx, iobm->bitmap))
+				goto fault;
+		}
+	}
+
+	return ES_OK;
+
+fault:
+	ctxt->fi.vector = X86_TRAP_GP;
+	ctxt->fi.error_code = 0;
+
+	return ES_EXCEPTION;
+}
+
 /* Include code shared with pre-decompression boot stage */
 #include "sev-shared.c"
 

From b2b3a1e6d1bb749bda9eb920ac70a92ff04c41b9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 16 Oct 2023 14:42:50 +0200
Subject: [PATCH 006/139] UPSTREAM: x86/sev: Check for user-space IOIO pointing
 to kernel space

Upstream commit: 63e44bc52047f182601e7817da969a105aa1f721

Check the memory operand of INS/OUTS before emulating the instruction.
The #VC exception can get raised from user-space, but the memory operand
can be manipulated to access kernel memory before the emulation actually
begins and after the exception handler has run.

  [ bp: Massage commit message. ]

Bug: 309733863
Fixes: 597cfe48212a ("x86/boot/compressed/64: Setup a GHCB-based VC Exception handler")
Reported-by: Tom Dohrmann <erbse.13@gmx.de>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 57d0639f60f1ff04cbe7fd52823b94b894d7f812)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: Iac1c2f15cc922ab215d57654b004d020a0b65e53
---
 arch/x86/boot/compressed/sev.c |  5 +++++
 arch/x86/kernel/sev-shared.c   | 31 +++++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index b9b8ff3fe8e9..9c91cc40f456 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -108,6 +108,11 @@ static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t si
 	return ES_OK;
 }
 
+static bool fault_in_kernel_space(unsigned long address)
+{
+	return false;
+}
+
 #undef __init
 #undef __pa
 #define __init
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index abbe7af14d92..71d8698702ce 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -629,6 +629,23 @@ fail:
 	sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
 }
 
+static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
+					   unsigned long address,
+					   bool write)
+{
+	if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
+		ctxt->fi.vector     = X86_TRAP_PF;
+		ctxt->fi.error_code = X86_PF_USER;
+		ctxt->fi.cr2        = address;
+		if (write)
+			ctxt->fi.error_code |= X86_PF_WRITE;
+
+		return ES_EXCEPTION;
+	}
+
+	return ES_OK;
+}
+
 static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
 					  void *src, char *buf,
 					  unsigned int data_size,
@@ -636,7 +653,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
 					  bool backwards)
 {
 	int i, b = backwards ? -1 : 1;
-	enum es_result ret = ES_OK;
+	unsigned long address = (unsigned long)src;
+	enum es_result ret;
+
+	ret = vc_insn_string_check(ctxt, address, false);
+	if (ret != ES_OK)
+		return ret;
 
 	for (i = 0; i < count; i++) {
 		void *s = src + (i * data_size * b);
@@ -657,7 +679,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
 					   bool backwards)
 {
 	int i, s = backwards ? -1 : 1;
-	enum es_result ret = ES_OK;
+	unsigned long address = (unsigned long)dst;
+	enum es_result ret;
+
+	ret = vc_insn_string_check(ctxt, address, true);
+	if (ret != ES_OK)
+		return ret;
 
 	for (i = 0; i < count; i++) {
 		void *d = dst + (i * data_size * s);

From bcc758eed789329e391a9a0180262ba51a9e14f9 Mon Sep 17 00:00:00 2001
From: Lee Jones <joneslee@google.com>
Date: Wed, 13 Dec 2023 09:40:49 +0000
Subject: [PATCH 007/139] Reapply "binder: fix UAF caused by faulty buffer
 cleanup"

This reverts commit 9f67f4f5007d5a8ecc26486f23c1f1d5093e3e9e.

Vanir complained that this fix was missing, but only from this branch.

Let's bring it back and see how the ABI checker behaves.

Bug: 275041864
Bug: 308350116
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: I1fc248582347a295d9168bbd8e55dbd6880e34ed
---
 drivers/android/binder.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 766b9d5dffb1..b0188e8ee00b 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2127,24 +2127,23 @@ static void binder_deferred_fd_close(int fd)
 static void binder_transaction_buffer_release(struct binder_proc *proc,
 					      struct binder_thread *thread,
 					      struct binder_buffer *buffer,
-					      binder_size_t failed_at,
+					      binder_size_t off_end_offset,
 					      bool is_failure)
 {
 	int debug_id = buffer->debug_id;
-	binder_size_t off_start_offset, buffer_offset, off_end_offset;
+	binder_size_t off_start_offset, buffer_offset;
 
 	binder_debug(BINDER_DEBUG_TRANSACTION,
 		     "%d buffer release %d, size %zd-%zd, failed at %llx\n",
 		     proc->pid, buffer->debug_id,
 		     buffer->data_size, buffer->offsets_size,
-		     (unsigned long long)failed_at);
+		     (unsigned long long)off_end_offset);
 
 	if (buffer->target_node)
 		binder_dec_node(buffer->target_node, 1, 0);
 
 	off_start_offset = ALIGN(buffer->data_size, sizeof(void *));
-	off_end_offset = is_failure && failed_at ? failed_at :
-				off_start_offset + buffer->offsets_size;
+
 	for (buffer_offset = off_start_offset; buffer_offset < off_end_offset;
 	     buffer_offset += sizeof(binder_size_t)) {
 		struct binder_object_header *hdr;
@@ -2304,6 +2303,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc,
 	}
 }
 
+/* Clean up all the objects in the buffer */
+static inline void binder_release_entire_buffer(struct binder_proc *proc,
+						struct binder_thread *thread,
+						struct binder_buffer *buffer,
+						bool is_failure)
+{
+	binder_size_t off_end_offset;
+
+	off_end_offset = ALIGN(buffer->data_size, sizeof(void *));
+	off_end_offset += buffer->offsets_size;
+
+	binder_transaction_buffer_release(proc, thread, buffer,
+					  off_end_offset, is_failure);
+}
+
 static int binder_translate_binder(struct flat_binder_object *fp,
 				   struct binder_transaction *t,
 				   struct binder_thread *thread)
@@ -3013,7 +3027,7 @@ static int binder_proc_transaction(struct binder_transaction *t,
 		t_outdated->buffer = NULL;
 		buffer->transaction = NULL;
 		trace_binder_transaction_update_buffer_release(buffer);
-		binder_transaction_buffer_release(proc, NULL, buffer, 0, 0);
+		binder_release_entire_buffer(proc, NULL, buffer, false);
 		binder_alloc_free_buf(&proc->alloc, buffer);
 		kfree(t_outdated);
 		binder_stats_deleted(BINDER_STAT_TRANSACTION);
@@ -4004,7 +4018,7 @@ binder_free_buf(struct binder_proc *proc,
 		binder_node_inner_unlock(buf_node);
 	}
 	trace_binder_transaction_buffer_release(buffer);
-	binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure);
+	binder_release_entire_buffer(proc, thread, buffer, is_failure);
 	binder_alloc_free_buf(&proc->alloc, buffer);
 }
 

From d8d2b95fd0b415ea700d02348ac852f463908950 Mon Sep 17 00:00:00 2001
From: DooHyun Hwang <dh0421.hwang@samsung.com>
Date: Wed, 13 Dec 2023 15:56:40 +0900
Subject: [PATCH 008/139] ANDROID: ABI: update symbol list for galaxy

2 function symbol(s) added
  'int scsi_device_quiesce(struct scsi_device*)'
  'void scsi_device_resume(struct scsi_device*)'

Bug: 316076675
Change-Id: I301b9445f41736ae485c3779b7164962c17117b2
Signed-off-by: DooHyun Hwang <dh0421.hwang@samsung.com>
---
 android/abi_gki_aarch64.stg    | 20 ++++++++++++++++++++
 android/abi_gki_aarch64_galaxy |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index b6bcc62d2c90..541a5f2beb27 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -382267,6 +382267,24 @@ elf_symbol {
   type_id: 0x19c71538
   full_name: "scsi_device_put"
 }
+elf_symbol {
+  id: 0x61df84bc
+  name: "scsi_device_quiesce"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x0daef571
+  type_id: 0x94dfa784
+  full_name: "scsi_device_quiesce"
+}
+elf_symbol {
+  id: 0x054c0bba
+  name: "scsi_device_resume"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x069ea5a4
+  type_id: 0x19c71538
+  full_name: "scsi_device_resume"
+}
 elf_symbol {
   id: 0xf10245da
   name: "scsi_dma_map"
@@ -403567,6 +403585,8 @@ interface {
   symbol_id: 0x76dea2aa
   symbol_id: 0x14eb95fa
   symbol_id: 0x474e9bcc
+  symbol_id: 0x61df84bc
+  symbol_id: 0x054c0bba
   symbol_id: 0xf10245da
   symbol_id: 0x18cbd7f9
   symbol_id: 0x30f6b9b1
diff --git a/android/abi_gki_aarch64_galaxy b/android/abi_gki_aarch64_galaxy
index 01f6927b0592..4dfd51b49f67 100644
--- a/android/abi_gki_aarch64_galaxy
+++ b/android/abi_gki_aarch64_galaxy
@@ -274,6 +274,8 @@
   sched_clock
   sched_show_task
   scnprintf
+  scsi_device_quiesce
+  scsi_device_resume
   seq_hex_dump
   seq_lseek
   seq_printf

From 956a0d3998cdc9bfc3608bc358d0fab974bf4905 Mon Sep 17 00:00:00 2001
From: Manish Varma <varmam@google.com>
Date: Mon, 15 Mar 2021 21:40:24 -0700
Subject: [PATCH 009/139] ANDROID: fs: Add vendor hooks for
 ep_create_wakeup_source & timerfd_create

timerfd doesn't create any wakelocks, but eventpoll can.  When it does,
it names them after the underlying file descriptor, and since all
timerfd file descriptors are named "[timerfd]" (which saves memory on
systems like desktops with potentially many timerfd instances), all
wakesources created as a result of using the eventpoll-on-timerfd idiom
are called... "[timerfd]".

However, it becomes impossible to tell which "[timerfd]" wakesource is
affliated with which process and hence troubleshooting is difficult.

Adding vendor hooks to allow vendor to assign appropriate names to
timerfd descriptors and eventoll wakesource.

Bug: 155142106

Signed-off-by: Manish Varma <varmam@google.com>
Change-Id: I330a42ab48bed4b26d5eb2f636925c66061165ec
(cherry picked from commit 0ff110fbb309be385126a42ac9f7004ba9b0644e)
---
 drivers/android/vendor_hooks.c |  3 +++
 fs/eventpoll.c                 | 11 +++++++++--
 fs/timerfd.c                   |  9 +++++++--
 include/trace/hooks/fs.h       | 23 +++++++++++++++++++++++
 4 files changed, 42 insertions(+), 4 deletions(-)
 create mode 100644 include/trace/hooks/fs.h

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 789fa7beea83..43eb748c1103 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -26,6 +26,7 @@
 #include <trace/hooks/printk.h>
 #include <trace/hooks/epoch.h>
 #include <trace/hooks/cpufreq.h>
+#include <trace/hooks/fs.h>
 #include <trace/hooks/preemptirq.h>
 #include <trace/hooks/ftrace_dump.h>
 #include <trace/hooks/ufshcd.h>
@@ -365,3 +366,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_slab_folio_alloced);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_large_alloced);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_netlink_poll);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ep_create_wakeup_source);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timerfd_create);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index eccecd3fac90..30217f0fed81 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -39,6 +39,8 @@
 #include <linux/rculist.h>
 #include <net/busy_poll.h>
 
+#include <trace/hooks/fs.h>
+
 /*
  * LOCKING:
  * There are three level of locking required by epoll :
@@ -1373,15 +1375,20 @@ static int ep_create_wakeup_source(struct epitem *epi)
 {
 	struct name_snapshot n;
 	struct wakeup_source *ws;
+	char ws_name[64];
 
+	strlcpy(ws_name, "eventpoll", sizeof(ws_name));
+	trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name));
 	if (!epi->ep->ws) {
-		epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
+		epi->ep->ws = wakeup_source_register(NULL, ws_name);
 		if (!epi->ep->ws)
 			return -ENOMEM;
 	}
 
 	take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
-	ws = wakeup_source_register(NULL, n.name.name);
+	strlcpy(ws_name, n.name.name, sizeof(ws_name));
+	trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name));
+	ws = wakeup_source_register(NULL, ws_name);
 	release_dentry_name_snapshot(&n);
 
 	if (!ws)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index e9c96a0c79f1..de8e736bbf7b 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -28,6 +28,8 @@
 #include <linux/rcupdate.h>
 #include <linux/time_namespace.h>
 
+#include <trace/hooks/fs.h>
+
 struct timerfd_ctx {
 	union {
 		struct hrtimer tmr;
@@ -407,6 +409,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 {
 	int ufd;
 	struct timerfd_ctx *ctx;
+	char file_name_buf[32];
 
 	/* Check the TFD_* constants for consistency.  */
 	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
@@ -443,7 +446,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 
 	ctx->moffs = ktime_mono_to_real(0);
 
-	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+	strlcpy(file_name_buf, "[timerfd]", sizeof(file_name_buf));
+	trace_android_vh_timerfd_create(file_name_buf, sizeof(file_name_buf));
+	ufd = anon_inode_getfd(file_name_buf, &timerfd_fops, ctx,
 			       O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
 	if (ufd < 0)
 		kfree(ctx);
@@ -451,7 +456,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	return ufd;
 }
 
-static int do_timerfd_settime(int ufd, int flags, 
+static int do_timerfd_settime(int ufd, int flags,
 		const struct itimerspec64 *new,
 		struct itimerspec64 *old)
 {
diff --git a/include/trace/hooks/fs.h b/include/trace/hooks/fs.h
new file mode 100644
index 000000000000..bb8f177db5c1
--- /dev/null
+++ b/include/trace/hooks/fs.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH trace/hooks
+
+#if !defined(_TRACE_HOOK_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HOOK_FS_H
+
+#include <trace/hooks/vendor_hooks.h>
+
+DECLARE_HOOK(android_vh_ep_create_wakeup_source,
+	TP_PROTO(char *name, int len),
+	TP_ARGS(name, len));
+
+DECLARE_HOOK(android_vh_timerfd_create,
+	TP_PROTO(char *name, int len),
+	TP_ARGS(name, len));
+#endif /* _TRACE_HOOK_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
\ No newline at end of file

From cc294d9503f8aa03d45318c0bf2a7870cea9c930 Mon Sep 17 00:00:00 2001
From: Benjamin Schwartz <bsschwar@google.com>
Date: Wed, 13 Dec 2023 16:12:48 -0800
Subject: [PATCH 010/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
     - __traceiter_android_vh_ep_create_wakeup_source
     - __traceiter_android_vh_timerfd_create
     - __tracepoint_android_vh_ep_create_wakeup_source
     - __tracepoint_android_vh_timerfd_create

Bug: 155142106

Change-Id: Ie895faefacd62674ac58783ba6a3cd5c3bc46637
Signed-off-by: Benjamin Schwartz <bsschwar@google.com>
---
 android/abi_gki_aarch64.stg   | 47 +++++++++++++++++++++++++++++++++++
 android/abi_gki_aarch64_pixel |  8 ++++++
 2 files changed, 55 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 541a5f2beb27..60b8a56bcba4 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -318689,6 +318689,13 @@ function {
   parameter_id: 0x3e10b518
   parameter_id: 0x33756485
 }
+function {
+  id: 0x9ba47dcc
+  return_type_id: 0x6720d32f
+  parameter_id: 0x18bd6530
+  parameter_id: 0x0483e6f8
+  parameter_id: 0x6720d32f
+}
 function {
   id: 0x9ba4eebd
   return_type_id: 0x6720d32f
@@ -336481,6 +336488,15 @@ elf_symbol {
   type_id: 0x9bcd4ff7
   full_name: "__traceiter_android_vh_encrypt_page"
 }
+elf_symbol {
+  id: 0x7f1591a1
+  name: "__traceiter_android_vh_ep_create_wakeup_source"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x1e8ed582
+  type_id: 0x9ba47dcc
+  full_name: "__traceiter_android_vh_ep_create_wakeup_source"
+}
 elf_symbol {
   id: 0x1921d10d
   name: "__traceiter_android_vh_exit_check"
@@ -337777,6 +337793,15 @@ elf_symbol {
   type_id: 0x9ab83ca3
   full_name: "__traceiter_android_vh_timer_calc_index"
 }
+elf_symbol {
+  id: 0x641d703d
+  name: "__traceiter_android_vh_timerfd_create"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x8c68d59c
+  type_id: 0x9ba47dcc
+  full_name: "__traceiter_android_vh_timerfd_create"
+}
 elf_symbol {
   id: 0x2bc25325
   name: "__traceiter_android_vh_try_to_freeze_todo"
@@ -340459,6 +340484,15 @@ elf_symbol {
   type_id: 0x18ccbd2c
   full_name: "__tracepoint_android_vh_encrypt_page"
 }
+elf_symbol {
+  id: 0xdef7c547
+  name: "__tracepoint_android_vh_ep_create_wakeup_source"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0x7db48833
+  type_id: 0x18ccbd2c
+  full_name: "__tracepoint_android_vh_ep_create_wakeup_source"
+}
 elf_symbol {
   id: 0x684e5f4f
   name: "__tracepoint_android_vh_exit_check"
@@ -341755,6 +341789,15 @@ elf_symbol {
   type_id: 0x18ccbd2c
   full_name: "__tracepoint_android_vh_timer_calc_index"
 }
+elf_symbol {
+  id: 0x2df766e3
+  name: "__tracepoint_android_vh_timerfd_create"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0x181a4352
+  type_id: 0x18ccbd2c
+  full_name: "__tracepoint_android_vh_timerfd_create"
+}
 elf_symbol {
   id: 0xd9d2bcff
   name: "__tracepoint_android_vh_try_to_freeze_todo"
@@ -398499,6 +398542,7 @@ interface {
   symbol_id: 0xdcaa59a3
   symbol_id: 0x7ebac47a
   symbol_id: 0xf586d5b6
+  symbol_id: 0x7f1591a1
   symbol_id: 0x1921d10d
   symbol_id: 0x1f554c2a
   symbol_id: 0x343adff1
@@ -398643,6 +398687,7 @@ interface {
   symbol_id: 0x226cc38b
   symbol_id: 0xeecc1529
   symbol_id: 0xfeff2e7f
+  symbol_id: 0x641d703d
   symbol_id: 0x2bc25325
   symbol_id: 0x0119fc41
   symbol_id: 0xd9f43028
@@ -398941,6 +398986,7 @@ interface {
   symbol_id: 0x54b2cd01
   symbol_id: 0x188eab44
   symbol_id: 0xe7584e1c
+  symbol_id: 0xdef7c547
   symbol_id: 0x684e5f4f
   symbol_id: 0x0d418d38
   symbol_id: 0x2121385f
@@ -399085,6 +399131,7 @@ interface {
   symbol_id: 0xa5c71571
   symbol_id: 0xfa3284c7
   symbol_id: 0x69721329
+  symbol_id: 0x2df766e3
   symbol_id: 0xd9d2bcff
   symbol_id: 0x09ba106b
   symbol_id: 0xf9580976
diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index d19d3f7e39aa..7153beba5ba6 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -1162,7 +1162,10 @@
   kernel_param_lock
   kernel_param_unlock
   kernel_restart
+  kernfs_find_and_get_ns
+  kernfs_notify
   kernfs_path_from_node
+  kernfs_put
   key_create_or_update
   key_put
   keyring_alloc
@@ -2150,6 +2153,7 @@
   thermal_zone_get_temp
   thermal_zone_get_zone_by_name
   thread_group_cputime_adjusted
+  tick_nohz_get_idle_calls_cpu
   time64_to_tm
   topology_update_thermal_pressure
   _totalram_pages
@@ -2209,6 +2213,7 @@
   __traceiter_android_vh_dup_task_struct
   __traceiter_android_vh_early_resume_begin
   __traceiter_android_vh_enable_thermal_genl_check
+  __traceiter_android_vh_ep_create_wakeup_source
   __traceiter_android_vh_filemap_get_folio
   __traceiter_android_vh_ipi_stop
   __traceiter_android_vh_meminfo_proc_show
@@ -2222,6 +2227,7 @@
   __traceiter_android_vh_setscheduler_uclamp
   __traceiter_android_vh_si_meminfo_adjust
   __traceiter_android_vh_sysrq_crash
+  __traceiter_android_vh_timerfd_create
   __traceiter_android_vh_typec_store_partner_src_caps
   __traceiter_android_vh_typec_tcpci_override_toggling
   __traceiter_android_vh_typec_tcpm_get_timer
@@ -2316,6 +2322,7 @@
   __tracepoint_android_vh_dup_task_struct
   __tracepoint_android_vh_early_resume_begin
   __tracepoint_android_vh_enable_thermal_genl_check
+  __tracepoint_android_vh_ep_create_wakeup_source
   __tracepoint_android_vh_filemap_get_folio
   __tracepoint_android_vh_ipi_stop
   __tracepoint_android_vh_meminfo_proc_show
@@ -2329,6 +2336,7 @@
   __tracepoint_android_vh_setscheduler_uclamp
   __tracepoint_android_vh_si_meminfo_adjust
   __tracepoint_android_vh_sysrq_crash
+  __tracepoint_android_vh_timerfd_create
   __tracepoint_android_vh_typec_store_partner_src_caps
   __tracepoint_android_vh_typec_tcpci_override_toggling
   __tracepoint_android_vh_typec_tcpm_get_timer

From cc653d701f72a92103ae30b7573129258d34e5e9 Mon Sep 17 00:00:00 2001
From: Elliot Berman <quic_eberman@quicinc.com>
Date: Thu, 14 Dec 2023 15:16:26 -0800
Subject: [PATCH 011/139] ANDROID: virt: gunyah: Zero state_data after vcpu_run

Do not re-use stale state_data on subsequent vcpu runs as the stale data
could be interpreted by Gunyah and rejected.

Bug: 268234781
Change-Id: I3d4bf7a922da1e0e85006ffa58b64a74e320d3c9
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 drivers/virt/gunyah/gunyah_vcpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/virt/gunyah/gunyah_vcpu.c b/drivers/virt/gunyah/gunyah_vcpu.c
index 82a0cbf55caf..bb13a1aed2e4 100644
--- a/drivers/virt/gunyah/gunyah_vcpu.c
+++ b/drivers/virt/gunyah/gunyah_vcpu.c
@@ -196,6 +196,7 @@ static int gh_vcpu_run(struct gh_vcpu *vcpu)
 		}
 
 		gh_error = gh_hypercall_vcpu_run(vcpu->rsc->capid, state_data, &vcpu_run_resp);
+		memset(state_data, 0, sizeof(state_data));
 		if (gh_error == GH_ERROR_OK) {
 			switch (vcpu_run_resp.state) {
 			case GH_VCPU_STATE_READY:

From 01dd8c280b9cfa4b3bbd4a2ffbaa0e07567a5163 Mon Sep 17 00:00:00 2001
From: Will Deacon <willdeacon@google.com>
Date: Wed, 25 Oct 2023 13:52:57 +0100
Subject: [PATCH 012/139] ANDROID: KVM: arm64: Prefault entries when splitting
 a block mapping

When splitting a block mapping, we install a table entry pointing to an
empty page and recreate the new entries lazily as we fault them in. For
page-tables with the KVM_PGTABLE_S2_IDMAP flag, this can result in
unnecessary translation faults.

When splitting a block for a page-table with KVM_PGTABLE_S2_IDMAP set,
pre-populate the newly allocate page-table page with contiguous ptes
based on the attributes of the block.

Bug: 308373293
Change-Id: I0c53d048de913e193830caef93d75755270db709
Signed-off-by: Will Deacon <willdeacon@google.com>
Signed-off-by: Keir Fraser <keirf@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b7e8faa894c9..2d11455aabe8 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -775,6 +775,22 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 	return 0;
 }
 
+static void stage2_map_prefault_idmap(u64 addr, u32 level, kvm_pte_t *ptep,
+				      kvm_pte_t attr)
+{
+	u64 granule = kvm_granule_size(level);
+	int i;
+
+	if (!kvm_pte_valid(attr))
+		return;
+
+	for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, addr += granule) {
+		kvm_pte_t pte = kvm_init_valid_leaf_pte(addr, attr, level);
+		/* We can write non-atomically: ptep isn't yet live. */
+		*ptep = pte;
+	}
+}
+
 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 				struct stage2_map_data *data)
 {
@@ -805,6 +821,12 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	if (!childp)
 		return -ENOMEM;
 
+	if (pgt->flags & KVM_PGTABLE_S2_IDMAP) {
+		WARN_ON(pte_ops->pte_is_counted_cb(pte, level));
+		addr = ALIGN_DOWN(addr, kvm_granule_size(level));
+		stage2_map_prefault_idmap(addr, level + 1, childp, pte);
+	}
+
 	/*
 	 * If we've run into an existing block mapping then replace it with
 	 * a table. Accesses beyond 'end' that fall within the new table

From f082d22541bf871efd1b7d56d0bf51ffa0e01bfd Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Tue, 31 Oct 2023 17:09:30 +0000
Subject: [PATCH 013/139] ANDROID: KVM: arm64: Optimise
 module_change_host_page_prot

Merge the relaxation and restriction paths to both only need to adjust
permissions. This avoids un-map + re-map on the restriction path; and
avoids installing an annotated entry on the relaxation path (which
will cause a translation fault on first access by the host).

Bug: 308373293
Change-Id: I9c7a6ac149aad64b19a5ce7808334188475b27cc
Signed-off-by: Keir Fraser <keirf@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 25 ++++++-------------------
 arch/arm64/kvm/hyp/pgtable.c          | 20 ++++++++++++++------
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 86cd64130328..4ba504f5f4bd 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -2008,19 +2008,6 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
 	return ret;
 }
 
-static int restrict_host_page_perms(u64 addr, kvm_pte_t pte, u32 level, enum kvm_pgtable_prot prot)
-{
-	int ret = 0;
-
-	/* XXX: optimize ... */
-	if (kvm_pte_valid(pte) && (level == KVM_PGTABLE_MAX_LEVELS - 1))
-		ret = kvm_pgtable_stage2_unmap(&host_mmu.pgt, addr, PAGE_SIZE);
-	if (!ret)
-		ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false);
-
-	return ret;
-}
-
 #define MODULE_PROT_ALLOWLIST (KVM_PGTABLE_PROT_RWX |	\
 			       KVM_PGTABLE_PROT_DEVICE |\
 			       KVM_PGTABLE_PROT_NC |	\
@@ -2065,12 +2052,12 @@ int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot)
 	}
 
 update:
-	if (prot == default_host_prot(!!page))
-		ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_HOST);
-	else if (!prot)
-		ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_PROTECTED);
-	else
-		ret = restrict_host_page_perms(addr, pte, level, prot);
+	if (!prot) {
+		ret = host_stage2_set_owner_locked(addr, PAGE_SIZE,
+						   PKVM_ID_PROTECTED);
+	} else {
+		ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false);
+	}
 
 	if (ret || !page)
 		goto unlock;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2d11455aabe8..b9140293da7d 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -645,8 +645,13 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
 	return prot;
 }
 
-static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
+static bool stage2_pte_needs_update(struct kvm_pgtable *pgt,
+				    kvm_pte_t old, kvm_pte_t new)
 {
+	/* Following filter logic applies only to guest stage-2 entries. */
+	if (pgt->flags & KVM_PGTABLE_S2_IDMAP)
+		return true;
+
 	if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
 		return true;
 
@@ -715,12 +720,15 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
 		new = data->annotation;
 
 	/*
-	 * Skip updating the PTE if we are trying to recreate the exact
-	 * same mapping or only change the access permissions. Instead,
-	 * the vCPU will exit one more time from guest if still needed
-	 * and then go through the path of relaxing permissions.
+	 * Skip updating a guest PTE if we are trying to recreate the exact
+	 * same mapping or change only the access permissions. Instead,
+	 * the vCPU will exit one more time from the guest if still needed
+	 * and then go through the path of relaxing permissions. This applies
+	 * only to guest PTEs; Host PTEs are unconditionally updated. The
+	 * host cannot livelock because the abort handler has done prior
+	 * checks before calling here.
 	 */
-	if (!stage2_pte_needs_update(old, new))
+	if (!stage2_pte_needs_update(pgt, old, new))
 		return -EAGAIN;
 
 	if (pte_ops->pte_is_counted_cb(old, level))

From fd720ebc6a4049162e3bec2f16b95fd57931164d Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Wed, 1 Nov 2023 15:54:56 +0000
Subject: [PATCH 014/139] ANDROID: KVM: arm64: Relax checks in
 module_change_host_page_prot

Modules can only relax permissions to RWX. This seems rather arbitrary.
Instead, allow any valid permissions to be set, as long as the page is
a pristine host page, or already module owned.

Bug: 308373293
Change-Id: I905786fad6543f47a00bd9b9f07e17dd660d457c
Signed-off-by: Keir Fraser <keirf@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 4ba504f5f4bd..f7f1c184b4fd 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -2041,15 +2041,12 @@ int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot)
 	page = hyp_phys_to_page(addr);
 
 	/*
-	 * Modules can only relax permissions of pages they own, and restrict
-	 * permissions of pristine pages.
+	 * Modules can only modify pages they already own, and pristine host
+	 * pages.
 	 */
-	if (prot == KVM_PGTABLE_PROT_RWX) {
-		if (!(page->flags & MODULE_OWNED_PAGE))
-			goto unlock;
-	} else if (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED) {
+	if (!(page->flags & MODULE_OWNED_PAGE) &&
+	    (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED))
 		goto unlock;
-	}
 
 update:
 	if (!prot) {

From fbc707442cb65d1368b82291c5975cdea1cc6c74 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Thu, 2 Nov 2023 16:26:11 +0000
Subject: [PATCH 015/139] ANDROID: KVM: arm64: Introduce
 module_change_host_prot_range

This allows protection attributes to be changed for a range of
pages via a single module API call.

The original API call modifying a single page is now implemented
as a shim on top of the new range-based call.

The ABI STG is also fixed up:

type 'struct pkvm_module_ops' changed
  member 'union { int(* host_stage2_mod_prot_range)(u64, enum kvm_pgtable_prot, u64); struct { u64 android_kabi_reserved1; }; union { }; }' was added
  member 'u64 android_kabi_reserved1' was removed

Bug: 308373293
Change-Id: I6fbb2e0b325aa972148f48746565dcc10d74edaf
Signed-off-by: Keir Fraser <keirf@google.com>
---
 android/abi_gki_aarch64.stg                   | 34 +++++++++-
 arch/arm64/include/asm/kvm_pkvm_module.h      |  3 +-
 arch/arm64/kvm/hyp/include/nvhe/mem_protect.h |  1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         | 66 +++++++++++++------
 arch/arm64/kvm/hyp/nvhe/modules.c             |  1 +
 5 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 60b8a56bcba4..0ceacc54c6de 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -12283,6 +12283,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0xb94739b9
 }
+pointer_reference {
+  id: 0x24c218d7
+  kind: POINTER
+  pointee_type_id: 0xb94885c2
+}
 pointer_reference {
   id: 0x24c6c7eb
   kind: POINTER
@@ -40287,6 +40292,11 @@ member {
   type_id: 0x797868f8
   offset: 32
 }
+member {
+  id: 0x3dbb0f88
+  type_id: 0x79c25039
+  offset: 2048
+}
 member {
   id: 0x3dbd80ff
   type_id: 0x79d85976
@@ -100848,6 +100858,11 @@ member {
   type_id: 0x24cb3ae4
   offset: 896
 }
+member {
+  id: 0xbcc50199
+  name: "host_stage2_mod_prot_range"
+  type_id: 0x24c218d7
+}
 member {
   id: 0xedc7b540
   name: "host_status"
@@ -215152,6 +215167,16 @@ struct_union {
     member_id: 0x3bfa35f3
   }
 }
+struct_union {
+  id: 0x79c25039
+  kind: UNION
+  definition {
+    bytesize: 8
+    member_id: 0xbcc50199
+    member_id: 0x27000c61
+    member_id: 0x36752b74
+  }
+}
 struct_union {
   id: 0x79d85976
   kind: UNION
@@ -247063,7 +247088,7 @@ struct_union {
     member_id: 0x636da10f
     member_id: 0x6f066e7f
     member_id: 0x3afd0925
-    member_id: 0x2d0812b0
+    member_id: 0x3dbb0f88
     member_id: 0x637607e0
     member_id: 0xac894cc9
     member_id: 0xe0f63db8
@@ -327601,6 +327626,13 @@ function {
   parameter_id: 0x18bd6530
   parameter_id: 0x310ec01d
 }
+function {
+  id: 0xb94885c2
+  return_type_id: 0x6720d32f
+  parameter_id: 0x92233392
+  parameter_id: 0x1908b154
+  parameter_id: 0x92233392
+}
 function {
   id: 0xb94d0c8b
   return_type_id: 0x06835e9c
diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index 5752e1d11abd..bf68d862b7d8 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -153,7 +153,8 @@ struct pkvm_module_ops {
 	void* (*hyp_va)(phys_addr_t phys);
 	unsigned long (*kern_hyp_va)(unsigned long x);
 
-	ANDROID_KABI_RESERVE(1);
+	ANDROID_KABI_USE(1, int (*host_stage2_mod_prot_range)(u64 pfn, enum kvm_pgtable_prot prot, u64 nr_pages));
+
 	ANDROID_KABI_RESERVE(2);
 	ANDROID_KABI_RESERVE(3);
 	ANDROID_KABI_RESERVE(4);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index b0eabed053d2..f80d15d52be6 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -104,6 +104,7 @@ int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages,
 		    struct kvm_hyp_memcache *host_mc);
 
 int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot);
+int module_change_host_page_prot_range(u64 pfn, enum kvm_pgtable_prot prot, u64 nr_pages);
 
 void destroy_hyp_vm_pgt(struct pkvm_hyp_vm *vm);
 void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc);
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index f7f1c184b4fd..2c1032a59826 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -2013,56 +2013,75 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
 			       KVM_PGTABLE_PROT_NC |	\
 			       KVM_PGTABLE_PROT_PXN |	\
 			       KVM_PGTABLE_PROT_UXN)
-int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot)
+
+int module_change_host_page_prot_range(u64 pfn, enum kvm_pgtable_prot prot, u64 nr_pages)
 {
-	u64 addr = hyp_pfn_to_phys(pfn);
+	u64 i, addr = hyp_pfn_to_phys(pfn);
+	u64 end = addr + nr_pages * PAGE_SIZE;
 	struct hyp_page *page = NULL;
-	kvm_pte_t pte;
-	u32 level;
+	struct kvm_mem_range range;
+	bool is_mmio;
 	int ret;
 
 	if ((prot & MODULE_PROT_ALLOWLIST) != prot)
 		return -EINVAL;
 
+	is_mmio = !find_mem_range(addr, &range);
+	if (end > range.end) {
+		/* Specified range not in a single mmio or memory block. */
+		return -EPERM;
+	}
+
 	host_lock_component();
-	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
-	if (ret)
-		goto unlock;
 
 	/*
 	 * There is no hyp_vmemmap covering MMIO regions, which makes tracking
 	 * of module-owned MMIO regions hard, so we trust the modules not to
 	 * mess things up.
 	 */
-	if (!addr_is_memory(addr))
+	if (is_mmio)
 		goto update;
 
-	ret = -EPERM;
+	/* Range is memory: we can track module ownership. */
 	page = hyp_phys_to_page(addr);
 
 	/*
 	 * Modules can only modify pages they already own, and pristine host
-	 * pages.
+	 * pages. The entire range must be consistently one or the other.
 	 */
-	if (!(page->flags & MODULE_OWNED_PAGE) &&
-	    (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED))
-		goto unlock;
+	if (page->flags & MODULE_OWNED_PAGE) {
+		/* The entire range must be module-owned. */
+		ret = -EPERM;
+		for (i = 1; i < nr_pages; i++) {
+			if (!(page[i].flags & MODULE_OWNED_PAGE))
+				goto unlock;
+		}
+	} else {
+		/* The entire range must be pristine. */
+		ret = __host_check_page_state_range(
+			addr, nr_pages << PAGE_SHIFT, PKVM_PAGE_OWNED);
+		if (ret)
+			goto unlock;
+	}
 
 update:
 	if (!prot) {
-		ret = host_stage2_set_owner_locked(addr, PAGE_SIZE,
-						   PKVM_ID_PROTECTED);
+		ret = host_stage2_set_owner_locked(
+			addr, nr_pages << PAGE_SHIFT, PKVM_ID_PROTECTED);
 	} else {
-		ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false);
+		ret = host_stage2_idmap_locked(
+			addr, nr_pages << PAGE_SHIFT, prot, false);
 	}
 
-	if (ret || !page)
+	if (WARN_ON(ret) || !page)
 		goto unlock;
 
-	if (prot != KVM_PGTABLE_PROT_RWX)
-		hyp_phys_to_page(addr)->flags |= MODULE_OWNED_PAGE;
-	else
-		hyp_phys_to_page(addr)->flags &= ~MODULE_OWNED_PAGE;
+	for (i = 0; i < nr_pages; i++) {
+		if (prot != KVM_PGTABLE_PROT_RWX)
+			page[i].flags |= MODULE_OWNED_PAGE;
+		else
+			page[i].flags &= ~MODULE_OWNED_PAGE;
+	}
 
 unlock:
 	host_unlock_component();
@@ -2070,6 +2089,11 @@ unlock:
 	return ret;
 }
 
+int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot)
+{
+	return module_change_host_page_prot_range(pfn, prot, 1);
+}
+
 int hyp_pin_shared_mem(void *from, void *to)
 {
 	u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE);
diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c
index 49e6c2c2e2ae..862e7b7a75ff 100644
--- a/arch/arm64/kvm/hyp/nvhe/modules.c
+++ b/arch/arm64/kvm/hyp/nvhe/modules.c
@@ -115,6 +115,7 @@ const struct pkvm_module_ops module_ops = {
 	.hyp_pa = hyp_virt_to_phys,
 	.hyp_va = hyp_phys_to_virt,
 	.kern_hyp_va = __kern_hyp_va,
+	.host_stage2_mod_prot_range = module_change_host_page_prot_range,
 };
 
 int __pkvm_init_module(void *module_init)

From 4fa87d4d8fce9c45afcbcb03258b7be84052f7c0 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keirf@google.com>
Date: Tue, 7 Nov 2023 15:40:41 +0000
Subject: [PATCH 016/139] ANDROID: KVM: arm64: Skip prefaulting ptes which will
 be modified later

Block mappings can be split as part of a page table update. When
prefaulting entries during the split, it is pointless to install
valid ptes which will later be modified by the same walk.

At the same time, push the check for pte_is_counted into the
prefault handler, where it logically belongs.

Bug: 308373293
Change-Id: If4599b2860aa62d82ce8db019a8410c2d883de71
Signed-off-by: Keir Fraser <keirf@google.com>
---
 arch/arm64/kvm/hyp/pgtable.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b9140293da7d..64387388584c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -783,19 +783,27 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
 	return 0;
 }
 
-static void stage2_map_prefault_idmap(u64 addr, u32 level, kvm_pte_t *ptep,
-				      kvm_pte_t attr)
+static void stage2_map_prefault_idmap(struct kvm_pgtable_pte_ops *pte_ops,
+				      u64 addr, u64 end, u32 level,
+				      kvm_pte_t *ptep, kvm_pte_t block_pte)
 {
-	u64 granule = kvm_granule_size(level);
+	u64 pa, granule;
 	int i;
 
-	if (!kvm_pte_valid(attr))
+	WARN_ON(pte_ops->pte_is_counted_cb(block_pte, level-1));
+
+	if (!kvm_pte_valid(block_pte))
 		return;
 
-	for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, addr += granule) {
-		kvm_pte_t pte = kvm_init_valid_leaf_pte(addr, attr, level);
-		/* We can write non-atomically: ptep isn't yet live. */
-		*ptep = pte;
+	pa = ALIGN_DOWN(addr, kvm_granule_size(level-1));
+	granule = kvm_granule_size(level);
+	for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, pa += granule) {
+		kvm_pte_t pte = kvm_init_valid_leaf_pte(pa, block_pte, level);
+		/* Skip ptes in the range being modified by the caller. */
+		if ((pa < addr) || (pa >= end)) {
+			/* We can write non-atomically: ptep isn't yet live. */
+			*ptep = pte;
+		}
 	}
 }
 
@@ -830,9 +838,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 		return -ENOMEM;
 
 	if (pgt->flags & KVM_PGTABLE_S2_IDMAP) {
-		WARN_ON(pte_ops->pte_is_counted_cb(pte, level));
-		addr = ALIGN_DOWN(addr, kvm_granule_size(level));
-		stage2_map_prefault_idmap(addr, level + 1, childp, pte);
+		stage2_map_prefault_idmap(pte_ops, addr, end, level + 1,
+					  childp, pte);
 	}
 
 	/*

From ac1031618a7d1cfb0a804439212de64f115a7773 Mon Sep 17 00:00:00 2001
From: Lee Jones <joneslee@google.com>
Date: Fri, 15 Dec 2023 09:36:46 +0000
Subject: [PATCH 017/139] ANDROID: Snapshot Mainline's version of checkpatch.pl

Nothing fancy here.  Keeping full history is not required.

  `git checkout mainline/master -- scripts/checkpatch.pl`

This may need to be done periodically.

Bug: 316492624
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: I4c90b50197ca7277c59e96bf332ecf795c4f3d12
---
 scripts/checkpatch.pl | 170 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 157 insertions(+), 13 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 1e5e66ae5a52..25fdb7fda112 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -74,6 +74,8 @@ my $git_command ='export LANGUAGE=en_US.UTF-8; git';
 my $tabsize = 8;
 my ${CONFIG_} = "CONFIG_";
 
+my %maybe_linker_symbol; # for externs in c exceptions, when seen in *vmlinux.lds.h
+
 sub help {
 	my ($exitcode) = @_;
 
@@ -620,6 +622,22 @@ our $signature_tags = qr{(?xi:
 	Cc:
 )};
 
+our @link_tags = qw(Link Closes);
+
+#Create a search and print patterns for all these strings to be used directly below
+our $link_tags_search = "";
+our $link_tags_print = "";
+foreach my $entry (@link_tags) {
+	if ($link_tags_search ne "") {
+		$link_tags_search .= '|';
+		$link_tags_print .= ' or ';
+	}
+	$entry .= ':';
+	$link_tags_search .= $entry;
+	$link_tags_print .= "'$entry'";
+}
+$link_tags_search = "(?:${link_tags_search})";
+
 our $tracing_logging_tags = qr{(?xi:
 	[=-]*> |
 	<[=-]* |
@@ -702,6 +720,17 @@ sub find_standard_signature {
 	return "";
 }
 
+our $obsolete_archives = qr{(?xi:
+	\Qfreedesktop.org/archives/dri-devel\E |
+	\Qlists.infradead.org\E |
+	\Qlkml.org\E |
+	\Qmail-archive.com\E |
+	\Qmailman.alsa-project.org/pipermail\E |
+	\Qmarc.info\E |
+	\Qozlabs.org/pipermail\E |
+	\Qspinics.net\E
+)};
+
 our @typeListMisordered = (
 	qr{char\s+(?:un)?signed},
 	qr{int\s+(?:(?:un)?signed\s+)?short\s},
@@ -812,7 +841,9 @@ our %deprecated_apis = (
 	"get_state_synchronize_sched"		=> "get_state_synchronize_rcu",
 	"cond_synchronize_sched"		=> "cond_synchronize_rcu",
 	"kmap"					=> "kmap_local_page",
+	"kunmap"				=> "kunmap_local",
 	"kmap_atomic"				=> "kmap_local_page",
+	"kunmap_atomic"				=> "kunmap_local",
 );
 
 #Create a search pattern for all these strings to speed up a loop below
@@ -3131,21 +3162,33 @@ sub process {
 			if ($sign_off =~ /^co-developed-by:$/i) {
 				if ($email eq $author) {
 					WARN("BAD_SIGN_OFF",
-					      "Co-developed-by: should not be used to attribute nominal patch author '$author'\n" . "$here\n" . $rawline);
+					      "Co-developed-by: should not be used to attribute nominal patch author '$author'\n" . $herecurr);
 				}
 				if (!defined $lines[$linenr]) {
 					WARN("BAD_SIGN_OFF",
-					     "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline);
-				} elsif ($rawlines[$linenr] !~ /^\s*signed-off-by:\s*(.*)/i) {
+					     "Co-developed-by: must be immediately followed by Signed-off-by:\n" . $herecurr);
+				} elsif ($rawlines[$linenr] !~ /^signed-off-by:\s*(.*)/i) {
 					WARN("BAD_SIGN_OFF",
-					     "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]);
+					     "Co-developed-by: must be immediately followed by Signed-off-by:\n" . $herecurr . $rawlines[$linenr] . "\n");
 				} elsif ($1 ne $email) {
 					WARN("BAD_SIGN_OFF",
-					     "Co-developed-by and Signed-off-by: name/email do not match \n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]);
+					     "Co-developed-by and Signed-off-by: name/email do not match\n" . $herecurr . $rawlines[$linenr] . "\n");
+				}
+			}
+
+# check if Reported-by: is followed by a Closes: tag
+			if ($sign_off =~ /^reported(?:|-and-tested)-by:$/i) {
+				if (!defined $lines[$linenr]) {
+					WARN("BAD_REPORTED_BY_LINK",
+					     "Reported-by: should be immediately followed by Closes: with a URL to the report\n" . $herecurr . "\n");
+				} elsif ($rawlines[$linenr] !~ /^closes:\s*/i) {
+					WARN("BAD_REPORTED_BY_LINK",
+					     "Reported-by: should be immediately followed by Closes: with a URL to the report\n" . $herecurr . $rawlines[$linenr] . "\n");
 				}
 			}
 		}
 
+
 # Check Fixes: styles is correct
 		if (!$in_header_lines &&
 		    $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) {
@@ -3225,11 +3268,11 @@ sub process {
 					# file delta changes
 		      $line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ ||
 					# filename then :
-		      $line =~ /^\s*(?:Fixes:|Link:|$signature_tags)/i ||
-					# A Fixes: or Link: line or signature tag line
+		      $line =~ /^\s*(?:Fixes:|$link_tags_search|$signature_tags)/i ||
+					# A Fixes:, link or signature tag line
 		      $commit_log_possible_stack_dump)) {
 			WARN("COMMIT_LOG_LONG_LINE",
-			     "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr);
+			     "Prefer a maximum 75 chars per line (possible unwrapped commit description?)\n" . $herecurr);
 			$commit_log_long_line = 1;
 		}
 
@@ -3239,6 +3282,29 @@ sub process {
 			$commit_log_possible_stack_dump = 0;
 		}
 
+# Check for odd tags before a URI/URL
+		if ($in_commit_log &&
+		    $line =~ /^\s*(\w+:)\s*http/ && $1 !~ /^$link_tags_search$/) {
+			if ($1 =~ /^v(?:ersion)?\d+/i) {
+				WARN("COMMIT_LOG_VERSIONING",
+				     "Patch version information should be after the --- line\n" . $herecurr);
+			} else {
+				WARN("COMMIT_LOG_USE_LINK",
+				     "Unknown link reference '$1', use $link_tags_print instead\n" . $herecurr);
+			}
+		}
+
+# Check for misuse of the link tags
+		if ($in_commit_log &&
+		    $line =~ /^\s*(\w+:)\s*(\S+)/) {
+			my $tag = $1;
+			my $value = $2;
+			if ($tag =~ /^$link_tags_search$/ && $value !~ m{^https?://}) {
+				WARN("COMMIT_LOG_WRONG_LINK",
+				     "'$tag' should be followed by a public http(s) link\n" . $herecurr);
+			}
+		}
+
 # Check for lines starting with a #
 		if ($in_commit_log && $line =~ /^#/) {
 			if (WARN("COMMIT_COMMENT_SYMBOL",
@@ -3324,6 +3390,12 @@ sub process {
 			$last_git_commit_id_linenr = $linenr if ($line =~ /\bcommit\s*$/i);
 		}
 
+# Check for mailing list archives other than lore.kernel.org
+		if ($rawline =~ m{http.*\b$obsolete_archives}) {
+			WARN("PREFER_LORE_ARCHIVE",
+			     "Use lore.kernel.org archive links when possible - see https://lore.kernel.org/lists.html\n" . $herecurr);
+		}
+
 # Check for added, moved or deleted files
 		if (!$reported_maintainer_file && !$in_commit_log &&
 		    ($line =~ /^(?:new|deleted) file mode\s*\d+\s*$/ ||
@@ -3693,7 +3765,7 @@ sub process {
 						     "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr);
 					}
 					if ($realfile =~ m@^Documentation/devicetree/bindings/@ &&
-					    not $spdx_license =~ /GPL-2\.0.*BSD-2-Clause/) {
+					    $spdx_license !~ /GPL-2\.0(?:-only)? OR BSD-2-Clause/) {
 						my $msg_level = \&WARN;
 						$msg_level = \&CHK if ($file);
 						if (&{$msg_level}("SPDX_LICENSE_TAG",
@@ -3703,12 +3775,17 @@ sub process {
 							$fixed[$fixlinenr] =~ s/SPDX-License-Identifier: .*/SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)/;
 						}
 					}
+					if ($realfile =~ m@^include/dt-bindings/@ &&
+					    $spdx_license !~ /GPL-2\.0(?:-only)? OR \S+/) {
+						WARN("SPDX_LICENSE_TAG",
+						     "DT binding headers should be licensed (GPL-2.0-only OR .*)\n" . $herecurr);
+					}
 				}
 			}
 		}
 
 # check for embedded filenames
-		if ($rawline =~ /^\+.*\Q$realfile\E/) {
+		if ($rawline =~ /^\+.*\b\Q$realfile\E\b/) {
 			WARN("EMBEDDED_FILENAME",
 			     "It's generally not useful to have the filename in the file\n" . $herecurr);
 		}
@@ -4971,7 +5048,7 @@ sub process {
 				if|for|while|switch|return|case|
 				volatile|__volatile__|
 				__attribute__|format|__extension__|
-				asm|__asm__)$/x)
+				asm|__asm__|scoped_guard)$/x)
 			{
 			# cpp #define statements have non-optional spaces, ie
 			# if there is a space between the name and the open
@@ -5766,6 +5843,8 @@ sub process {
 			    $var !~ /^(?:[A-Z]+_){1,5}[A-Z]{1,3}[a-z]/ &&
 #Ignore Page<foo> variants
 			    $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ &&
+#Ignore ETHTOOL_LINK_MODE_<foo> variants
+			    $var !~ /^ETHTOOL_LINK_MODE_/ &&
 #Ignore SI style variants like nS, mV and dB
 #(ie: max_uV, regulator_min_uA_show, RANGE_mA_VALUE)
 			    $var !~ /^(?:[a-z0-9_]*|[A-Z0-9_]*)?_?[a-z][A-Z](?:_[a-z0-9_]+|_[A-Z0-9_]+)?$/ &&
@@ -5901,6 +5980,7 @@ sub process {
 			    $dstat !~ /$exceptions/ &&
 			    $dstat !~ /^\.$Ident\s*=/ &&				# .foo =
 			    $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ &&		# stringification #foo
+			    $dstat !~ /^case\b/ &&					# case ...
 			    $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ &&	# do {...} while (...); // do {...} while (...)
 			    $dstat !~ /^while\s*$Constant\s*$Constant\s*$/ &&		# while (...) {...}
 			    $dstat !~ /^for\s*$Constant$/ &&				# for (...)
@@ -5973,6 +6053,9 @@ sub process {
 
 # check for line continuations outside of #defines, preprocessor #, and asm
 
+		} elsif ($realfile =~ m@/vmlinux.lds.h$@) {
+		    $line =~ s/(\w+)/$maybe_linker_symbol{$1}++/ge;
+		    #print "REAL: $realfile\nln: $line\nkeys:", sort keys %maybe_linker_symbol;
 		} else {
 			if ($prevline !~ /^..*\\$/ &&
 			    $line !~ /^\+\s*\#.*\\$/ &&		# preprocessor
@@ -6910,10 +6993,22 @@ sub process {
 #			}
 #		}
 
+# strcpy uses that should likely be strscpy
+		if ($line =~ /\bstrcpy\s*\(/) {
+			WARN("STRCPY",
+			     "Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88\n" . $herecurr);
+		}
+
 # strlcpy uses that should likely be strscpy
 		if ($line =~ /\bstrlcpy\s*\(/) {
 			WARN("STRLCPY",
-			     "Prefer strscpy over strlcpy - see: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw\@mail.gmail.com/\n" . $herecurr);
+			     "Prefer strscpy over strlcpy - see: https://github.com/KSPP/linux/issues/89\n" . $herecurr);
+		}
+
+# strncpy uses that should likely be strscpy or strscpy_pad
+		if ($line =~ /\bstrncpy\s*\(/) {
+			WARN("STRNCPY",
+			     "Prefer strscpy, strscpy_pad, or __nonstring over strncpy - see: https://github.com/KSPP/linux/issues/90\n" . $herecurr);
 		}
 
 # typecasts on min/max could be min_t/max_t
@@ -7020,6 +7115,21 @@ sub process {
 				     "arguments for function declarations should follow identifier\n" . $herecurr);
 			}
 
+		} elsif ($realfile =~ /\.c$/ && defined $stat &&
+		    $stat =~ /^\+extern struct\s+(\w+)\s+(\w+)\[\];/)
+		{
+			my ($st_type, $st_name) = ($1, $2);
+
+			for my $s (keys %maybe_linker_symbol) {
+			    #print "Linker symbol? $st_name : $s\n";
+			    goto LIKELY_LINKER_SYMBOL
+				if $st_name =~ /$s/;
+			}
+			WARN("AVOID_EXTERNS",
+			     "found a file-scoped extern type:$st_type name:$st_name in .c file\n"
+			     . "is this a linker symbol ?\n" . $herecurr);
+		  LIKELY_LINKER_SYMBOL:
+
 		} elsif ($realfile =~ /\.c$/ && defined $stat &&
 		    $stat =~ /^.\s*extern\s+/)
 		{
@@ -7128,7 +7238,7 @@ sub process {
 		}
 
 # check for alloc argument mismatch
-		if ($line =~ /\b((?:devm_)?(?:kcalloc|kmalloc_array))\s*\(\s*sizeof\b/) {
+		if ($line =~ /\b((?:devm_)?((?:k|kv)?(calloc|malloc_array)(?:_node)?))\s*\(\s*sizeof\b/) {
 			WARN("ALLOC_ARRAY_ARGS",
 			     "$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr);
 		}
@@ -7331,6 +7441,16 @@ sub process {
 			}
 		}
 
+# check for array definition/declarations that should use flexible arrays instead
+		if ($sline =~ /^[\+ ]\s*\}(?:\s*__packed)?\s*;\s*$/ &&
+		    $prevline =~ /^\+\s*(?:\}(?:\s*__packed\s*)?|$Type)\s*$Ident\s*\[\s*(0|1)\s*\]\s*;\s*$/) {
+			if (ERROR("FLEXIBLE_ARRAY",
+				  "Use C99 flexible arrays - see https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays\n" . $hereprev) &&
+			    $1 == '0' && $fix) {
+				$fixed[$fixlinenr - 1] =~ s/\[\s*0\s*\]/[]/;
+			}
+		}
+
 # nested likely/unlikely calls
 		if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) {
 			WARN("LIKELY_MISUSE",
@@ -7348,6 +7468,30 @@ sub process {
 			}
 		}
 
+# Complain about RCU Tasks Trace used outside of BPF (and of course, RCU).
+		our $rcu_trace_funcs = qr{(?x:
+			rcu_read_lock_trace |
+			rcu_read_lock_trace_held |
+			rcu_read_unlock_trace |
+			call_rcu_tasks_trace |
+			synchronize_rcu_tasks_trace |
+			rcu_barrier_tasks_trace |
+			rcu_request_urgent_qs_task
+		)};
+		our $rcu_trace_paths = qr{(?x:
+			kernel/bpf/ |
+			include/linux/bpf |
+			net/bpf/ |
+			kernel/rcu/ |
+			include/linux/rcu
+		)};
+		if ($line =~ /\b($rcu_trace_funcs)\s*\(/) {
+			if ($realfile !~ m{^$rcu_trace_paths}) {
+				WARN("RCU_TASKS_TRACE",
+				     "use of RCU tasks trace is incorrect outside BPF or core RCU code\n" . $herecurr);
+			}
+		}
+
 # check for lockdep_set_novalidate_class
 		if ($line =~ /^.\s*lockdep_set_novalidate_class\s*\(/ ||
 		    $line =~ /__lockdep_no_validate__\s*\)/ ) {

From 6c7495f04a3c2fc58077131ca46e399c0aa83564 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:32 +0000
Subject: [PATCH 018/139] UPSTREAM: mm/damon/core: split out DAMOS-charged
 region skip logic into a new function

Patch series "mm/damon: cleanup and refactoring code", v2.

This patchset cleans up and refactors a range of DAMON code including the
core, DAMON sysfs interface, and DAMON modules, for better readability and
convenient future feature implementations.

In detail, this patchset splits unnecessarily long and complex functions
in core into smaller functions (patches 1-4).  Then, it cleans up the
DAMON sysfs interface by using more type-safe code (patch 5) and removing
unnecessary function parameters (patch 6).  Further, it refactor the code
by distributing the code into multiple files (patches 7-10).  Last two
patches (patches 11 and 12) deduplicates and remove unnecessary header
inclusion in DAMON modules (reclaim and lru_sort).

This patch (of 12):

The DAMOS action applying function, 'damon_do_apply_schemes()', is quite
long and not so simple.  Split out the already quota-charged region skip
code, which is not a small amount of simple code, into a new function with
some comments for better readability.

Link: https://lkml.kernel.org/r/20221026225943.100429-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221026225943.100429-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 2ea3498980f5e6f3001f2984b0b92736bf1b78cb)

Bug: 300502883
Change-Id: I416f3c24d8a6e41df5512a8cfee57a9de26ae185
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/core.c | 96 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 31 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 36d098d06c55..06b50ede9cc6 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -694,6 +694,67 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
 	return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
 }
 
+/*
+ * damos_skip_charged_region() - Check if the given region or starting part of
+ * it is already charged for the DAMOS quota.
+ * @t:	The target of the region.
+ * @rp:	The pointer to the region.
+ * @s:	The scheme to be applied.
+ *
+ * If a quota of a scheme has exceeded in a quota charge window, the scheme's
+ * action would applied to only a part of the target access pattern fulfilling
+ * regions.  To avoid applying the scheme action to only already applied
+ * regions, DAMON skips applying the scheme action to the regions that charged
+ * in the previous charge window.
+ *
+ * This function checks if a given region should be skipped or not for the
+ * reason.  If only the starting part of the region has previously charged,
+ * this function splits the region into two so that the second one covers the
+ * area that not charged in the previous charge widnow and saves the second
+ * region in *rp and returns false, so that the caller can apply DAMON action
+ * to the second one.
+ *
+ * Return: true if the region should be entirely skipped, false otherwise.
+ */
+static bool damos_skip_charged_region(struct damon_target *t,
+		struct damon_region **rp, struct damos *s)
+{
+	struct damon_region *r = *rp;
+	struct damos_quota *quota = &s->quota;
+	unsigned long sz_to_skip;
+
+	/* Skip previously charged regions */
+	if (quota->charge_target_from) {
+		if (t != quota->charge_target_from)
+			return true;
+		if (r == damon_last_region(t)) {
+			quota->charge_target_from = NULL;
+			quota->charge_addr_from = 0;
+			return true;
+		}
+		if (quota->charge_addr_from &&
+				r->ar.end <= quota->charge_addr_from)
+			return true;
+
+		if (quota->charge_addr_from && r->ar.start <
+				quota->charge_addr_from) {
+			sz_to_skip = ALIGN_DOWN(quota->charge_addr_from -
+					r->ar.start, DAMON_MIN_REGION);
+			if (!sz_to_skip) {
+				if (damon_sz_region(r) <= DAMON_MIN_REGION)
+					return true;
+				sz_to_skip = DAMON_MIN_REGION;
+			}
+			damon_split_region_at(t, r, sz_to_skip);
+			r = damon_next_region(r);
+			*rp = r;
+		}
+		quota->charge_target_from = NULL;
+		quota->charge_addr_from = 0;
+	}
+	return false;
+}
+
 static void damon_do_apply_schemes(struct damon_ctx *c,
 				   struct damon_target *t,
 				   struct damon_region *r)
@@ -702,7 +763,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
-		unsigned long sz = damon_sz_region(r);
+		unsigned long sz;
 		struct timespec64 begin, end;
 		unsigned long sz_applied = 0;
 
@@ -713,41 +774,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		if (quota->esz && quota->charged_sz >= quota->esz)
 			continue;
 
-		/* Skip previously charged regions */
-		if (quota->charge_target_from) {
-			if (t != quota->charge_target_from)
-				continue;
-			if (r == damon_last_region(t)) {
-				quota->charge_target_from = NULL;
-				quota->charge_addr_from = 0;
-				continue;
-			}
-			if (quota->charge_addr_from &&
-					r->ar.end <= quota->charge_addr_from)
-				continue;
-
-			if (quota->charge_addr_from && r->ar.start <
-					quota->charge_addr_from) {
-				sz = ALIGN_DOWN(quota->charge_addr_from -
-						r->ar.start, DAMON_MIN_REGION);
-				if (!sz) {
-					if (damon_sz_region(r) <=
-					    DAMON_MIN_REGION)
-						continue;
-					sz = DAMON_MIN_REGION;
-				}
-				damon_split_region_at(t, r, sz);
-				r = damon_next_region(r);
-				sz = damon_sz_region(r);
-			}
-			quota->charge_target_from = NULL;
-			quota->charge_addr_from = 0;
-		}
+		if (damos_skip_charged_region(t, &r, s))
+			continue;
 
 		if (!damos_valid_target(c, t, r, s))
 			continue;
 
 		/* Apply the scheme */
+		sz = damon_sz_region(r);
 		if (c->ops.apply_scheme) {
 			if (quota->esz &&
 					quota->charged_sz + sz > quota->esz) {

From 0b0a43029ed4941a2f9faca14375e69062fe2e27 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:33 +0000
Subject: [PATCH 019/139] UPSTREAM: mm/damon/core: split damos application
 logic into a new function

The DAMOS action applying function, 'damon_do_apply_schemes()', is still
long and not easy to read.  Split out the code for applying a single
action to a single region into a new function for better readability.

Link: https://lkml.kernel.org/r/20221026225943.100429-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit e63a30c51f8400915db401c05d3c4db6743857e8)

Bug: 300502883
Change-Id: Iea228c7ed452fffb72f3d9a94078ad00dabf3269
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/core.c | 73 ++++++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 06b50ede9cc6..c1a912bc46ae 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -755,6 +755,44 @@ static bool damos_skip_charged_region(struct damon_target *t,
 	return false;
 }
 
+static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
+		struct damon_region *r, struct damos *s)
+{
+	struct damos_quota *quota = &s->quota;
+	unsigned long sz = damon_sz_region(r);
+	struct timespec64 begin, end;
+	unsigned long sz_applied = 0;
+
+	if (c->ops.apply_scheme) {
+		if (quota->esz && quota->charged_sz + sz > quota->esz) {
+			sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+					DAMON_MIN_REGION);
+			if (!sz)
+				goto update_stat;
+			damon_split_region_at(t, r, sz);
+		}
+		ktime_get_coarse_ts64(&begin);
+		sz_applied = c->ops.apply_scheme(c, t, r, s);
+		ktime_get_coarse_ts64(&end);
+		quota->total_charged_ns += timespec64_to_ns(&end) -
+			timespec64_to_ns(&begin);
+		quota->charged_sz += sz;
+		if (quota->esz && quota->charged_sz >= quota->esz) {
+			quota->charge_target_from = t;
+			quota->charge_addr_from = r->ar.end + 1;
+		}
+	}
+	if (s->action != DAMOS_STAT)
+		r->age = 0;
+
+update_stat:
+	s->stat.nr_tried++;
+	s->stat.sz_tried += sz;
+	if (sz_applied)
+		s->stat.nr_applied++;
+	s->stat.sz_applied += sz_applied;
+}
+
 static void damon_do_apply_schemes(struct damon_ctx *c,
 				   struct damon_target *t,
 				   struct damon_region *r)
@@ -763,9 +801,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 
 	damon_for_each_scheme(s, c) {
 		struct damos_quota *quota = &s->quota;
-		unsigned long sz;
-		struct timespec64 begin, end;
-		unsigned long sz_applied = 0;
 
 		if (!s->wmarks.activated)
 			continue;
@@ -780,37 +815,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
 		if (!damos_valid_target(c, t, r, s))
 			continue;
 
-		/* Apply the scheme */
-		sz = damon_sz_region(r);
-		if (c->ops.apply_scheme) {
-			if (quota->esz &&
-					quota->charged_sz + sz > quota->esz) {
-				sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
-						DAMON_MIN_REGION);
-				if (!sz)
-					goto update_stat;
-				damon_split_region_at(t, r, sz);
-			}
-			ktime_get_coarse_ts64(&begin);
-			sz_applied = c->ops.apply_scheme(c, t, r, s);
-			ktime_get_coarse_ts64(&end);
-			quota->total_charged_ns += timespec64_to_ns(&end) -
-				timespec64_to_ns(&begin);
-			quota->charged_sz += sz;
-			if (quota->esz && quota->charged_sz >= quota->esz) {
-				quota->charge_target_from = t;
-				quota->charge_addr_from = r->ar.end + 1;
-			}
-		}
-		if (s->action != DAMOS_STAT)
-			r->age = 0;
-
-update_stat:
-		s->stat.nr_tried++;
-		s->stat.sz_tried += sz;
-		if (sz_applied)
-			s->stat.nr_applied++;
-		s->stat.sz_applied += sz_applied;
+		damos_apply_scheme(c, t, r, s);
 	}
 }
 

From 43475d970841cd755d36285a5b9a249c83bf1df6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:34 +0000
Subject: [PATCH 020/139] UPSTREAM: mm/damon/core: split out scheme stat update
 logic into a new function

The function for applying a given DAMON scheme action to a given DAMON
region, 'damos_apply_scheme()' is not quite short.  Make it better to read
by splitting out the stat update logic into a new function.

Link: https://lkml.kernel.org/r/20221026225943.100429-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit d1cbbf621fc25950938be74a228ef518d05d93a1)

Bug: 300502883
Change-Id: I1502a102cdd6959494c249b3633ff97af6ccb94c
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/core.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index c1a912bc46ae..3a810c6e26bc 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -755,6 +755,16 @@ static bool damos_skip_charged_region(struct damon_target *t,
 	return false;
 }
 
+static void damos_update_stat(struct damos *s,
+		unsigned long sz_tried, unsigned long sz_applied)
+{
+	s->stat.nr_tried++;
+	s->stat.sz_tried += sz_tried;
+	if (sz_applied)
+		s->stat.nr_applied++;
+	s->stat.sz_applied += sz_applied;
+}
+
 static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 		struct damon_region *r, struct damos *s)
 {
@@ -786,11 +796,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 		r->age = 0;
 
 update_stat:
-	s->stat.nr_tried++;
-	s->stat.sz_tried += sz;
-	if (sz_applied)
-		s->stat.nr_applied++;
-	s->stat.sz_applied += sz_applied;
+	damos_update_stat(s, sz, sz_applied);
 }
 
 static void damon_do_apply_schemes(struct damon_ctx *c,

From b6e6b1dbf82480e44d2fdf0649edc853befdd45b Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:35 +0000
Subject: [PATCH 021/139] UPSTREAM: mm/damon/core: split out scheme quota
 adjustment logic into a new function

DAMOS quota adjustment logic in 'kdamond_apply_schemes()', has some amount
of code, and the logic is not so straightforward.  Split it out to a new
function for better readability.

Link: https://lkml.kernel.org/r/20221026225943.100429-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 898810e5ca54691f4e173f5ffc92bbce0335bc69)

Bug: 300502883
Change-Id: I2d13cf290774d36884b533fe703eb01dfc47094c
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/core.c | 91 ++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 43 deletions(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index 3a810c6e26bc..80d5937fe337 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -848,6 +848,53 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 	quota->esz = esz;
 }
 
+static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
+{
+	struct damos_quota *quota = &s->quota;
+	struct damon_target *t;
+	struct damon_region *r;
+	unsigned long cumulated_sz;
+	unsigned int score, max_score = 0;
+
+	if (!quota->ms && !quota->sz)
+		return;
+
+	/* New charge window starts */
+	if (time_after_eq(jiffies, quota->charged_from +
+				msecs_to_jiffies(quota->reset_interval))) {
+		if (quota->esz && quota->charged_sz >= quota->esz)
+			s->stat.qt_exceeds++;
+		quota->total_charged_sz += quota->charged_sz;
+		quota->charged_from = jiffies;
+		quota->charged_sz = 0;
+		damos_set_effective_quota(quota);
+	}
+
+	if (!c->ops.get_scheme_score)
+		return;
+
+	/* Fill up the score histogram */
+	memset(quota->histogram, 0, sizeof(quota->histogram));
+	damon_for_each_target(t, c) {
+		damon_for_each_region(r, t) {
+			if (!__damos_valid_target(r, s))
+				continue;
+			score = c->ops.get_scheme_score(c, t, r, s);
+			quota->histogram[score] += damon_sz_region(r);
+			if (score > max_score)
+				max_score = score;
+		}
+	}
+
+	/* Set the min score limit */
+	for (cumulated_sz = 0, score = max_score; ; score--) {
+		cumulated_sz += quota->histogram[score];
+		if (cumulated_sz >= quota->esz || !score)
+			break;
+	}
+	quota->min_score = score;
+}
+
 static void kdamond_apply_schemes(struct damon_ctx *c)
 {
 	struct damon_target *t;
@@ -855,52 +902,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
 	struct damos *s;
 
 	damon_for_each_scheme(s, c) {
-		struct damos_quota *quota = &s->quota;
-		unsigned long cumulated_sz;
-		unsigned int score, max_score = 0;
-
 		if (!s->wmarks.activated)
 			continue;
 
-		if (!quota->ms && !quota->sz)
-			continue;
-
-		/* New charge window starts */
-		if (time_after_eq(jiffies, quota->charged_from +
-					msecs_to_jiffies(
-						quota->reset_interval))) {
-			if (quota->esz && quota->charged_sz >= quota->esz)
-				s->stat.qt_exceeds++;
-			quota->total_charged_sz += quota->charged_sz;
-			quota->charged_from = jiffies;
-			quota->charged_sz = 0;
-			damos_set_effective_quota(quota);
-		}
-
-		if (!c->ops.get_scheme_score)
-			continue;
-
-		/* Fill up the score histogram */
-		memset(quota->histogram, 0, sizeof(quota->histogram));
-		damon_for_each_target(t, c) {
-			damon_for_each_region(r, t) {
-				if (!__damos_valid_target(r, s))
-					continue;
-				score = c->ops.get_scheme_score(
-						c, t, r, s);
-				quota->histogram[score] += damon_sz_region(r);
-				if (score > max_score)
-					max_score = score;
-			}
-		}
-
-		/* Set the min score limit */
-		for (cumulated_sz = 0, score = max_score; ; score--) {
-			cumulated_sz += quota->histogram[score];
-			if (cumulated_sz >= quota->esz || !score)
-				break;
-		}
-		quota->min_score = score;
+		damos_adjust_quota(c, s);
 	}
 
 	damon_for_each_target(t, c) {

From 19364f11a4db80547dfd7bad92cea4ebd133505c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:36 +0000
Subject: [PATCH 022/139] UPSTREAM: mm/damon/sysfs: use damon_addr_range for
 region's start and end values

DAMON has a struct for each address range but DAMON sysfs interface is
using the low type (unsigned long) for storing the start and end addresses
of regions.  Use the dedicated struct for better type safety.

Link: https://lkml.kernel.org/r/20221026225943.100429-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 789a230613c8dd14bdd41653de0c22783726276f)

Bug: 300502883
Change-Id: I3bcd00e51144728c7daace32bd1f5283a1ff9e3f
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 07e5f1bdf025..a5ef503d8444 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1062,13 +1062,11 @@ static struct kobj_type damon_sysfs_schemes_ktype = {
 
 struct damon_sysfs_region {
 	struct kobject kobj;
-	unsigned long start;
-	unsigned long end;
+	struct damon_addr_range ar;
 };
 
 static struct damon_sysfs_region *damon_sysfs_region_alloc(
-		unsigned long start,
-		unsigned long end)
+		struct damon_addr_range ar)
 {
 	struct damon_sysfs_region *region = kmalloc(sizeof(*region),
 			GFP_KERNEL);
@@ -1076,8 +1074,7 @@ static struct damon_sysfs_region *damon_sysfs_region_alloc(
 	if (!region)
 		return NULL;
 	region->kobj = (struct kobject){};
-	region->start = start;
-	region->end = end;
+	region->ar = ar;
 	return region;
 }
 
@@ -1087,7 +1084,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
 
-	return sysfs_emit(buf, "%lu\n", region->start);
+	return sysfs_emit(buf, "%lu\n", region->ar.start);
 }
 
 static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1095,7 +1092,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
-	int err = kstrtoul(buf, 0, &region->start);
+	int err = kstrtoul(buf, 0, &region->ar.start);
 
 	return err ? err : count;
 }
@@ -1106,7 +1103,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
 
-	return sysfs_emit(buf, "%lu\n", region->end);
+	return sysfs_emit(buf, "%lu\n", region->ar.end);
 }
 
 static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1114,7 +1111,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
 	struct damon_sysfs_region *region = container_of(kobj,
 			struct damon_sysfs_region, kobj);
-	int err = kstrtoul(buf, 0, &region->end);
+	int err = kstrtoul(buf, 0, &region->ar.end);
 
 	return err ? err : count;
 }
@@ -1187,7 +1184,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
 	regions->regions_arr = regions_arr;
 
 	for (i = 0; i < nr_regions; i++) {
-		region = damon_sysfs_region_alloc(0, 0);
+		region = damon_sysfs_region_alloc((struct damon_addr_range){});
 		if (!region) {
 			damon_sysfs_regions_rm_dirs(regions);
 			return -ENOMEM;
@@ -2147,11 +2144,11 @@ static int damon_sysfs_set_regions(struct damon_target *t,
 		struct damon_sysfs_region *sys_region =
 			sysfs_regions->regions_arr[i];
 
-		if (sys_region->start > sys_region->end)
+		if (sys_region->ar.start > sys_region->ar.end)
 			goto out;
 
-		ranges[i].start = sys_region->start;
-		ranges[i].end = sys_region->end;
+		ranges[i].start = sys_region->ar.start;
+		ranges[i].end = sys_region->ar.end;
 		if (i == 0)
 			continue;
 		if (ranges[i - 1].end > ranges[i].start)

From b7fc8d59a55859c15122aa8f63543f865d3819b7 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:37 +0000
Subject: [PATCH 023/139] UPSTREAM: mm/damon/sysfs: remove parameters of
 damon_sysfs_region_alloc()

'damon_sysfs_region_alloc()' is always called with zero-filled 'struct
damon_addr_range', because the start and end addresses should set by
users.  Remove unnecessary parameters of the function and simplify the
body by using 'kzalloc()'.

Link: https://lkml.kernel.org/r/20221026225943.100429-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 1f71981408ef5696ad8544f282d336d4fc60a807)

Bug: 300502883
Change-Id: I6fa54a7474851598fda50a07f5b6b2465bf4cd1f
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a5ef503d8444..f3d7b34ea0ab 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1065,17 +1065,9 @@ struct damon_sysfs_region {
 	struct damon_addr_range ar;
 };
 
-static struct damon_sysfs_region *damon_sysfs_region_alloc(
-		struct damon_addr_range ar)
+static struct damon_sysfs_region *damon_sysfs_region_alloc(void)
 {
-	struct damon_sysfs_region *region = kmalloc(sizeof(*region),
-			GFP_KERNEL);
-
-	if (!region)
-		return NULL;
-	region->kobj = (struct kobject){};
-	region->ar = ar;
-	return region;
+	return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL);
 }
 
 static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1184,7 +1176,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
 	regions->regions_arr = regions_arr;
 
 	for (i = 0; i < nr_regions; i++) {
-		region = damon_sysfs_region_alloc((struct damon_addr_range){});
+		region = damon_sysfs_region_alloc();
 		if (!region) {
 			damon_sysfs_regions_rm_dirs(regions);
 			return -ENOMEM;

From c5038d80cebc558ec97786181cd90ab5ce3fd823 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:38 +0000
Subject: [PATCH 024/139] UPSTREAM: mm/damon/sysfs: move sysfs_lock to common
 module

DAMON sysfs interface is implemented in a single file, sysfs.c, which has
about 2,800 lines of code.  As the interface is hierarchical and some of
the code can be reused by different hierarchies, it would make more sense
to split out the implementation into common parts and different parts in
multiple files.  As the beginning of the work, create files for common
code and move the global mutex for directories modifications protection
into the new file.

Link: https://lkml.kernel.org/r/20221026225943.100429-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 39240595917ec0c4f71d7b9dd7909790715968b5)

Bug: 300502883
Change-Id: I8e0eb32f2b90a907d917f684c35f2a321d944c0e
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/Makefile       |  2 +-
 mm/damon/sysfs-common.c | 11 +++++++++++
 mm/damon/sysfs-common.h | 11 +++++++++++
 mm/damon/sysfs.c        |  4 +---
 4 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 mm/damon/sysfs-common.c
 create mode 100644 mm/damon/sysfs-common.h

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 3e6b8ad73858..f8d535a6253b 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,7 +3,7 @@
 obj-y				:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
-obj-$(CONFIG_DAMON_SYSFS)	+= sysfs.o
+obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
 obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
new file mode 100644
index 000000000000..9dc743868d5b
--- /dev/null
+++ b/mm/damon/sysfs-common.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include "sysfs-common.h"
+
+DEFINE_MUTEX(damon_sysfs_lock);
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
new file mode 100644
index 000000000000..745a918b94f5
--- /dev/null
+++ b/mm/damon/sysfs-common.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Sysfs Interface
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+
+extern struct mutex damon_sysfs_lock;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index f3d7b34ea0ab..a847b9159718 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -5,13 +5,11 @@
  * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
  */
 
-#include <linux/damon.h>
-#include <linux/kobject.h>
 #include <linux/pid.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-static DEFINE_MUTEX(damon_sysfs_lock);
+#include "sysfs-common.h"
 
 /*
  * unsigned long range directory

From a45dff567c923884760f51e6b111697c6b92a52c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:39 +0000
Subject: [PATCH 025/139] UPSTREAM: mm/damon/sysfs: move unsigned long range
 directory to common module

The implementation of unsigned long type range directories can be reused
by multiple DAMON sysfs directories including those for DAMON-based
Operation Schemes and the range of number of monitoring regions.  Move the
code into the files for DAMON sysfs common logics.

Link: https://lkml.kernel.org/r/20221026225943.100429-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit d332fe11debe69fee3de4c2d84fa0b6649678ad2)

Bug: 300502883
Change-Id: I35f23cb5070fb3fb62a52fd2652e7f054d019201
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-common.c |  96 ++++++++++++++++++++++++++++++++++++++
 mm/damon/sysfs-common.h |  13 ++++++
 mm/damon/sysfs.c        | 100 ----------------------------------------
 3 files changed, 109 insertions(+), 100 deletions(-)

diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c
index 9dc743868d5b..52bebf242f74 100644
--- a/mm/damon/sysfs-common.c
+++ b/mm/damon/sysfs-common.c
@@ -5,7 +5,103 @@
  * Author: SeongJae Park <sj@kernel.org>
  */
 
+#include <linux/slab.h>
+
 #include "sysfs-common.h"
 
 DEFINE_MUTEX(damon_sysfs_lock);
 
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+		unsigned long min,
+		unsigned long max)
+{
+	struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+			GFP_KERNEL);
+
+	if (!range)
+		return NULL;
+	range->kobj = (struct kobject){};
+	range->min = min;
+	range->max = max;
+
+	return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+
+	return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+	unsigned long min;
+	int err;
+
+	err = kstrtoul(buf, 0, &min);
+	if (err)
+		return err;
+
+	range->min = min;
+	return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+
+	return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_ul_range *range = container_of(kobj,
+			struct damon_sysfs_ul_range, kobj);
+	unsigned long max;
+	int err;
+
+	err = kstrtoul(buf, 0, &max);
+	if (err)
+		return err;
+
+	range->max = max;
+	return count;
+}
+
+void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+		__ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+		__ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+	&damon_sysfs_ul_range_min_attr.attr,
+	&damon_sysfs_ul_range_max_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+struct kobj_type damon_sysfs_ul_range_ktype = {
+	.release = damon_sysfs_ul_range_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_ul_range_groups,
+};
+
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 745a918b94f5..56e6a99e353b 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -9,3 +9,16 @@
 #include <linux/kobject.h>
 
 extern struct mutex damon_sysfs_lock;
+
+struct damon_sysfs_ul_range {
+	struct kobject kobj;
+	unsigned long min;
+	unsigned long max;
+};
+
+struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+		unsigned long min,
+		unsigned long max);
+void damon_sysfs_ul_range_release(struct kobject *kobj);
+
+extern struct kobj_type damon_sysfs_ul_range_ktype;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a847b9159718..6774a669962e 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -11,106 +11,6 @@
 
 #include "sysfs-common.h"
 
-/*
- * unsigned long range directory
- */
-
-struct damon_sysfs_ul_range {
-	struct kobject kobj;
-	unsigned long min;
-	unsigned long max;
-};
-
-static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
-		unsigned long min,
-		unsigned long max)
-{
-	struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
-			GFP_KERNEL);
-
-	if (!range)
-		return NULL;
-	range->kobj = (struct kobject){};
-	range->min = min;
-	range->max = max;
-
-	return range;
-}
-
-static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-
-	return sysfs_emit(buf, "%lu\n", range->min);
-}
-
-static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-	unsigned long min;
-	int err;
-
-	err = kstrtoul(buf, 0, &min);
-	if (err)
-		return err;
-
-	range->min = min;
-	return count;
-}
-
-static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-
-	return sysfs_emit(buf, "%lu\n", range->max);
-}
-
-static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_ul_range *range = container_of(kobj,
-			struct damon_sysfs_ul_range, kobj);
-	unsigned long max;
-	int err;
-
-	err = kstrtoul(buf, 0, &max);
-	if (err)
-		return err;
-
-	range->max = max;
-	return count;
-}
-
-static void damon_sysfs_ul_range_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_ul_range_min_attr =
-		__ATTR_RW_MODE(min, 0600);
-
-static struct kobj_attribute damon_sysfs_ul_range_max_attr =
-		__ATTR_RW_MODE(max, 0600);
-
-static struct attribute *damon_sysfs_ul_range_attrs[] = {
-	&damon_sysfs_ul_range_min_attr.attr,
-	&damon_sysfs_ul_range_max_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
-
-static struct kobj_type damon_sysfs_ul_range_ktype = {
-	.release = damon_sysfs_ul_range_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_ul_range_groups,
-};
-
 /*
  * schemes/stats directory
  */

From 0b17df8a4fee5ed733973b5afc72240362a385ca Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:40 +0000
Subject: [PATCH 026/139] UPSTREAM: mm/damon/sysfs: split out
 kdamond-independent schemes stats update logic into a new function

'damon_sysfs_schemes_update_stats()' is coupled with both
damon_sysfs_kdamond and damon_sysfs_schemes.  It's a wide range of types
dependency.  It makes splitting the logics a little bit distracting.
Split the function so that each function is coupled with smaller range of
types.

Link: https://lkml.kernel.org/r/20221026225943.100429-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 4acd715ff57fd05a481c64d074db68f2cf5711aa)

Bug: 300502883
Change-Id: I3b0ab08f44a5cd0e6ab4dc65f8db8024a26a461e
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6774a669962e..836df19a7d86 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2246,25 +2246,13 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 	mutex_unlock(&ctx->kdamond_lock);
 }
 
-/*
- * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
- * @kdamond:	The kobject wrapper that associated to the kdamond thread.
- *
- * This function reads the schemes stats of specific kdamond and update the
- * related values for sysfs files.  This function should be called from DAMON
- * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
- * contexts-internal data and DAMON sysfs variables.
- */
-static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+static void damon_sysfs_schemes_update_stats(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
 {
-	struct damon_ctx *ctx = kdamond->damon_ctx;
-	struct damon_sysfs_schemes *sysfs_schemes;
 	struct damos *scheme;
 	int schemes_idx = 0;
 
-	if (!ctx)
-		return -EINVAL;
-	sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_stats *sysfs_stats;
 
@@ -2279,6 +2267,25 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 		sysfs_stats->sz_applied = scheme->stat.sz_applied;
 		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
 	}
+}
+
+/*
+ * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
+ * @kdamond:	The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes stats of specific kdamond and update the
+ * related values for sysfs files.  This function should be called from DAMON
+ * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
+ * contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	damon_sysfs_schemes_update_stats(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
 	return 0;
 }
 

From 67ef7b0f42a143e2cc471777397c2f6571e5dddd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:41 +0000
Subject: [PATCH 027/139] UPSTREAM: mm/damon/sysfs: split out schemes directory
 implementation to separate file

DAMON sysfs interface for 'schemes' directory is implemented using about
one thousand lines of code.  It has no strong dependency with other
parts of its file, so split it out to another file for better code
management.

Link: https://lkml.kernel.org/r/20221026225943.100429-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit c8e7b4d0ba348a8ef14956a80c780f152f433764)

Bug: 300502883
Change-Id: I3a8f63dc4f6de6f4662b6f1a7d12afe1e25b36d2
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/Makefile        |    2 +-
 mm/damon/sysfs-common.h  |   22 +
 mm/damon/sysfs-schemes.c | 1068 ++++++++++++++++++++++++++++++++++++++
 mm/damon/sysfs.c         | 1064 -------------------------------------
 4 files changed, 1091 insertions(+), 1065 deletions(-)
 create mode 100644 mm/damon/sysfs-schemes.c

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f8d535a6253b..1e86f5253d7f 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -3,7 +3,7 @@
 obj-y				:= core.o
 obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
-obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs.o
+obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs-schemes.o sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
 obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
 obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 56e6a99e353b..4626b2784404 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -22,3 +22,25 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
 void damon_sysfs_ul_range_release(struct kobject *kobj);
 
 extern struct kobj_type damon_sysfs_ul_range_ktype;
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes {
+	struct kobject kobj;
+	struct damon_sysfs_scheme **schemes_arr;
+	int nr;
+};
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void);
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes);
+
+extern struct kobj_type damon_sysfs_schemes_ktype;
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+		struct damon_sysfs_schemes *sysfs_schemes);
+
+void damon_sysfs_schemes_update_stats(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
new file mode 100644
index 000000000000..9509d5c1e7fc
--- /dev/null
+++ b/mm/damon/sysfs-schemes.c
@@ -0,0 +1,1068 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/slab.h>
+
+#include "sysfs-common.h"
+
+/*
+ * schemes/stats directory
+ */
+
+struct damon_sysfs_stats {
+	struct kobject kobj;
+	unsigned long nr_tried;
+	unsigned long sz_tried;
+	unsigned long nr_applied;
+	unsigned long sz_applied;
+	unsigned long qt_exceeds;
+};
+
+static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
+}
+
+static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->nr_tried);
+}
+
+static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->sz_tried);
+}
+
+static ssize_t nr_applied_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->nr_applied);
+}
+
+static ssize_t sz_applied_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->sz_applied);
+}
+
+static ssize_t qt_exceeds_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_stats *stats = container_of(kobj,
+			struct damon_sysfs_stats, kobj);
+
+	return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
+}
+
+static void damon_sysfs_stats_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
+		__ATTR_RO_MODE(nr_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
+		__ATTR_RO_MODE(sz_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
+		__ATTR_RO_MODE(nr_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
+		__ATTR_RO_MODE(sz_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
+		__ATTR_RO_MODE(qt_exceeds, 0400);
+
+static struct attribute *damon_sysfs_stats_attrs[] = {
+	&damon_sysfs_stats_nr_tried_attr.attr,
+	&damon_sysfs_stats_sz_tried_attr.attr,
+	&damon_sysfs_stats_nr_applied_attr.attr,
+	&damon_sysfs_stats_sz_applied_attr.attr,
+	&damon_sysfs_stats_qt_exceeds_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_stats);
+
+static struct kobj_type damon_sysfs_stats_ktype = {
+	.release = damon_sysfs_stats_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_stats_groups,
+};
+
+/*
+ * watermarks directory
+ */
+
+struct damon_sysfs_watermarks {
+	struct kobject kobj;
+	enum damos_wmark_metric metric;
+	unsigned long interval_us;
+	unsigned long high;
+	unsigned long mid;
+	unsigned long low;
+};
+
+static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
+		enum damos_wmark_metric metric, unsigned long interval_us,
+		unsigned long high, unsigned long mid, unsigned long low)
+{
+	struct damon_sysfs_watermarks *watermarks = kmalloc(
+			sizeof(*watermarks), GFP_KERNEL);
+
+	if (!watermarks)
+		return NULL;
+	watermarks->kobj = (struct kobject){};
+	watermarks->metric = metric;
+	watermarks->interval_us = interval_us;
+	watermarks->high = high;
+	watermarks->mid = mid;
+	watermarks->low = low;
+	return watermarks;
+}
+
+/* Should match with enum damos_wmark_metric */
+static const char * const damon_sysfs_wmark_metric_strs[] = {
+	"none",
+	"free_mem_rate",
+};
+
+static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_wmark_metric_strs[watermarks->metric]);
+}
+
+static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	enum damos_wmark_metric metric;
+
+	for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
+		if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
+			watermarks->metric = metric;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static ssize_t interval_us_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
+}
+
+static ssize_t interval_us_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->interval_us);
+
+	return err ? err : count;
+}
+
+static ssize_t high_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->high);
+}
+
+static ssize_t high_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->high);
+
+	return err ? err : count;
+}
+
+static ssize_t mid_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->mid);
+}
+
+static ssize_t mid_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->mid);
+
+	return err ? err : count;
+}
+
+static ssize_t low_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+
+	return sysfs_emit(buf, "%lu\n", watermarks->low);
+}
+
+static ssize_t low_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+			struct damon_sysfs_watermarks, kobj);
+	int err = kstrtoul(buf, 0, &watermarks->low);
+
+	return err ? err : count;
+}
+
+static void damon_sysfs_watermarks_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
+		__ATTR_RW_MODE(metric, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
+		__ATTR_RW_MODE(interval_us, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_high_attr =
+		__ATTR_RW_MODE(high, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
+		__ATTR_RW_MODE(mid, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_low_attr =
+		__ATTR_RW_MODE(low, 0600);
+
+static struct attribute *damon_sysfs_watermarks_attrs[] = {
+	&damon_sysfs_watermarks_metric_attr.attr,
+	&damon_sysfs_watermarks_interval_us_attr.attr,
+	&damon_sysfs_watermarks_high_attr.attr,
+	&damon_sysfs_watermarks_mid_attr.attr,
+	&damon_sysfs_watermarks_low_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
+
+static struct kobj_type damon_sysfs_watermarks_ktype = {
+	.release = damon_sysfs_watermarks_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_watermarks_groups,
+};
+
+/*
+ * scheme/weights directory
+ */
+
+struct damon_sysfs_weights {
+	struct kobject kobj;
+	unsigned int sz;
+	unsigned int nr_accesses;
+	unsigned int age;
+};
+
+static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
+		unsigned int nr_accesses, unsigned int age)
+{
+	struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
+			GFP_KERNEL);
+
+	if (!weights)
+		return NULL;
+	weights->kobj = (struct kobject){};
+	weights->sz = sz;
+	weights->nr_accesses = nr_accesses;
+	weights->age = age;
+	return weights;
+}
+
+static ssize_t sz_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->sz);
+}
+
+static ssize_t sz_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->sz);
+
+	return err ? err : count;
+}
+
+static ssize_t nr_accesses_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->nr_accesses);
+}
+
+static ssize_t nr_accesses_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->nr_accesses);
+
+	return err ? err : count;
+}
+
+static ssize_t age_permil_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+
+	return sysfs_emit(buf, "%u\n", weights->age);
+}
+
+static ssize_t age_permil_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_weights *weights = container_of(kobj,
+			struct damon_sysfs_weights, kobj);
+	int err = kstrtouint(buf, 0, &weights->age);
+
+	return err ? err : count;
+}
+
+static void damon_sysfs_weights_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_weights_sz_attr =
+		__ATTR_RW_MODE(sz_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
+		__ATTR_RW_MODE(nr_accesses_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_age_attr =
+		__ATTR_RW_MODE(age_permil, 0600);
+
+static struct attribute *damon_sysfs_weights_attrs[] = {
+	&damon_sysfs_weights_sz_attr.attr,
+	&damon_sysfs_weights_nr_accesses_attr.attr,
+	&damon_sysfs_weights_age_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_weights);
+
+static struct kobj_type damon_sysfs_weights_ktype = {
+	.release = damon_sysfs_weights_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_weights_groups,
+};
+
+/*
+ * quotas directory
+ */
+
+struct damon_sysfs_quotas {
+	struct kobject kobj;
+	struct damon_sysfs_weights *weights;
+	unsigned long ms;
+	unsigned long sz;
+	unsigned long reset_interval_ms;
+};
+
+static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
+}
+
+static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
+{
+	struct damon_sysfs_weights *weights;
+	int err;
+
+	weights = damon_sysfs_weights_alloc(0, 0, 0);
+	if (!weights)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
+			&quotas->kobj, "weights");
+	if (err)
+		kobject_put(&weights->kobj);
+	else
+		quotas->weights = weights;
+	return err;
+}
+
+static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
+{
+	kobject_put(&quotas->weights->kobj);
+}
+
+static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->ms);
+}
+
+static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->ms);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->sz);
+}
+
+static ssize_t bytes_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->sz);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static ssize_t reset_interval_ms_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+
+	return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
+}
+
+static ssize_t reset_interval_ms_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_quotas *quotas = container_of(kobj,
+			struct damon_sysfs_quotas, kobj);
+	int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
+
+	if (err)
+		return -EINVAL;
+	return count;
+}
+
+static void damon_sysfs_quotas_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_quotas_ms_attr =
+		__ATTR_RW_MODE(ms, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_sz_attr =
+		__ATTR_RW_MODE(bytes, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
+		__ATTR_RW_MODE(reset_interval_ms, 0600);
+
+static struct attribute *damon_sysfs_quotas_attrs[] = {
+	&damon_sysfs_quotas_ms_attr.attr,
+	&damon_sysfs_quotas_sz_attr.attr,
+	&damon_sysfs_quotas_reset_interval_ms_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_quotas);
+
+static struct kobj_type damon_sysfs_quotas_ktype = {
+	.release = damon_sysfs_quotas_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_quotas_groups,
+};
+
+/*
+ * access_pattern directory
+ */
+
+struct damon_sysfs_access_pattern {
+	struct kobject kobj;
+	struct damon_sysfs_ul_range *sz;
+	struct damon_sysfs_ul_range *nr_accesses;
+	struct damon_sysfs_ul_range *age;
+};
+
+static
+struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		kmalloc(sizeof(*access_pattern), GFP_KERNEL);
+
+	if (!access_pattern)
+		return NULL;
+	access_pattern->kobj = (struct kobject){};
+	return access_pattern;
+}
+
+static int damon_sysfs_access_pattern_add_range_dir(
+		struct damon_sysfs_access_pattern *access_pattern,
+		struct damon_sysfs_ul_range **range_dir_ptr,
+		char *name)
+{
+	struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
+	int err;
+
+	if (!range)
+		return -ENOMEM;
+	err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
+			&access_pattern->kobj, name);
+	if (err)
+		kobject_put(&range->kobj);
+	else
+		*range_dir_ptr = range;
+	return err;
+}
+
+static int damon_sysfs_access_pattern_add_dirs(
+		struct damon_sysfs_access_pattern *access_pattern)
+{
+	int err;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->sz, "sz");
+	if (err)
+		goto put_sz_out;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->nr_accesses, "nr_accesses");
+	if (err)
+		goto put_nr_accesses_sz_out;
+
+	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+			&access_pattern->age, "age");
+	if (err)
+		goto put_age_nr_accesses_sz_out;
+	return 0;
+
+put_age_nr_accesses_sz_out:
+	kobject_put(&access_pattern->age->kobj);
+	access_pattern->age = NULL;
+put_nr_accesses_sz_out:
+	kobject_put(&access_pattern->nr_accesses->kobj);
+	access_pattern->nr_accesses = NULL;
+put_sz_out:
+	kobject_put(&access_pattern->sz->kobj);
+	access_pattern->sz = NULL;
+	return err;
+}
+
+static void damon_sysfs_access_pattern_rm_dirs(
+		struct damon_sysfs_access_pattern *access_pattern)
+{
+	kobject_put(&access_pattern->sz->kobj);
+	kobject_put(&access_pattern->nr_accesses->kobj);
+	kobject_put(&access_pattern->age->kobj);
+}
+
+static void damon_sysfs_access_pattern_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
+}
+
+static struct attribute *damon_sysfs_access_pattern_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
+
+static struct kobj_type damon_sysfs_access_pattern_ktype = {
+	.release = damon_sysfs_access_pattern_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_access_pattern_groups,
+};
+
+/*
+ * scheme directory
+ */
+
+struct damon_sysfs_scheme {
+	struct kobject kobj;
+	enum damos_action action;
+	struct damon_sysfs_access_pattern *access_pattern;
+	struct damon_sysfs_quotas *quotas;
+	struct damon_sysfs_watermarks *watermarks;
+	struct damon_sysfs_stats *stats;
+};
+
+/* This should match with enum damos_action */
+static const char * const damon_sysfs_damos_action_strs[] = {
+	"willneed",
+	"cold",
+	"pageout",
+	"hugepage",
+	"nohugepage",
+	"lru_prio",
+	"lru_deprio",
+	"stat",
+};
+
+static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
+		enum damos_action action)
+{
+	struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
+				GFP_KERNEL);
+
+	if (!scheme)
+		return NULL;
+	scheme->kobj = (struct kobject){};
+	scheme->action = action;
+	return scheme;
+}
+
+static int damon_sysfs_scheme_set_access_pattern(
+		struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern;
+	int err;
+
+	access_pattern = damon_sysfs_access_pattern_alloc();
+	if (!access_pattern)
+		return -ENOMEM;
+	err = kobject_init_and_add(&access_pattern->kobj,
+			&damon_sysfs_access_pattern_ktype, &scheme->kobj,
+			"access_pattern");
+	if (err)
+		goto out;
+	err = damon_sysfs_access_pattern_add_dirs(access_pattern);
+	if (err)
+		goto out;
+	scheme->access_pattern = access_pattern;
+	return 0;
+
+out:
+	kobject_put(&access_pattern->kobj);
+	return err;
+}
+
+static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
+	int err;
+
+	if (!quotas)
+		return -ENOMEM;
+	err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
+			&scheme->kobj, "quotas");
+	if (err)
+		goto out;
+	err = damon_sysfs_quotas_add_dirs(quotas);
+	if (err)
+		goto out;
+	scheme->quotas = quotas;
+	return 0;
+
+out:
+	kobject_put(&quotas->kobj);
+	return err;
+}
+
+static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_watermarks *watermarks =
+		damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
+	int err;
+
+	if (!watermarks)
+		return -ENOMEM;
+	err = kobject_init_and_add(&watermarks->kobj,
+			&damon_sysfs_watermarks_ktype, &scheme->kobj,
+			"watermarks");
+	if (err)
+		kobject_put(&watermarks->kobj);
+	else
+		scheme->watermarks = watermarks;
+	return err;
+}
+
+static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
+	int err;
+
+	if (!stats)
+		return -ENOMEM;
+	err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
+			&scheme->kobj, "stats");
+	if (err)
+		kobject_put(&stats->kobj);
+	else
+		scheme->stats = stats;
+	return err;
+}
+
+static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
+{
+	int err;
+
+	err = damon_sysfs_scheme_set_access_pattern(scheme);
+	if (err)
+		return err;
+	err = damon_sysfs_scheme_set_quotas(scheme);
+	if (err)
+		goto put_access_pattern_out;
+	err = damon_sysfs_scheme_set_watermarks(scheme);
+	if (err)
+		goto put_quotas_access_pattern_out;
+	err = damon_sysfs_scheme_set_stats(scheme);
+	if (err)
+		goto put_watermarks_quotas_access_pattern_out;
+	return 0;
+
+put_watermarks_quotas_access_pattern_out:
+	kobject_put(&scheme->watermarks->kobj);
+	scheme->watermarks = NULL;
+put_quotas_access_pattern_out:
+	kobject_put(&scheme->quotas->kobj);
+	scheme->quotas = NULL;
+put_access_pattern_out:
+	kobject_put(&scheme->access_pattern->kobj);
+	scheme->access_pattern = NULL;
+	return err;
+}
+
+static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
+{
+	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
+	kobject_put(&scheme->access_pattern->kobj);
+	damon_sysfs_quotas_rm_dirs(scheme->quotas);
+	kobject_put(&scheme->quotas->kobj);
+	kobject_put(&scheme->watermarks->kobj);
+	kobject_put(&scheme->stats->kobj);
+}
+
+static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+
+	return sysfs_emit(buf, "%s\n",
+			damon_sysfs_damos_action_strs[scheme->action]);
+}
+
+static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct damon_sysfs_scheme *scheme = container_of(kobj,
+			struct damon_sysfs_scheme, kobj);
+	enum damos_action action;
+
+	for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
+		if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
+			scheme->action = action;
+			return count;
+		}
+	}
+	return -EINVAL;
+}
+
+static void damon_sysfs_scheme_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_action_attr =
+		__ATTR_RW_MODE(action, 0600);
+
+static struct attribute *damon_sysfs_scheme_attrs[] = {
+	&damon_sysfs_scheme_action_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme);
+
+static struct kobj_type damon_sysfs_scheme_ktype = {
+	.release = damon_sysfs_scheme_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_groups,
+};
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
+}
+
+void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
+{
+	struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
+	int i;
+
+	for (i = 0; i < schemes->nr; i++) {
+		damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
+		kobject_put(&schemes_arr[i]->kobj);
+	}
+	schemes->nr = 0;
+	kfree(schemes_arr);
+	schemes->schemes_arr = NULL;
+}
+
+static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
+		int nr_schemes)
+{
+	struct damon_sysfs_scheme **schemes_arr, *scheme;
+	int err, i;
+
+	damon_sysfs_schemes_rm_dirs(schemes);
+	if (!nr_schemes)
+		return 0;
+
+	schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
+			GFP_KERNEL | __GFP_NOWARN);
+	if (!schemes_arr)
+		return -ENOMEM;
+	schemes->schemes_arr = schemes_arr;
+
+	for (i = 0; i < nr_schemes; i++) {
+		scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+		if (!scheme) {
+			damon_sysfs_schemes_rm_dirs(schemes);
+			return -ENOMEM;
+		}
+
+		err = kobject_init_and_add(&scheme->kobj,
+				&damon_sysfs_scheme_ktype, &schemes->kobj,
+				"%d", i);
+		if (err)
+			goto out;
+		err = damon_sysfs_scheme_add_dirs(scheme);
+		if (err)
+			goto out;
+
+		schemes_arr[i] = scheme;
+		schemes->nr++;
+	}
+	return 0;
+
+out:
+	damon_sysfs_schemes_rm_dirs(schemes);
+	kobject_put(&scheme->kobj);
+	return err;
+}
+
+static ssize_t nr_schemes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_schemes *schemes = container_of(kobj,
+			struct damon_sysfs_schemes, kobj);
+
+	return sysfs_emit(buf, "%d\n", schemes->nr);
+}
+
+static ssize_t nr_schemes_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	struct damon_sysfs_schemes *schemes;
+	int nr, err = kstrtoint(buf, 0, &nr);
+
+	if (err)
+		return err;
+	if (nr < 0)
+		return -EINVAL;
+
+	schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
+
+	if (!mutex_trylock(&damon_sysfs_lock))
+		return -EBUSY;
+	err = damon_sysfs_schemes_add_dirs(schemes, nr);
+	mutex_unlock(&damon_sysfs_lock);
+	if (err)
+		return err;
+	return count;
+}
+
+static void damon_sysfs_schemes_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_schemes_nr_attr =
+		__ATTR_RW_MODE(nr_schemes, 0600);
+
+static struct attribute *damon_sysfs_schemes_attrs[] = {
+	&damon_sysfs_schemes_nr_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_schemes);
+
+struct kobj_type damon_sysfs_schemes_ktype = {
+	.release = damon_sysfs_schemes_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_schemes_groups,
+};
+
+static struct damos *damon_sysfs_mk_scheme(
+		struct damon_sysfs_scheme *sysfs_scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		sysfs_scheme->access_pattern;
+	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+	struct damos_access_pattern pattern = {
+		.min_sz_region = access_pattern->sz->min,
+		.max_sz_region = access_pattern->sz->max,
+		.min_nr_accesses = access_pattern->nr_accesses->min,
+		.max_nr_accesses = access_pattern->nr_accesses->max,
+		.min_age_region = access_pattern->age->min,
+		.max_age_region = access_pattern->age->max,
+	};
+	struct damos_quota quota = {
+		.ms = sysfs_quotas->ms,
+		.sz = sysfs_quotas->sz,
+		.reset_interval = sysfs_quotas->reset_interval_ms,
+		.weight_sz = sysfs_weights->sz,
+		.weight_nr_accesses = sysfs_weights->nr_accesses,
+		.weight_age = sysfs_weights->age,
+	};
+	struct damos_watermarks wmarks = {
+		.metric = sysfs_wmarks->metric,
+		.interval = sysfs_wmarks->interval_us,
+		.high = sysfs_wmarks->high,
+		.mid = sysfs_wmarks->mid,
+		.low = sysfs_wmarks->low,
+	};
+
+	return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+			&wmarks);
+}
+
+static void damon_sysfs_update_scheme(struct damos *scheme,
+		struct damon_sysfs_scheme *sysfs_scheme)
+{
+	struct damon_sysfs_access_pattern *access_pattern =
+		sysfs_scheme->access_pattern;
+	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+	scheme->pattern.min_sz_region = access_pattern->sz->min;
+	scheme->pattern.max_sz_region = access_pattern->sz->max;
+	scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
+	scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
+	scheme->pattern.min_age_region = access_pattern->age->min;
+	scheme->pattern.max_age_region = access_pattern->age->max;
+
+	scheme->action = sysfs_scheme->action;
+
+	scheme->quota.ms = sysfs_quotas->ms;
+	scheme->quota.sz = sysfs_quotas->sz;
+	scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
+	scheme->quota.weight_sz = sysfs_weights->sz;
+	scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
+	scheme->quota.weight_age = sysfs_weights->age;
+
+	scheme->wmarks.metric = sysfs_wmarks->metric;
+	scheme->wmarks.interval = sysfs_wmarks->interval_us;
+	scheme->wmarks.high = sysfs_wmarks->high;
+	scheme->wmarks.mid = sysfs_wmarks->mid;
+	scheme->wmarks.low = sysfs_wmarks->low;
+}
+
+int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+		struct damon_sysfs_schemes *sysfs_schemes)
+{
+	struct damos *scheme, *next;
+	int i = 0;
+
+	damon_for_each_scheme_safe(scheme, next, ctx) {
+		if (i < sysfs_schemes->nr)
+			damon_sysfs_update_scheme(scheme,
+					sysfs_schemes->schemes_arr[i]);
+		else
+			damon_destroy_scheme(scheme);
+		i++;
+	}
+
+	for (; i < sysfs_schemes->nr; i++) {
+		struct damos *scheme, *next;
+
+		scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
+		if (!scheme) {
+			damon_for_each_scheme_safe(scheme, next, ctx)
+				damon_destroy_scheme(scheme);
+			return -ENOMEM;
+		}
+		damon_add_scheme(ctx, scheme);
+	}
+	return 0;
+}
+
+void damon_sysfs_schemes_update_stats(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_stats *sysfs_stats;
+
+		/* user could have removed the scheme sysfs dir */
+		if (schemes_idx >= sysfs_schemes->nr)
+			break;
+
+		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+		sysfs_stats->nr_tried = scheme->stat.nr_tried;
+		sysfs_stats->sz_tried = scheme->stat.sz_tried;
+		sysfs_stats->nr_applied = scheme->stat.nr_applied;
+		sysfs_stats->sz_applied = scheme->stat.sz_applied;
+		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+	}
+}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 836df19a7d86..284daf274b3e 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -11,949 +11,6 @@
 
 #include "sysfs-common.h"
 
-/*
- * schemes/stats directory
- */
-
-struct damon_sysfs_stats {
-	struct kobject kobj;
-	unsigned long nr_tried;
-	unsigned long sz_tried;
-	unsigned long nr_applied;
-	unsigned long sz_applied;
-	unsigned long qt_exceeds;
-};
-
-static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
-{
-	return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
-}
-
-static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->nr_tried);
-}
-
-static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->sz_tried);
-}
-
-static ssize_t nr_applied_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->nr_applied);
-}
-
-static ssize_t sz_applied_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->sz_applied);
-}
-
-static ssize_t qt_exceeds_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_stats *stats = container_of(kobj,
-			struct damon_sysfs_stats, kobj);
-
-	return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
-}
-
-static void damon_sysfs_stats_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
-		__ATTR_RO_MODE(nr_tried, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
-		__ATTR_RO_MODE(sz_tried, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
-		__ATTR_RO_MODE(nr_applied, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
-		__ATTR_RO_MODE(sz_applied, 0400);
-
-static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
-		__ATTR_RO_MODE(qt_exceeds, 0400);
-
-static struct attribute *damon_sysfs_stats_attrs[] = {
-	&damon_sysfs_stats_nr_tried_attr.attr,
-	&damon_sysfs_stats_sz_tried_attr.attr,
-	&damon_sysfs_stats_nr_applied_attr.attr,
-	&damon_sysfs_stats_sz_applied_attr.attr,
-	&damon_sysfs_stats_qt_exceeds_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_stats);
-
-static struct kobj_type damon_sysfs_stats_ktype = {
-	.release = damon_sysfs_stats_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_stats_groups,
-};
-
-/*
- * watermarks directory
- */
-
-struct damon_sysfs_watermarks {
-	struct kobject kobj;
-	enum damos_wmark_metric metric;
-	unsigned long interval_us;
-	unsigned long high;
-	unsigned long mid;
-	unsigned long low;
-};
-
-static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
-		enum damos_wmark_metric metric, unsigned long interval_us,
-		unsigned long high, unsigned long mid, unsigned long low)
-{
-	struct damon_sysfs_watermarks *watermarks = kmalloc(
-			sizeof(*watermarks), GFP_KERNEL);
-
-	if (!watermarks)
-		return NULL;
-	watermarks->kobj = (struct kobject){};
-	watermarks->metric = metric;
-	watermarks->interval_us = interval_us;
-	watermarks->high = high;
-	watermarks->mid = mid;
-	watermarks->low = low;
-	return watermarks;
-}
-
-/* Should match with enum damos_wmark_metric */
-static const char * const damon_sysfs_wmark_metric_strs[] = {
-	"none",
-	"free_mem_rate",
-};
-
-static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%s\n",
-			damon_sysfs_wmark_metric_strs[watermarks->metric]);
-}
-
-static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	enum damos_wmark_metric metric;
-
-	for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
-		if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
-			watermarks->metric = metric;
-			return count;
-		}
-	}
-	return -EINVAL;
-}
-
-static ssize_t interval_us_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
-}
-
-static ssize_t interval_us_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->interval_us);
-
-	return err ? err : count;
-}
-
-static ssize_t high_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->high);
-}
-
-static ssize_t high_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->high);
-
-	return err ? err : count;
-}
-
-static ssize_t mid_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->mid);
-}
-
-static ssize_t mid_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->mid);
-
-	return err ? err : count;
-}
-
-static ssize_t low_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-
-	return sysfs_emit(buf, "%lu\n", watermarks->low);
-}
-
-static ssize_t low_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_watermarks *watermarks = container_of(kobj,
-			struct damon_sysfs_watermarks, kobj);
-	int err = kstrtoul(buf, 0, &watermarks->low);
-
-	return err ? err : count;
-}
-
-static void damon_sysfs_watermarks_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
-		__ATTR_RW_MODE(metric, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
-		__ATTR_RW_MODE(interval_us, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_high_attr =
-		__ATTR_RW_MODE(high, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
-		__ATTR_RW_MODE(mid, 0600);
-
-static struct kobj_attribute damon_sysfs_watermarks_low_attr =
-		__ATTR_RW_MODE(low, 0600);
-
-static struct attribute *damon_sysfs_watermarks_attrs[] = {
-	&damon_sysfs_watermarks_metric_attr.attr,
-	&damon_sysfs_watermarks_interval_us_attr.attr,
-	&damon_sysfs_watermarks_high_attr.attr,
-	&damon_sysfs_watermarks_mid_attr.attr,
-	&damon_sysfs_watermarks_low_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
-
-static struct kobj_type damon_sysfs_watermarks_ktype = {
-	.release = damon_sysfs_watermarks_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_watermarks_groups,
-};
-
-/*
- * scheme/weights directory
- */
-
-struct damon_sysfs_weights {
-	struct kobject kobj;
-	unsigned int sz;
-	unsigned int nr_accesses;
-	unsigned int age;
-};
-
-static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
-		unsigned int nr_accesses, unsigned int age)
-{
-	struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
-			GFP_KERNEL);
-
-	if (!weights)
-		return NULL;
-	weights->kobj = (struct kobject){};
-	weights->sz = sz;
-	weights->nr_accesses = nr_accesses;
-	weights->age = age;
-	return weights;
-}
-
-static ssize_t sz_permil_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-
-	return sysfs_emit(buf, "%u\n", weights->sz);
-}
-
-static ssize_t sz_permil_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-	int err = kstrtouint(buf, 0, &weights->sz);
-
-	return err ? err : count;
-}
-
-static ssize_t nr_accesses_permil_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-
-	return sysfs_emit(buf, "%u\n", weights->nr_accesses);
-}
-
-static ssize_t nr_accesses_permil_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-	int err = kstrtouint(buf, 0, &weights->nr_accesses);
-
-	return err ? err : count;
-}
-
-static ssize_t age_permil_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-
-	return sysfs_emit(buf, "%u\n", weights->age);
-}
-
-static ssize_t age_permil_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_weights *weights = container_of(kobj,
-			struct damon_sysfs_weights, kobj);
-	int err = kstrtouint(buf, 0, &weights->age);
-
-	return err ? err : count;
-}
-
-static void damon_sysfs_weights_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_weights_sz_attr =
-		__ATTR_RW_MODE(sz_permil, 0600);
-
-static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
-		__ATTR_RW_MODE(nr_accesses_permil, 0600);
-
-static struct kobj_attribute damon_sysfs_weights_age_attr =
-		__ATTR_RW_MODE(age_permil, 0600);
-
-static struct attribute *damon_sysfs_weights_attrs[] = {
-	&damon_sysfs_weights_sz_attr.attr,
-	&damon_sysfs_weights_nr_accesses_attr.attr,
-	&damon_sysfs_weights_age_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_weights);
-
-static struct kobj_type damon_sysfs_weights_ktype = {
-	.release = damon_sysfs_weights_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_weights_groups,
-};
-
-/*
- * quotas directory
- */
-
-struct damon_sysfs_quotas {
-	struct kobject kobj;
-	struct damon_sysfs_weights *weights;
-	unsigned long ms;
-	unsigned long sz;
-	unsigned long reset_interval_ms;
-};
-
-static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
-{
-	return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
-}
-
-static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
-{
-	struct damon_sysfs_weights *weights;
-	int err;
-
-	weights = damon_sysfs_weights_alloc(0, 0, 0);
-	if (!weights)
-		return -ENOMEM;
-
-	err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
-			&quotas->kobj, "weights");
-	if (err)
-		kobject_put(&weights->kobj);
-	else
-		quotas->weights = weights;
-	return err;
-}
-
-static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
-{
-	kobject_put(&quotas->weights->kobj);
-}
-
-static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-
-	return sysfs_emit(buf, "%lu\n", quotas->ms);
-}
-
-static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-	int err = kstrtoul(buf, 0, &quotas->ms);
-
-	if (err)
-		return -EINVAL;
-	return count;
-}
-
-static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-
-	return sysfs_emit(buf, "%lu\n", quotas->sz);
-}
-
-static ssize_t bytes_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-	int err = kstrtoul(buf, 0, &quotas->sz);
-
-	if (err)
-		return -EINVAL;
-	return count;
-}
-
-static ssize_t reset_interval_ms_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-
-	return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
-}
-
-static ssize_t reset_interval_ms_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_quotas *quotas = container_of(kobj,
-			struct damon_sysfs_quotas, kobj);
-	int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
-
-	if (err)
-		return -EINVAL;
-	return count;
-}
-
-static void damon_sysfs_quotas_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_quotas_ms_attr =
-		__ATTR_RW_MODE(ms, 0600);
-
-static struct kobj_attribute damon_sysfs_quotas_sz_attr =
-		__ATTR_RW_MODE(bytes, 0600);
-
-static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
-		__ATTR_RW_MODE(reset_interval_ms, 0600);
-
-static struct attribute *damon_sysfs_quotas_attrs[] = {
-	&damon_sysfs_quotas_ms_attr.attr,
-	&damon_sysfs_quotas_sz_attr.attr,
-	&damon_sysfs_quotas_reset_interval_ms_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_quotas);
-
-static struct kobj_type damon_sysfs_quotas_ktype = {
-	.release = damon_sysfs_quotas_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_quotas_groups,
-};
-
-/*
- * access_pattern directory
- */
-
-struct damon_sysfs_access_pattern {
-	struct kobject kobj;
-	struct damon_sysfs_ul_range *sz;
-	struct damon_sysfs_ul_range *nr_accesses;
-	struct damon_sysfs_ul_range *age;
-};
-
-static
-struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
-{
-	struct damon_sysfs_access_pattern *access_pattern =
-		kmalloc(sizeof(*access_pattern), GFP_KERNEL);
-
-	if (!access_pattern)
-		return NULL;
-	access_pattern->kobj = (struct kobject){};
-	return access_pattern;
-}
-
-static int damon_sysfs_access_pattern_add_range_dir(
-		struct damon_sysfs_access_pattern *access_pattern,
-		struct damon_sysfs_ul_range **range_dir_ptr,
-		char *name)
-{
-	struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
-	int err;
-
-	if (!range)
-		return -ENOMEM;
-	err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
-			&access_pattern->kobj, name);
-	if (err)
-		kobject_put(&range->kobj);
-	else
-		*range_dir_ptr = range;
-	return err;
-}
-
-static int damon_sysfs_access_pattern_add_dirs(
-		struct damon_sysfs_access_pattern *access_pattern)
-{
-	int err;
-
-	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
-			&access_pattern->sz, "sz");
-	if (err)
-		goto put_sz_out;
-
-	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
-			&access_pattern->nr_accesses, "nr_accesses");
-	if (err)
-		goto put_nr_accesses_sz_out;
-
-	err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
-			&access_pattern->age, "age");
-	if (err)
-		goto put_age_nr_accesses_sz_out;
-	return 0;
-
-put_age_nr_accesses_sz_out:
-	kobject_put(&access_pattern->age->kobj);
-	access_pattern->age = NULL;
-put_nr_accesses_sz_out:
-	kobject_put(&access_pattern->nr_accesses->kobj);
-	access_pattern->nr_accesses = NULL;
-put_sz_out:
-	kobject_put(&access_pattern->sz->kobj);
-	access_pattern->sz = NULL;
-	return err;
-}
-
-static void damon_sysfs_access_pattern_rm_dirs(
-		struct damon_sysfs_access_pattern *access_pattern)
-{
-	kobject_put(&access_pattern->sz->kobj);
-	kobject_put(&access_pattern->nr_accesses->kobj);
-	kobject_put(&access_pattern->age->kobj);
-}
-
-static void damon_sysfs_access_pattern_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
-}
-
-static struct attribute *damon_sysfs_access_pattern_attrs[] = {
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
-
-static struct kobj_type damon_sysfs_access_pattern_ktype = {
-	.release = damon_sysfs_access_pattern_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_access_pattern_groups,
-};
-
-/*
- * scheme directory
- */
-
-struct damon_sysfs_scheme {
-	struct kobject kobj;
-	enum damos_action action;
-	struct damon_sysfs_access_pattern *access_pattern;
-	struct damon_sysfs_quotas *quotas;
-	struct damon_sysfs_watermarks *watermarks;
-	struct damon_sysfs_stats *stats;
-};
-
-/* This should match with enum damos_action */
-static const char * const damon_sysfs_damos_action_strs[] = {
-	"willneed",
-	"cold",
-	"pageout",
-	"hugepage",
-	"nohugepage",
-	"lru_prio",
-	"lru_deprio",
-	"stat",
-};
-
-static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
-		enum damos_action action)
-{
-	struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
-				GFP_KERNEL);
-
-	if (!scheme)
-		return NULL;
-	scheme->kobj = (struct kobject){};
-	scheme->action = action;
-	return scheme;
-}
-
-static int damon_sysfs_scheme_set_access_pattern(
-		struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_access_pattern *access_pattern;
-	int err;
-
-	access_pattern = damon_sysfs_access_pattern_alloc();
-	if (!access_pattern)
-		return -ENOMEM;
-	err = kobject_init_and_add(&access_pattern->kobj,
-			&damon_sysfs_access_pattern_ktype, &scheme->kobj,
-			"access_pattern");
-	if (err)
-		goto out;
-	err = damon_sysfs_access_pattern_add_dirs(access_pattern);
-	if (err)
-		goto out;
-	scheme->access_pattern = access_pattern;
-	return 0;
-
-out:
-	kobject_put(&access_pattern->kobj);
-	return err;
-}
-
-static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
-	int err;
-
-	if (!quotas)
-		return -ENOMEM;
-	err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
-			&scheme->kobj, "quotas");
-	if (err)
-		goto out;
-	err = damon_sysfs_quotas_add_dirs(quotas);
-	if (err)
-		goto out;
-	scheme->quotas = quotas;
-	return 0;
-
-out:
-	kobject_put(&quotas->kobj);
-	return err;
-}
-
-static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_watermarks *watermarks =
-		damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
-	int err;
-
-	if (!watermarks)
-		return -ENOMEM;
-	err = kobject_init_and_add(&watermarks->kobj,
-			&damon_sysfs_watermarks_ktype, &scheme->kobj,
-			"watermarks");
-	if (err)
-		kobject_put(&watermarks->kobj);
-	else
-		scheme->watermarks = watermarks;
-	return err;
-}
-
-static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
-{
-	struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
-	int err;
-
-	if (!stats)
-		return -ENOMEM;
-	err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
-			&scheme->kobj, "stats");
-	if (err)
-		kobject_put(&stats->kobj);
-	else
-		scheme->stats = stats;
-	return err;
-}
-
-static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
-{
-	int err;
-
-	err = damon_sysfs_scheme_set_access_pattern(scheme);
-	if (err)
-		return err;
-	err = damon_sysfs_scheme_set_quotas(scheme);
-	if (err)
-		goto put_access_pattern_out;
-	err = damon_sysfs_scheme_set_watermarks(scheme);
-	if (err)
-		goto put_quotas_access_pattern_out;
-	err = damon_sysfs_scheme_set_stats(scheme);
-	if (err)
-		goto put_watermarks_quotas_access_pattern_out;
-	return 0;
-
-put_watermarks_quotas_access_pattern_out:
-	kobject_put(&scheme->watermarks->kobj);
-	scheme->watermarks = NULL;
-put_quotas_access_pattern_out:
-	kobject_put(&scheme->quotas->kobj);
-	scheme->quotas = NULL;
-put_access_pattern_out:
-	kobject_put(&scheme->access_pattern->kobj);
-	scheme->access_pattern = NULL;
-	return err;
-}
-
-static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
-{
-	damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
-	kobject_put(&scheme->access_pattern->kobj);
-	damon_sysfs_quotas_rm_dirs(scheme->quotas);
-	kobject_put(&scheme->quotas->kobj);
-	kobject_put(&scheme->watermarks->kobj);
-	kobject_put(&scheme->stats->kobj);
-}
-
-static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
-		char *buf)
-{
-	struct damon_sysfs_scheme *scheme = container_of(kobj,
-			struct damon_sysfs_scheme, kobj);
-
-	return sysfs_emit(buf, "%s\n",
-			damon_sysfs_damos_action_strs[scheme->action]);
-}
-
-static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
-		const char *buf, size_t count)
-{
-	struct damon_sysfs_scheme *scheme = container_of(kobj,
-			struct damon_sysfs_scheme, kobj);
-	enum damos_action action;
-
-	for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
-		if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
-			scheme->action = action;
-			return count;
-		}
-	}
-	return -EINVAL;
-}
-
-static void damon_sysfs_scheme_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_scheme_action_attr =
-		__ATTR_RW_MODE(action, 0600);
-
-static struct attribute *damon_sysfs_scheme_attrs[] = {
-	&damon_sysfs_scheme_action_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_scheme);
-
-static struct kobj_type damon_sysfs_scheme_ktype = {
-	.release = damon_sysfs_scheme_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_scheme_groups,
-};
-
-/*
- * schemes directory
- */
-
-struct damon_sysfs_schemes {
-	struct kobject kobj;
-	struct damon_sysfs_scheme **schemes_arr;
-	int nr;
-};
-
-static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
-{
-	return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
-}
-
-static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
-{
-	struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
-	int i;
-
-	for (i = 0; i < schemes->nr; i++) {
-		damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
-		kobject_put(&schemes_arr[i]->kobj);
-	}
-	schemes->nr = 0;
-	kfree(schemes_arr);
-	schemes->schemes_arr = NULL;
-}
-
-static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
-		int nr_schemes)
-{
-	struct damon_sysfs_scheme **schemes_arr, *scheme;
-	int err, i;
-
-	damon_sysfs_schemes_rm_dirs(schemes);
-	if (!nr_schemes)
-		return 0;
-
-	schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
-			GFP_KERNEL | __GFP_NOWARN);
-	if (!schemes_arr)
-		return -ENOMEM;
-	schemes->schemes_arr = schemes_arr;
-
-	for (i = 0; i < nr_schemes; i++) {
-		scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
-		if (!scheme) {
-			damon_sysfs_schemes_rm_dirs(schemes);
-			return -ENOMEM;
-		}
-
-		err = kobject_init_and_add(&scheme->kobj,
-				&damon_sysfs_scheme_ktype, &schemes->kobj,
-				"%d", i);
-		if (err)
-			goto out;
-		err = damon_sysfs_scheme_add_dirs(scheme);
-		if (err)
-			goto out;
-
-		schemes_arr[i] = scheme;
-		schemes->nr++;
-	}
-	return 0;
-
-out:
-	damon_sysfs_schemes_rm_dirs(schemes);
-	kobject_put(&scheme->kobj);
-	return err;
-}
-
-static ssize_t nr_schemes_show(struct kobject *kobj,
-		struct kobj_attribute *attr, char *buf)
-{
-	struct damon_sysfs_schemes *schemes = container_of(kobj,
-			struct damon_sysfs_schemes, kobj);
-
-	return sysfs_emit(buf, "%d\n", schemes->nr);
-}
-
-static ssize_t nr_schemes_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	struct damon_sysfs_schemes *schemes;
-	int nr, err = kstrtoint(buf, 0, &nr);
-
-	if (err)
-		return err;
-	if (nr < 0)
-		return -EINVAL;
-
-	schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
-
-	if (!mutex_trylock(&damon_sysfs_lock))
-		return -EBUSY;
-	err = damon_sysfs_schemes_add_dirs(schemes, nr);
-	mutex_unlock(&damon_sysfs_lock);
-	if (err)
-		return err;
-	return count;
-}
-
-static void damon_sysfs_schemes_release(struct kobject *kobj)
-{
-	kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
-}
-
-static struct kobj_attribute damon_sysfs_schemes_nr_attr =
-		__ATTR_RW_MODE(nr_schemes, 0600);
-
-static struct attribute *damon_sysfs_schemes_attrs[] = {
-	&damon_sysfs_schemes_nr_attr.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(damon_sysfs_schemes);
-
-static struct kobj_type damon_sysfs_schemes_ktype = {
-	.release = damon_sysfs_schemes_release,
-	.sysfs_ops = &kobj_sysfs_ops,
-	.default_groups = damon_sysfs_schemes_groups,
-};
-
 /*
  * init region directory
  */
@@ -2133,104 +1190,6 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 	return 0;
 }
 
-static struct damos *damon_sysfs_mk_scheme(
-		struct damon_sysfs_scheme *sysfs_scheme)
-{
-	struct damon_sysfs_access_pattern *access_pattern =
-		sysfs_scheme->access_pattern;
-	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
-	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
-	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
-
-	struct damos_access_pattern pattern = {
-		.min_sz_region = access_pattern->sz->min,
-		.max_sz_region = access_pattern->sz->max,
-		.min_nr_accesses = access_pattern->nr_accesses->min,
-		.max_nr_accesses = access_pattern->nr_accesses->max,
-		.min_age_region = access_pattern->age->min,
-		.max_age_region = access_pattern->age->max,
-	};
-	struct damos_quota quota = {
-		.ms = sysfs_quotas->ms,
-		.sz = sysfs_quotas->sz,
-		.reset_interval = sysfs_quotas->reset_interval_ms,
-		.weight_sz = sysfs_weights->sz,
-		.weight_nr_accesses = sysfs_weights->nr_accesses,
-		.weight_age = sysfs_weights->age,
-	};
-	struct damos_watermarks wmarks = {
-		.metric = sysfs_wmarks->metric,
-		.interval = sysfs_wmarks->interval_us,
-		.high = sysfs_wmarks->high,
-		.mid = sysfs_wmarks->mid,
-		.low = sysfs_wmarks->low,
-	};
-
-	return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
-			&wmarks);
-}
-
-static void damon_sysfs_update_scheme(struct damos *scheme,
-		struct damon_sysfs_scheme *sysfs_scheme)
-{
-	struct damon_sysfs_access_pattern *access_pattern =
-		sysfs_scheme->access_pattern;
-	struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
-	struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
-	struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
-
-	scheme->pattern.min_sz_region = access_pattern->sz->min;
-	scheme->pattern.max_sz_region = access_pattern->sz->max;
-	scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min;
-	scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max;
-	scheme->pattern.min_age_region = access_pattern->age->min;
-	scheme->pattern.max_age_region = access_pattern->age->max;
-
-	scheme->action = sysfs_scheme->action;
-
-	scheme->quota.ms = sysfs_quotas->ms;
-	scheme->quota.sz = sysfs_quotas->sz;
-	scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms;
-	scheme->quota.weight_sz = sysfs_weights->sz;
-	scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses;
-	scheme->quota.weight_age = sysfs_weights->age;
-
-	scheme->wmarks.metric = sysfs_wmarks->metric;
-	scheme->wmarks.interval = sysfs_wmarks->interval_us;
-	scheme->wmarks.high = sysfs_wmarks->high;
-	scheme->wmarks.mid = sysfs_wmarks->mid;
-	scheme->wmarks.low = sysfs_wmarks->low;
-}
-
-static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
-		struct damon_sysfs_schemes *sysfs_schemes)
-{
-	struct damos *scheme, *next;
-	int i = 0;
-
-	damon_for_each_scheme_safe(scheme, next, ctx) {
-		if (i < sysfs_schemes->nr)
-			damon_sysfs_update_scheme(scheme,
-					sysfs_schemes->schemes_arr[i]);
-		else
-			damon_destroy_scheme(scheme);
-		i++;
-	}
-
-	for (; i < sysfs_schemes->nr; i++) {
-		struct damos *scheme, *next;
-
-		scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
-		if (!scheme) {
-			damon_for_each_scheme_safe(scheme, next, ctx)
-				damon_destroy_scheme(scheme);
-			return -ENOMEM;
-		}
-		damon_add_scheme(ctx, scheme);
-	}
-	return 0;
-}
-
 static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
@@ -2246,29 +1205,6 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 	mutex_unlock(&ctx->kdamond_lock);
 }
 
-static void damon_sysfs_schemes_update_stats(
-		struct damon_sysfs_schemes *sysfs_schemes,
-		struct damon_ctx *ctx)
-{
-	struct damos *scheme;
-	int schemes_idx = 0;
-
-	damon_for_each_scheme(scheme, ctx) {
-		struct damon_sysfs_stats *sysfs_stats;
-
-		/* user could have removed the scheme sysfs dir */
-		if (schemes_idx >= sysfs_schemes->nr)
-			break;
-
-		sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
-		sysfs_stats->nr_tried = scheme->stat.nr_tried;
-		sysfs_stats->sz_tried = scheme->stat.sz_tried;
-		sysfs_stats->nr_applied = scheme->stat.nr_applied;
-		sysfs_stats->sz_applied = scheme->stat.sz_applied;
-		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
-	}
-}
-
 /*
  * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
  * @kdamond:	The kobject wrapper that associated to the kdamond thread.

From 3c0bc73f6e2d8f625fa571decabb0cc1706a7485 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:42 +0000
Subject: [PATCH 028/139] UPSTREAM: mm/damon/modules: deduplicate init steps
 for DAMON context setup

DAMON_RECLAIM and DAMON_LRU_SORT has duplicated code for DAMON context and
target initializations.  Deduplicate the part by implementing a function
for the initialization in 'modules-common.c' and using it.

Link: https://lkml.kernel.org/r/20221026225943.100429-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 7ae2c17f53d5054d1fe5c1a103ad46068034617d)

Bug: 300502883
Change-Id: I3679fce4da1d68f3ba49bb623d1b1b69b0cb6bab
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/Makefile         |  4 ++--
 mm/damon/lru_sort.c       | 17 +++-------------
 mm/damon/modules-common.c | 42 +++++++++++++++++++++++++++++++++++++++
 mm/damon/modules-common.h |  3 +++
 mm/damon/reclaim.c        | 17 +++-------------
 5 files changed, 53 insertions(+), 30 deletions(-)
 create mode 100644 mm/damon/modules-common.c

diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index 1e86f5253d7f..f7add3f4aa79 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -5,5 +5,5 @@ obj-$(CONFIG_DAMON_VADDR)	+= ops-common.o vaddr.o
 obj-$(CONFIG_DAMON_PADDR)	+= ops-common.o paddr.o
 obj-$(CONFIG_DAMON_SYSFS)	+= sysfs-common.o sysfs-schemes.o sysfs.o
 obj-$(CONFIG_DAMON_DBGFS)	+= dbgfs.o
-obj-$(CONFIG_DAMON_RECLAIM)	+= reclaim.o
-obj-$(CONFIG_DAMON_LRU_SORT)	+= lru_sort.o
+obj-$(CONFIG_DAMON_RECLAIM)	+= modules-common.o reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT)	+= modules-common.o lru_sort.o
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index efbc2bda8b9c..a1896c5acfe9 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -314,25 +314,14 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
 
 static int __init damon_lru_sort_init(void)
 {
-	ctx = damon_new_ctx();
-	if (!ctx)
-		return -ENOMEM;
+	int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
 
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
-		damon_destroy_ctx(ctx);
-		return -EINVAL;
-	}
+	if (err)
+		return err;
 
 	ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
 
-	target = damon_new_target();
-	if (!target) {
-		damon_destroy_ctx(ctx);
-		return -ENOMEM;
-	}
-	damon_add_target(ctx, target);
-
 	schedule_delayed_work(&damon_lru_sort_timer, 0);
 
 	damon_lru_sort_initialized = true;
diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c
new file mode 100644
index 000000000000..b2381a8466ec
--- /dev/null
+++ b/mm/damon/modules-common.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#include <linux/damon.h>
+
+#include "modules-common.h"
+
+/*
+ * Allocate, set, and return a DAMON context for the physical address space.
+ * @ctxp:	Pointer to save the point to the newly created context
+ * @targetp:	Pointer to save the point to the newly created target
+ */
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+		struct damon_target **targetp)
+{
+	struct damon_ctx *ctx;
+	struct damon_target *target;
+
+	ctx = damon_new_ctx();
+	if (!ctx)
+		return -ENOMEM;
+
+	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+		damon_destroy_ctx(ctx);
+		return -EINVAL;
+	}
+
+	target = damon_new_target();
+	if (!target) {
+		damon_destroy_ctx(ctx);
+		return -ENOMEM;
+	}
+	damon_add_target(ctx, target);
+
+	*ctxp = ctx;
+	*targetp = target;
+	return 0;
+}
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
index 5a4921851d32..f49cdb417005 100644
--- a/mm/damon/modules-common.h
+++ b/mm/damon/modules-common.h
@@ -44,3 +44,6 @@
 			0400);						\
 	module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong,	\
 			0400);
+
+int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp,
+		struct damon_target **targetp);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 162c9b1ca00f..3173f373435c 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -256,25 +256,14 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
 
 static int __init damon_reclaim_init(void)
 {
-	ctx = damon_new_ctx();
-	if (!ctx)
-		return -ENOMEM;
+	int err = damon_modules_new_paddr_ctx_target(&ctx, &target);
 
-	if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
-		damon_destroy_ctx(ctx);
-		return -EINVAL;
-	}
+	if (err)
+		return err;
 
 	ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
-	target = damon_new_target();
-	if (!target) {
-		damon_destroy_ctx(ctx);
-		return -ENOMEM;
-	}
-	damon_add_target(ctx, target);
-
 	schedule_delayed_work(&damon_reclaim_timer, 0);
 
 	damon_reclaim_initialized = true;

From 4e2d3f8e31d925e3edb1e1d05df2137c1d9e6725 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 26 Oct 2022 22:59:43 +0000
Subject: [PATCH 029/139] UPSTREAM: mm/damon/{reclaim,lru_sort}: remove
 unnecessarily included headers

Some headers that 'reclaim.c' and 'lru_sort.c' are including are
unnecessary now owing to previous cleanups and refactorings.  Remove
those.

Link: https://lkml.kernel.org/r/20221026225943.100429-13-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit b0d3dbd1b98660ec2154fccbd21c13916c967c05)

Bug: 300502883
Change-Id: I66ffcdfe7276261f5de14d8d794a1dd6b5312caf
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/lru_sort.c | 2 --
 mm/damon/reclaim.c  | 2 --
 2 files changed, 4 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index a1896c5acfe9..5c60163e556c 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -8,9 +8,7 @@
 #define pr_fmt(fmt) "damon-lru-sort: " fmt
 
 #include <linux/damon.h>
-#include <linux/ioport.h>
 #include <linux/module.h>
-#include <linux/sched.h>
 #include <linux/workqueue.h>
 
 #include "modules-common.h"
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 3173f373435c..e14eb30c01f4 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -8,9 +8,7 @@
 #define pr_fmt(fmt) "damon-reclaim: " fmt
 
 #include <linux/damon.h>
-#include <linux/ioport.h>
 #include <linux/module.h>
-#include <linux/sched.h>
 #include <linux/workqueue.h>
 
 #include "modules-common.h"

From 540e9b850d4abf29dcc30631c2a517111028abed Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 25 Oct 2022 17:36:47 +0000
Subject: [PATCH 030/139] UPSTREAM: mm/damon/reclaim: enable and disable
 synchronously

Patch series "mm/damon/reclaim,lru_sort: enable/disable synchronously".

Writing a value to DAMON_RECLAIM and DAMON_LRU_SORT's 'enabled' parameters
turns on or off DAMON in an ansychronous way.  This means the parameter
cannot be used to read the current status of them.  'kdamond_pid'
parameter should be used instead for the purpose.  The documentation is
easy to be read as it works in a synchronous way, so it is a little bit
confusing.  It also makes the user space tooling dirty.

There's no real reason to have the asynchronous behavior, though.  Simply
make the parameter works synchronously, rather than updating the document.

The first and second patches changes the behavior of the 'enabled'
parameter for DAMON_RECLAIM and adds a selftest for the changed behavior,
respectively.  Following two patches make the same changes for
DAMON_LRU_SORT.

This patch (of 4):

Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off
DAMON in an ansychronous way.  This means the parameter cannot be used to
read the current status of DAMON_RECLAIM.  'kdamond_pid' parameter should
be used instead for the purpose.  The documentation is easy to be read as
it works in a synchronous way, so it is a little bit confusing.  It also
makes the user space tooling dirty.

There's no real reason to have the asynchronous behavior, though.  Simply
make the parameter works synchronously, rather than updating the document.

Link: https://lkml.kernel.org/r/20221025173650.90624-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221025173650.90624-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 04e98764befa371836a78b2b489e8b931a3a9e9a)

Bug: 300502883
Change-Id: If3ea45d1a07d57a3be47317886b17b61f62d5bcf
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/reclaim.c | 53 ++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e14eb30c01f4..e57604bec06d 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -9,7 +9,6 @@
 
 #include <linux/damon.h>
 #include <linux/module.h>
-#include <linux/workqueue.h>
 
 #include "modules-common.h"
 
@@ -181,38 +180,31 @@ static int damon_reclaim_turn(bool on)
 	return 0;
 }
 
-static struct delayed_work damon_reclaim_timer;
-static void damon_reclaim_timer_fn(struct work_struct *work)
-{
-	static bool last_enabled;
-	bool now_enabled;
-
-	now_enabled = enabled;
-	if (last_enabled != now_enabled) {
-		if (!damon_reclaim_turn(now_enabled))
-			last_enabled = now_enabled;
-		else
-			enabled = last_enabled;
-	}
-}
-static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
-
-static bool damon_reclaim_initialized;
-
 static int damon_reclaim_enabled_store(const char *val,
 		const struct kernel_param *kp)
 {
-	int rc = param_set_bool(val, kp);
+	bool is_enabled = enabled;
+	bool enable;
+	int err;
 
-	if (rc < 0)
-		return rc;
+	err = strtobool(val, &enable);
+	if (err)
+		return err;
 
-	/* system_wq might not initialized yet */
-	if (!damon_reclaim_initialized)
-		return rc;
+	if (is_enabled == enable)
+		return 0;
 
-	schedule_delayed_work(&damon_reclaim_timer, 0);
-	return 0;
+	/* Called before init function.  The function will handle this. */
+	if (!ctx)
+		goto set_param_out;
+
+	err = damon_reclaim_turn(enable);
+	if (err)
+		return err;
+
+set_param_out:
+	enabled = enable;
+	return err;
 }
 
 static const struct kernel_param_ops enabled_param_ops = {
@@ -262,10 +254,11 @@ static int __init damon_reclaim_init(void)
 	ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
 
-	schedule_delayed_work(&damon_reclaim_timer, 0);
+	/* 'enabled' has set before this function, probably via command line */
+	if (enabled)
+		err = damon_reclaim_turn(true);
 
-	damon_reclaim_initialized = true;
-	return 0;
+	return err;
 }
 
 module_init(damon_reclaim_init);

From 6547a97f321cb1b08e8b030f720f736adc12b6fa Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 25 Oct 2022 17:36:49 +0000
Subject: [PATCH 031/139] UPSTREAM: mm/damon/lru_sort: enable and disable
 synchronously

Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off
DAMON in an ansychronous way.  This means the parameter cannot be used to
read the current status of DAMON_RECLAIM.  'kdamond_pid' parameter should
be used instead for the purpose.  The documentation is easy to be read as
it works in a synchronous way, so it is a little bit confusing.  It also
makes the user space tooling dirty.

There's no real reason to have the asynchronous behavior, though.  Simply
make the parameter works synchronously, rather than updating the document.

Link: https://lkml.kernel.org/r/20221025173650.90624-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 7a034fbba3361e94956431d17660d7c5674d13c3)

Bug: 300502883
Change-Id: Iaabcbb45e4fe5dfe6781407b23d82657988c53d8
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/lru_sort.c | 51 +++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 5c60163e556c..2a532e3983df 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -9,7 +9,6 @@
 
 #include <linux/damon.h>
 #include <linux/module.h>
-#include <linux/workqueue.h>
 
 #include "modules-common.h"
 
@@ -235,38 +234,31 @@ static int damon_lru_sort_turn(bool on)
 	return 0;
 }
 
-static struct delayed_work damon_lru_sort_timer;
-static void damon_lru_sort_timer_fn(struct work_struct *work)
-{
-	static bool last_enabled;
-	bool now_enabled;
-
-	now_enabled = enabled;
-	if (last_enabled != now_enabled) {
-		if (!damon_lru_sort_turn(now_enabled))
-			last_enabled = now_enabled;
-		else
-			enabled = last_enabled;
-	}
-}
-static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
-
-static bool damon_lru_sort_initialized;
-
 static int damon_lru_sort_enabled_store(const char *val,
 		const struct kernel_param *kp)
 {
-	int rc = param_set_bool(val, kp);
+	bool is_enabled = enabled;
+	bool enable;
+	int err;
 
-	if (rc < 0)
-		return rc;
+	err = strtobool(val, &enable);
+	if (err)
+		return err;
 
-	if (!damon_lru_sort_initialized)
-		return rc;
+	if (is_enabled == enable)
+		return 0;
 
-	schedule_delayed_work(&damon_lru_sort_timer, 0);
+	/* Called before init function.  The function will handle this. */
+	if (!ctx)
+		goto set_param_out;
 
-	return 0;
+	err = damon_lru_sort_turn(enable);
+	if (err)
+		return err;
+
+set_param_out:
+	enabled = enable;
+	return err;
 }
 
 static const struct kernel_param_ops enabled_param_ops = {
@@ -320,10 +312,11 @@ static int __init damon_lru_sort_init(void)
 	ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
 	ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
 
-	schedule_delayed_work(&damon_lru_sort_timer, 0);
+	/* 'enabled' has set before this function, probably via command line */
+	if (enabled)
+		err = damon_lru_sort_turn(true);
 
-	damon_lru_sort_initialized = true;
-	return 0;
+	return err;
 }
 
 module_init(damon_lru_sort_init);

From b5d1f3576b71eda9569c41a3dae8d9538ab27444 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:21 +0000
Subject: [PATCH 032/139] UPSTREAM: mm/damon/core: add a callback for scheme
 target regions check

Patch series "efficiently expose damos action tried regions information".

DAMON users can retrieve the monitoring results via 'after_aggregation'
callbacks if the user is using the kernel API, or 'damon_aggregated'
tracepoint if the user is in the user space.  Those are useful if full
monitoring results are necessary.  However, if the user has interest in
only a snapshot of the results for some regions having specific access
pattern, the interfaces could be inefficient.  For example, some users
only want to know which memory regions are not accessed for more than a
specific time at the moment.

Also, some DAMOS users would want to know exactly to what memory regions
the schemes' actions tried to be applied, for a debugging or a tuning.  As
DAMOS has its internal mechanism for quota and regions prioritization, the
users would need to simulate DAMOS' mechanism against the monitoring
results.  That's unnecessarily complex.

This patchset implements DAMON kernel API callbacks and sysfs directory
for efficient exposure of the information for the use cases.  The new
callback will be called for each region when a DAMOS action is gonna tried
to be applied to it.  The sysfs directory will be called 'tried_regions'
and placed under each scheme sysfs directory.  Users can write a special
keyworkd, 'update_schemes_regions', to the 'state' file of a kdamond sysfs
directory.  Then, DAMON sysfs interface will fill the directory with the
information of regions that corresponding scheme action was tried to be
applied for next one aggregation interval.

Patches Sequence
----------------

The first one (patch 1) implements the callback for the kernel space
users.  Following two patches (patches 2 and 3) implements sysfs
directories for the information and its sub directories.  Two patches
(patches 4 and 5) for implementing the special keywords for filling the
data to and cleaning up the directories follow.  Patch 6 adds a selftest
for the new sysfs directory.  Finally, two patches (patches 7 and 8)
document the new feature in the administrator guide and the ABI document.

This patch (of 8):

Getting DAMON monitoring results of only specific access pattern (e.g.,
getting address ranges of memory that not accessed at all for two minutes)
can be useful for efficient monitoring of the system.  The information can
also be helpful for deep level investigation of DAMON-based operation
schemes.

For that, users need to record (in case of the user space users) or
iterate (in case of the kernel space users) full monitoring results and
filter it out for the specific access pattern.  In case of the DAMOS
investigation, users will even need to simulate DAMOS' quota and
prioritization mechanisms.  It's inefficient and complex.

Add a new DAMON callback that will be called before each scheme is applied
to each region.  DAMON kernel API users will be able to do the query-like
monitoring results collection, or DAMOS investigation in an efficient and
simple way using it.

Commits for providing the capability to the user space users will follow.

Link: https://lkml.kernel.org/r/20221101220328.95765-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20221101220328.95765-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 44467bbb7e81ebcef2a5bfc9d6546bf7cd015374)

Bug: 300502883
Change-Id: I21ff3c9cf6c30e113f78883e5063bcb898506b41
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 include/linux/damon.h | 5 +++++
 mm/damon/core.c       | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 620ada094c3b..35630634d790 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -357,6 +357,7 @@ struct damon_operations {
  * @after_wmarks_check:	Called after each schemes' watermarks check.
  * @after_sampling:	Called after each sampling.
  * @after_aggregation:	Called after each aggregation.
+ * @before_damos_apply:	Called before applying DAMOS action.
  * @before_terminate:	Called before terminating the monitoring.
  * @private:		User private data.
  *
@@ -385,6 +386,10 @@ struct damon_callback {
 	int (*after_wmarks_check)(struct damon_ctx *context);
 	int (*after_sampling)(struct damon_ctx *context);
 	int (*after_aggregation)(struct damon_ctx *context);
+	int (*before_damos_apply)(struct damon_ctx *context,
+			struct damon_target *target,
+			struct damon_region *region,
+			struct damos *scheme);
 	void (*before_terminate)(struct damon_ctx *context);
 };
 
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 80d5937fe337..ceec75b88ef9 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -772,6 +772,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 	unsigned long sz = damon_sz_region(r);
 	struct timespec64 begin, end;
 	unsigned long sz_applied = 0;
+	int err = 0;
 
 	if (c->ops.apply_scheme) {
 		if (quota->esz && quota->charged_sz + sz > quota->esz) {
@@ -782,7 +783,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t,
 			damon_split_region_at(t, r, sz);
 		}
 		ktime_get_coarse_ts64(&begin);
-		sz_applied = c->ops.apply_scheme(c, t, r, s);
+		if (c->callback.before_damos_apply)
+			err = c->callback.before_damos_apply(c, t, r, s);
+		if (!err)
+			sz_applied = c->ops.apply_scheme(c, t, r, s);
 		ktime_get_coarse_ts64(&end);
 		quota->total_charged_ns += timespec64_to_ns(&end) -
 			timespec64_to_ns(&begin);

From b4c34cc1689700c49e0ce371b13c695981f66d5c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:22 +0000
Subject: [PATCH 033/139] UPSTREAM: mm/damon/sysfs-schemes: implement
 schemes/tried_regions directory

For efficient and simple query-like DAMON monitoring results readings and
deep level investigations of DAMOS, DAMON kernel API
(include/linux/damon.h) users can use 'before_damos_apply' DAMON callback.
However, DAMON sysfs interface users don't have such option.

Add a directory, namely 'tried_regions', under each scheme directory to
use it as the interface for the purpose.  Note that this commit is
implementing only the directory but the data filling.

After the data filling change is made, users will be able to signal DAMON
to fill the directory with the regions that corresponding scheme has tried
to be applied.  By setting the access pattern of the scheme, users could
do the efficient query-like monitoring.

Link: https://lkml.kernel.org/r/20221101220328.95765-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 5181b75f438d2e5b7f27bf48c6ea88a87c2882b7)

Bug: 300502883
Change-Id: Idc7a1fca201b90f8fea62899f1e6b500bb8e14e1
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-schemes.c | 57 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 9509d5c1e7fc..500759d8b20c 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -9,6 +9,36 @@
 
 #include "sysfs-common.h"
 
+/*
+ * scheme regions directory
+ */
+
+struct damon_sysfs_scheme_regions {
+	struct kobject kobj;
+};
+
+static struct damon_sysfs_scheme_regions *
+damon_sysfs_scheme_regions_alloc(void)
+{
+	return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL);
+}
+
+static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
+{
+	kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
+}
+
+static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
+
+static struct kobj_type damon_sysfs_scheme_regions_ktype = {
+	.release = damon_sysfs_scheme_regions_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_regions_groups,
+};
+
 /*
  * schemes/stats directory
  */
@@ -635,6 +665,7 @@ struct damon_sysfs_scheme {
 	struct damon_sysfs_quotas *quotas;
 	struct damon_sysfs_watermarks *watermarks;
 	struct damon_sysfs_stats *stats;
+	struct damon_sysfs_scheme_regions *tried_regions;
 };
 
 /* This should match with enum damos_action */
@@ -743,6 +774,25 @@ static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
 	return err;
 }
 
+static int damon_sysfs_scheme_set_tried_regions(
+		struct damon_sysfs_scheme *scheme)
+{
+	struct damon_sysfs_scheme_regions *tried_regions =
+		damon_sysfs_scheme_regions_alloc();
+	int err;
+
+	if (!tried_regions)
+		return -ENOMEM;
+	err = kobject_init_and_add(&tried_regions->kobj,
+			&damon_sysfs_scheme_regions_ktype, &scheme->kobj,
+			"tried_regions");
+	if (err)
+		kobject_put(&tried_regions->kobj);
+	else
+		scheme->tried_regions = tried_regions;
+	return err;
+}
+
 static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 {
 	int err;
@@ -759,8 +809,14 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
 	err = damon_sysfs_scheme_set_stats(scheme);
 	if (err)
 		goto put_watermarks_quotas_access_pattern_out;
+	err = damon_sysfs_scheme_set_tried_regions(scheme);
+	if (err)
+		goto put_tried_regions_out;
 	return 0;
 
+put_tried_regions_out:
+	kobject_put(&scheme->tried_regions->kobj);
+	scheme->tried_regions = NULL;
 put_watermarks_quotas_access_pattern_out:
 	kobject_put(&scheme->watermarks->kobj);
 	scheme->watermarks = NULL;
@@ -781,6 +837,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
 	kobject_put(&scheme->stats->kobj);
+	kobject_put(&scheme->tried_regions->kobj);
 }
 
 static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,

From 3421250b35221bad4e65440ac2be0831767c5ba6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:23 +0000
Subject: [PATCH 034/139] UPSTREAM: mm/damon/sysfs-schemes: implement scheme
 region directory

Implement region directories under 'tried_regions' directory of each
scheme DAMON sysfs directory.  This directory will provide the address
range, the monitored access frequency ('nr_accesses'), and the age of each
DAMON region that corresponding DAMON-based operation scheme has tried to
be applied.  Note that this commit doesn't implement the code for filling
the data but only the sysfs directory.

Link: https://lkml.kernel.org/r/20221101220328.95765-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 9277d0367ba18ef4bb98bafb1209e715844cdf7e)

Bug: 300502883
Change-Id: I69c9010a8fce2fa61a1d27f2964ac7bc7b85dd44
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-schemes.c | 123 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 500759d8b20c..f0b5ad7e721d 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -9,18 +9,138 @@
 
 #include "sysfs-common.h"
 
+/*
+ * scheme region directory
+ */
+
+struct damon_sysfs_scheme_region {
+	struct kobject kobj;
+	struct damon_addr_range ar;
+	unsigned int nr_accesses;
+	unsigned int age;
+	struct list_head list;
+};
+
+static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc(
+		struct damon_region *region)
+{
+	struct damon_sysfs_scheme_region *sysfs_region = kmalloc(
+			sizeof(*sysfs_region), GFP_KERNEL);
+
+	if (!sysfs_region)
+		return NULL;
+	sysfs_region->kobj = (struct kobject){};
+	sysfs_region->ar = region->ar;
+	sysfs_region->nr_accesses = region->nr_accesses;
+	sysfs_region->age = region->age;
+	INIT_LIST_HEAD(&sysfs_region->list);
+	return sysfs_region;
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%lu\n", region->ar.start);
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%lu\n", region->ar.end);
+}
+
+static ssize_t nr_accesses_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%u\n", region->nr_accesses);
+}
+
+static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	return sysfs_emit(buf, "%u\n", region->age);
+}
+
+static void damon_sysfs_scheme_region_release(struct kobject *kobj)
+{
+	struct damon_sysfs_scheme_region *region = container_of(kobj,
+			struct damon_sysfs_scheme_region, kobj);
+
+	list_del(&region->list);
+	kfree(region);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_region_start_attr =
+		__ATTR_RO_MODE(start, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_end_attr =
+		__ATTR_RO_MODE(end, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr =
+		__ATTR_RO_MODE(nr_accesses, 0400);
+
+static struct kobj_attribute damon_sysfs_scheme_region_age_attr =
+		__ATTR_RO_MODE(age, 0400);
+
+static struct attribute *damon_sysfs_scheme_region_attrs[] = {
+	&damon_sysfs_scheme_region_start_attr.attr,
+	&damon_sysfs_scheme_region_end_attr.attr,
+	&damon_sysfs_scheme_region_nr_accesses_attr.attr,
+	&damon_sysfs_scheme_region_age_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_region);
+
+static struct kobj_type damon_sysfs_scheme_region_ktype = {
+	.release = damon_sysfs_scheme_region_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = damon_sysfs_scheme_region_groups,
+};
+
 /*
  * scheme regions directory
  */
 
 struct damon_sysfs_scheme_regions {
 	struct kobject kobj;
+	struct list_head regions_list;
+	int nr_regions;
 };
 
 static struct damon_sysfs_scheme_regions *
 damon_sysfs_scheme_regions_alloc(void)
 {
-	return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL);
+	struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
+			GFP_KERNEL);
+
+	regions->kobj = (struct kobject){};
+	INIT_LIST_HEAD(&regions->regions_list);
+	regions->nr_regions = 0;
+	return regions;
+}
+
+static void damon_sysfs_scheme_regions_rm_dirs(
+		struct damon_sysfs_scheme_regions *regions)
+{
+	struct damon_sysfs_scheme_region *r, *next;
+
+	list_for_each_entry_safe(r, next, &regions->regions_list, list) {
+		/* release function deletes it from the list */
+		kobject_put(&r->kobj);
+		regions->nr_regions--;
+	}
 }
 
 static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
@@ -837,6 +957,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
 	kobject_put(&scheme->quotas->kobj);
 	kobject_put(&scheme->watermarks->kobj);
 	kobject_put(&scheme->stats->kobj);
+	damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
 	kobject_put(&scheme->tried_regions->kobj);
 }
 

From 80ccab9b0e4e326132976d42b33ef57bb85d486e Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:24 +0000
Subject: [PATCH 035/139] UPSTREAM: mm/damon/sysfs: implement DAMOS tried
 regions update command

Implement the code for filling the data of 'tried_regions' DAMON sysfs
directory.  With this commit, DAMON sysfs interface users can write a
special keyword, 'update_schemes_tried_regions' to the corresponding
'state' file of the kdamond.  Then, DAMON sysfs interface will collect the
tried regions information using the 'before_damos_apply()' callback for
one aggregation interval and populate scheme region directories with the
values.

[sj@kernel.org: skip tried regions update if the scheme directory was removed]
  Link: https://lkml.kernel.org/r/20221114182954.4745-2-sj@kernel.org
Link: https://lkml.kernel.org/r/20221101220328.95765-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit f1d13cacabe140305844879e495ca67837e059cc)

Bug: 300502883
Change-Id: I6749b8dc75023a9a3f3dc64902196b07fa523267
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-common.h  |  6 +++
 mm/damon/sysfs-schemes.c | 80 ++++++++++++++++++++++++++++++++++++++++
 mm/damon/sysfs.c         | 57 +++++++++++++++++++++++++++-
 3 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 4626b2784404..634a6e7fca78 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -44,3 +44,9 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx,
 void damon_sysfs_schemes_update_stats(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_start(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index f0b5ad7e721d..5f14f18bcc49 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1244,3 +1244,83 @@ void damon_sysfs_schemes_update_stats(
 		sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
 	}
 }
+
+/*
+ * damon_sysfs_schemes that need to update its schemes regions dir.  Protected
+ * by damon_sysfs_lock
+ */
+static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
+static int damon_sysfs_schemes_region_idx;
+
+/*
+ * DAMON callback that called before damos apply.  While this callback is
+ * registered, damon_sysfs_lock should be held to ensure the regions
+ * directories exist.
+ */
+static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
+		struct damon_target *t, struct damon_region *r,
+		struct damos *s)
+{
+	struct damos *scheme;
+	struct damon_sysfs_scheme_regions *sysfs_regions;
+	struct damon_sysfs_scheme_region *region;
+	struct damon_sysfs_schemes *sysfs_schemes =
+		damon_sysfs_schemes_for_damos_callback;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		if (scheme == s)
+			break;
+		schemes_idx++;
+	}
+
+	/* user could have removed the scheme sysfs dir */
+	if (schemes_idx >= sysfs_schemes->nr)
+		return 0;
+
+	sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+	region = damon_sysfs_scheme_region_alloc(r);
+	list_add_tail(&region->list, &sysfs_regions->regions_list);
+	sysfs_regions->nr_regions++;
+	if (kobject_init_and_add(&region->kobj,
+				&damon_sysfs_scheme_region_ktype,
+				&sysfs_regions->kobj, "%d",
+				damon_sysfs_schemes_region_idx++)) {
+		kobject_put(&region->kobj);
+	}
+	return 0;
+}
+
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_update_regions_start(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	struct damos *scheme;
+	int schemes_idx = 0;
+
+	damon_for_each_scheme(scheme, ctx) {
+		struct damon_sysfs_scheme *sysfs_scheme;
+
+		sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
+		damon_sysfs_scheme_regions_rm_dirs(
+				sysfs_scheme->tried_regions);
+	}
+
+	damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
+	return 0;
+}
+
+/*
+ * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock.  Caller
+ * should unlock damon_sysfs_lock which held before
+ * damon_sysfs_schemes_update_regions_start()
+ */
+int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
+{
+	damon_sysfs_schemes_for_damos_callback = NULL;
+	ctx->callback.before_damos_apply = NULL;
+	damon_sysfs_schemes_region_idx = 0;
+	return 0;
+}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 284daf274b3e..ffb5a84059d7 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -999,6 +999,11 @@ enum damon_sysfs_cmd {
 	 * files.
 	 */
 	DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
+	/*
+	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
+	 * regions
+	 */
+	DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS,
 	/*
 	 * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
 	 */
@@ -1011,6 +1016,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"off",
 	"commit",
 	"update_schemes_stats",
+	"update_schemes_tried_regions",
 };
 
 /*
@@ -1193,6 +1199,16 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
+	struct damon_sysfs_kdamond *kdamond;
+
+	/* damon_sysfs_schemes_update_regions_stop() might not yet called */
+	kdamond = damon_sysfs_cmd_request.kdamond;
+	if (kdamond && damon_sysfs_cmd_request.cmd ==
+			DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
+			ctx == kdamond->damon_ctx) {
+		damon_sysfs_schemes_update_regions_stop(ctx);
+		mutex_unlock(&damon_sysfs_lock);
+	}
 
 	if (!damon_target_has_pid(ctx))
 		return;
@@ -1225,6 +1241,27 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 	return 0;
 }
 
+static int damon_sysfs_upd_schemes_regions_start(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	return damon_sysfs_schemes_update_regions_start(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
+static int damon_sysfs_upd_schemes_regions_stop(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	return damon_sysfs_schemes_update_regions_stop(ctx);
+}
+
 static inline bool damon_sysfs_kdamond_running(
 		struct damon_sysfs_kdamond *kdamond)
 {
@@ -1277,10 +1314,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
 static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 {
 	struct damon_sysfs_kdamond *kdamond;
+	static bool damon_sysfs_schemes_regions_updating;
 	int err = 0;
 
 	/* avoid deadlock due to concurrent state_store('off') */
-	if (!mutex_trylock(&damon_sysfs_lock))
+	if (!damon_sysfs_schemes_regions_updating &&
+			!mutex_trylock(&damon_sysfs_lock))
 		return 0;
 	kdamond = damon_sysfs_cmd_request.kdamond;
 	if (!kdamond || kdamond->damon_ctx != c)
@@ -1292,13 +1331,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 	case DAMON_SYSFS_CMD_COMMIT:
 		err = damon_sysfs_commit_input(kdamond);
 		break;
+	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
+		if (!damon_sysfs_schemes_regions_updating) {
+			err = damon_sysfs_upd_schemes_regions_start(kdamond);
+			if (!err) {
+				damon_sysfs_schemes_regions_updating = true;
+				goto keep_lock_out;
+			}
+		} else {
+			err = damon_sysfs_upd_schemes_regions_stop(kdamond);
+			damon_sysfs_schemes_regions_updating = false;
+		}
+		break;
 	default:
 		break;
 	}
 	/* Mark the request as invalid now. */
 	damon_sysfs_cmd_request.kdamond = NULL;
 out:
-	mutex_unlock(&damon_sysfs_lock);
+	if (!damon_sysfs_schemes_regions_updating)
+		mutex_unlock(&damon_sysfs_lock);
+keep_lock_out:
 	return err;
 }
 

From 5bf7b568603c3fc8e41280bd591a2ef291ca1618 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 1 Nov 2022 22:03:25 +0000
Subject: [PATCH 036/139] UPSTREAM: mm/damon/sysfs-schemes: implement
 DAMOS-tried regions clear command

When there are huge number of DAMON regions that specific scheme actions
are tried to be applied, directories and files under 'tried_regions'
scheme directory could waste some memory.  Add another special input
keyword ('clear_schemes_tried_regions') for 'state' file of each kdamond
sysfs directory that can be used for cleanup of the 'tried_regions'
sub-directories.

[sj@kernel.org: skip regions clearing if the scheme directory was removed]
  Link: https://lkml.kernel.org/r/20221114182954.4745-3-sj@kernel.org
Link: https://lkml.kernel.org/r/20221101220328.95765-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 772c15e5adcb32a42dbbcdb905ec49f662312976)

Bug: 300502883
Change-Id: I969e05ce1fa4599bae50454633b61b5320eaa67d
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-common.h  |  4 ++++
 mm/damon/sysfs-schemes.c | 14 +++++++++++++-
 mm/damon/sysfs.c         | 20 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 634a6e7fca78..604a6cbc3ede 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -50,3 +50,7 @@ int damon_sysfs_schemes_update_regions_start(
 		struct damon_ctx *ctx);
 
 int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
+
+int damon_sysfs_schemes_clear_regions(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 5f14f18bcc49..81fc4d27f4e4 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1292,7 +1292,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 }
 
 /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
-int damon_sysfs_schemes_update_regions_start(
+int damon_sysfs_schemes_clear_regions(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx)
 {
@@ -1302,11 +1302,23 @@ int damon_sysfs_schemes_update_regions_start(
 	damon_for_each_scheme(scheme, ctx) {
 		struct damon_sysfs_scheme *sysfs_scheme;
 
+		/* user could have removed the scheme sysfs dir */
+		if (schemes_idx >= sysfs_schemes->nr)
+			break;
+
 		sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
 		damon_sysfs_scheme_regions_rm_dirs(
 				sysfs_scheme->tried_regions);
 	}
+	return 0;
+}
 
+/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
+int damon_sysfs_schemes_update_regions_start(
+		struct damon_sysfs_schemes *sysfs_schemes,
+		struct damon_ctx *ctx)
+{
+	damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
 	damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
 	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
 	return 0;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index ffb5a84059d7..aeb0beb1da91 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1004,6 +1004,11 @@ enum damon_sysfs_cmd {
 	 * regions
 	 */
 	DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS,
+	/*
+	 * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried
+	 * regions
+	 */
+	DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS,
 	/*
 	 * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
 	 */
@@ -1017,6 +1022,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"commit",
 	"update_schemes_stats",
 	"update_schemes_tried_regions",
+	"clear_schemes_tried_regions",
 };
 
 /*
@@ -1262,6 +1268,17 @@ static int damon_sysfs_upd_schemes_regions_stop(
 	return damon_sysfs_schemes_update_regions_stop(ctx);
 }
 
+static int damon_sysfs_clear_schemes_regions(
+		struct damon_sysfs_kdamond *kdamond)
+{
+	struct damon_ctx *ctx = kdamond->damon_ctx;
+
+	if (!ctx)
+		return -EINVAL;
+	return damon_sysfs_schemes_clear_regions(
+			kdamond->contexts->contexts_arr[0]->schemes, ctx);
+}
+
 static inline bool damon_sysfs_kdamond_running(
 		struct damon_sysfs_kdamond *kdamond)
 {
@@ -1343,6 +1360,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 			damon_sysfs_schemes_regions_updating = false;
 		}
 		break;
+	case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS:
+		err = damon_sysfs_clear_schemes_regions(kdamond);
+		break;
 	default:
 		break;
 	}

From 3ca21ef5fa5a146c51584a3c28884dc078a194b4 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 1 Nov 2022 22:14:08 +0100
Subject: [PATCH 037/139] UPSTREAM: mm/damon: use kstrtobool() instead of
 strtobool()

strtobool() is the same as kstrtobool().  However, the latter is more used
within the kernel.

In order to remove strtobool() and slightly simplify kstrtox.h, switch to
the other function name.

While at it, include the corresponding header file (<linux/kstrtox.h>)

Link: https://lkml.kernel.org/r/ed2b46489a513988688decb53850339cc228940c.1667336095.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit e6aff38b2e25e934e95471351c96d1410bb17561)

Bug: 300502883
Change-Id: I21df914f9ba754921bdc00d8e9a33e77b2606360
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/lru_sort.c | 3 ++-
 mm/damon/reclaim.c  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 2a532e3983df..7b8fce2f67a8 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -8,6 +8,7 @@
 #define pr_fmt(fmt) "damon-lru-sort: " fmt
 
 #include <linux/damon.h>
+#include <linux/kstrtox.h>
 #include <linux/module.h>
 
 #include "modules-common.h"
@@ -241,7 +242,7 @@ static int damon_lru_sort_enabled_store(const char *val,
 	bool enable;
 	int err;
 
-	err = strtobool(val, &enable);
+	err = kstrtobool(val, &enable);
 	if (err)
 		return err;
 
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e57604bec06d..e82631f39481 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -8,6 +8,7 @@
 #define pr_fmt(fmt) "damon-reclaim: " fmt
 
 #include <linux/damon.h>
+#include <linux/kstrtox.h>
 #include <linux/module.h>
 
 #include "modules-common.h"
@@ -187,7 +188,7 @@ static int damon_reclaim_enabled_store(const char *val,
 	bool enable;
 	int err;
 
-	err = strtobool(val, &enable);
+	err = kstrtobool(val, &enable);
 	if (err)
 		return err;
 

From ea215c9a1040c12446c3b32887255680fb719fe4 Mon Sep 17 00:00:00 2001
From: Huaisheng Ye <huaisheng.ye@intel.com>
Date: Mon, 16 Jan 2023 14:23:47 +0800
Subject: [PATCH 038/139] UPSTREAM: mm/damon/core: skip apply schemes if empty

Sometimes there is no scheme in damon's context, for example just use damo
record to monitor workload's data access pattern.

If current damon context doesn't have any scheme in the list, kdamond has
no need to iterate over list of all targets and regions but do nothing.

So, skip apply schemes when ctx->schemes is empty.

Link: https://lkml.kernel.org/r/20230116062347.1148553-1-huaisheng.ye@intel.com
Signed-off-by: Huaisheng Ye <huaisheng.ye@intel.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 64517d6e1291b5e942b00c53674ecf33f918313f)

Bug: 300502883
Change-Id: Ic76ca90c85dbb24205b17dd914f91a8dd4cf7345
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/damon/core.c b/mm/damon/core.c
index ceec75b88ef9..f338691e4591 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1230,7 +1230,8 @@ static int kdamond_fn(void *data)
 			if (ctx->callback.after_aggregation &&
 					ctx->callback.after_aggregation(ctx))
 				break;
-			kdamond_apply_schemes(ctx);
+			if (!list_empty(&ctx->schemes))
+				kdamond_apply_schemes(ctx);
 			kdamond_reset_aggregated(ctx);
 			kdamond_split_regions(ctx);
 			if (ctx->ops.reset_aggregated)

From a548d90994eb62ebdad8405df72576dd29d50e1a Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 2 Jun 2023 10:29:48 +0100
Subject: [PATCH 039/139] UPSTREAM: mm/damon/ops-common: refactor to use
 {pte|pmd}p_clear_young_notify()

With the fix in place to atomically test and clear young on ptes and pmds,
simplify the code to handle the clearing for both the primary mmu and the
mmu notifier with a single API call.

Link: https://lkml.kernel.org/r/20230602092949.545577-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit fa8c919dac3f5f325b17f9fcf8ac7dd899992598)

Bug: 300502883
Change-Id: I4414604788996e338ac638c3eb3ec1ef7959223e
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/ops-common.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 13b99975cbc2..073481023bea 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -35,21 +35,12 @@ struct page *damon_get_page(unsigned long pfn)
 
 void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
 {
-	bool referenced = false;
 	struct page *page = damon_get_page(pte_pfn(*pte));
 
 	if (!page)
 		return;
 
-	if (ptep_test_and_clear_young(vma, addr, pte))
-		referenced = true;
-
-#ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE))
-		referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
-	if (referenced)
+	if (ptep_clear_young_notify(vma, addr, pte))
 		set_page_young(page);
 
 	set_page_idle(page);
@@ -59,21 +50,12 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr
 void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	bool referenced = false;
 	struct page *page = damon_get_page(pmd_pfn(*pmd));
 
 	if (!page)
 		return;
 
-	if (pmdp_test_and_clear_young(vma, addr, pmd))
-		referenced = true;
-
-#ifdef CONFIG_MMU_NOTIFIER
-	if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE))
-		referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
-	if (referenced)
+	if (pmdp_clear_young_notify(vma, addr, pmd))
 		set_page_young(page);
 
 	set_page_idle(page);

From 7d48e19f74a3d74eb234f04e4565a3cebbd7c750 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 2 Aug 2023 21:32:17 +0000
Subject: [PATCH 040/139] UPSTREAM: mm/damon/sysfs-schemes: implement DAMOS
 tried total bytes file

Patch series "mm/damon/sysfs-schemes: implement DAMOS tried total bytes
file".

The tried_regions directory of DAMON sysfs interface is useful for
retrieving monitoring results snapshot or DAMOS debugging.  However, for
common use case that need to monitor only the total size of the scheme
tried regions (e.g., monitoring working set size), the kernel overhead for
directory construction and user overhead for reading the content could be
high if the number of monitoring region is not small.  This patchset
implements DAMON sysfs files for efficient support of the use case.

The first patch implements the sysfs file to reduce the user space
overhead, and the second patch implements a command for reducing the
kernel space overhead.

The third patch adds a selftest for the new file, and following two
patches update documents.

[1] https://lore.kernel.org/damon/20230728201817.70602-1-sj@kernel.org/

This patch (of 5):

The tried_regions directory can be used for retrieving the monitoring
results snapshot for regions of specific access pattern, by setting the
scheme's action as 'stat' and the access pattern as required.  While the
interface provides every detail of the monitoring results, some use cases
including working set size monitoring requires only the total size of the
regions.  For such cases, users should read all the information and
calculate the total size of the regions.  However, it could incur high
overhead if the number of regions is high.  Add a file for retrieving only
the information, namely 'total_bytes' file.  It allows users to get the
total size by reading only the file.

Link: https://lkml.kernel.org/r/20230802213222.109841-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230802213222.109841-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit b69f92a741405336fb17a8a3d67fc144192fe8e2)

Bug: 300502883
Change-Id: I49c225d15ba09a9b896341da14cc9f2b45578da7
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-schemes.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 81fc4d27f4e4..1e0270490d50 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -117,6 +117,7 @@ struct damon_sysfs_scheme_regions {
 	struct kobject kobj;
 	struct list_head regions_list;
 	int nr_regions;
+	unsigned long total_bytes;
 };
 
 static struct damon_sysfs_scheme_regions *
@@ -128,9 +129,19 @@ damon_sysfs_scheme_regions_alloc(void)
 	regions->kobj = (struct kobject){};
 	INIT_LIST_HEAD(&regions->regions_list);
 	regions->nr_regions = 0;
+	regions->total_bytes = 0;
 	return regions;
 }
 
+static ssize_t total_bytes_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct damon_sysfs_scheme_regions *regions = container_of(kobj,
+			struct damon_sysfs_scheme_regions, kobj);
+
+	return sysfs_emit(buf, "%lu\n", regions->total_bytes);
+}
+
 static void damon_sysfs_scheme_regions_rm_dirs(
 		struct damon_sysfs_scheme_regions *regions)
 {
@@ -148,7 +159,11 @@ static void damon_sysfs_scheme_regions_release(struct kobject *kobj)
 	kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj));
 }
 
+static struct kobj_attribute damon_sysfs_scheme_regions_total_bytes_attr =
+		__ATTR_RO_MODE(total_bytes, 0400);
+
 static struct attribute *damon_sysfs_scheme_regions_attrs[] = {
+	&damon_sysfs_scheme_regions_total_bytes_attr.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions);
@@ -1279,6 +1294,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 		return 0;
 
 	sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+	sysfs_regions->total_bytes += r->ar.end - r->ar.start;
 	region = damon_sysfs_scheme_region_alloc(r);
 	list_add_tail(&region->list, &sysfs_regions->regions_list);
 	sysfs_regions->nr_regions++;
@@ -1309,6 +1325,7 @@ int damon_sysfs_schemes_clear_regions(
 		sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++];
 		damon_sysfs_scheme_regions_rm_dirs(
 				sysfs_scheme->tried_regions);
+		sysfs_scheme->tried_regions->total_bytes = 0;
 	}
 	return 0;
 }

From b46391e09202c37cba18a2eb9a0049f3482c8cde Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Wed, 2 Aug 2023 21:32:18 +0000
Subject: [PATCH 041/139] UPSTREAM: mm/damon/sysfs: implement a command for
 updating only schemes tried total bytes

Using tried_regions/total_bytes file, users can efficiently retrieve the
total size of memory regions having specific access pattern.  However,
DAMON sysfs interface in kernel still populates all the infomration on the
tried_regions subdirectories.  That means the kernel part overhead for the
construction of tried regions directories still exists.  To remove the
overhead, implement yet another command input for 'state' DAMON sysfs
file.  Writing the input to the file makes DAMON sysfs interface to update
only the total_bytes file.

Link: https://lkml.kernel.org/r/20230802213222.109841-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 6ad243b83b5094026fdb3171711ddb25246b3d8a)

Bug: 300502883
Change-Id: Id0bdf13858d6a92de0eeef22f59a65ee884e9d20
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-common.h  |  2 +-
 mm/damon/sysfs-schemes.c |  7 ++++++-
 mm/damon/sysfs.c         | 26 ++++++++++++++++++++------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 604a6cbc3ede..90b1bef41db8 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -47,7 +47,7 @@ void damon_sysfs_schemes_update_stats(
 
 int damon_sysfs_schemes_update_regions_start(
 		struct damon_sysfs_schemes *sysfs_schemes,
-		struct damon_ctx *ctx);
+		struct damon_ctx *ctx, bool total_bytes_only);
 
 int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
 
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 1e0270490d50..3ca99586f45b 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1266,6 +1266,7 @@ void damon_sysfs_schemes_update_stats(
  */
 static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback;
 static int damon_sysfs_schemes_region_idx;
+static bool damos_regions_upd_total_bytes_only;
 
 /*
  * DAMON callback that called before damos apply.  While this callback is
@@ -1295,6 +1296,9 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 
 	sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
 	sysfs_regions->total_bytes += r->ar.end - r->ar.start;
+	if (damos_regions_upd_total_bytes_only)
+		return 0;
+
 	region = damon_sysfs_scheme_region_alloc(r);
 	list_add_tail(&region->list, &sysfs_regions->regions_list);
 	sysfs_regions->nr_regions++;
@@ -1333,10 +1337,11 @@ int damon_sysfs_schemes_clear_regions(
 /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
 int damon_sysfs_schemes_update_regions_start(
 		struct damon_sysfs_schemes *sysfs_schemes,
-		struct damon_ctx *ctx)
+		struct damon_ctx *ctx, bool total_bytes_only)
 {
 	damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
 	damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+	damos_regions_upd_total_bytes_only = total_bytes_only;
 	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
 	return 0;
 }
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index aeb0beb1da91..fb9cf961b79c 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -999,6 +999,11 @@ enum damon_sysfs_cmd {
 	 * files.
 	 */
 	DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
+	/*
+	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: Update
+	 * tried_regions/total_bytes sysfs files for each scheme.
+	 */
+	DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES,
 	/*
 	 * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried
 	 * regions
@@ -1021,6 +1026,7 @@ static const char * const damon_sysfs_cmd_strs[] = {
 	"off",
 	"commit",
 	"update_schemes_stats",
+	"update_schemes_tried_bytes",
 	"update_schemes_tried_regions",
 	"clear_schemes_tried_regions",
 };
@@ -1206,12 +1212,14 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
 	struct damon_sysfs_kdamond *kdamond;
+	enum damon_sysfs_cmd cmd;
 
 	/* damon_sysfs_schemes_update_regions_stop() might not yet called */
 	kdamond = damon_sysfs_cmd_request.kdamond;
-	if (kdamond && damon_sysfs_cmd_request.cmd ==
-			DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS &&
-			ctx == kdamond->damon_ctx) {
+	cmd = damon_sysfs_cmd_request.cmd;
+	if (kdamond && ctx == kdamond->damon_ctx &&
+			(cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
+			 cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) {
 		damon_sysfs_schemes_update_regions_stop(ctx);
 		mutex_unlock(&damon_sysfs_lock);
 	}
@@ -1248,14 +1256,15 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
 }
 
 static int damon_sysfs_upd_schemes_regions_start(
-		struct damon_sysfs_kdamond *kdamond)
+		struct damon_sysfs_kdamond *kdamond, bool total_bytes_only)
 {
 	struct damon_ctx *ctx = kdamond->damon_ctx;
 
 	if (!ctx)
 		return -EINVAL;
 	return damon_sysfs_schemes_update_regions_start(
-			kdamond->contexts->contexts_arr[0]->schemes, ctx);
+			kdamond->contexts->contexts_arr[0]->schemes, ctx,
+			total_bytes_only);
 }
 
 static int damon_sysfs_upd_schemes_regions_stop(
@@ -1332,6 +1341,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 {
 	struct damon_sysfs_kdamond *kdamond;
 	static bool damon_sysfs_schemes_regions_updating;
+	bool total_bytes_only = false;
 	int err = 0;
 
 	/* avoid deadlock due to concurrent state_store('off') */
@@ -1348,9 +1358,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 	case DAMON_SYSFS_CMD_COMMIT:
 		err = damon_sysfs_commit_input(kdamond);
 		break;
+	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES:
+		total_bytes_only = true;
+		fallthrough;
 	case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS:
 		if (!damon_sysfs_schemes_regions_updating) {
-			err = damon_sysfs_upd_schemes_regions_start(kdamond);
+			err = damon_sysfs_upd_schemes_regions_start(kdamond,
+					total_bytes_only);
 			if (!err) {
 				damon_sysfs_schemes_regions_updating = true;
 				goto keep_lock_out;

From f5a0a8bc43e23a5423c65ff554d5f28cce1933e5 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 7 Oct 2023 20:04:32 +0000
Subject: [PATCH 042/139] UPSTREAM: mm/damon/sysfs: check DAMOS regions update
 progress from before_terminate()

DAMON_SYSFS can receive DAMOS tried regions update request while kdamond
is already out of the main loop and before_terminate callback
(damon_sysfs_before_terminate() in this case) is not yet called.  And
damon_sysfs_handle_cmd() can further be finished before the callback is
invoked.  Then, damon_sysfs_before_terminate() unlocks damon_sysfs_lock,
which is not locked by anyone.  This happens because the callback function
assumes damon_sysfs_cmd_request_callback() should be called before it.
Check if the assumption was true before doing the unlock, to avoid this
problem.

Link: https://lkml.kernel.org/r/20231007200432.3110-1-sj@kernel.org
Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.2.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 76b7069bcc89dec33f03eb08abee165d0306b754)

Bug: 300502883
Change-Id: I7cd5e00c0d0226dc8d7856d103f88a26307cafce
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index fb9cf961b79c..a3a1c94dc60d 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1208,6 +1208,8 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 	return 0;
 }
 
+static bool damon_sysfs_schemes_regions_updating;
+
 static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 {
 	struct damon_target *t, *next;
@@ -1219,8 +1221,10 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 	cmd = damon_sysfs_cmd_request.cmd;
 	if (kdamond && ctx == kdamond->damon_ctx &&
 			(cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS ||
-			 cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) {
+			 cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) &&
+			damon_sysfs_schemes_regions_updating) {
 		damon_sysfs_schemes_update_regions_stop(ctx);
+		damon_sysfs_schemes_regions_updating = false;
 		mutex_unlock(&damon_sysfs_lock);
 	}
 
@@ -1340,7 +1344,6 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
 static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 {
 	struct damon_sysfs_kdamond *kdamond;
-	static bool damon_sysfs_schemes_regions_updating;
 	bool total_bytes_only = false;
 	int err = 0;
 

From c194e597cb3ced4c779d6ca35c9fe879f772a7c2 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 12 Oct 2023 19:22:53 +0000
Subject: [PATCH 043/139] UPSTREAM: mm/damon/sysfs-schemes: do not update tried
 regions more than one DAMON snapshot

Patch series "mm/damon/sysfs-schemes: Do DAMOS tried regions update for
only one apply interval".

DAMOS tried regions update feature of DAMON sysfs interface is doing the
update for one aggregation interval after the request is made.  Since the
per-scheme apply interval is supported, that behavior makes no much sense.
That is, the tried regions directory will have regions from multiple
DAMON monitoring results snapshots, or no region for apply intervals that
much shorter than, or longer than the aggregation interval, respectively.
Update the behavior to update the regions for each scheme for only its
apply interval, and update the document.

Since DAMOS apply interval is the aggregation by default, this change
makes no visible behavioral difference to old users who don't explicitly
set the apply intervals.

Patches Sequence
----------------

The first two patches makes schemes of apply intervals that much shorter
or longer than the aggregation interval to keep the maximum and minimum
times for continuing the update.  After the two patches, the update aligns
with the each scheme's apply interval.

Finally, the third patch updates the document to reflect the behavior.

This patch (of 3):

DAMON_SYSFS exposes every DAMON-found region that eligible for applying
the scheme action for one aggregation interval.  However, each DAMON-based
operation scheme has its own apply interval.  Hence, for a scheme that
having its apply interval much smaller than the aggregation interval,
DAMON_SYSFS will expose the scheme regions that applied to more than one
DAMON monitoring results snapshots.  Since the purpose of DAMON tried
regions is exposing single snapshot, this makes no much sense.  Track
progress of each scheme's tried regions update and avoid the case.

Link: https://lkml.kernel.org/r/20231012192256.33556-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231012192256.33556-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 4d4e41b682990b1dc5bba2bc313800340bf5c2d4)

Bug: 300502883
Change-Id: I78602a6810a9b4d8d131c3ace69f255ac1349d13
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-schemes.c | 77 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 3ca99586f45b..4ee188714ca2 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -113,11 +113,47 @@ static struct kobj_type damon_sysfs_scheme_region_ktype = {
  * scheme regions directory
  */
 
+/*
+ * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update
+ *					 status
+ * @DAMOS_TRIED_REGIONS_UPD_IDLE:		Waiting for next request.
+ * @DAMOS_TRIED_REGIONS_UPD_STARTED:		Update started.
+ * @DAMOS_TRIED_REGIONS_UPD_FINISHED:	Update finished.
+ *
+ * Each DAMON-based operation scheme (&struct damos) has its own apply
+ * interval, and we need to expose the scheme tried regions based on only
+ * single snapshot.  For this, we keep the tried regions update status for each
+ * scheme.  The status becomes 'idle' at the beginning.
+ *
+ * Once the tried regions update request is received, the request handling
+ * start function (damon_sysfs_scheme_update_regions_start()) sets the status
+ * of all schemes as 'idle' again, and register ->before_damos_apply() and
+ * ->after_sampling() callbacks.
+ *
+ * Then, the first followup ->before_damos_apply() callback
+ * (damon_sysfs_before_damos_apply()) sets the status 'started'.  The first
+ * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call
+ * is called only after the scheme is completely applied
+ * to the given snapshot.  Hence the callback knows the situation by showing
+ * 'started' status, and sets the status as 'finished'.  Then,
+ * damon_sysfs_before_damos_apply() understands the situation by showing the
+ * 'finished' status and do nothing.
+ *
+ *  Finally, the tried regions request handling finisher function
+ *  (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks.
+ */
+enum damos_sysfs_regions_upd_status {
+	DAMOS_TRIED_REGIONS_UPD_IDLE,
+	DAMOS_TRIED_REGIONS_UPD_STARTED,
+	DAMOS_TRIED_REGIONS_UPD_FINISHED,
+};
+
 struct damon_sysfs_scheme_regions {
 	struct kobject kobj;
 	struct list_head regions_list;
 	int nr_regions;
 	unsigned long total_bytes;
+	enum damos_sysfs_regions_upd_status upd_status;
 };
 
 static struct damon_sysfs_scheme_regions *
@@ -130,6 +166,7 @@ damon_sysfs_scheme_regions_alloc(void)
 	INIT_LIST_HEAD(&regions->regions_list);
 	regions->nr_regions = 0;
 	regions->total_bytes = 0;
+	regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE;
 	return regions;
 }
 
@@ -1295,6 +1332,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 		return 0;
 
 	sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions;
+	if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED)
+		return 0;
+	if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE)
+		sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED;
 	sysfs_regions->total_bytes += r->ar.end - r->ar.start;
 	if (damos_regions_upd_total_bytes_only)
 		return 0;
@@ -1311,6 +1352,29 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 	return 0;
 }
 
+/*
+ * DAMON callback that called after each accesses sampling.  While this
+ * callback is registered, damon_sysfs_lock should be held to ensure the
+ * regions directories exist.
+ */
+static int damon_sysfs_after_sampling(struct damon_ctx *ctx)
+{
+	struct damon_sysfs_schemes *sysfs_schemes =
+		damon_sysfs_schemes_for_damos_callback;
+	struct damon_sysfs_scheme_regions *sysfs_regions;
+	int i;
+
+	for (i = 0; i < sysfs_schemes->nr; i++) {
+		sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
+		if (sysfs_regions->upd_status ==
+				DAMOS_TRIED_REGIONS_UPD_STARTED)
+			sysfs_regions->upd_status =
+				DAMOS_TRIED_REGIONS_UPD_FINISHED;
+	}
+
+	return 0;
+}
+
 /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
 int damon_sysfs_schemes_clear_regions(
 		struct damon_sysfs_schemes *sysfs_schemes,
@@ -1334,6 +1398,16 @@ int damon_sysfs_schemes_clear_regions(
 	return 0;
 }
 
+static void damos_tried_regions_init_upd_status(
+		struct damon_sysfs_schemes *sysfs_schemes)
+{
+	int i;
+
+	for (i = 0; i < sysfs_schemes->nr; i++)
+		sysfs_schemes->schemes_arr[i]->tried_regions->upd_status =
+			DAMOS_TRIED_REGIONS_UPD_IDLE;
+}
+
 /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */
 int damon_sysfs_schemes_update_regions_start(
 		struct damon_sysfs_schemes *sysfs_schemes,
@@ -1341,8 +1415,10 @@ int damon_sysfs_schemes_update_regions_start(
 {
 	damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx);
 	damon_sysfs_schemes_for_damos_callback = sysfs_schemes;
+	damos_tried_regions_init_upd_status(sysfs_schemes);
 	damos_regions_upd_total_bytes_only = total_bytes_only;
 	ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply;
+	ctx->callback.after_sampling = damon_sysfs_after_sampling;
 	return 0;
 }
 
@@ -1355,6 +1431,7 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx)
 {
 	damon_sysfs_schemes_for_damos_callback = NULL;
 	ctx->callback.before_damos_apply = NULL;
+	ctx->callback.after_sampling = NULL;
 	damon_sysfs_schemes_region_idx = 0;
 	return 0;
 }

From 1e19db10e7ac53a11ca4ae40a60ae232699393ee Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Thu, 12 Oct 2023 19:22:54 +0000
Subject: [PATCH 044/139] UPSTREAM: mm/damon/sysfs: avoid empty scheme tried
 regions for large apply interval

DAMON_SYSFS assumes all schemes will be applied for at least one DAMON
monitoring results snapshot within one aggregation interval, or makes no
sense to wait for it while DAMON is deactivated by the watermarks.  That
for deactivated status still makes sense, but the aggregation interval
based assumption is invalid now because each scheme can has its own apply
interval.  For schemes having larger than the aggregation or watermarks
check interval, DAMOS tried regions update request can be finished without
the update.  Avoid the case by explicitly checking the status of the
schemes tried regions update and watermarks based DAMON deactivation.

Link: https://lkml.kernel.org/r/20231012192256.33556-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 76126332c7606ba25a4ae5db37145fd526985b45)

Bug: 300502883
Change-Id: I8283709a023123d7a89fd37a1d4a834888c15c7e
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-common.h  |  2 ++
 mm/damon/sysfs-schemes.c | 16 ++++++++++++++++
 mm/damon/sysfs.c         | 34 ++++++++++++++++++++++++++++++----
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h
index 90b1bef41db8..3db199c84ed3 100644
--- a/mm/damon/sysfs-common.h
+++ b/mm/damon/sysfs-common.h
@@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start(
 		struct damon_sysfs_schemes *sysfs_schemes,
 		struct damon_ctx *ctx, bool total_bytes_only);
 
+bool damos_sysfs_regions_upd_done(void);
+
 int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx);
 
 int damon_sysfs_schemes_clear_regions(
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 4ee188714ca2..6df715f113ae 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1422,6 +1422,22 @@ int damon_sysfs_schemes_update_regions_start(
 	return 0;
 }
 
+bool damos_sysfs_regions_upd_done(void)
+{
+	struct damon_sysfs_schemes *sysfs_schemes =
+		damon_sysfs_schemes_for_damos_callback;
+	struct damon_sysfs_scheme_regions *sysfs_regions;
+	int i;
+
+	for (i = 0; i < sysfs_schemes->nr; i++) {
+		sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions;
+		if (sysfs_regions->upd_status !=
+				DAMOS_TRIED_REGIONS_UPD_FINISHED)
+			return false;
+	}
+	return true;
+}
+
 /*
  * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock.  Caller
  * should unlock damon_sysfs_lock which held before
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index a3a1c94dc60d..ca7bd9dfb7d7 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1336,12 +1336,13 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
 
 /*
  * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
- * @c:	The DAMON context of the callback.
+ * @c:		The DAMON context of the callback.
+ * @active:	Whether @c is not deactivated due to watermarks.
  *
  * This function is periodically called back from the kdamond thread for @c.
  * Then, it checks if there is a waiting DAMON sysfs request and handles it.
  */
-static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active)
 {
 	struct damon_sysfs_kdamond *kdamond;
 	bool total_bytes_only = false;
@@ -1373,6 +1374,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
 				goto keep_lock_out;
 			}
 		} else {
+			/*
+			 * Continue regions updating if DAMON is till
+			 * active and the update for all schemes is not
+			 * finished.
+			 */
+			if (active && !damos_sysfs_regions_upd_done())
+				goto keep_lock_out;
 			err = damon_sysfs_upd_schemes_regions_stop(kdamond);
 			damon_sysfs_schemes_regions_updating = false;
 		}
@@ -1392,6 +1400,24 @@ keep_lock_out:
 	return err;
 }
 
+static int damon_sysfs_after_wmarks_check(struct damon_ctx *c)
+{
+	/*
+	 * after_wmarks_check() is called back while the context is deactivated
+	 * by watermarks.
+	 */
+	return damon_sysfs_cmd_request_callback(c, false);
+}
+
+static int damon_sysfs_after_aggregation(struct damon_ctx *c)
+{
+	/*
+	 * after_aggregation() is called back only while the context is not
+	 * deactivated by watermarks.
+	 */
+	return damon_sysfs_cmd_request_callback(c, true);
+}
+
 static struct damon_ctx *damon_sysfs_build_ctx(
 		struct damon_sysfs_context *sys_ctx)
 {
@@ -1407,8 +1433,8 @@ static struct damon_ctx *damon_sysfs_build_ctx(
 		return ERR_PTR(err);
 	}
 
-	ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
-	ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
+	ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check;
+	ctx->callback.after_aggregation = damon_sysfs_after_aggregation;
 	ctx->callback.before_terminate = damon_sysfs_before_terminate;
 	return ctx;
 }

From 6b7c4cc2624da6285a97a16d6fad00a6d3f5df46 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sun, 22 Oct 2023 21:07:33 +0000
Subject: [PATCH 045/139] UPSTREAM: mm/damon/sysfs: remove requested targets
 when online-commit inputs

damon_sysfs_set_targets(), which updates the targets of the context for
online commitment, do not remove targets that removed from the
corresponding sysfs files.  As a result, more than intended targets of the
context can exist and hence consume memory and monitoring CPU resource
more than expected.

Fix it by removing all targets of the context and fill up again using the
user input.  This could cause unnecessary memory dealloc and realloc
operations, but this is not a hot code path.  Also, note that damon_target
is stateless, and hence no data is lost.

[sj@kernel.org: fix unnecessary monitoring results removal]
  Link: https://lkml.kernel.org/r/20231028213353.45397-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231022210735.46409-2-sj@kernel.org
Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: <stable@vger.kernel.org>	[5.19.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 19467a950b49432a84bf6dbadbbb17bdf89418b7)

Bug: 300502883
Change-Id: Icf094f138e6810182d23d2c412fbabe3ecd960fe
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 70 +++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index ca7bd9dfb7d7..4b3f05fc34a2 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1150,58 +1150,60 @@ destroy_targets_out:
 	return err;
 }
 
-/*
- * Search a target in a context that corresponds to the sysfs target input.
- *
- * Return: pointer to the target if found, NULL if not found, or negative
- * error code if the search failed.
- */
-static struct damon_target *damon_sysfs_existing_target(
-		struct damon_sysfs_target *sys_target, struct damon_ctx *ctx)
+static int damon_sysfs_update_target(struct damon_target *target,
+		struct damon_ctx *ctx,
+		struct damon_sysfs_target *sys_target)
 {
 	struct pid *pid;
-	struct damon_target *t;
+	struct damon_region *r, *next;
 
-	if (!damon_target_has_pid(ctx)) {
-		/* Up to only one target for paddr could exist */
-		damon_for_each_target(t, ctx)
-			return t;
-		return NULL;
-	}
+	if (!damon_target_has_pid(ctx))
+		return 0;
 
-	/* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */
 	pid = find_get_pid(sys_target->pid);
 	if (!pid)
-		return ERR_PTR(-EINVAL);
-	damon_for_each_target(t, ctx) {
-		if (t->pid == pid) {
-			put_pid(pid);
-			return t;
-		}
+		return -EINVAL;
+
+	/* no change to the target */
+	if (pid == target->pid) {
+		put_pid(pid);
+		return 0;
 	}
-	put_pid(pid);
-	return NULL;
+
+	/* remove old monitoring results and update the target's pid */
+	damon_for_each_region_safe(r, next, target)
+		damon_destroy_region(r, target);
+	put_pid(target->pid);
+	target->pid = pid;
+	return 0;
 }
 
 static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 		struct damon_sysfs_targets *sysfs_targets)
 {
-	int i, err;
+	struct damon_target *t, *next;
+	int i = 0, err;
 
 	/* Multiple physical address space monitoring targets makes no sense */
 	if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1)
 		return -EINVAL;
 
-	for (i = 0; i < sysfs_targets->nr; i++) {
-		struct damon_sysfs_target *st = sysfs_targets->targets_arr[i];
-		struct damon_target *t = damon_sysfs_existing_target(st, ctx);
+	damon_for_each_target_safe(t, next, ctx) {
+		if (i < sysfs_targets->nr) {
+			damon_sysfs_update_target(t, ctx,
+					sysfs_targets->targets_arr[i]);
+		} else {
+			if (damon_target_has_pid(ctx))
+				put_pid(t->pid);
+			damon_destroy_target(t);
+		}
+		i++;
+	}
 
-		if (IS_ERR(t))
-			return PTR_ERR(t);
-		if (!t)
-			err = damon_sysfs_add_target(st, ctx);
-		else
-			err = damon_sysfs_set_regions(t, st->regions);
+	for (; i < sysfs_targets->nr; i++) {
+		struct damon_sysfs_target *st = sysfs_targets->targets_arr[i];
+
+		err = damon_sysfs_add_target(st, ctx);
 		if (err)
 			return err;
 	}

From c132d077ebc6073b0dfc70b82897cfd690e3d3ca Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 31 Oct 2023 17:01:31 +0000
Subject: [PATCH 046/139] UPSTREAM: mm/damon/sysfs: update monitoring target
 regions for online input commit

When user input is committed online, DAMON sysfs interface is ignoring the
user input for the monitoring target regions.  Such request is valid and
useful for fixed monitoring target regions-based monitoring ops like
'paddr' or 'fvaddr'.

Update the region boundaries as user specified, too.  Note that the
monitoring results of the regions that overlap between the latest
monitoring target regions and the new target regions are preserved.

Treat empty monitoring target regions user request as a request to just
make no change to the monitoring target regions.  Otherwise, users should
set the monitoring target regions same to current one for every online
input commit, and it could be challenging for dynamic monitoring target
regions update DAMON ops like 'vaddr'.  If the user really need to remove
all monitoring target regions, they can simply remove the target and then
create the target again with empty target regions.

Link: https://lkml.kernel.org/r/20231031170131.46972-1-sj@kernel.org
Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 9732336006764e2ee61225387e3c70eae9139035)

Bug: 300502883
Change-Id: I6857482470951382c9be36f2099da76e9b71d502
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 51 ++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 4b3f05fc34a2..507ca2a95a87 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1150,32 +1150,45 @@ destroy_targets_out:
 	return err;
 }
 
+static int damon_sysfs_update_target_pid(struct damon_target *target, int pid)
+{
+	struct pid *pid_new;
+
+	pid_new = find_get_pid(pid);
+	if (!pid_new)
+		return -EINVAL;
+
+	if (pid_new == target->pid) {
+		put_pid(pid_new);
+		return 0;
+	}
+
+	put_pid(target->pid);
+	target->pid = pid_new;
+	return 0;
+}
+
 static int damon_sysfs_update_target(struct damon_target *target,
 		struct damon_ctx *ctx,
 		struct damon_sysfs_target *sys_target)
 {
-	struct pid *pid;
-	struct damon_region *r, *next;
+	int err;
 
-	if (!damon_target_has_pid(ctx))
-		return 0;
-
-	pid = find_get_pid(sys_target->pid);
-	if (!pid)
-		return -EINVAL;
-
-	/* no change to the target */
-	if (pid == target->pid) {
-		put_pid(pid);
-		return 0;
+	if (damon_target_has_pid(ctx)) {
+		err = damon_sysfs_update_target_pid(target, sys_target->pid);
+		if (err)
+			return err;
 	}
 
-	/* remove old monitoring results and update the target's pid */
-	damon_for_each_region_safe(r, next, target)
-		damon_destroy_region(r, target);
-	put_pid(target->pid);
-	target->pid = pid;
-	return 0;
+	/*
+	 * Do monitoring target region boundary update only if one or more
+	 * regions are set by the user.  This is for keeping current monitoring
+	 * target results and range easier, especially for dynamic monitoring
+	 * target regions update ops like 'vaddr'.
+	 */
+	if (sys_target->regions->nr)
+		err = damon_sysfs_set_regions(target, sys_target->regions);
+	return err;
 }
 
 static int damon_sysfs_set_targets(struct damon_ctx *ctx,

From 606444fd06a65bc6ecd3b582052adc84c96bcdb0 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Mon, 6 Nov 2023 17:07:40 +0300
Subject: [PATCH 047/139] UPSTREAM: mm/damon/sysfs: eliminate potential
 uninitialized variable warning

The "err" variable is not initialized if damon_target_has_pid(ctx) is
false and sys_target->regions->nr is zero.

Link: https://lkml.kernel.org/r/739e6aaf-a634-4e33-98a8-16546379ec9f@moroto.mountain
Fixes: 0bcd216c4741 ("mm/damon/sysfs: update monitoring target regions for online input commit")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 85c2ceaafbd306814a3a4740bf4d95ac26a8b36a)

Bug: 300502883
Change-Id: I235ea1bfc9d8bf0fef426dbc21881d755e3a5d67
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 507ca2a95a87..6545a46da606 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target,
 		struct damon_ctx *ctx,
 		struct damon_sysfs_target *sys_target)
 {
-	int err;
+	int err = 0;
 
 	if (damon_target_has_pid(ctx)) {
 		err = damon_sysfs_update_target_pid(target, sys_target->pid);

From 7fbeab3c65e91069096a7a5811abf91262d67ec0 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Nov 2023 23:34:06 +0000
Subject: [PATCH 048/139] UPSTREAM: mm/damon/sysfs: check error from
 damon_sysfs_update_target()

Patch series "mm/damon/sysfs: fix unhandled return values".

Some of DAMON sysfs interface code is not handling return values from some
functions.  As a result, confusing user input handling or NULL-dereference
is possible.  Check those properly.

This patch (of 3):

damon_sysfs_update_target() returns error code for failures, but its
caller, damon_sysfs_set_targets() is ignoring that.  The update function
seems making no critical change in case of such failures, but the behavior
will look like DAMON sysfs is silently ignoring or only partially
accepting the user input.  Fix it.

Link: https://lkml.kernel.org/r/20231106233408.51159-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20231106233408.51159-2-sj@kernel.org
Fixes: 19467a950b49 ("mm/damon/sysfs: remove requested targets when online-commit inputs")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit b4936b544b08ed44949055b92bd25f77759ebafc)

Bug: 300502883
Change-Id: I9bfea66f76ad094ed73defee5ff3fdb3794e8162
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 6545a46da606..0a6b4625de9f 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
 
 	damon_for_each_target_safe(t, next, ctx) {
 		if (i < sysfs_targets->nr) {
-			damon_sysfs_update_target(t, ctx,
+			err = damon_sysfs_update_target(t, ctx,
 					sysfs_targets->targets_arr[i]);
+			if (err)
+				return err;
 		} else {
 			if (damon_target_has_pid(ctx))
 				put_pid(t->pid);

From 1cedfc05e9c1fa234a52b833b06722eb430e15ac Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Nov 2023 23:34:07 +0000
Subject: [PATCH 049/139] UPSTREAM: mm/damon/sysfs-schemes: handle tried
 regions sysfs directory allocation failure

DAMOS tried regions sysfs directory allocation function
(damon_sysfs_scheme_regions_alloc()) is not handling the memory allocation
failure.  In the case, the code will dereference NULL pointer.  Handle the
failure to avoid such invalid access.

Link: https://lkml.kernel.org/r/20231106233408.51159-3-sj@kernel.org
Fixes: 9277d0367ba1 ("mm/damon/sysfs-schemes: implement scheme region directory")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 84055688b6bc075c92a88e2d6c3ad26ab93919f9)

Bug: 300502883
Change-Id: I86ecb2f3cf1604199b5567576b1fa583914f7f36
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-schemes.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 6df715f113ae..b79b7c0eb9d0 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -162,6 +162,9 @@ damon_sysfs_scheme_regions_alloc(void)
 	struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions),
 			GFP_KERNEL);
 
+	if (!regions)
+		return NULL;
+
 	regions->kobj = (struct kobject){};
 	INIT_LIST_HEAD(&regions->regions_list);
 	regions->nr_regions = 0;

From 31c59d59c7a7963e0ceec6c35f166d19ebabf8fd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Mon, 6 Nov 2023 23:34:08 +0000
Subject: [PATCH 050/139] UPSTREAM: mm/damon/sysfs-schemes: handle tried region
 directory allocation failure

DAMON sysfs interface's before_damos_apply callback
(damon_sysfs_before_damos_apply()), which creates the DAMOS tried regions
for each DAMOS action applied region, is not handling the allocation
failure for the sysfs directory data.  As a result, NULL pointer
derefeence is possible.  Fix it by handling the case.

Link: https://lkml.kernel.org/r/20231106233408.51159-4-sj@kernel.org
Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[6.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit ae636ae2bbfd9279f5681dbf320d1da817e52b68)

Bug: 300502883
Change-Id: I98568f4b0cee9fea82f4fe6d3e7a505370c3c304
Signed-off-by: cui yangpei <cuiyangpei@xiaomi.com>
---
 mm/damon/sysfs-schemes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index b79b7c0eb9d0..12b2c903b0a0 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1344,6 +1344,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
 		return 0;
 
 	region = damon_sysfs_scheme_region_alloc(r);
+	if (!region)
+		return 0;
 	list_add_tail(&region->list, &sysfs_regions->regions_list);
 	sysfs_regions->nr_regions++;
 	if (kobject_init_and_add(&region->kobj,

From cee8ebf7c50810d851430ab046eb25f3a56bbac1 Mon Sep 17 00:00:00 2001
From: cuiyangpei <cuiyangpei@xiaomi.com>
Date: Wed, 6 Dec 2023 21:03:17 +0800
Subject: [PATCH 051/139] ANDROID: GKI: build damon for monitoring virtual
 address spaces

Enable damon related configs in gki_defconfig.

Bug: 300502883
Change-Id: Ie00a923464d2f1fff8f12a8804cbac040f0cacdf
Signed-off-by: cuiyangpei <cuiyangpei@xiaomi.com>
---
 arch/arm64/configs/gki_defconfig | 3 +++
 arch/x86/configs/gki_defconfig   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index abf7b41cdc85..38c90d04264e 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -123,6 +123,9 @@ CONFIG_ANON_VMA_NAME=y
 CONFIG_USERFAULTFD=y
 CONFIG_LRU_GEN=y
 CONFIG_LRU_GEN_ENABLED=y
+CONFIG_DAMON=y
+CONFIG_DAMON_VADDR=y
+CONFIG_DAMON_SYSFS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index 34a85587c4f4..22797e912979 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -118,6 +118,9 @@ CONFIG_ANON_VMA_NAME=y
 CONFIG_USERFAULTFD=y
 CONFIG_LRU_GEN=y
 CONFIG_LRU_GEN_ENABLED=y
+CONFIG_DAMON=y
+CONFIG_DAMON_VADDR=y
+CONFIG_DAMON_SYSFS=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y

From efa8f34b5aa8a380bd397baba5a88e0a21592795 Mon Sep 17 00:00:00 2001
From: John Scheible <johnscheible@google.com>
Date: Wed, 13 Dec 2023 15:29:57 -0800
Subject: [PATCH 052/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
  - dma_fence_enable_sw_signaling
  - dma_fence_unwrap_first
  - __dma_fence_unwrap_merge
  - dma_fence_unwrap_next

3 function symbol(s) added
  'struct dma_fence* __dma_fence_unwrap_merge(unsigned int, struct dma_fence**, struct dma_fence_unwrap*)'
  'struct dma_fence* dma_fence_unwrap_first(struct dma_fence*, struct dma_fence_unwrap*)'
  'struct dma_fence* dma_fence_unwrap_next(struct dma_fence_unwrap*)'

Bug: 316212868
Change-Id: I41a4d906e98c983c4b612f65127bd7ef7ac5cb85
Signed-off-by: John Scheible <johnscheible@google.com>
---
 android/abi_gki_aarch64.stg   | 75 +++++++++++++++++++++++++++++++++++
 android/abi_gki_aarch64_pixel |  4 ++
 2 files changed, 79 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 0ceacc54c6de..68cf985124c9 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -3098,6 +3098,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0x07b1db01
 }
+pointer_reference {
+  id: 0x0b7d7682
+  kind: POINTER
+  pointee_type_id: 0x07b53c95
+}
 pointer_reference {
   id: 0x0b85846c
   kind: POINTER
@@ -49124,6 +49129,12 @@ member {
   name: "array"
   type_id: 0x38d23361
 }
+member {
+  id: 0xdfa4a7f6
+  name: "array"
+  type_id: 0x030b9acf
+  offset: 64
+}
 member {
   id: 0xdfcad7c4
   name: "array"
@@ -59752,6 +59763,11 @@ member {
   type_id: 0x1a8b04e5
   offset: 384
 }
+member {
+  id: 0x15741053
+  name: "chain"
+  type_id: 0x030b9acf
+}
 member {
   id: 0x15798222
   name: "chain"
@@ -224625,6 +224641,17 @@ struct_union {
     member_id: 0x14d0dfac
   }
 }
+struct_union {
+  id: 0x07b53c95
+  kind: STRUCT
+  name: "dma_fence_unwrap"
+  definition {
+    bytesize: 24
+    member_id: 0x15741053
+    member_id: 0xdfa4a7f6
+    member_id: 0xad7c841b
+  }
+}
 struct_union {
   id: 0x7f49bdff
   kind: STRUCT
@@ -328631,6 +328658,13 @@ function {
   parameter_id: 0xc9082b19
   parameter_id: 0xc9082b19
 }
+function {
+  id: 0xce0d9e89
+  return_type_id: 0x030b9acf
+  parameter_id: 0x4585663f
+  parameter_id: 0x0a52df14
+  parameter_id: 0x0b7d7682
+}
 function {
   id: 0xce0dc24b
   return_type_id: 0x4585663f
@@ -329053,6 +329087,11 @@ function {
   parameter_id: 0x1e9745d3
   parameter_id: 0x1e9745d3
 }
+function {
+  id: 0xdd980e87
+  return_type_id: 0x030b9acf
+  parameter_id: 0x0b7d7682
+}
 function {
   id: 0xddb49ff7
   return_type_id: 0x3ae3ff84
@@ -329212,6 +329251,12 @@ function {
   return_type_id: 0x030b9acf
   parameter_id: 0x030b9acf
 }
+function {
+  id: 0xdfa8404e
+  return_type_id: 0x030b9acf
+  parameter_id: 0x030b9acf
+  parameter_id: 0x0b7d7682
+}
 function {
   id: 0xdfba2774
   return_type_id: 0x4585663f
@@ -332379,6 +332424,15 @@ elf_symbol {
   type_id: 0x4058e56a
   full_name: "__devres_alloc_node"
 }
+elf_symbol {
+  id: 0xfa3b077f
+  name: "__dma_fence_unwrap_merge"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xd88defca
+  type_id: 0xce0d9e89
+  full_name: "__dma_fence_unwrap_merge"
+}
 elf_symbol {
   id: 0x0a6e3e89
   name: "__dma_request_channel"
@@ -354070,6 +354124,24 @@ elf_symbol {
   type_id: 0x9d05158e
   full_name: "dma_fence_signal_timestamp_locked"
 }
+elf_symbol {
+  id: 0x2012ba51
+  name: "dma_fence_unwrap_first"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xc3cd6929
+  type_id: 0xdfa8404e
+  full_name: "dma_fence_unwrap_first"
+}
+elf_symbol {
+  id: 0xf5f318e9
+  name: "dma_fence_unwrap_next"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xd13e4af8
+  type_id: 0xdd980e87
+  full_name: "dma_fence_unwrap_next"
+}
 elf_symbol {
   id: 0xf18ac584
   name: "dma_fence_wait_any_timeout"
@@ -398114,6 +398186,7 @@ interface {
   symbol_id: 0x279e51a3
   symbol_id: 0xe78c29b1
   symbol_id: 0x95c24824
+  symbol_id: 0xfa3b077f
   symbol_id: 0x0a6e3e89
   symbol_id: 0x347a699c
   symbol_id: 0x27ce6aa1
@@ -400523,6 +400596,8 @@ interface {
   symbol_id: 0xe2a2feec
   symbol_id: 0x904cad71
   symbol_id: 0x2b7d2f8e
+  symbol_id: 0x2012ba51
+  symbol_id: 0xf5f318e9
   symbol_id: 0xf18ac584
   symbol_id: 0x7ffe50b7
   symbol_id: 0x3b69b427
diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index 7153beba5ba6..b7d8f1baa215 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -544,12 +544,16 @@
   dma_fence_array_create
   dma_fence_context_alloc
   dma_fence_default_wait
+  dma_fence_enable_sw_signaling
   dma_fence_get_status
   dma_fence_init
   dma_fence_release
   dma_fence_remove_callback
   dma_fence_signal
   dma_fence_signal_locked
+  dma_fence_unwrap_first
+  __dma_fence_unwrap_merge
+  dma_fence_unwrap_next
   dma_fence_wait_timeout
   dma_free_attrs
   dma_free_pages

From f44d373b32b144822b89eb24e38f4e66f2663281 Mon Sep 17 00:00:00 2001
From: Rick Yiu <rickyiu@google.com>
Date: Fri, 5 May 2023 14:31:36 +0000
Subject: [PATCH 053/139] ANDROID: sched: Add trace_android_rvh_setscheduler

Sync to android13-5.10. This vendor hook is declared already.

Bug: 245675204
Change-Id: Ib081b52542380d22317f225a50b553cda5f2634c
Signed-off-by: Rick Yiu <rickyiu@google.com>
(cherry picked from commit f9688670ca92bf9f46f9a6134bc69623c30bbb23)
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a1564c2268f..fc3f3dad20c3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7837,6 +7837,7 @@ change:
 	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
 		__setscheduler_params(p, attr);
 		__setscheduler_prio(p, newprio);
+		trace_android_rvh_setscheduler(p);
 	}
 	__setscheduler_uclamp(p, attr);
 

From 07775f9683d486f456e051dafa5871af18609ada Mon Sep 17 00:00:00 2001
From: Kever Yang <kever.yang@rock-chips.com>
Date: Thu, 14 Dec 2023 18:42:50 +0800
Subject: [PATCH 054/139] ANDROID: GKI: Add symbols for rockchip sata

INFO: 24 function symbol(s) added
  'size_t __scsi_format_command(char*, size_t, const unsigned char*, size_t)'
  'int attribute_container_register(struct attribute_container*)'
  'int attribute_container_unregister(struct attribute_container*)'
  'void pci_intx(struct pci_dev*, int)'
  'int pcim_iomap_regions_request_all(struct pci_dev*, int, const char*)'
  'void pcim_pin_device(struct pci_dev*)'
  'int reset_control_rearm(struct reset_control*)'
  'enum scsi_disposition scsi_check_sense(struct scsi_cmnd*)'
  'int scsi_device_set_state(struct scsi_device*, enum scsi_device_state)'
  'void scsi_eh_finish_cmd(struct scsi_cmnd*, struct list_head*)'
  'void scsi_eh_flush_done_q(struct list_head*)'
  'int scsi_rescan_device(struct scsi_device*)'
  'void scsi_schedule_eh(struct Scsi_Host*)'
  'const u8* scsi_sense_desc_find(const u8*, int, int)'
  'int scsi_set_sense_field_pointer(u8*, int, u16, u8, bool)'
  'void sdev_evt_send_simple(struct scsi_device*, enum scsi_device_event, gfp_t)'
  'bool system_entering_hibernation()'
  'int transport_add_device(struct device*)'
  'int transport_class_register(struct transport_class*)'
  'void transport_class_unregister(struct transport_class*)'
  'void transport_configure_device(struct device*)'
  'void transport_destroy_device(struct device*)'
  'void transport_remove_device(struct device*)'
  'void transport_setup_device(struct device*)'

Bug: 300024866
Change-Id: I6a505d48d0d199a710b0d93b6a8df189735a7b89
Signed-off-by: Kever Yang <kever.yang@rock-chips.com>
---
 android/abi_gki_aarch64.stg      | 411 +++++++++++++++++++++++++++++++
 android/abi_gki_aarch64_rockchip |  88 ++++++-
 2 files changed, 497 insertions(+), 2 deletions(-)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 68cf985124c9..e570ab134873 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -8268,6 +8268,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0x6255e5da
 }
+pointer_reference {
+  id: 0x120b1632
+  kind: POINTER
+  pointee_type_id: 0x626cbe56
+}
 pointer_reference {
   id: 0x12191e2a
   kind: POINTER
@@ -10833,6 +10838,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0x52606d54
 }
+pointer_reference {
+  id: 0x1e0dbd15
+  kind: POINTER
+  pointee_type_id: 0x527612cb
+}
 pointer_reference {
   id: 0x1e20e7eb
   kind: POINTER
@@ -15948,6 +15958,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0x990a74b4
 }
+pointer_reference {
+  id: 0x2cd2cd79
+  kind: POINTER
+  pointee_type_id: 0x990bd378
+}
 pointer_reference {
   id: 0x2cd31328
   kind: POINTER
@@ -61382,6 +61397,11 @@ member {
   type_id: 0xe62ebf07
   offset: 128
 }
+member {
+  id: 0x86c22df0
+  name: "class"
+  type_id: 0x83714889
+}
 member {
   id: 0x86f2bb02
   name: "class"
@@ -64796,6 +64816,12 @@ member {
   type_id: 0x2d7be27a
   offset: 448
 }
+member {
+  id: 0x73c32817
+  name: "configure"
+  type_id: 0x2cd2cd79
+  offset: 1280
+}
 member {
   id: 0x73c333e5
   name: "configure"
@@ -162517,6 +162543,12 @@ member {
   type_id: 0x2da2fbac
   offset: 704
 }
+member {
+  id: 0xb48f08f4
+  name: "remove"
+  type_id: 0x2cd2cd79
+  offset: 1344
+}
 member {
   id: 0xb48fbf27
   name: "remove"
@@ -175459,6 +175491,12 @@ member {
   type_id: 0x2cdc0ac8
   offset: 9088
 }
+member {
+  id: 0x84e59dd8
+  name: "setup"
+  type_id: 0x2cd2cd79
+  offset: 1216
+}
 member {
   id: 0x84e68e26
   name: "setup"
@@ -260350,6 +260388,18 @@ struct_union {
     member_id: 0xba11b0ec
   }
 }
+struct_union {
+  id: 0x527612cb
+  kind: STRUCT
+  name: "transport_class"
+  definition {
+    bytesize: 176
+    member_id: 0x86c22df0
+    member_id: 0x84e59dd8
+    member_id: 0x73c32817
+    member_id: 0xb48f08f4
+  }
+}
 struct_union {
   id: 0x626cbe56
   kind: STRUCT
@@ -282999,6 +283049,57 @@ enumeration {
     }
   }
 }
+enumeration {
+  id: 0xd7ffc9ea
+  name: "scsi_device_event"
+  definition {
+    underlying_type_id: 0x4585663f
+    enumerator {
+      name: "SDEV_EVT_MEDIA_CHANGE"
+      value: 1
+    }
+    enumerator {
+      name: "SDEV_EVT_INQUIRY_CHANGE_REPORTED"
+      value: 2
+    }
+    enumerator {
+      name: "SDEV_EVT_CAPACITY_CHANGE_REPORTED"
+      value: 3
+    }
+    enumerator {
+      name: "SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED"
+      value: 4
+    }
+    enumerator {
+      name: "SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED"
+      value: 5
+    }
+    enumerator {
+      name: "SDEV_EVT_LUN_CHANGE_REPORTED"
+      value: 6
+    }
+    enumerator {
+      name: "SDEV_EVT_ALUA_STATE_CHANGE_REPORTED"
+      value: 7
+    }
+    enumerator {
+      name: "SDEV_EVT_POWER_ON_RESET_OCCURRED"
+      value: 8
+    }
+    enumerator {
+      name: "SDEV_EVT_FIRST"
+      value: 1
+    }
+    enumerator {
+      name: "SDEV_EVT_LAST"
+      value: 8
+    }
+    enumerator {
+      name: "SDEV_EVT_MAXBITS"
+      value: 9
+    }
+  }
+}
 enumeration {
   id: 0xdf9e95f6
   name: "scsi_device_state"
@@ -292974,6 +293075,11 @@ function {
   parameter_id: 0x1d44326e
   parameter_id: 0x34d3469d
 }
+function {
+  id: 0x171b3ed6
+  return_type_id: 0x48b5725f
+  parameter_id: 0x1e0dbd15
+}
 function {
   id: 0x171c8621
   return_type_id: 0xd5cc9c9a
@@ -294775,6 +294881,13 @@ function {
   parameter_id: 0x1259e377
   parameter_id: 0xe276adef
 }
+function {
+  id: 0x1aa483a8
+  return_type_id: 0x48b5725f
+  parameter_id: 0x257d12af
+  parameter_id: 0xd7ffc9ea
+  parameter_id: 0xf1a6dfed
+}
 function {
   id: 0x1aa56a0d
   return_type_id: 0x48b5725f
@@ -298446,6 +298559,12 @@ function {
   parameter_id: 0x3e10b518
   parameter_id: 0xa52a0930
 }
+function {
+  id: 0x1f84fe6e
+  return_type_id: 0x48b5725f
+  parameter_id: 0x3f949c69
+  parameter_id: 0x3e6239e1
+}
 function {
   id: 0x1f85d3ef
   return_type_id: 0x48b5725f
@@ -302490,6 +302609,11 @@ function {
   parameter_id: 0xe276adef
   parameter_id: 0xc93e017b
 }
+function {
+  id: 0x62985582
+  return_type_id: 0x34cf6c51
+  parameter_id: 0x3f949c69
+}
 function {
   id: 0x62b8d7ec
   return_type_id: 0x09427c40
@@ -311320,6 +311444,14 @@ function {
   return_type_id: 0x6720d32f
   parameter_id: 0x21069feb
 }
+function {
+  id: 0x95c2268d
+  return_type_id: 0xf435685e
+  parameter_id: 0x0483e6f8
+  parameter_id: 0xf435685e
+  parameter_id: 0x384c5795
+  parameter_id: 0xf435685e
+}
 function {
   id: 0x95c3652e
   return_type_id: 0x6720d32f
@@ -312535,6 +312667,12 @@ function {
   parameter_id: 0xf1a6dfed
   parameter_id: 0x0292b875
 }
+function {
+  id: 0x97a1ddd3
+  return_type_id: 0x6720d32f
+  parameter_id: 0x257d12af
+  parameter_id: 0xdf9e95f6
+}
 function {
   id: 0x97a3c07a
   return_type_id: 0x6720d32f
@@ -314085,6 +314223,13 @@ function {
   parameter_id: 0x6720d32f
   parameter_id: 0x064d6086
 }
+function {
+  id: 0x990bd378
+  return_type_id: 0x6720d32f
+  parameter_id: 0x120b1632
+  parameter_id: 0x0258f96e
+  parameter_id: 0x0258f96e
+}
 function {
   id: 0x99132caa
   return_type_id: 0x6720d32f
@@ -315160,6 +315305,11 @@ function {
   parameter_id: 0x1d19a9d5
   parameter_id: 0x310ec01d
 }
+function {
+  id: 0x9a038c6a
+  return_type_id: 0x6720d32f
+  parameter_id: 0x1e0dbd15
+}
 function {
   id: 0x9a03c4d6
   return_type_id: 0x6720d32f
@@ -320536,6 +320686,15 @@ function {
   parameter_id: 0x6720d32f
   parameter_id: 0x92233392
 }
+function {
+  id: 0x9c09446b
+  return_type_id: 0x6720d32f
+  parameter_id: 0x00c72527
+  parameter_id: 0x6720d32f
+  parameter_id: 0x914dbfdc
+  parameter_id: 0x295c7202
+  parameter_id: 0x6d7f5ff6
+}
 function {
   id: 0x9c09d6aa
   return_type_id: 0x6720d32f
@@ -325996,6 +326155,11 @@ function {
   parameter_id: 0x0cf3d8fe
   parameter_id: 0x4585663f
 }
+function {
+  id: 0x9faad4c6
+  return_type_id: 0x6720d32f
+  parameter_id: 0x08a8dfa4
+}
 function {
   id: 0x9fab680a
   return_type_id: 0x6720d32f
@@ -329000,6 +329164,13 @@ function {
   return_type_id: 0x02eb105a
   parameter_id: 0x3e10b518
 }
+function {
+  id: 0xd981a35c
+  return_type_id: 0x3f0185ef
+  parameter_id: 0x3f0185ef
+  parameter_id: 0x6720d32f
+  parameter_id: 0x6720d32f
+}
 function {
   id: 0xd9bb2b92
   return_type_id: 0x4585663f
@@ -334324,6 +334495,15 @@ elf_symbol {
   type_id: 0xa017504e
   full_name: "__scsi_device_lookup_by_target"
 }
+elf_symbol {
+  id: 0xe18b6ee8
+  name: "__scsi_format_command"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x93022ba6
+  type_id: 0x95c2268d
+  full_name: "__scsi_format_command"
+}
 elf_symbol {
   id: 0x0166be18
   name: "__scsi_iterate_devices"
@@ -344204,6 +344384,24 @@ elf_symbol {
   type_id: 0x9048c0ea
   full_name: "atomic_notifier_chain_unregister"
 }
+elf_symbol {
+  id: 0x41765c03
+  name: "attribute_container_register"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x167c84c3
+  type_id: 0x9faad4c6
+  full_name: "attribute_container_register"
+}
+elf_symbol {
+  id: 0xcd05507b
+  name: "attribute_container_unregister"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x15baabca
+  type_id: 0x9faad4c6
+  full_name: "attribute_container_unregister"
+}
 elf_symbol {
   id: 0x82786c66
   name: "autoremove_wake_function"
@@ -375088,6 +375286,15 @@ elf_symbol {
   type_id: 0x93acae9b
   full_name: "pci_host_probe"
 }
+elf_symbol {
+  id: 0xec0d5441
+  name: "pci_intx"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x4e899f5e
+  type_id: 0x157d734c
+  full_name: "pci_intx"
+}
 elf_symbol {
   id: 0x9c6c58ea
   name: "pci_iomap"
@@ -375628,6 +375835,15 @@ elf_symbol {
   type_id: 0x986a45dd
   full_name: "pcim_iomap_regions"
 }
+elf_symbol {
+  id: 0xae61b91f
+  name: "pcim_iomap_regions_request_all"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xd111489f
+  type_id: 0x986a45dd
+  full_name: "pcim_iomap_regions_request_all"
+}
 elf_symbol {
   id: 0xc37c9a74
   name: "pcim_iomap_table"
@@ -375646,6 +375862,15 @@ elf_symbol {
   type_id: 0x157d734c
   full_name: "pcim_iounmap_regions"
 }
+elf_symbol {
+  id: 0xfa9dbeca
+  name: "pcim_pin_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xfdffeca6
+  type_id: 0x14e1f000
+  full_name: "pcim_pin_device"
+}
 elf_symbol {
   id: 0x123cd197
   name: "pcpu_nr_pages"
@@ -380920,6 +381145,15 @@ elf_symbol {
   type_id: 0x1a9c8a01
   full_name: "reset_control_put"
 }
+elf_symbol {
+  id: 0x642147cd
+  name: "reset_control_rearm"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x5d2bc42a
+  type_id: 0x978438bd
+  full_name: "reset_control_rearm"
+}
 elf_symbol {
   id: 0x9c7a2d6c
   name: "reset_control_release"
@@ -382369,6 +382603,15 @@ elf_symbol {
   type_id: 0x954324c8
   full_name: "scsi_change_queue_depth"
 }
+elf_symbol {
+  id: 0xebec291e
+  name: "scsi_check_sense"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x05404117
+  type_id: 0x62985582
+  full_name: "scsi_check_sense"
+}
 elf_symbol {
   id: 0xe4036f2e
   name: "scsi_cmd_allowed"
@@ -382432,6 +382675,15 @@ elf_symbol {
   type_id: 0x19c71538
   full_name: "scsi_device_resume"
 }
+elf_symbol {
+  id: 0x55968d64
+  name: "scsi_device_set_state"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x3e16b971
+  type_id: 0x97a1ddd3
+  full_name: "scsi_device_set_state"
+}
 elf_symbol {
   id: 0xf10245da
   name: "scsi_dma_map"
@@ -382459,6 +382711,24 @@ elf_symbol {
   type_id: 0x1f7d7689
   full_name: "scsi_done"
 }
+elf_symbol {
+  id: 0xb77321e1
+  name: "scsi_eh_finish_cmd"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x8e5f7b03
+  type_id: 0x1f84fe6e
+  full_name: "scsi_eh_finish_cmd"
+}
+elf_symbol {
+  id: 0xe584e576
+  name: "scsi_eh_flush_done_q"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xf811e69d
+  type_id: 0x1f00dfeb
+  full_name: "scsi_eh_flush_done_q"
+}
 elf_symbol {
   id: 0x8ef5c221
   name: "scsi_execute_cmd"
@@ -382585,6 +382855,15 @@ elf_symbol {
   type_id: 0x14f27dac
   full_name: "scsi_report_bus_reset"
 }
+elf_symbol {
+  id: 0x24093af7
+  name: "scsi_rescan_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x83fa9f1b
+  type_id: 0x94dfa784
+  full_name: "scsi_rescan_device"
+}
 elf_symbol {
   id: 0xc9021692
   name: "scsi_scan_host"
@@ -382594,6 +382873,33 @@ elf_symbol {
   type_id: 0x156efee0
   full_name: "scsi_scan_host"
 }
+elf_symbol {
+  id: 0x51e78cea
+  name: "scsi_schedule_eh"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xd78a6752
+  type_id: 0x156efee0
+  full_name: "scsi_schedule_eh"
+}
+elf_symbol {
+  id: 0x9489f8a9
+  name: "scsi_sense_desc_find"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x10d9f885
+  type_id: 0xd981a35c
+  full_name: "scsi_sense_desc_find"
+}
+elf_symbol {
+  id: 0x494ae459
+  name: "scsi_set_sense_field_pointer"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x3ab7b1cc
+  type_id: 0x9c09446b
+  full_name: "scsi_set_sense_field_pointer"
+}
 elf_symbol {
   id: 0xcf17c9a6
   name: "scsi_set_sense_information"
@@ -382612,6 +382918,15 @@ elf_symbol {
   type_id: 0x156efee0
   full_name: "scsi_unblock_requests"
 }
+elf_symbol {
+  id: 0xe6808261
+  name: "sdev_evt_send_simple"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x1727f774
+  type_id: 0x1aa483a8
+  full_name: "sdev_evt_send_simple"
+}
 elf_symbol {
   id: 0x771aea1d
   name: "sdev_prefix_printk"
@@ -388511,6 +388826,15 @@ elf_symbol {
   type_id: 0x599826a1
   full_name: "system_32bit_el0_cpumask"
 }
+elf_symbol {
+  id: 0x991b4bfd
+  name: "system_entering_hibernation"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x13f42152
+  type_id: 0xfea45b04
+  full_name: "system_entering_hibernation"
+}
 elf_symbol {
   id: 0xb5701f35
   name: "system_freezable_power_efficient_wq"
@@ -389546,6 +389870,69 @@ elf_symbol {
   type_id: 0x10985193
   full_name: "tracing_off"
 }
+elf_symbol {
+  id: 0x8f8403dc
+  name: "transport_add_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x9d7e8343
+  type_id: 0x9d16dd74
+  full_name: "transport_add_device"
+}
+elf_symbol {
+  id: 0x5911125b
+  name: "transport_class_register"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x071cb3f2
+  type_id: 0x9a038c6a
+  full_name: "transport_class_register"
+}
+elf_symbol {
+  id: 0x113cbc59
+  name: "transport_class_unregister"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xce941924
+  type_id: 0x171b3ed6
+  full_name: "transport_class_unregister"
+}
+elf_symbol {
+  id: 0x7640c32b
+  name: "transport_configure_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x106dd54f
+  type_id: 0x100e6fc8
+  full_name: "transport_configure_device"
+}
+elf_symbol {
+  id: 0xc0be90d8
+  name: "transport_destroy_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x1870a351
+  type_id: 0x100e6fc8
+  full_name: "transport_destroy_device"
+}
+elf_symbol {
+  id: 0x09f20ac9
+  name: "transport_remove_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xcd97ee1a
+  type_id: 0x100e6fc8
+  full_name: "transport_remove_device"
+}
+elf_symbol {
+  id: 0xd75a472d
+  name: "transport_setup_device"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x66ba89d2
+  type_id: 0x100e6fc8
+  full_name: "transport_setup_device"
+}
 elf_symbol {
   id: 0x3f07269b
   name: "truncate_inode_pages"
@@ -398397,6 +398784,7 @@ interface {
   symbol_id: 0x99aa632e
   symbol_id: 0xe68925b8
   symbol_id: 0x6e3bb1cf
+  symbol_id: 0xe18b6ee8
   symbol_id: 0x0166be18
   symbol_id: 0xc5953732
   symbol_id: 0x1d4d84d0
@@ -399495,6 +399883,8 @@ interface {
   symbol_id: 0x5f6a1554
   symbol_id: 0x3beebbde
   symbol_id: 0x24064426
+  symbol_id: 0x41765c03
+  symbol_id: 0xcd05507b
   symbol_id: 0x82786c66
   symbol_id: 0xd772fde3
   symbol_id: 0x1abdc14f
@@ -402925,6 +403315,7 @@ interface {
   symbol_id: 0x9ac8ef20
   symbol_id: 0x35c96922
   symbol_id: 0xbe6406c3
+  symbol_id: 0xec0d5441
   symbol_id: 0x9c6c58ea
   symbol_id: 0x2fefe933
   symbol_id: 0x1c994923
@@ -402985,8 +403376,10 @@ interface {
   symbol_id: 0xffa3ecd1
   symbol_id: 0x42595f98
   symbol_id: 0xd085753f
+  symbol_id: 0xae61b91f
   symbol_id: 0xc37c9a74
   symbol_id: 0xd03f3f09
+  symbol_id: 0xfa9dbeca
   symbol_id: 0x123cd197
   symbol_id: 0xe57e5e73
   symbol_id: 0x8ba9d028
@@ -403573,6 +403966,7 @@ interface {
   symbol_id: 0x57ee69c1
   symbol_id: 0xd76b82b2
   symbol_id: 0x30c7b7f4
+  symbol_id: 0x642147cd
   symbol_id: 0x9c7a2d6c
   symbol_id: 0x48fc2cb6
   symbol_id: 0xd41c441b
@@ -403734,6 +404128,7 @@ interface {
   symbol_id: 0xd3148537
   symbol_id: 0xb5b25b58
   symbol_id: 0x278a6b59
+  symbol_id: 0xebec291e
   symbol_id: 0xe4036f2e
   symbol_id: 0xd49d7abc
   symbol_id: 0x76dea2aa
@@ -403741,9 +404136,12 @@ interface {
   symbol_id: 0x474e9bcc
   symbol_id: 0x61df84bc
   symbol_id: 0x054c0bba
+  symbol_id: 0x55968d64
   symbol_id: 0xf10245da
   symbol_id: 0x18cbd7f9
   symbol_id: 0x30f6b9b1
+  symbol_id: 0xb77321e1
+  symbol_id: 0xe584e576
   symbol_id: 0x8ef5c221
   symbol_id: 0x32b196e0
   symbol_id: 0x022517f0
@@ -403758,9 +404156,14 @@ interface {
   symbol_id: 0x42390c70
   symbol_id: 0x8deacb1d
   symbol_id: 0x2e407415
+  symbol_id: 0x24093af7
   symbol_id: 0xc9021692
+  symbol_id: 0x51e78cea
+  symbol_id: 0x9489f8a9
+  symbol_id: 0x494ae459
   symbol_id: 0xcf17c9a6
   symbol_id: 0x9c54c873
+  symbol_id: 0xe6808261
   symbol_id: 0x771aea1d
   symbol_id: 0x8d3c4841
   symbol_id: 0xf399cd48
@@ -404417,6 +404820,7 @@ interface {
   symbol_id: 0xda44819e
   symbol_id: 0x46cd3193
   symbol_id: 0xb6c44fb1
+  symbol_id: 0x991b4bfd
   symbol_id: 0xb5701f35
   symbol_id: 0xeeb4dc4c
   symbol_id: 0x314b4b2e
@@ -404532,6 +404936,13 @@ interface {
   symbol_id: 0x3df2f359
   symbol_id: 0x33172d21
   symbol_id: 0x54bbaa46
+  symbol_id: 0x8f8403dc
+  symbol_id: 0x5911125b
+  symbol_id: 0x113cbc59
+  symbol_id: 0x7640c32b
+  symbol_id: 0xc0be90d8
+  symbol_id: 0x09f20ac9
+  symbol_id: 0xd75a472d
   symbol_id: 0x3f07269b
   symbol_id: 0x3c7c6ce9
   symbol_id: 0x7a43283c
diff --git a/android/abi_gki_aarch64_rockchip b/android/abi_gki_aarch64_rockchip
index 8fdda5ad35fb..0010cf2300b6 100644
--- a/android/abi_gki_aarch64_rockchip
+++ b/android/abi_gki_aarch64_rockchip
@@ -2,6 +2,7 @@
 # commonly used symbols
   add_timer
   alloc_chrdev_region
+  alloc_etherdev_mqs
   alloc_iova_fast
   __alloc_pages
   __alloc_skb
@@ -827,9 +828,25 @@
   param_ops_int
   param_ops_string
   param_ops_uint
+  param_ops_ulong
+  pci_disable_device
+  pci_disable_link_state
   pcie_capability_clear_and_set_word
+  pci_find_capability
+  pcim_enable_device
+  pcim_iomap_table
+  pcim_pin_device
+  pci_read_config_byte
   pci_read_config_dword
+  pci_read_config_word
+  __pci_register_driver
+  pci_restore_state
+  pci_save_state
+  pci_set_master
+  pci_set_power_state
+  pci_unregister_driver
   pci_write_config_dword
+  pci_write_config_word
   __per_cpu_offset
   perf_trace_buf_alloc
   perf_trace_run_bpf_submit
@@ -1023,7 +1040,11 @@
   sched_set_fifo
   schedule
   schedule_timeout
+  schedule_timeout_uninterruptible
   scnprintf
+  scsi_command_size_tbl
+  scsi_device_get
+  scsi_device_put
   __sdhci_add_host
   sdhci_cleanup_host
   sdhci_enable_clk
@@ -1325,6 +1346,7 @@
   vunmap
   vzalloc
   wait_for_completion
+  wait_for_completion_interruptible
   wait_for_completion_timeout
   __wake_up
   wake_up_process
@@ -1346,15 +1368,23 @@
   skcipher_walk_aead_decrypt
   skcipher_walk_aead_encrypt
 
+# required by ahci.ko
+  pci_alloc_irq_vectors_affinity
+  pci_free_irq_vectors
+  pci_intx
+  pci_irq_vector
+  pci_match_id
+  pcim_iomap_regions_request_all
+  sysfs_add_file_to_group
+  sysfs_remove_file_from_group
+
 # required by analogix_dp.ko
   drm_atomic_get_old_connector_for_encoder
 
 # required by aspm_ext.ko
-  pci_find_capability
   pci_find_ext_capability
 
 # required by bcmdhd.ko
-  alloc_etherdev_mqs
   cpu_bit_bitmap
   down_interruptible
   down_timeout
@@ -1873,6 +1903,60 @@
 # required by ledtrig-heartbeat.ko
   avenrun
 
+# required by libahci.ko
+  __printk_ratelimit
+
+# required by libahci_platform.ko
+  reset_control_rearm
+
+# required by libata.ko
+  async_schedule_node
+  async_synchronize_cookie
+  attribute_container_register
+  attribute_container_unregister
+  autoremove_wake_function
+  blk_abort_request
+  blk_queue_max_hw_sectors
+  blk_queue_max_segments
+  blk_queue_update_dma_alignment
+  blk_queue_update_dma_pad
+  glob_match
+  pci_bus_type
+  pcim_iomap_regions
+  prepare_to_wait
+  __scsi_add_device
+  scsi_add_host_with_dma
+  scsi_build_sense
+  scsi_change_queue_depth
+  scsi_check_sense
+  scsi_device_set_state
+  scsi_done
+  scsi_eh_finish_cmd
+  scsi_eh_flush_done_q
+  scsi_execute_cmd
+  __scsi_format_command
+  scsi_host_alloc
+  scsi_host_put
+  scsi_remove_device
+  scsi_remove_host
+  scsi_rescan_device
+  scsi_schedule_eh
+  scsi_sense_desc_find
+  scsi_set_sense_field_pointer
+  scsi_set_sense_information
+  sdev_evt_send_simple
+  system_entering_hibernation
+  trace_seq_printf
+  trace_seq_putc
+  transport_add_device
+  transport_class_register
+  transport_class_unregister
+  transport_configure_device
+  transport_destroy_device
+  transport_remove_device
+  transport_setup_device
+  vscnprintf
+
 # required by mac80211.ko
   alloc_netdev_mqs
   __alloc_percpu_gfp

From 613d8368e39f4cd67f2ad629ebb870d3e3b4bc6e Mon Sep 17 00:00:00 2001
From: Paul Lawrence <paullawrence@google.com>
Date: Wed, 2 Aug 2023 12:23:44 -0700
Subject: [PATCH 055/139] ANDROID: fuse-bpf: Follow mounts in lookups

Bug: 292925770
Test: fuse_test run. The following steps on Android also now pass:

	Create /data/123 and /data/media/0/Android/data/45 directories
	Mount /data/123 directory to /data/media/0/Android/data/45 directory
	Create 1.txt under the /data/123 directory

	File 1.txt should appear in /storage/emulated/0/Android/data/45
Change-Id: I1fe27d743ca2981e624a9aa87d9ab6deb313aadc
Signed-off-by: Paul Lawrence <paullawrence@google.com>
---
 fs/fuse/backing.c                             | 15 ++++---
 .../selftests/filesystems/fuse/bpf_loader.c   | 28 +++++++++++-
 .../selftests/filesystems/fuse/fuse_test.c    | 45 +++++++++++++++++++
 .../selftests/filesystems/fuse/test_fuse.h    |  3 ++
 4 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c
index e16457c75944..6ca74987f7da 100644
--- a/fs/fuse/backing.c
+++ b/fs/fuse/backing.c
@@ -1117,7 +1117,6 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir,
 	struct kstat stat;
 	int err;
 
-	/* TODO this will not handle lookups over mount points */
 	inode_lock_nested(dir_backing_inode, I_MUTEX_PARENT);
 	backing_entry = lookup_one_len(entry->d_name.name, dir_backing_entry,
 					strlen(entry->d_name.name));
@@ -1136,16 +1135,22 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir,
 		return 0;
 	}
 
+	err = follow_down(&fuse_entry->backing_path);
+	if (err)
+		goto err_out;
+
 	err = vfs_getattr(&fuse_entry->backing_path, &stat,
 				  STATX_BASIC_STATS, 0);
-	if (err) {
-		path_put_init(&fuse_entry->backing_path);
-		return err;
-	}
+	if (err)
+		goto err_out;
 
 	fuse_stat_to_attr(get_fuse_conn(dir),
 			  backing_entry->d_inode, &stat, &feo->attr);
 	return 0;
+
+err_out:
+	path_put_init(&fuse_entry->backing_path);
+	return err;
 }
 
 int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode,
diff --git a/tools/testing/selftests/filesystems/fuse/bpf_loader.c b/tools/testing/selftests/filesystems/fuse/bpf_loader.c
index 5bf26eadd421..94f884c64d29 100644
--- a/tools/testing/selftests/filesystems/fuse/bpf_loader.c
+++ b/tools/testing/selftests/filesystems/fuse/bpf_loader.c
@@ -394,6 +394,29 @@ int s_rename(struct s oldpathname, struct s newpathname)
 	return res;
 }
 
+int s_mount(struct s source, struct s target, struct s filesystem,
+	    unsigned long mountflags, struct s data)
+{
+	int res;
+
+	res = mount(source.s, target.s, filesystem.s, mountflags, data.s);
+	free(source.s);
+	free(target.s);
+	free(filesystem.s);
+	free(data.s);
+
+	return res;
+}
+
+int s_umount(struct s target)
+{
+	int res;
+
+	res = umount(target.s);
+	free(target.s);
+	return res;
+}
+
 int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out)
 {
 
@@ -574,7 +597,10 @@ static int mount_fuse_maybe_init(const char *mount_dir, int bpf_fd, int dir_fd,
 		}));
 	}
 
-	*fuse_dev_ptr = fuse_dev;
+	if (fuse_dev_ptr)
+		*fuse_dev_ptr = fuse_dev;
+	else
+		TESTSYSCALL(close(fuse_dev));
 	fuse_dev = -1;
 	result = TEST_SUCCESS;
 out:
diff --git a/tools/testing/selftests/filesystems/fuse/fuse_test.c b/tools/testing/selftests/filesystems/fuse/fuse_test.c
index 528595a8e82f..ad24ed48853e 100644
--- a/tools/testing/selftests/filesystems/fuse/fuse_test.c
+++ b/tools/testing/selftests/filesystems/fuse/fuse_test.c
@@ -2114,6 +2114,50 @@ out:
 	return result;
 }
 
+/**
+ * Test that fuse passthrough correctly traverses a mount point on the lower fs
+ */
+static int bpf_test_follow_mounts(const char *mount_dir)
+{
+	const char *bind_src = "bind_src";
+	const char *bind_dst = "bind_dst";
+	const char *file = "file";
+	int fd = -1;
+	int src_fd = -1;
+	int result = TEST_FAILURE;
+
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_src)), 0777));
+	TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_dst)), 0777));
+	TEST(fd = s_creat(s_pathn(3, s(ft_src), s(bind_src), s(file)), 0777),
+	     fd != -1);
+	TESTSYSCALL(close(fd));
+	fd = -1;
+	TESTSYSCALL(s_mount(s_path(s(ft_src), s(bind_src)),
+			    s_path(s(ft_src), s(bind_dst)),
+			    s(NULL), MS_BIND, s(NULL)));
+	TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC),
+	     src_fd != -1);
+	TESTEQUAL(mount_fuse_no_init(mount_dir, -1, src_fd, NULL), 0);
+	TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_src), s(file)),
+			 O_RDONLY),
+	     fd != -1);
+	TESTSYSCALL(close(fd));
+	fd = -1;
+	TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_dst), s(file)),
+			 O_RDONLY),
+	     fd != -1);
+	TESTSYSCALL(close(fd));
+	fd = -1;
+
+	result = TEST_SUCCESS;
+out:
+	umount(mount_dir);
+	close(src_fd);
+	s_umount(s_path(s(ft_src), s(bind_dst)));
+	close(fd);
+	return result;
+}
+
 static void parse_range(const char *ranges, bool *run_test, size_t tests)
 {
 	size_t i;
@@ -2244,6 +2288,7 @@ int main(int argc, char *argv[])
 		MAKE_TEST(bpf_test_create_and_remove_bpf),
 		MAKE_TEST(bpf_test_mkdir_and_remove_bpf),
 		MAKE_TEST(bpf_test_readahead),
+		MAKE_TEST(bpf_test_follow_mounts),
 	};
 #undef MAKE_TEST
 
diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse.h b/tools/testing/selftests/filesystems/fuse/test_fuse.h
index 69dadc9c7e45..e62e2ee07713 100644
--- a/tools/testing/selftests/filesystems/fuse/test_fuse.h
+++ b/tools/testing/selftests/filesystems/fuse/test_fuse.h
@@ -64,6 +64,9 @@ int s_setxattr(struct s pathname, const char name[], const void *value,
 	       size_t size, int flags);
 int s_removexattr(struct s pathname, const char name[]);
 int s_rename(struct s oldpathname, struct s newpathname);
+int s_mount(struct s source, struct s target, struct s filesystem,
+	    unsigned long mountflags, struct s data);
+int s_umount(struct s target);
 
 struct s tracing_folder(void);
 int tracing_on(void);

From 44702d8fa1b046b52595a03ac7fedc7abc414c42 Mon Sep 17 00:00:00 2001
From: Charan Teja Kalla <quic_charante@quicinc.com>
Date: Thu, 14 Dec 2023 04:58:41 +0000
Subject: [PATCH 056/139] FROMLIST: mm: migrate high-order folios in swap cache
 correctly

Large folios occupy N consecutive entries in the swap cache instead of
using multi-index entries like the page cache.  However, if a large folio
is re-added to the LRU list, it can be migrated.  The migration code was
not aware of the difference between the swap cache and the page cache and
assumed that a single xas_store() would be sufficient.

This leaves potentially many stale pointers to the now-migrated folio in
the swap cache, which can lead to almost arbitrary data corruption in the
future.  This can also manifest as infinite loops with the RCU read lock
held.

Bug: 315281107
Change-Id: I455f964a9f21c13089890073777388236b6669d7
[willy@infradead.org: modifications to the changelog & tweaked the fix]
Fixes: 3417013e0d18 ("mm/migrate: Add folio_migrate_mapping()")
Link: https://lkml.kernel.org/r/20231214045841.961776-1-willy@infradead.org
Link: https://lore.kernel.org/linux-mm/20231214045841.961776-1-willy@infradead.org/
Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reported-by: Charan Teja Kalla <quic_charante@quicinc.com>
Closes: https://lkml.kernel.org/r/1700569840-17327-1-git-send-email-quic_charante@quicinc.com
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
---
 mm/migrate.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index ef490976c98e..9f5f52deed0c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -393,6 +393,7 @@ int folio_migrate_mapping(struct address_space *mapping,
 	int dirty;
 	int expected_count = folio_expected_refs(mapping, folio) + extra_count;
 	long nr = folio_nr_pages(folio);
+	long entries, i;
 
 	if (!mapping) {
 		/* Anonymous page without mapping */
@@ -430,8 +431,10 @@ int folio_migrate_mapping(struct address_space *mapping,
 			folio_set_swapcache(newfolio);
 			newfolio->private = folio_get_private(folio);
 		}
+		entries = nr;
 	} else {
 		VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
+		entries = 1;
 	}
 
 	/* Move dirty while page refs frozen and newpage not yet exposed */
@@ -441,7 +444,11 @@ int folio_migrate_mapping(struct address_space *mapping,
 		folio_set_dirty(newfolio);
 	}
 
-	xas_store(&xas, newfolio);
+	/* Swap cache still stores N entries instead of a high-order entry */
+	for (i = 0; i < entries; i++) {
+		xas_store(&xas, newfolio);
+		xas_next(&xas);
+	}
 
 	/*
 	 * Drop cache reference from old page by unfreezing

From 30bca9e2785b3c7cce113308b16b40132293ca34 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 1 Dec 2023 15:47:13 +0100
Subject: [PATCH 057/139] UPSTREAM: netfilter: nft_set_pipapo: skip inactive
 elements during set walk

commit 317eb9685095678f2c9f5a8189de698c5354316a upstream.

Otherwise set elements can be deactivated twice which will cause a crash.

Bug: 316310313
Reported-by: Xingyuan Mo <hdthky0@gmail.com>
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 189c2a82933c67ad360c421258d5449f6647544a)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: I27fb6ee806642e23ca02700763a387341dd463e6
---
 net/netfilter/nft_set_pipapo.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index deea6196d992..4e1cc31729b8 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2042,6 +2042,9 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
 
 		e = f->mt[r].e;
 
+		if (!nft_set_elem_active(&e->ext, iter->genmask))
+			goto cont;
+
 		elem.priv = e;
 
 		iter->err = iter->fn(ctx, set, iter, &elem);

From 401a2769d99066952f3bfb73a8ce6b0269bc04d7 Mon Sep 17 00:00:00 2001
From: Wu Bo <bo.wu@vivo.com>
Date: Tue, 21 Nov 2023 20:51:50 -0700
Subject: [PATCH 058/139] UPSTREAM: dm verity: don't perform FEC for failed
 readahead IO

We found an issue under Android OTA scenario that many BIOs have to do
FEC where the data under dm-verity is 100% complete and no corruption.

Android OTA has many dm-block layers, from upper to lower:
dm-verity
dm-snapshot
dm-origin & dm-cow
dm-linear
ufs

DM tables have to change 2 times during Android OTA merging process.
When doing table change, the dm-snapshot will be suspended for a while.
During this interval, many readahead IOs are submitted to dm_verity
from filesystem. Then the kverity works are busy doing FEC process
which cost too much time to finish dm-verity IO. This causes needless
delay which feels like system is hung.

After adding debugging it was found that each readahead IO needed
around 10s to finish when this situation occurred. This is due to IO
amplification:

dm-snapshot suspend
erofs_readahead     // 300+ io is submitted
	dm_submit_bio (dm_verity)
		dm_submit_bio (dm_snapshot)
		bio return EIO
		bio got nothing, it's empty
	verity_end_io
	verity_verify_io
	forloop range(0, io->n_blocks)    // each io->nblocks ~= 20
		verity_fec_decode
		fec_decode_rsb
		fec_read_bufs
		forloop range(0, v->fec->rsn) // v->fec->rsn = 253
			new_read
			submit_bio (dm_snapshot)
		end loop
	end loop
dm-snapshot resume

Readahead BIOs get nothing while dm-snapshot is suspended, so all of
them will cause verity's FEC.
Each readahead BIO needs to verify ~20 (io->nblocks) blocks.
Each block needs to do FEC, and every block needs to do 253
(v->fec->rsn) reads.
So during the suspend interval(~200ms), 300 readahead BIOs trigger
~1518000 (300*20*253) IOs to dm-snapshot.

As readahead IO is not required by userspace, and to fix this issue,
it is best to pass readahead errors to upper layer to handle it.

Cc: stable@vger.kernel.org
Fixes: a739ff3f543a ("dm verity: add support for forward error correction")
Bug: 316972624
Link: https://lore.kernel.org/dm-devel/b84fb49-bf63-3442-8c99-d565e134f2@redhat.com
Signed-off-by: Wu Bo <bo.wu@vivo.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Akilesh Kailash <akailash@google.com>
(cherry picked from commit 0193e3966ceeeef69e235975918b287ab093082b)
Change-Id: I73560e5660cebdc1997e1f9926cbb8888789eb46
---
 drivers/md/dm-verity-target.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index fd68500ae2ad..1112a0a694e0 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -656,7 +656,9 @@ static void verity_end_io(struct bio *bio)
 	struct dm_verity_io *io = bio->bi_private;
 
 	if (bio->bi_status &&
-	    (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) {
+	    (!verity_fec_is_enabled(io->v) ||
+	     verity_is_system_shutting_down() ||
+	     (bio->bi_opf & REQ_RAHEAD))) {
 		verity_finish_io(io, bio->bi_status);
 		return;
 	}

From bc97d5019afdfa2a26cd39d6556ace0e55269919 Mon Sep 17 00:00:00 2001
From: xieliujie <xieliujie@oppo.com>
Date: Mon, 25 Dec 2023 14:28:51 +0800
Subject: [PATCH 059/139] ANDROID: vendor_hooks: Add hooks for rt_mutex steal

Add hooks at rt_mutex_steal function so that oems can decide
whether tasks with the same priority steal the rt_mutex or
not. We did experiments and found that rt_mutex throughput
can benefit a lot when threads with the same priority can
steal the rt_mutex lock.

Bug: 317670024
Change-Id: Id60a7a41c6c77a67808982d3667946cabe4acc8f
Signed-off-by: xeiliujie <xieliujie@oppo.com>
---
 drivers/android/vendor_hooks.c | 1 +
 include/trace/hooks/dtask.h    | 3 +++
 kernel/locking/rtmutex.c       | 6 ++++++
 3 files changed, 10 insertions(+)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index 43eb748c1103..ee68032b1918 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -95,6 +95,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_task_blocks_on_rtmutex);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_waiter_prio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rt_mutex_steal);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner);
diff --git a/include/trace/hooks/dtask.h b/include/trace/hooks/dtask.h
index a63a2868e626..b51147089b2d 100644
--- a/include/trace/hooks/dtask.h
+++ b/include/trace/hooks/dtask.h
@@ -42,6 +42,9 @@ DECLARE_HOOK(android_vh_rtmutex_wait_start,
 DECLARE_HOOK(android_vh_rtmutex_wait_finish,
 	TP_PROTO(struct rt_mutex_base *lock),
 	TP_ARGS(lock));
+DECLARE_HOOK(android_vh_rt_mutex_steal,
+	TP_PROTO(int waiter_prio, int top_waiter_prio, bool *ret),
+	TP_ARGS(waiter_prio, top_waiter_prio, ret));
 
 DECLARE_HOOK(android_vh_rwsem_read_wait_start,
 	TP_PROTO(struct rw_semaphore *sem),
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 351716fe9138..8207fdade4a8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -391,9 +391,15 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
 				  struct rt_mutex_waiter *top_waiter)
 {
+	bool ret = false;
+
 	if (rt_mutex_waiter_less(waiter, top_waiter))
 		return true;
 
+	trace_android_vh_rt_mutex_steal(waiter->prio, top_waiter->prio, &ret);
+	if (ret)
+		return true;
+
 #ifdef RT_MUTEX_BUILD_SPINLOCKS
 	/*
 	 * Note that RT tasks are excluded from same priority (lateral)

From d3006fb9449d46b3c7733d5baa30c7a0e944cc5e Mon Sep 17 00:00:00 2001
From: xieliujie <xieliujie@oppo.com>
Date: Mon, 25 Dec 2023 15:06:38 +0800
Subject: [PATCH 060/139] ANDROID: ABI: Update oplus symbol list

1 function symbol(s) added
  'int __traceiter_android_vh_rt_mutex_steal(void*, int, int, bool*)'

1 variable symbol(s) added
  'struct tracepoint __tracepoint_android_vh_rt_mutex_steal'

Bug: 317670024
Change-Id: I28f0379adaec041400e49cbd1e497b2f8c5c893d
Signed-off-by: xeiliujie <xieliujie@oppo.com>
---
 android/abi_gki_aarch64.stg   | 28 ++++++++++++++++++++++++++++
 android/abi_gki_aarch64_oplus |  2 ++
 2 files changed, 30 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index e570ab134873..f2b7c07a7716 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -315510,6 +315510,14 @@ function {
   parameter_id: 0x6720d32f
   parameter_id: 0x3c2755a3
 }
+function {
+  id: 0x9a2ab624
+  return_type_id: 0x6720d32f
+  parameter_id: 0x18bd6530
+  parameter_id: 0x6720d32f
+  parameter_id: 0x6720d32f
+  parameter_id: 0x11cfee5a
+}
 function {
   id: 0x9a2abc7b
   return_type_id: 0x6720d32f
@@ -337636,6 +337644,15 @@ elf_symbol {
   type_id: 0x9b08a261
   full_name: "__traceiter_android_vh_rproc_recovery_set"
 }
+elf_symbol {
+  id: 0xd56fbf76
+  name: "__traceiter_android_vh_rt_mutex_steal"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xf0a6d2df
+  type_id: 0x9a2ab624
+  full_name: "__traceiter_android_vh_rt_mutex_steal"
+}
 elf_symbol {
   id: 0x3ef508a2
   name: "__traceiter_android_vh_rtmutex_wait_finish"
@@ -341632,6 +341649,15 @@ elf_symbol {
   type_id: 0x18ccbd2c
   full_name: "__tracepoint_android_vh_rproc_recovery_set"
 }
+elf_symbol {
+  id: 0xed43b088
+  name: "__tracepoint_android_vh_rt_mutex_steal"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0xdc6b8d43
+  type_id: 0x18ccbd2c
+  full_name: "__tracepoint_android_vh_rt_mutex_steal"
+}
 elf_symbol {
   id: 0xa3915d70
   name: "__tracepoint_android_vh_rtmutex_wait_finish"
@@ -399133,6 +399159,7 @@ interface {
   symbol_id: 0x8d62858f
   symbol_id: 0xcef5d79f
   symbol_id: 0x91384eff
+  symbol_id: 0xd56fbf76
   symbol_id: 0x3ef508a2
   symbol_id: 0xfb1b8d64
   symbol_id: 0xc56d7179
@@ -399577,6 +399604,7 @@ interface {
   symbol_id: 0x04365139
   symbol_id: 0xd94bc301
   symbol_id: 0x3fc5ffc9
+  symbol_id: 0xed43b088
   symbol_id: 0xa3915d70
   symbol_id: 0xf01f02ea
   symbol_id: 0xeaebbadf
diff --git a/android/abi_gki_aarch64_oplus b/android/abi_gki_aarch64_oplus
index a374dcf65636..8c9846b3a27d 100644
--- a/android/abi_gki_aarch64_oplus
+++ b/android/abi_gki_aarch64_oplus
@@ -158,6 +158,7 @@
   __traceiter_android_vh_dm_bufio_shrink_scan_bypass
   __traceiter_android_vh_mutex_unlock_slowpath
   __traceiter_android_vh_rtmutex_waiter_prio
+  __traceiter_android_vh_rt_mutex_steal
   __traceiter_android_vh_rwsem_can_spin_on_owner
   __traceiter_android_vh_rwsem_opt_spin_finish
   __traceiter_android_vh_rwsem_opt_spin_start
@@ -258,6 +259,7 @@
   __tracepoint_android_vh_record_rtmutex_lock_starttime
   __tracepoint_android_vh_record_rwsem_lock_starttime
   __tracepoint_android_vh_rtmutex_waiter_prio
+  __tracepoint_android_vh_rt_mutex_steal
   __tracepoint_android_vh_rwsem_can_spin_on_owner
   __tracepoint_android_vh_rwsem_opt_spin_finish
   __tracepoint_android_vh_rwsem_opt_spin_start

From d16a15fde575b740e248e5bbd6ffd11a90c40baa Mon Sep 17 00:00:00 2001
From: Roy Luo <royluo@google.com>
Date: Tue, 28 Nov 2023 22:17:56 +0000
Subject: [PATCH 061/139] UPSTREAM: USB: gadget: core: adjust uevent timing on
 gadget unbind

The KOBJ_CHANGE uevent is sent before gadget unbind is actually
executed, resulting in inaccurate uevent emitted at incorrect timing
(the uevent would have USB_UDC_DRIVER variable set while it would
soon be removed).
Move the KOBJ_CHANGE uevent to the end of the unbind function so that
uevent is sent only after the change has been made.

Fixes: 2ccea03a8f7e ("usb: gadget: introduce UDC Class")
Cc: stable@vger.kernel.org
Signed-off-by: Roy Luo <royluo@google.com>
Link: https://lore.kernel.org/r/20231128221756.2591158-1-royluo@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Bug: 312543856
Change-Id: Ida7fa7e1cfae3d1b3f3348512a67fe91065f25af
(cherry picked from commit 73ea73affe8622bdf292de898da869d441da6a9d)
Signed-off-by: Roy Luo <royluo@google.com>
---
 drivers/usb/gadget/udc/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index d7e992ee7743..1f56b770465e 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1619,8 +1619,6 @@ static void gadget_unbind_driver(struct device *dev)
 
 	dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function);
 
-	kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
-
 	udc->allow_connect = false;
 	cancel_work_sync(&udc->vbus_work);
 	mutex_lock(&udc->connect_lock);
@@ -1640,6 +1638,8 @@ static void gadget_unbind_driver(struct device *dev)
 	driver->is_bound = false;
 	udc->driver = NULL;
 	mutex_unlock(&udc_lock);
+
+	kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
 }
 
 /* ------------------------------------------------------------------------- */

From 2c085909e7a7d2bb39881bc33ab6a45713c7f379 Mon Sep 17 00:00:00 2001
From: leonardian <leonardian@google.com>
Date: Wed, 13 Dec 2023 07:00:25 +0000
Subject: [PATCH 062/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
  - _dev_alert

Bug: 311337219
Change-Id: Iaf6710842c45921ccfbacd1361e0b57401cf65d9
Signed-off-by: leonardian <leonardian@google.com>
---
 android/abi_gki_aarch64_pixel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index b7d8f1baa215..af22721e20b8 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -340,6 +340,7 @@
   desc_to_gpio
   destroy_workqueue
   dev_addr_mod
+  _dev_alert
   dev_alloc_name
   __dev_change_net_namespace
   dev_close

From 88a19395047b78359c6c352491c2028dc730878c Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Wed, 30 Nov 2022 14:04:55 +0800
Subject: [PATCH 063/139] UPSTREAM: erofs: enable large folios for iomap mode

Enable large folios for iomap mode.  Then the readahead routine will
pass down large folios containing multiple pages.

Let's enable this for non-compressed format for now, until the
compression part supports large folios later.

When large folios supported, the iomap routine will allocate iomap_page
for each large folio and thus we need iomap_release_folio() and
iomap_invalidate_folio() to free iomap_page when these folios get
reclaimed or invalidated.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20221130060455.44532-1-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Bug: 318378021
Change-Id: Iedbb9a2daf132399b7a1b5ea6905977ba123ba3c
(cherry picked from commit ce529cc25b184e93397b94a8a322128fc0095cbb)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/data.c  | 2 ++
 fs/erofs/inode.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 83532525282e..bbfbaf25ee59 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -404,6 +404,8 @@ const struct address_space_operations erofs_raw_access_aops = {
 	.readahead = erofs_readahead,
 	.bmap = erofs_bmap,
 	.direct_IO = noop_direct_IO,
+	.release_folio = iomap_release_folio,
+	.invalidate_folio = iomap_invalidate_folio,
 };
 
 #ifdef CONFIG_FS_DAX
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 8fc41fd1620c..187ca02bbc2d 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -299,6 +299,8 @@ static int erofs_fill_inode(struct inode *inode)
 		goto out_unlock;
 	}
 	inode->i_mapping->a_ops = &erofs_raw_access_aops;
+	if (!erofs_is_fscache_mode(inode->i_sb))
+		mapping_set_large_folios(inode->i_mapping);
 #ifdef CONFIG_EROFS_FS_ONDEMAND
 	if (erofs_is_fscache_mode(inode->i_sb))
 		inode->i_mapping->a_ops = &erofs_fscache_access_aops;

From 66595bb17c147d49721669fe822817dea377b9ce Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@coolpad.com>
Date: Wed, 26 Apr 2023 16:44:49 +0800
Subject: [PATCH 064/139] UPSTREAM: erofs: fold in z_erofs_decompress()

No need this helper since it's just a simple wrapper for decompress
method and only one caller.  So, let's fold in directly instead.

Signed-off-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230426084449.12781-1-zbestahu@gmail.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Bug: 318378021
(cherry picked from commit 597e2953ae9b4a391e883c1f1a4cda5878e2dbed)
Change-Id: I849360f088016cf97542858e8a5a9cee671a2f61
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/compress.h     | 3 +--
 fs/erofs/decompressor.c | 8 +-------
 fs/erofs/zdata.c        | 4 +++-
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 26fa170090b8..b1b846504027 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
 
 int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
 			 unsigned int padbufsize);
-int z_erofs_decompress(struct z_erofs_decompress_req *rq,
-		       struct page **pagepool);
+extern const struct z_erofs_decompressor erofs_decompressors[];
 
 /* prototypes for specific algorithms */
 int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 7021e2cf6146..2a29943fa5cc 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 	return 0;
 }
 
-static struct z_erofs_decompressor decompressors[] = {
+const struct z_erofs_decompressor erofs_decompressors[] = {
 	[Z_EROFS_COMPRESSION_SHIFTED] = {
 		.decompress = z_erofs_transform_plain,
 		.name = "shifted"
@@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = {
 	},
 #endif
 };
-
-int z_erofs_decompress(struct z_erofs_decompress_req *rq,
-		       struct page **pagepool)
-{
-	return decompressors[rq->alg].decompress(rq, pagepool);
-}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 451b9a6cba68..f63c2422c9f9 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1256,6 +1256,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
 	struct z_erofs_pcluster *pcl = be->pcl;
 	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+	const struct z_erofs_decompressor *decompressor =
+				&erofs_decompressors[pcl->algorithmformat];
 	unsigned int i, inputsize;
 	int err2;
 	struct page *page;
@@ -1299,7 +1301,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	else
 		inputsize = pclusterpages * PAGE_SIZE;
 
-	err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+	err = decompressor->decompress(&(struct z_erofs_decompress_req) {
 					.sb = be->sb,
 					.in = be->compressed_pages,
 					.out = be->decompressed_pages,

From 5e861fa97e24f31aeda455870177da40c9eccd76 Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@coolpad.com>
Date: Wed, 24 May 2023 14:39:44 +0800
Subject: [PATCH 065/139] UPSTREAM: erofs: remove the member readahead from
 struct z_erofs_decompress_frontend

The struct member is only used to add REQ_RAHEAD during I/O submission.
So it is cleaner to pass it as a parameter than keep it in the struct.

Also, rename function z_erofs_get_sync_decompress_policy() to
z_erofs_is_sync_decompress() for better clarity and conciseness.

Signed-off-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230524063944.1655-1-zbestahu@gmail.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Bug: 318378021
(cherry picked from commit ef4b4b46c6aaf8edeea9a79320627fe10993f153)
Change-Id: I59cc13e7499968a1e93e13df1cb43a5123d510d9
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index f63c2422c9f9..50f803a48f96 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -534,7 +534,6 @@ struct z_erofs_decompress_frontend {
 	z_erofs_next_pcluster_t owned_head;
 	enum z_erofs_pclustermode mode;
 
-	bool readahead;
 	/* used for applying cache strategy on the fly */
 	bool backmost;
 	erofs_off_t headoffset;
@@ -1076,7 +1075,7 @@ out:
 	return err;
 }
 
-static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
+static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
 				       unsigned int readahead_pages)
 {
 	/* auto: enable for read_folio, disable for readahead */
@@ -1637,7 +1636,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 				 struct page **pagepool,
 				 struct z_erofs_decompressqueue *fgq,
-				 bool *force_fg)
+				 bool *force_fg, bool readahead)
 {
 	struct super_block *sb = f->inode->i_sb;
 	struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
@@ -1723,7 +1722,7 @@ submit_bio_retry:
 				bio->bi_iter.bi_sector = (sector_t)cur <<
 					(sb->s_blocksize_bits - 9);
 				bio->bi_private = q[JQ_SUBMIT];
-				if (f->readahead)
+				if (readahead)
 					bio->bi_opf |= REQ_RAHEAD;
 				++nr_bios;
 			}
@@ -1759,13 +1758,13 @@ submit_bio_retry:
 }
 
 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
-			     struct page **pagepool, bool force_fg)
+			     struct page **pagepool, bool force_fg, bool ra)
 {
 	struct z_erofs_decompressqueue io[NR_JOBQUEUES];
 
 	if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
 		return;
-	z_erofs_submit_queue(f, pagepool, io, &force_fg);
+	z_erofs_submit_queue(f, pagepool, io, &force_fg, ra);
 
 	/* handle bypass queue (no i/o pclusters) immediately */
 	z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1863,8 +1862,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 	(void)z_erofs_collector_end(&f);
 
 	/* if some compressed cluster ready, need submit them anyway */
-	z_erofs_runqueue(&f, &pagepool,
-			 z_erofs_get_sync_decompress_policy(sbi, 0));
+	z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0),
+			 false);
 
 	if (err)
 		erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1882,7 +1881,6 @@ static void z_erofs_readahead(struct readahead_control *rac)
 	struct page *pagepool = NULL, *head = NULL, *page;
 	unsigned int nr_pages;
 
-	f.readahead = true;
 	f.headoffset = readahead_pos(rac);
 
 	z_erofs_pcluster_readmore(&f, rac, f.headoffset +
@@ -1913,7 +1911,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 	(void)z_erofs_collector_end(&f);
 
 	z_erofs_runqueue(&f, &pagepool,
-			 z_erofs_get_sync_decompress_policy(sbi, nr_pages));
+			 z_erofs_is_sync_decompress(sbi, nr_pages), true);
 	erofs_put_metabuf(&f.map.buf);
 	erofs_release_pages(&pagepool);
 }

From bed20ed1d348355f99a40e7e61729b25e21356f9 Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@coolpad.com>
Date: Thu, 25 May 2023 15:26:05 +0800
Subject: [PATCH 066/139] UPSTREAM: erofs: clean up z_erofs_pcluster_readmore()

`end` parameter is no needed since it's pointless for !backmost, we can
handle it with backmost internally.  And we only expand the trailing
edge, so the newstart can be replaced with ->headoffset.

Also, remove linux/prefetch.h inclusion since that is not used anymore
after commit 386292919c25 ("erofs: introduce readmore decompression
strategy").

Signed-off-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230525072605.17857-1-zbestahu@gmail.com
[ Gao Xiang: update commit description. ]
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Bug: 318378021
(cherry picked from commit 796e9149a2fcdba5543e247abd8d911a399bb9a6)
Change-Id: I9412c4111800077c876a43c4256ce9760a7d902e
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 50f803a48f96..b7bf435eea82 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -5,7 +5,6 @@
  * Copyright (C) 2022 Alibaba Cloud
  */
 #include "compress.h"
-#include <linux/prefetch.h>
 #include <linux/psi.h>
 #include <linux/cpuhotplug.h>
 #include <linux/kthread.h>
@@ -1785,28 +1784,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
  */
 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
 				      struct readahead_control *rac,
-				      erofs_off_t end,
-				      struct page **pagepool,
-				      bool backmost)
+				      struct page **pagepool, bool backmost)
 {
 	struct inode *inode = f->inode;
 	struct erofs_map_blocks *map = &f->map;
-	erofs_off_t cur;
+	erofs_off_t cur, end, headoffset = f->headoffset;
 	int err;
 
 	if (backmost) {
+		if (rac)
+			end = headoffset + readahead_length(rac) - 1;
+		else
+			end = headoffset + PAGE_SIZE - 1;
 		map->m_la = end;
 		err = z_erofs_map_blocks_iter(inode, map,
 					      EROFS_GET_BLOCKS_READMORE);
 		if (err)
 			return;
 
-		/* expend ra for the trailing edge if readahead */
+		/* expand ra for the trailing edge if readahead */
 		if (rac) {
-			loff_t newstart = readahead_pos(rac);
-
 			cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
-			readahead_expand(rac, newstart, cur - newstart);
+			readahead_expand(rac, headoffset, cur - headoffset);
 			return;
 		}
 		end = round_up(end, PAGE_SIZE);
@@ -1854,10 +1853,9 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 	trace_erofs_readpage(page, false);
 	f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
 
-	z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
-				  &pagepool, true);
+	z_erofs_pcluster_readmore(&f, NULL, &pagepool, true);
 	err = z_erofs_do_read_page(&f, page, &pagepool);
-	z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+	z_erofs_pcluster_readmore(&f, NULL, &pagepool, false);
 
 	(void)z_erofs_collector_end(&f);
 
@@ -1883,8 +1881,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 
 	f.headoffset = readahead_pos(rac);
 
-	z_erofs_pcluster_readmore(&f, rac, f.headoffset +
-				  readahead_length(rac) - 1, &pagepool, true);
+	z_erofs_pcluster_readmore(&f, rac, &pagepool, true);
 	nr_pages = readahead_count(rac);
 	trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
 
@@ -1907,7 +1904,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 				  page->index, EROFS_I(inode)->nid);
 		put_page(page);
 	}
-	z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
+	z_erofs_pcluster_readmore(&f, rac, &pagepool, false);
 	(void)z_erofs_collector_end(&f);
 
 	z_erofs_runqueue(&f, &pagepool,

From 5c1827383ac1df6e796ae44533a913d39425a78e Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Sat, 27 May 2023 04:14:54 +0800
Subject: [PATCH 067/139] UPSTREAM: erofs: allocate extra bvec pages directly
 instead of retrying

If non-bootstrap bvecs cannot be kept in place (very rarely), an extra
short-lived page is allocated.

Let's just allocate it immediately rather than do unnecessary -EAGAIN
return first and retry as a cleanup.  Also it's unnecessary to use
__GFP_NOFAIL here since we could gracefully fail out this case instead.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-2-hsiangkao@linux.alibaba.com

Bug: 318378021
(cherry picked from commit 05b63d2beb8b0f752d1f5cdd051c8bdbf532cedd)
Change-Id: I2ac45a943060406bcbb741c5f7aa1094f783f906
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index b7bf435eea82..2e945803c457 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -240,12 +240,17 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
 				struct z_erofs_bvec *bvec,
 				struct page **candidate_bvpage)
 {
-	if (iter->cur == iter->nr) {
-		if (!*candidate_bvpage)
-			return -EAGAIN;
+	if (iter->cur >= iter->nr) {
+		struct page *nextpage = *candidate_bvpage;
 
+		if (!nextpage) {
+			nextpage = alloc_page(GFP_NOFS);
+			if (!nextpage)
+				return -ENOMEM;
+			set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
+		}
 		DBG_BUGON(iter->bvset->nextpage);
-		iter->bvset->nextpage = *candidate_bvpage;
+		iter->bvset->nextpage = nextpage;
 		z_erofs_bvset_flip(iter);
 
 		iter->bvset->nextpage = NULL;
@@ -872,10 +877,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
 	z_erofs_bvec_iter_end(&fe->biter);
 	mutex_unlock(&pcl->lock);
 
-	if (fe->candidate_bvpage) {
-		DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+	if (fe->candidate_bvpage)
 		fe->candidate_bvpage = NULL;
-	}
 
 	/*
 	 * if all pending pages are added, don't hold its reference
@@ -1023,24 +1026,13 @@ hitted:
 	if (cur)
 		tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
 
-retry:
 	err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
 					.page = page,
 					.offset = offset - map->m_la,
 					.end = end,
 				  }), exclusive);
-	/* should allocate an additional short-lived page for bvset */
-	if (err == -EAGAIN && !fe->candidate_bvpage) {
-		fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
-		set_page_private(fe->candidate_bvpage,
-				 Z_EROFS_SHORTLIVED_PAGE);
-		goto retry;
-	}
-
-	if (err) {
-		DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
+	if (err)
 		goto out;
-	}
 
 	z_erofs_onlinepage_split(page);
 	/* bump up the number of spiltted parts of a page */

From 3d9318266165302aae965416e8ebfed980088b8c Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Sat, 27 May 2023 04:14:55 +0800
Subject: [PATCH 068/139] UPSTREAM: erofs: avoid on-stack pagepool directly
 passed by arguments

On-stack pagepool is used so that short-lived temporary pages could be
shared within a single I/O request (e.g. among multiple pclusters).

Moving the remaining frontend-related uses into
z_erofs_decompress_frontend to avoid too many arguments.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-3-hsiangkao@linux.alibaba.com

Bug: 318378021
(cherry picked from commit 6ab5eed6002edc5a29b683285e90459a7df6ce2b)
Change-Id: I57d3ba6087904bb40c55b780aca50c16bfba2c0f
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 64 +++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 2e945803c457..2abc2398708a 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -238,13 +238,14 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
 
 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
 				struct z_erofs_bvec *bvec,
-				struct page **candidate_bvpage)
+				struct page **candidate_bvpage,
+				struct page **pagepool)
 {
 	if (iter->cur >= iter->nr) {
 		struct page *nextpage = *candidate_bvpage;
 
 		if (!nextpage) {
-			nextpage = alloc_page(GFP_NOFS);
+			nextpage = erofs_allocpage(pagepool, GFP_NOFS);
 			if (!nextpage)
 				return -ENOMEM;
 			set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -533,6 +534,7 @@ struct z_erofs_decompress_frontend {
 	struct erofs_map_blocks map;
 	struct z_erofs_bvec_iter biter;
 
+	struct page *pagepool;
 	struct page *candidate_bvpage;
 	struct z_erofs_pcluster *pcl;
 	z_erofs_next_pcluster_t owned_head;
@@ -567,8 +569,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
 	return false;
 }
 
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
-			       struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 {
 	struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
 	struct z_erofs_pcluster *pcl = fe->pcl;
@@ -609,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
 			 * succeeds or fallback to in-place I/O instead
 			 * to avoid any direct reclaim.
 			 */
-			newpage = erofs_allocpage(pagepool, gfp);
+			newpage = erofs_allocpage(&fe->pagepool, gfp);
 			if (!newpage)
 				continue;
 			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
@@ -622,7 +623,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
 		if (page)
 			put_page(page);
 		else if (newpage)
-			erofs_pagepool_add(pagepool, newpage);
+			erofs_pagepool_add(&fe->pagepool, newpage);
 	}
 
 	/*
@@ -720,7 +721,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
 		    !fe->candidate_bvpage)
 			fe->candidate_bvpage = bvec->page;
 	}
-	ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+	ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
+				   &fe->pagepool);
 	fe->pcl->vcnt += (ret >= 0);
 	return ret;
 }
@@ -925,7 +927,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
 }
 
 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
-				struct page *page, struct page **pagepool)
+				struct page *page)
 {
 	struct inode *const inode = fe->inode;
 	struct erofs_map_blocks *const map = &fe->map;
@@ -985,7 +987,7 @@ repeat:
 		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 	} else {
 		/* bind cache first when cached decompression is preferred */
-		z_erofs_bind_cache(fe, pagepool);
+		z_erofs_bind_cache(fe);
 	}
 hitted:
 	/*
@@ -1625,7 +1627,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
 }
 
 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
-				 struct page **pagepool,
 				 struct z_erofs_decompressqueue *fgq,
 				 bool *force_fg, bool readahead)
 {
@@ -1683,8 +1684,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 		do {
 			struct page *page;
 
-			page = pickup_page_for_submission(pcl, i++, pagepool,
-							  mc);
+			page = pickup_page_for_submission(pcl, i++,
+					&f->pagepool, mc);
 			if (!page)
 				continue;
 
@@ -1749,16 +1750,16 @@ submit_bio_retry:
 }
 
 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
-			     struct page **pagepool, bool force_fg, bool ra)
+			     bool force_fg, bool ra)
 {
 	struct z_erofs_decompressqueue io[NR_JOBQUEUES];
 
 	if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
 		return;
-	z_erofs_submit_queue(f, pagepool, io, &force_fg, ra);
+	z_erofs_submit_queue(f, io, &force_fg, ra);
 
 	/* handle bypass queue (no i/o pclusters) immediately */
-	z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
+	z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
 
 	if (!force_fg)
 		return;
@@ -1767,7 +1768,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
 	wait_for_completion_io(&io[JQ_SUBMIT].u.done);
 
 	/* handle synchronous decompress queue in the caller context */
-	z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+	z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
 }
 
 /*
@@ -1775,8 +1776,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
  * approximate readmore strategies as a start.
  */
 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
-				      struct readahead_control *rac,
-				      struct page **pagepool, bool backmost)
+		struct readahead_control *rac, bool backmost)
 {
 	struct inode *inode = f->inode;
 	struct erofs_map_blocks *map = &f->map;
@@ -1818,7 +1818,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
 			if (PageUptodate(page)) {
 				unlock_page(page);
 			} else {
-				err = z_erofs_do_read_page(f, page, pagepool);
+				err = z_erofs_do_read_page(f, page);
 				if (err)
 					erofs_err(inode->i_sb,
 						  "readmore error at page %lu @ nid %llu",
@@ -1839,27 +1839,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 	struct inode *const inode = page->mapping->host;
 	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
-	struct page *pagepool = NULL;
 	int err;
 
 	trace_erofs_readpage(page, false);
 	f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
 
-	z_erofs_pcluster_readmore(&f, NULL, &pagepool, true);
-	err = z_erofs_do_read_page(&f, page, &pagepool);
-	z_erofs_pcluster_readmore(&f, NULL, &pagepool, false);
-
+	z_erofs_pcluster_readmore(&f, NULL, true);
+	err = z_erofs_do_read_page(&f, page);
+	z_erofs_pcluster_readmore(&f, NULL, false);
 	(void)z_erofs_collector_end(&f);
 
 	/* if some compressed cluster ready, need submit them anyway */
-	z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0),
-			 false);
+	z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
 
 	if (err)
 		erofs_err(inode->i_sb, "failed to read, err [%d]", err);
 
 	erofs_put_metabuf(&f.map.buf);
-	erofs_release_pages(&pagepool);
+	erofs_release_pages(&f.pagepool);
 	return err;
 }
 
@@ -1868,12 +1865,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
 	struct inode *const inode = rac->mapping->host;
 	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
-	struct page *pagepool = NULL, *head = NULL, *page;
+	struct page *head = NULL, *page;
 	unsigned int nr_pages;
 
 	f.headoffset = readahead_pos(rac);
 
-	z_erofs_pcluster_readmore(&f, rac, &pagepool, true);
+	z_erofs_pcluster_readmore(&f, rac, true);
 	nr_pages = readahead_count(rac);
 	trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
 
@@ -1889,20 +1886,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
 		/* traversal in reverse order */
 		head = (void *)page_private(page);
 
-		err = z_erofs_do_read_page(&f, page, &pagepool);
+		err = z_erofs_do_read_page(&f, page);
 		if (err)
 			erofs_err(inode->i_sb,
 				  "readahead error at page %lu @ nid %llu",
 				  page->index, EROFS_I(inode)->nid);
 		put_page(page);
 	}
-	z_erofs_pcluster_readmore(&f, rac, &pagepool, false);
+	z_erofs_pcluster_readmore(&f, rac, false);
 	(void)z_erofs_collector_end(&f);
 
-	z_erofs_runqueue(&f, &pagepool,
-			 z_erofs_is_sync_decompress(sbi, nr_pages), true);
+	z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
 	erofs_put_metabuf(&f.map.buf);
-	erofs_release_pages(&pagepool);
+	erofs_release_pages(&f.pagepool);
 }
 
 const struct address_space_operations z_erofs_aops = {

From 187d034575bcaf95e145627406a5b26108fba447 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Sat, 27 May 2023 04:14:57 +0800
Subject: [PATCH 069/139] BACKPORT: erofs: adapt managed inode operations into
 folios

This patch gets rid of erofs_try_to_free_cached_page() and fold it
into .release_folio().

It also moves managed inode operations into zdata.c, which simplifies
the code a bit.  No logic changes.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-5-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I5cb1e44769f68edce788cb4f8084bb3d45b594b3
(cherry picked from commit 7b4e372c36fcd33c74ba3cbd65fa534b9c558184)
[dhavale: changes to internal.h applied manually]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/internal.h |  3 ++-
 fs/erofs/super.c    | 62 ---------------------------------------------
 fs/erofs/zdata.c    | 59 ++++++++++++++++++++++++++++++++++++------
 3 files changed, 53 insertions(+), 71 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1c03daf83a68..23151da13a23 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -544,7 +544,7 @@ int __init z_erofs_init_zip_subsystem(void);
 void z_erofs_exit_zip_subsystem(void);
 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 				       struct erofs_workgroup *egrp);
-int erofs_try_to_free_cached_page(struct page *page);
+int erofs_init_managed_cache(struct super_block *sb);
 int z_erofs_load_lz4_config(struct super_block *sb,
 			    struct erofs_super_block *dsb,
 			    struct z_erofs_lz4_cfgs *lz4, int len);
@@ -565,6 +565,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
 	}
 	return 0;
 }
+static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
 #ifdef CONFIG_EROFS_FS_ZIP_LZMA
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index b073b38c1c77..19af9bbcb8f1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -597,68 +597,6 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 	return 0;
 }
 
-#ifdef CONFIG_EROFS_FS_ZIP
-static const struct address_space_operations managed_cache_aops;
-
-static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
-{
-	bool ret = true;
-	struct address_space *const mapping = folio->mapping;
-
-	DBG_BUGON(!folio_test_locked(folio));
-	DBG_BUGON(mapping->a_ops != &managed_cache_aops);
-
-	if (folio_test_private(folio))
-		ret = erofs_try_to_free_cached_page(&folio->page);
-
-	return ret;
-}
-
-/*
- * It will be called only on inode eviction. In case that there are still some
- * decompression requests in progress, wait with rescheduling for a bit here.
- * We could introduce an extra locking instead but it seems unnecessary.
- */
-static void erofs_managed_cache_invalidate_folio(struct folio *folio,
-					       size_t offset, size_t length)
-{
-	const size_t stop = length + offset;
-
-	DBG_BUGON(!folio_test_locked(folio));
-
-	/* Check for potential overflow in debug mode */
-	DBG_BUGON(stop > folio_size(folio) || stop < length);
-
-	if (offset == 0 && stop == folio_size(folio))
-		while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
-			cond_resched();
-}
-
-static const struct address_space_operations managed_cache_aops = {
-	.release_folio = erofs_managed_cache_release_folio,
-	.invalidate_folio = erofs_managed_cache_invalidate_folio,
-};
-
-static int erofs_init_managed_cache(struct super_block *sb)
-{
-	struct erofs_sb_info *const sbi = EROFS_SB(sb);
-	struct inode *const inode = new_inode(sb);
-
-	if (!inode)
-		return -ENOMEM;
-
-	set_nlink(inode, 1);
-	inode->i_size = OFFSET_MAX;
-
-	inode->i_mapping->a_ops = &managed_cache_aops;
-	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
-	sbi->managed_cache = inode;
-	return 0;
-}
-#else
-static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
-#endif
-
 static struct inode *erofs_nfs_get_inode(struct super_block *sb,
 					 u64 ino, u32 generation)
 {
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 2abc2398708a..8085e712314e 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -668,29 +668,72 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 	return 0;
 }
 
-int erofs_try_to_free_cached_page(struct page *page)
+static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
 {
-	struct z_erofs_pcluster *const pcl = (void *)page_private(page);
-	int ret, i;
+	struct z_erofs_pcluster *pcl = folio_get_private(folio);
+	bool ret;
+	int i;
+
+	if (!folio_test_private(folio))
+		return true;
 
 	if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
-		return 0;
+		return false;
 
-	ret = 0;
+	ret = false;
 	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
 	for (i = 0; i < pcl->pclusterpages; ++i) {
-		if (pcl->compressed_bvecs[i].page == page) {
+		if (pcl->compressed_bvecs[i].page == &folio->page) {
 			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
-			ret = 1;
+			ret = true;
 			break;
 		}
 	}
 	erofs_workgroup_unfreeze(&pcl->obj, 1);
+
 	if (ret)
-		detach_page_private(page);
+		folio_detach_private(folio);
 	return ret;
 }
 
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * An extra lock could be introduced instead but it seems unnecessary.
+ */
+static void z_erofs_cache_invalidate_folio(struct folio *folio,
+					   size_t offset, size_t length)
+{
+	const size_t stop = length + offset;
+
+	/* Check for potential overflow in debug mode */
+	DBG_BUGON(stop > folio_size(folio) || stop < length);
+
+	if (offset == 0 && stop == folio_size(folio))
+		while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations z_erofs_cache_aops = {
+	.release_folio = z_erofs_cache_release_folio,
+	.invalidate_folio = z_erofs_cache_invalidate_folio,
+};
+
+int erofs_init_managed_cache(struct super_block *sb)
+{
+	struct inode *const inode = new_inode(sb);
+
+	if (!inode)
+		return -ENOMEM;
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+	inode->i_mapping->a_ops = &z_erofs_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+	EROFS_SB(sb)->managed_cache = inode;
+	return 0;
+}
+
 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
 				   struct z_erofs_bvec *bvec)
 {

From 365ca16da250c80d27a7f7b3499acfa2db9718a7 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 28 Jun 2023 00:12:40 +0800
Subject: [PATCH 070/139] UPSTREAM: erofs: simplify z_erofs_transform_plain()

Use memcpy_to_page() instead of open-coding them.

In addition, add a missing flush_dcache_page() even though almost all
modern architectures clear `PG_dcache_clean` flag for new file cache
pages so that it doesn't change anything in practice.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20230627161240.331-2-hsiangkao@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Bug: 318378021
(cherry picked from commit c5539762f32e97c5e16215fa1336e32095b8b0fd)
Change-Id: I4cb665b592936502ca95e2aee20e1c3a56103ff5
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/decompressor.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2a29943fa5cc..b9dd685a16fc 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 	const unsigned int lefthalf = rq->outputsize - righthalf;
 	const unsigned int interlaced_offset =
 		rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
-	unsigned char *src, *dst;
+	u8 *src;
 
 	if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
 		DBG_BUGON(1);
@@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 	}
 
 	src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
-	if (rq->out[0]) {
-		dst = kmap_local_page(rq->out[0]);
-		memcpy(dst + rq->pageofs_out, src + interlaced_offset,
-		       righthalf);
-		kunmap_local(dst);
-	}
+	if (rq->out[0])
+		memcpy_to_page(rq->out[0], rq->pageofs_out,
+			       src + interlaced_offset, righthalf);
 
 	if (outpages > inpages) {
 		DBG_BUGON(!rq->out[outpages - 1]);
 		if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
-			dst = kmap_local_page(rq->out[outpages - 1]);
-			memcpy(dst, interlaced_offset ? src :
-					(src + righthalf), lefthalf);
-			kunmap_local(dst);
+			memcpy_to_page(rq->out[outpages - 1], 0, src +
+					(interlaced_offset ? 0 : righthalf),
+				       lefthalf);
 		} else if (!interlaced_offset) {
 			memmove(src, src + righthalf, lefthalf);
+			flush_dcache_page(rq->in[inpages - 1]);
 		}
 	}
 	kunmap_local(src);

From 4067dd99694a3722116ed26bfcb361201ea97a9b Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 28 Jun 2023 00:12:39 +0800
Subject: [PATCH 071/139] UPSTREAM: erofs: get rid of the remaining
 kmap_atomic()

It's unnecessary to use kmap_atomic() compared with kmap_local_page().
In addition, kmap_atomic() is deprecated now.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20230627161240.331-1-hsiangkao@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>

Bug: 318378021
(cherry picked from commit 123ec246ebe323d468c5ca996700ea4739d20ddf)
Change-Id: I7efee861bb4f079fe6b79123d554be2e1867d13b
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/decompressor.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index b9dd685a16fc..cfad1eac7fd9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
 		*maptype = 0;
 		return inpage;
 	}
-	kunmap_atomic(inpage);
+	kunmap_local(inpage);
 	might_sleep();
 	src = erofs_vm_map_ram(rq->in, ctx->inpages);
 	if (!src)
@@ -162,7 +162,7 @@ docopy:
 	src = erofs_get_pcpubuf(ctx->inpages);
 	if (!src) {
 		DBG_BUGON(1);
-		kunmap_atomic(inpage);
+		kunmap_local(inpage);
 		return ERR_PTR(-EFAULT);
 	}
 
@@ -173,9 +173,9 @@ docopy:
 			min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
 
 		if (!inpage)
-			inpage = kmap_atomic(*in);
+			inpage = kmap_local_page(*in);
 		memcpy(tmp, inpage + *inputmargin, page_copycnt);
-		kunmap_atomic(inpage);
+		kunmap_local(inpage);
 		inpage = NULL;
 		tmp += page_copycnt;
 		total -= page_copycnt;
@@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
 	int ret, maptype;
 
 	DBG_BUGON(*rq->in == NULL);
-	headpage = kmap_atomic(*rq->in);
+	headpage = kmap_local_page(*rq->in);
 
 	/* LZ4 decompression inplace is only safe if zero_padding is enabled */
 	if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
@@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
 				min_t(unsigned int, rq->inputsize,
 				      rq->sb->s_blocksize - rq->pageofs_in));
 		if (ret) {
-			kunmap_atomic(headpage);
+			kunmap_local(headpage);
 			return ret;
 		}
 		may_inplace = !((rq->pageofs_in + rq->inputsize) &
@@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
 	}
 
 	if (maptype == 0) {
-		kunmap_atomic(headpage);
+		kunmap_local(headpage);
 	} else if (maptype == 1) {
 		vm_unmap_ram(src, ctx->inpages);
 	} else if (maptype == 2) {
@@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
 	/* one optimized fast path only for non bigpcluster cases yet */
 	if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
 		DBG_BUGON(!*rq->out);
-		dst = kmap_atomic(*rq->out);
+		dst = kmap_local_page(*rq->out);
 		dst_maptype = 0;
 		goto dstmap_out;
 	}
@@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
 dstmap_out:
 	ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
 	if (!dst_maptype)
-		kunmap_atomic(dst);
+		kunmap_local(dst);
 	else if (dst_maptype == 2)
 		vm_unmap_ram(dst, ctx.outpages);
 	return ret;

From d0dbf747924547655f733242ade325bd2426a7e0 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 17 Aug 2023 16:28:06 +0800
Subject: [PATCH 072/139] BACKPORT: erofs: simplify z_erofs_read_fragment()

A trivial cleanup to make the fragment handling logic more clear.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817082813.81180-1-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I50c09c65b7d3da5022cfc2ede27aa31a1b331d29
(cherry picked from commit 8b00be163f7b57cbf957b3d27b5a7ca1e2495cfa)
[dhavale: resolved conflict around erofs_bread() in zdata.c]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 8085e712314e..f7d0d71359e8 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -936,22 +936,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
 	return true;
 }
 
-static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
-				 struct page *page, unsigned int pageofs,
-				 unsigned int len)
+static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+			unsigned int cur, unsigned int end, erofs_off_t pos)
 {
-	struct super_block *sb = inode->i_sb;
-	struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+	struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
 	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
-	u8 *src, *dst;
-	unsigned int i, cnt;
+	unsigned int cnt;
+	u8 *src;
 
 	if (!packed_inode)
 		return -EFSCORRUPTED;
 
-	pos += EROFS_I(inode)->z_fragmentoff;
-	for (i = 0; i < len; i += cnt) {
-		cnt = min_t(unsigned int, len - i,
+	for (; cur < end; cur += cnt, pos += cnt) {
+		cnt = min_t(unsigned int, end - cur,
 			    sb->s_blocksize - erofs_blkoff(sb, pos));
 		src = erofs_bread(&buf, packed_inode,
 				  erofs_blknr(sb, pos), EROFS_KMAP);
@@ -959,11 +956,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
 			erofs_put_metabuf(&buf);
 			return PTR_ERR(src);
 		}
-
-		dst = kmap_local_page(page);
-		memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt);
-		kunmap_local(dst);
-		pos += cnt;
+		memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
 	}
 	erofs_put_metabuf(&buf);
 	return 0;
@@ -976,7 +969,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 	struct erofs_map_blocks *const map = &fe->map;
 	const loff_t offset = page_offset(page);
 	bool tight = true, exclusive;
-	unsigned int cur, end, spiltted;
+	unsigned int cur, end, len, spiltted;
 	int err = 0;
 
 	/* register locked file pages as online pages in pack */
@@ -1049,17 +1042,11 @@ hitted:
 		goto next_part;
 	}
 	if (map->m_flags & EROFS_MAP_FRAGMENT) {
-		unsigned int pageofs, skip, len;
+		erofs_off_t fpos = offset + cur - map->m_la;
 
-		if (offset > map->m_la) {
-			pageofs = 0;
-			skip = offset - map->m_la;
-		} else {
-			pageofs = map->m_la & ~PAGE_MASK;
-			skip = 0;
-		}
-		len = min_t(unsigned int, map->m_llen - skip, end - cur);
-		err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+		len = min_t(unsigned int, map->m_llen - fpos, end - cur);
+		err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
+				EROFS_I(inode)->z_fragmentoff + fpos);
 		if (err)
 			goto out;
 		++spiltted;

From 7751567a719dc3cd78125ac4b200596bab4c501a Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 17 Aug 2023 16:28:07 +0800
Subject: [PATCH 073/139] BACKPORT: erofs: avoid obsolete
 {collector,collection} terms

{collector,collection} were once reserved in order to indicate different
runtime logical extent instance of multi-reference pclusters.

However, de-duplicated decompression has been landed in a more flexable
way, thus `struct z_erofs_collection` was formally removed in commit
87ca34a7065d ("erofs: get rid of `struct z_erofs_collection'").

Let's handle the remaining leftovers, for example:
    `z_erofs_collector_begin` => `z_erofs_pcluster_begin`
    `z_erofs_collector_end` => `z_erofs_pcluster_end`

as well as some comments.  No logic changes.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817082813.81180-2-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I61b812b5ae3dd564e52012d082415b1fc198383d
(cherry picked from commit dcba1b232e26ebadbd215728199455d38a59253e)
[dhavale: fixed minor conflict zdata.c in z_erofs_do_read_page()]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index f7d0d71359e8..ad921985883f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -512,19 +512,17 @@ enum z_erofs_pclustermode {
 	 */
 	Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
 	/*
-	 * The current collection has been linked with the owned chain, and
-	 * could also be linked with the remaining collections, which means
-	 * if the processing page is the tail page of the collection, thus
-	 * the current collection can safely use the whole page (since
-	 * the previous collection is under control) for in-place I/O, as
-	 * illustrated below:
-	 *  ________________________________________________________________
-	 * |  tail (partial) page |          head (partial) page           |
-	 * |  (of the current cl) |      (of the previous collection)      |
-	 * |                      |                                        |
-	 * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
+	 * The pcluster was just linked to a decompression chain by us.  It can
+	 * also be linked with the remaining pclusters, which means if the
+	 * processing page is the tail page of a pcluster, this pcluster can
+	 * safely use the whole page (since the previous pcluster is within the
+	 * same chain) for in-place I/O, as illustrated below:
+	 *  ___________________________________________________
+	 * |  tail (partial) page  |    head (partial) page    |
+	 * |  (of the current pcl) |   (of the previous pcl)   |
+	 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
 	 *
-	 * [  (*) the above page can be used as inplace I/O.               ]
+	 * [  (*) the page above can be used as inplace I/O.   ]
 	 */
 	Z_EROFS_PCLUSTER_FOLLOWED,
 };
@@ -855,7 +853,7 @@ err_out:
 	return err;
 }
 
-static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 {
 	struct erofs_map_blocks *map = &fe->map;
 	struct erofs_workgroup *grp = NULL;
@@ -912,12 +910,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
 	call_rcu(&pcl->rcu, z_erofs_rcu_callback);
 }
 
-static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
 {
 	struct z_erofs_pcluster *pcl = fe->pcl;
 
 	if (!pcl)
-		return false;
+		return;
 
 	z_erofs_bvec_iter_end(&fe->biter);
 	mutex_unlock(&pcl->lock);
@@ -933,7 +931,7 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
 		erofs_workgroup_put(&pcl->obj);
 
 	fe->pcl = NULL;
-	return true;
+	fe->backmost = false;
 }
 
 static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
@@ -984,8 +982,7 @@ repeat:
 	    offset + cur >= map->m_la + map->m_llen) {
 		erofs_dbg("out-of-range map @ pos %llu", offset + cur);
 
-		if (z_erofs_collector_end(fe))
-			fe->backmost = false;
+		z_erofs_pcluster_end(fe);
 		map->m_la = offset + cur;
 		map->m_llen = 0;
 		err = z_erofs_map_blocks_iter(inode, map, 0);
@@ -1001,7 +998,7 @@ repeat:
 	    map->m_flags & EROFS_MAP_FRAGMENT)
 		goto hitted;
 
-	err = z_erofs_collector_begin(fe);
+	err = z_erofs_pcluster_begin(fe);
 	if (err)
 		goto out;
 
@@ -1877,7 +1874,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 	z_erofs_pcluster_readmore(&f, NULL, true);
 	err = z_erofs_do_read_page(&f, page);
 	z_erofs_pcluster_readmore(&f, NULL, false);
-	(void)z_erofs_collector_end(&f);
+	z_erofs_pcluster_end(&f);
 
 	/* if some compressed cluster ready, need submit them anyway */
 	z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
@@ -1924,7 +1921,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 		put_page(page);
 	}
 	z_erofs_pcluster_readmore(&f, rac, false);
-	(void)z_erofs_collector_end(&f);
+	z_erofs_pcluster_end(&f);
 
 	z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
 	erofs_put_metabuf(&f.map.buf);

From dc94c3cc6b3fae027bc45b89ecb826bd4b6e9a2f Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 17 Aug 2023 16:28:08 +0800
Subject: [PATCH 074/139] UPSTREAM: erofs: move preparation logic into
 z_erofs_pcluster_begin()

Some preparation logic should be part of z_erofs_pcluster_begin()
instead of z_erofs_do_read_page().  Let's move now.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817082813.81180-3-hsiangkao@linux.alibaba.com

Bug: 318378021
(cherry picked from commit aeebae9d77217709f8ae3edb0cd7858ec8c7a9d6)
Change-Id: I4bf438d719742a18a6f3065a78bf027de5dae293
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 60 ++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ad921985883f..87e7d9f843bd 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -856,6 +856,8 @@ err_out:
 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 {
 	struct erofs_map_blocks *map = &fe->map;
+	struct super_block *sb = fe->inode->i_sb;
+	erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
 	struct erofs_workgroup *grp = NULL;
 	int ret;
 
@@ -865,8 +867,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 	DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
 
 	if (!(map->m_flags & EROFS_MAP_META)) {
-		grp = erofs_find_workgroup(fe->inode->i_sb,
-					   map->m_pa >> PAGE_SHIFT);
+		grp = erofs_find_workgroup(sb, blknr);
 	} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
 		DBG_BUGON(1);
 		return -EFSCORRUPTED;
@@ -885,9 +886,26 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 	} else if (ret) {
 		return ret;
 	}
+
 	z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
 				Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
-	/* since file-backed online pages are traversed in reverse order */
+	if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+		/* bind cache first when cached decompression is preferred */
+		z_erofs_bind_cache(fe);
+	} else {
+		void *mptr;
+
+		mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+		if (IS_ERR(mptr)) {
+			ret = PTR_ERR(mptr);
+			erofs_err(sb, "failed to get inline data %d", ret);
+			return ret;
+		}
+		get_page(map->buf.page);
+		WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+	}
+	/* file-backed inplace I/O pages are traversed in reverse order */
 	fe->icur = z_erofs_pclusterpages(fe->pcl);
 	return 0;
 }
@@ -988,39 +1006,15 @@ repeat:
 		err = z_erofs_map_blocks_iter(inode, map, 0);
 		if (err)
 			goto out;
-	} else {
-		if (fe->pcl)
-			goto hitted;
-		/* didn't get a valid pcluster previously (very rare) */
+	} else if (fe->pcl) {
+		goto hitted;
 	}
 
-	if (!(map->m_flags & EROFS_MAP_MAPPED) ||
-	    map->m_flags & EROFS_MAP_FRAGMENT)
-		goto hitted;
-
-	err = z_erofs_pcluster_begin(fe);
-	if (err)
-		goto out;
-
-	if (z_erofs_is_inline_pcluster(fe->pcl)) {
-		void *mp;
-
-		mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
-					erofs_blknr(inode->i_sb, map->m_pa),
-					EROFS_NO_KMAP);
-		if (IS_ERR(mp)) {
-			err = PTR_ERR(mp);
-			erofs_err(inode->i_sb,
-				  "failed to get inline page, err %d", err);
+	if ((map->m_flags & EROFS_MAP_MAPPED) &&
+	    !(map->m_flags & EROFS_MAP_FRAGMENT)) {
+		err = z_erofs_pcluster_begin(fe);
+		if (err)
 			goto out;
-		}
-		get_page(fe->map.buf.page);
-		WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
-			   fe->map.buf.page);
-		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
-	} else {
-		/* bind cache first when cached decompression is preferred */
-		z_erofs_bind_cache(fe);
 	}
 hitted:
 	/*

From 0d329bbe5cac1bea2bf637e00846eee6982cba53 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Thu, 17 Aug 2023 16:28:09 +0800
Subject: [PATCH 075/139] BACKPORT: erofs: tidy up z_erofs_do_read_page()

 - Fix a typo: spiltted => split;

 - Move !EROFS_MAP_MAPPED and EROFS_MAP_FRAGMENT upwards;

 - Increase `split` in advance to avoid unnecessary repeats.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20230817082813.81180-4-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I465fd33c7cbbe91d5da4b4ee2343a7b319534148
(cherry picked from commit e4c1cf523d820730a86cae2c6d55924833b6f7ac)
[dhavale: resolved small conflict in zdata.c in z_erofs_do_read_page()]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 60 +++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 87e7d9f843bd..18fb6556cfd2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -985,53 +985,35 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 	struct erofs_map_blocks *const map = &fe->map;
 	const loff_t offset = page_offset(page);
 	bool tight = true, exclusive;
-	unsigned int cur, end, len, spiltted;
+	unsigned int cur, end, len, split;
 	int err = 0;
 
-	/* register locked file pages as online pages in pack */
 	z_erofs_onlinepage_init(page);
 
-	spiltted = 0;
+	split = 0;
 	end = PAGE_SIZE;
 repeat:
-	cur = end - 1;
-
-	if (offset + cur < map->m_la ||
-	    offset + cur >= map->m_la + map->m_llen) {
-		erofs_dbg("out-of-range map @ pos %llu", offset + cur);
-
+	if (offset + end - 1 < map->m_la ||
+	    offset + end - 1 >= map->m_la + map->m_llen) {
+		erofs_dbg("out-of-range map @ pos %llu", offset + end - 1);
 		z_erofs_pcluster_end(fe);
-		map->m_la = offset + cur;
+		map->m_la = offset + end - 1;
 		map->m_llen = 0;
 		err = z_erofs_map_blocks_iter(inode, map, 0);
 		if (err)
 			goto out;
-	} else if (fe->pcl) {
-		goto hitted;
 	}
 
-	if ((map->m_flags & EROFS_MAP_MAPPED) &&
-	    !(map->m_flags & EROFS_MAP_FRAGMENT)) {
-		err = z_erofs_pcluster_begin(fe);
-		if (err)
-			goto out;
-	}
-hitted:
-	/*
-	 * Ensure the current partial page belongs to this submit chain rather
-	 * than other concurrent submit chains or the noio(bypass) chain since
-	 * those chains are handled asynchronously thus the page cannot be used
-	 * for inplace I/O or bvpage (should be processed in a strict order.)
-	 */
-	tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+	cur = offset > map->m_la ? 0 : map->m_la - offset;
+	/* bump split parts first to avoid several separate cases */
+	++split;
 
-	cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
 	if (!(map->m_flags & EROFS_MAP_MAPPED)) {
 		zero_user_segment(page, cur, end);
-		++spiltted;
 		tight = false;
 		goto next_part;
 	}
+
 	if (map->m_flags & EROFS_MAP_FRAGMENT) {
 		erofs_off_t fpos = offset + cur - map->m_la;
 
@@ -1040,12 +1022,24 @@ hitted:
 				EROFS_I(inode)->z_fragmentoff + fpos);
 		if (err)
 			goto out;
-		++spiltted;
 		tight = false;
 		goto next_part;
 	}
 
-	exclusive = (!cur && (!spiltted || tight));
+	if (!fe->pcl) {
+		err = z_erofs_pcluster_begin(fe);
+		if (err)
+			goto out;
+	}
+
+	/*
+	 * Ensure the current partial page belongs to this submit chain rather
+	 * than other concurrent submit chains or the noio(bypass) chain since
+	 * those chains are handled asynchronously thus the page cannot be used
+	 * for inplace I/O or bvpage (should be processed in a strict order.)
+	 */
+	tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+	exclusive = (!cur && ((split <= 1) || tight));
 	if (cur)
 		tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
 
@@ -1058,8 +1052,6 @@ hitted:
 		goto out;
 
 	z_erofs_onlinepage_split(page);
-	/* bump up the number of spiltted parts of a page */
-	++spiltted;
 	if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
 		fe->pcl->multibases = true;
 	if (fe->pcl->length < offset + end - map->m_la) {
@@ -1084,8 +1076,8 @@ out:
 		z_erofs_page_mark_eio(page);
 	z_erofs_onlinepage_endio(page);
 
-	erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
-		  __func__, page, spiltted, map->m_llen);
+	erofs_dbg("%s, finish page: %pK split: %u map->m_llen %llu",
+		  __func__, page, split, map->m_llen);
 	return err;
 }
 

From bdc5d268ba5ebf809f44e12bb31cce23703bb659 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 29 Nov 2023 02:04:31 +0800
Subject: [PATCH 076/139] FROMGIT: erofs: fix memory leak on short-lived
 bounced pages

Both MicroLZMA and DEFLATE algorithms can use short-lived pages on
demand for the overlapped inplace I/O decompression.

However, those short-lived pages are actually added to
`be->compressed_pages`.  Thus, it should be checked instead of
`pcl->compressed_bvecs`.

The LZ4 algorithm doesn't work like this, so it won't be impacted.

Fixes: 67139e36d970 ("erofs: introduce `z_erofs_parse_in_bvecs'")
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231128180431.4116991-1-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: Ia1f602e9944b884022a3e20db12af568304fd80c
(cherry picked from commit 93d6fda7f926451a0fa1121b9558d75ca47e861e
https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 18fb6556cfd2..51ae6de18cba 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1328,12 +1328,11 @@ out:
 		put_page(page);
 	} else {
 		for (i = 0; i < pclusterpages; ++i) {
-			page = pcl->compressed_bvecs[i].page;
+			/* consider shortlived pages added when decompressing */
+			page = be->compressed_pages[i];
 
 			if (erofs_page_is_managed(sbi, page))
 				continue;
-
-			/* recycle all individual short-lived pages */
 			(void)z_erofs_put_shortlivedpage(be->pagepool, page);
 			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
 		}

From 8a49ea94416049bab4c6074c6c533181eb419167 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 6 Dec 2023 12:55:34 +0800
Subject: [PATCH 077/139] FROMGIT: erofs: fix lz4 inplace decompression

Currently EROFS can map another compressed buffer for inplace
decompression, that was used to handle the cases that some pages of
compressed data are actually not in-place I/O.

However, like most simple LZ77 algorithms, LZ4 expects the compressed
data is arranged at the end of the decompressed buffer and it
explicitly uses memmove() to handle overlapping:
  __________________________________________________________
 |_ direction of decompression --> ____ |_ compressed data _|

Although EROFS arranges compressed data like this, it typically maps two
individual virtual buffers so the relative order is uncertain.
Previously, it was hardly observed since LZ4 only uses memmove() for
short overlapped literals and x86/arm64 memmove implementations seem to
completely cover it up and they don't have this issue.  Juhyung reported
that EROFS data corruption can be found on a new Intel x86 processor.
After some analysis, it seems that recent x86 processors with the new
FSRM feature expose this issue with "rep movsb".

Let's strictly use the decompressed buffer for lz4 inplace
decompression for now.  Later, as an useful improvement, we could try
to tie up these two buffers together in the correct order.

Reported-and-tested-by: Juhyung Park <qkrwngud825@gmail.com>
Closes: https://lore.kernel.org/r/CAD14+f2AVKf8Fa2OO1aAUdDNTDsVzzR6ctU_oJSmTyd6zSYR2Q@mail.gmail.com
Fixes: 0ffd71bcc3a0 ("staging: erofs: introduce LZ4 decompression inplace")
Fixes: 598162d05080 ("erofs: support decompress big pcluster for lz4 backend")
Cc: stable <stable@vger.kernel.org> # 5.4+
Tested-by: Yifan Zhao <zhaoyifan@sjtu.edu.cn>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231206045534.3920847-1-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: Ifd2981320f9f79b27bc7484d8906501a2fa05359
(cherry picked from commit 3c12466b6b7bf1e56f9b32c366a3d83d87afb4de
 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/decompressor.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index cfad1eac7fd9..024e0b4733e8 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -122,11 +122,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
 }
 
 static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
-			void *inpage, unsigned int *inputmargin, int *maptype,
-			bool may_inplace)
+			void *inpage, void *out, unsigned int *inputmargin,
+			int *maptype, bool may_inplace)
 {
 	struct z_erofs_decompress_req *rq = ctx->rq;
-	unsigned int omargin, total, i, j;
+	unsigned int omargin, total, i;
 	struct page **in;
 	void *src, *tmp;
 
@@ -136,12 +136,13 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
 		    omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
 			goto docopy;
 
-		for (i = 0; i < ctx->inpages; ++i) {
-			DBG_BUGON(rq->in[i] == NULL);
-			for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j)
-				if (rq->out[j] == rq->in[i])
-					goto docopy;
-		}
+		for (i = 0; i < ctx->inpages; ++i)
+			if (rq->out[ctx->outpages - ctx->inpages + i] !=
+			    rq->in[i])
+				goto docopy;
+		kunmap_local(inpage);
+		*maptype = 3;
+		return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT);
 	}
 
 	if (ctx->inpages <= 1) {
@@ -149,7 +150,6 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
 		return inpage;
 	}
 	kunmap_local(inpage);
-	might_sleep();
 	src = erofs_vm_map_ram(rq->in, ctx->inpages);
 	if (!src)
 		return ERR_PTR(-ENOMEM);
@@ -205,12 +205,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
 }
 
 static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
-				      u8 *out)
+				      u8 *dst)
 {
 	struct z_erofs_decompress_req *rq = ctx->rq;
 	bool support_0padding = false, may_inplace = false;
 	unsigned int inputmargin;
-	u8 *headpage, *src;
+	u8 *out, *headpage, *src;
 	int ret, maptype;
 
 	DBG_BUGON(*rq->in == NULL);
@@ -231,11 +231,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
 	}
 
 	inputmargin = rq->pageofs_in;
-	src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin,
+	src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin,
 					 &maptype, may_inplace);
 	if (IS_ERR(src))
 		return PTR_ERR(src);
 
+	out = dst + rq->pageofs_out;
 	/* legacy format could compress extra data in a pcluster. */
 	if (rq->partial_decoding || !support_0padding)
 		ret = LZ4_decompress_safe_partial(src + inputmargin, out,
@@ -266,7 +267,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
 		vm_unmap_ram(src, ctx->inpages);
 	} else if (maptype == 2) {
 		erofs_put_pcpubuf(src);
-	} else {
+	} else if (maptype != 3) {
 		DBG_BUGON(1);
 		return -EFAULT;
 	}
@@ -309,7 +310,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
 	}
 
 dstmap_out:
-	ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
+	ret = z_erofs_lz4_decompress_mem(&ctx, dst);
 	if (!dst_maptype)
 		kunmap_local(dst);
 	else if (dst_maptype == 2)

From 9d259220acdf19d29e0970740d10d2ec973c5d78 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 6 Dec 2023 17:10:53 +0800
Subject: [PATCH 078/139] FROMGIT: erofs: support I/O submission for sub-page
 compressed blocks

Add a basic I/O submission path first to support sub-page blocks:

 - Temporary short-lived pages will be used entirely;

 - In-place I/O pages can be used partially, but compressed pages need
   to be able to be mapped in contiguous virtual memory.

As a start, currently cache decompression is explicitly disabled for
sub-page blocks, which will be supported in the future.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231206091057.87027-2-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: Ib2cb6120805ab479a450580fc8774af131271791
(cherry picked from commit 192351616a9dde686492bcb9d1e4895a1411a527
 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 156 ++++++++++++++++++++++-------------------------
 1 file changed, 74 insertions(+), 82 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 51ae6de18cba..bc8e6cdf3ae3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1452,86 +1452,85 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 	z_erofs_decompressqueue_work(&io->u.work);
 }
 
-static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
-					       unsigned int nr,
-					       struct page **pagepool,
-					       struct address_space *mc)
+static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
+				 struct z_erofs_decompress_frontend *f,
+				 struct z_erofs_pcluster *pcl,
+				 unsigned int nr,
+				 struct address_space *mc)
 {
-	const pgoff_t index = pcl->obj.index;
 	gfp_t gfp = mapping_gfp_mask(mc);
 	bool tocache = false;
-
+	struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
 	struct address_space *mapping;
-	struct page *oldpage, *page;
-	int justfound;
+	struct page *page, *oldpage;
+	int justfound, bs = i_blocksize(f->inode);
 
+	/* Except for inplace pages, the entire page can be used for I/Os */
+	bvec->bv_offset = 0;
+	bvec->bv_len = PAGE_SIZE;
 repeat:
-	page = READ_ONCE(pcl->compressed_bvecs[nr].page);
-	oldpage = page;
-
-	if (!page)
+	oldpage = READ_ONCE(zbv->page);
+	if (!oldpage)
 		goto out_allocpage;
 
-	justfound = (unsigned long)page & 1UL;
-	page = (struct page *)((unsigned long)page & ~1UL);
+	justfound = (unsigned long)oldpage & 1UL;
+	page = (struct page *)((unsigned long)oldpage & ~1UL);
+	bvec->bv_page = page;
 
+	DBG_BUGON(z_erofs_is_shortlived_page(page));
 	/*
-	 * preallocated cached pages, which is used to avoid direct reclaim
-	 * otherwise, it will go inplace I/O path instead.
+	 * Handle preallocated cached pages.  We tried to allocate such pages
+	 * without triggering direct reclaim.  If allocation failed, inplace
+	 * file-backed pages will be used instead.
 	 */
 	if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
-		WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
 		set_page_private(page, 0);
+		WRITE_ONCE(zbv->page, page);
 		tocache = true;
 		goto out_tocache;
 	}
+
 	mapping = READ_ONCE(page->mapping);
-
 	/*
-	 * file-backed online pages in plcuster are all locked steady,
-	 * therefore it is impossible for `mapping' to be NULL.
+	 * File-backed pages for inplace I/Os are all locked steady,
+	 * therefore it is impossible for `mapping` to be NULL.
 	 */
-	if (mapping && mapping != mc)
-		/* ought to be unmanaged pages */
-		goto out;
-
-	/* directly return for shortlived page as well */
-	if (z_erofs_is_shortlived_page(page))
-		goto out;
+	if (mapping && mapping != mc) {
+		if (zbv->offset < 0)
+			bvec->bv_offset = round_up(-zbv->offset, bs);
+		bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+		return;
+	}
 
 	lock_page(page);
-
 	/* only true if page reclaim goes wrong, should never happen */
 	DBG_BUGON(justfound && PagePrivate(page));
 
-	/* the page is still in manage cache */
+	/* the cached page is still in managed cache */
 	if (page->mapping == mc) {
-		WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
-
+		WRITE_ONCE(zbv->page, page);
+		/*
+		 * The cached page is still available but without a valid
+		 * `->private` pcluster hint.  Let's reconnect them.
+		 */
 		if (!PagePrivate(page)) {
-			/*
-			 * impossible to be !PagePrivate(page) for
-			 * the current restriction as well if
-			 * the page is already in compressed_bvecs[].
-			 */
 			DBG_BUGON(!justfound);
-
-			justfound = 0;
-			set_page_private(page, (unsigned long)pcl);
-			SetPagePrivate(page);
+			/* compressed_bvecs[] already takes a ref */
+			attach_page_private(page, pcl);
+			put_page(page);
 		}
 
-		/* no need to submit io if it is already up-to-date */
+		/* no need to submit if it is already up-to-date */
 		if (PageUptodate(page)) {
 			unlock_page(page);
-			page = NULL;
+			bvec->bv_page = NULL;
 		}
-		goto out;
+		return;
 	}
 
 	/*
-	 * the managed page has been truncated, it's unsafe to
-	 * reuse this one, let's allocate a new cache-managed page.
+	 * It has been truncated, so it's unsafe to reuse this one. Let's
+	 * allocate a new page for compressed data.
 	 */
 	DBG_BUGON(page->mapping);
 	DBG_BUGON(!justfound);
@@ -1540,25 +1539,23 @@ repeat:
 	unlock_page(page);
 	put_page(page);
 out_allocpage:
-	page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
-	if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
-			       oldpage, page)) {
-		erofs_pagepool_add(pagepool, page);
+	page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+	if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+		erofs_pagepool_add(&f->pagepool, page);
 		cond_resched();
 		goto repeat;
 	}
+	bvec->bv_page = page;
 out_tocache:
-	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
-		/* turn into temporary page if fails (1 ref) */
+	if (!tocache || bs != PAGE_SIZE ||
+	    add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) {
+		/* turn into a temporary shortlived page (1 ref) */
 		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
-		goto out;
+		return;
 	}
 	attach_page_private(page, pcl);
-	/* drop a refcount added by allocpage (then we have 2 refs here) */
+	/* drop a refcount added by allocpage (then 2 refs in total here) */
 	put_page(page);
-
-out:	/* the only exit (for tracing and debugging) */
-	return page;
 }
 
 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1613,7 +1610,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
 	qtail[JQ_BYPASS] = &pcl->next;
 }
 
-static void z_erofs_decompressqueue_endio(struct bio *bio)
+static void z_erofs_submissionqueue_endio(struct bio *bio)
 {
 	struct z_erofs_decompressqueue *q = bio->bi_private;
 	blk_status_t err = bio->bi_status;
@@ -1625,7 +1622,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
 
 		DBG_BUGON(PageUptodate(page));
 		DBG_BUGON(z_erofs_page_is_invalidated(page));
-
 		if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
 			if (!err)
 				SetPageUptodate(page);
@@ -1648,17 +1644,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 	struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
 	z_erofs_next_pcluster_t owned_head = f->owned_head;
 	/* bio is NULL initially, so no need to initialize last_{index,bdev} */
-	pgoff_t last_index;
+	erofs_off_t last_pa;
 	struct block_device *last_bdev;
 	unsigned int nr_bios = 0;
 	struct bio *bio = NULL;
 	unsigned long pflags;
 	int memstall = 0;
 
-	/*
-	 * if managed cache is enabled, bypass jobqueue is needed,
-	 * no need to read from device for all pclusters in this queue.
-	 */
+	/* No need to read from device for pclusters in the bypass queue. */
 	q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
 	q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
 
@@ -1671,7 +1664,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 	do {
 		struct erofs_map_dev mdev;
 		struct z_erofs_pcluster *pcl;
-		pgoff_t cur, end;
+		erofs_off_t cur, end;
+		struct bio_vec bvec;
 		unsigned int i = 0;
 		bool bypass = true;
 
@@ -1690,18 +1684,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 		};
 		(void)erofs_map_dev(sb, &mdev);
 
-		cur = erofs_blknr(sb, mdev.m_pa);
-		end = cur + pcl->pclusterpages;
-
+		cur = mdev.m_pa;
+		end = cur + (pcl->pclusterpages << PAGE_SHIFT);
 		do {
-			struct page *page;
-
-			page = pickup_page_for_submission(pcl, i++,
-					&f->pagepool, mc);
-			if (!page)
+			z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+			if (!bvec.bv_page)
 				continue;
 
-			if (bio && (cur != last_index + 1 ||
+			if (bio && (cur != last_pa ||
 				    last_bdev != mdev.m_bdev)) {
 submit_bio_retry:
 				submit_bio(bio);
@@ -1712,7 +1702,8 @@ submit_bio_retry:
 				bio = NULL;
 			}
 
-			if (unlikely(PageWorkingset(page)) && !memstall) {
+			if (unlikely(PageWorkingset(bvec.bv_page)) &&
+			    !memstall) {
 				psi_memstall_enter(&pflags);
 				memstall = 1;
 			}
@@ -1720,23 +1711,24 @@ submit_bio_retry:
 			if (!bio) {
 				bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
 						REQ_OP_READ, GFP_NOIO);
-				bio->bi_end_io = z_erofs_decompressqueue_endio;
-
-				last_bdev = mdev.m_bdev;
-				bio->bi_iter.bi_sector = (sector_t)cur <<
-					(sb->s_blocksize_bits - 9);
+				bio->bi_end_io = z_erofs_submissionqueue_endio;
+				bio->bi_iter.bi_sector = cur >> 9;
 				bio->bi_private = q[JQ_SUBMIT];
 				if (readahead)
 					bio->bi_opf |= REQ_RAHEAD;
 				++nr_bios;
+				last_bdev = mdev.m_bdev;
 			}
 
-			if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+			if (cur + bvec.bv_len > end)
+				bvec.bv_len = end - cur;
+			if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
+					  bvec.bv_offset))
 				goto submit_bio_retry;
 
-			last_index = cur;
+			last_pa = cur + bvec.bv_len;
 			bypass = false;
-		} while (++cur < end);
+		} while ((cur += bvec.bv_len) < end);
 
 		if (!bypass)
 			qtail[JQ_SUBMIT] = &pcl->next;

From d7bb85f1cb1950bfa12b4c6c09f8346d32710d49 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 6 Dec 2023 17:10:54 +0800
Subject: [PATCH 079/139] FROMGIT: erofs: record `pclustersize` in bytes
 instead of pages

Currently, compressed sizes are recorded in pages using `pclusterpages`,
However, for tailpacking pclusters, `tailpacking_size` is used instead.

This approach doesn't work when dealing with sub-page blocks. To address
this, let's switch them to the unified `pclustersize` in bytes.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231206091057.87027-3-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: Ia8c50a7b4adcf6cd161b1d6f8bfc5a7fd3371079
(cherry picked from commit 54ed3fdd66055d073cb1cd2c6c65bbc0683c40cf
 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 64 ++++++++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index bc8e6cdf3ae3..471129a41335 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -57,6 +57,9 @@ struct z_erofs_pcluster {
 	/* L: total number of bvecs */
 	unsigned int vcnt;
 
+	/* I: pcluster size (compressed size) in bytes */
+	unsigned int pclustersize;
+
 	/* I: page offset of start position of decompression */
 	unsigned short pageofs_out;
 
@@ -71,14 +74,6 @@ struct z_erofs_pcluster {
 		struct rcu_head rcu;
 	};
 
-	union {
-		/* I: physical cluster size in pages */
-		unsigned short pclusterpages;
-
-		/* I: tailpacking inline compressed size */
-		unsigned short tailpacking_size;
-	};
-
 	/* I: compression algorithm format */
 	unsigned char algorithmformat;
 
@@ -118,9 +113,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
 
 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
 {
-	if (z_erofs_is_inline_pcluster(pcl))
-		return 1;
-	return pcl->pclusterpages;
+	return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
 }
 
 /*
@@ -306,12 +299,12 @@ static int z_erofs_create_pcluster_pool(void)
 	return 0;
 }
 
-static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
 {
-	int i;
+	unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct z_erofs_pcluster_slab *pcs = pcluster_pool;
 
-	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
-		struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+	for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
 		struct z_erofs_pcluster *pcl;
 
 		if (nrpages > pcs->maxpages)
@@ -320,7 +313,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
 		pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
 		if (!pcl)
 			return ERR_PTR(-ENOMEM);
-		pcl->pclusterpages = nrpages;
+		pcl->pclustersize = size;
 		return pcl;
 	}
 	return ERR_PTR(-EINVAL);
@@ -571,6 +564,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 {
 	struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
 	struct z_erofs_pcluster *pcl = fe->pcl;
+	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 	bool shouldalloc = z_erofs_should_alloc_cache(fe);
 	bool standalone = true;
 	/*
@@ -584,10 +578,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
 		return;
 
-	for (i = 0; i < pcl->pclusterpages; ++i) {
-		struct page *page;
+	for (i = 0; i < pclusterpages; ++i) {
+		struct page *page, *newpage;
 		void *t;	/* mark pages just found for debugging */
-		struct page *newpage = NULL;
 
 		/* the compressed page was loaded before */
 		if (READ_ONCE(pcl->compressed_bvecs[i].page))
@@ -597,6 +590,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 
 		if (page) {
 			t = (void *)((unsigned long)page | 1);
+			newpage = NULL;
 		} else {
 			/* I/O is needed, no possible to decompress directly */
 			standalone = false;
@@ -604,9 +598,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 				continue;
 
 			/*
-			 * try to use cached I/O if page allocation
-			 * succeeds or fallback to in-place I/O instead
-			 * to avoid any direct reclaim.
+			 * Try cached I/O if allocation succeeds or fallback to
+			 * in-place I/O instead to avoid any direct reclaim.
 			 */
 			newpage = erofs_allocpage(&fe->pagepool, gfp);
 			if (!newpage)
@@ -638,6 +631,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 {
 	struct z_erofs_pcluster *const pcl =
 		container_of(grp, struct z_erofs_pcluster, obj);
+	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 	int i;
 
 	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
@@ -645,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 	 * refcount of workgroup is now freezed as 1,
 	 * therefore no need to worry about available decompression users.
 	 */
-	for (i = 0; i < pcl->pclusterpages; ++i) {
+	for (i = 0; i < pclusterpages; ++i) {
 		struct page *page = pcl->compressed_bvecs[i].page;
 
 		if (!page)
@@ -669,6 +663,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
 {
 	struct z_erofs_pcluster *pcl = folio_get_private(folio);
+	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 	bool ret;
 	int i;
 
@@ -680,7 +675,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
 
 	ret = false;
 	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
-	for (i = 0; i < pcl->pclusterpages; ++i) {
+	for (i = 0; i < pclusterpages; ++i) {
 		if (pcl->compressed_bvecs[i].page == &folio->page) {
 			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
 			ret = true;
@@ -789,20 +784,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 {
 	struct erofs_map_blocks *map = &fe->map;
+	struct super_block *sb = fe->inode->i_sb;
 	bool ztailpacking = map->m_flags & EROFS_MAP_META;
 	struct z_erofs_pcluster *pcl;
 	struct erofs_workgroup *grp;
 	int err;
 
 	if (!(map->m_flags & EROFS_MAP_ENCODED) ||
-	    (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
+	    (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
 		DBG_BUGON(1);
 		return -EFSCORRUPTED;
 	}
 
 	/* no available pcluster, let's allocate one */
-	pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
-				     map->m_plen >> PAGE_SHIFT);
+	pcl = z_erofs_alloc_pcluster(map->m_plen);
 	if (IS_ERR(pcl))
 		return PTR_ERR(pcl);
 
@@ -826,9 +821,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 	if (ztailpacking) {
 		pcl->obj.index = 0;	/* which indicates ztailpacking */
 		pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa);
-		pcl->tailpacking_size = map->m_plen;
 	} else {
-		pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+		pcl->obj.index = erofs_blknr(sb, map->m_pa);
 
 		grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
 		if (IS_ERR(grp)) {
@@ -1263,8 +1257,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
 	const struct z_erofs_decompressor *decompressor =
 				&erofs_decompressors[pcl->algorithmformat];
-	unsigned int i, inputsize;
-	int err2;
+	int i, err2;
 	struct page *page;
 	bool overlapped;
 
@@ -1301,18 +1294,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 	if (err)
 		goto out;
 
-	if (z_erofs_is_inline_pcluster(pcl))
-		inputsize = pcl->tailpacking_size;
-	else
-		inputsize = pclusterpages * PAGE_SIZE;
-
 	err = decompressor->decompress(&(struct z_erofs_decompress_req) {
 					.sb = be->sb,
 					.in = be->compressed_pages,
 					.out = be->decompressed_pages,
 					.pageofs_in = pcl->pageofs_in,
 					.pageofs_out = pcl->pageofs_out,
-					.inputsize = inputsize,
+					.inputsize = pcl->pclustersize,
 					.outputsize = pcl->length,
 					.alg = pcl->algorithmformat,
 					.inplace_io = overlapped,
@@ -1685,7 +1673,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 		(void)erofs_map_dev(sb, &mdev);
 
 		cur = mdev.m_pa;
-		end = cur + (pcl->pclusterpages << PAGE_SHIFT);
+		end = cur + pcl->pclustersize;
 		do {
 			z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
 			if (!bvec.bv_page)

From 0c6a18c75b54e9045df5284d1cd5d65afcdfe34d Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 6 Dec 2023 17:10:55 +0800
Subject: [PATCH 080/139] BACKPORT: FROMGIT: erofs: fix up compacted indexes
 for block size < 4096

Previously, the block size always equaled to PAGE_SIZE, therefore
`lclusterbits` couldn't be less than 12.

Since sub-page compressed blocks are now considered, `lobits` for
a lcluster in each pack cannot always be `lclusterbits` as before.
Otherwise, there is no enough room for the special value
`Z_EROFS_VLE_DI_D0_CBLKCNT`.

To support smaller block sizes, `lobits` for each compacted lcluster is
now calculated as:
   lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1)

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231206091057.87027-4-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: Iacd89e2b33ddf39ea40b90e88a2bf99bb5a83b31
(cherry picked from commit 8d2517aaeea3ab8651bb517bca8f3c8664d318ea
 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
[dhavale: resolved conflicts in zmap.c due to older naming of constants
and updated commit message also to use the older names]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zmap.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 8973ccad707d..f5d3ba39dd42 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -101,29 +101,26 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 }
 
 static unsigned int decode_compactedbits(unsigned int lobits,
-					 unsigned int lomask,
 					 u8 *in, unsigned int pos, u8 *type)
 {
 	const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
-	const unsigned int lo = v & lomask;
+	const unsigned int lo = v & ((1 << lobits) - 1);
 
 	*type = (v >> lobits) & 3;
 	return lo;
 }
 
-static int get_compacted_la_distance(unsigned int lclusterbits,
+static int get_compacted_la_distance(unsigned int lobits,
 				     unsigned int encodebits,
 				     unsigned int vcnt, u8 *in, int i)
 {
-	const unsigned int lomask = (1 << lclusterbits) - 1;
 	unsigned int lo, d1 = 0;
 	u8 type;
 
 	DBG_BUGON(i >= vcnt);
 
 	do {
-		lo = decode_compactedbits(lclusterbits, lomask,
-					  in, encodebits * i, &type);
+		lo = decode_compactedbits(lobits, in, encodebits * i, &type);
 
 		if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
 			return d1;
@@ -142,15 +139,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 {
 	struct erofs_inode *const vi = EROFS_I(m->inode);
 	const unsigned int lclusterbits = vi->z_logical_clusterbits;
-	const unsigned int lomask = (1 << lclusterbits) - 1;
-	unsigned int vcnt, base, lo, encodebits, nblk, eofs;
+	unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs;
 	int i;
 	u8 *in, type;
 	bool big_pcluster;
 
 	if (1 << amortizedshift == 4 && lclusterbits <= 14)
 		vcnt = 2;
-	else if (1 << amortizedshift == 2 && lclusterbits == 12)
+	else if (1 << amortizedshift == 2 && lclusterbits <= 12)
 		vcnt = 16;
 	else
 		return -EOPNOTSUPP;
@@ -159,6 +155,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 	m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
 			 (vcnt << amortizedshift);
 	big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+	lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1U);
 	encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
 	eofs = erofs_blkoff(m->inode->i_sb, pos);
 	base = round_down(eofs, vcnt << amortizedshift);
@@ -166,15 +163,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 
 	i = (eofs - base) >> amortizedshift;
 
-	lo = decode_compactedbits(lclusterbits, lomask,
-				  in, encodebits * i, &type);
+	lo = decode_compactedbits(lobits, in, encodebits * i, &type);
 	m->type = type;
 	if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
 		m->clusterofs = 1 << lclusterbits;
 
 		/* figure out lookahead_distance: delta[1] if needed */
 		if (lookahead)
-			m->delta[1] = get_compacted_la_distance(lclusterbits,
+			m->delta[1] = get_compacted_la_distance(lobits,
 						encodebits, vcnt, in, i);
 		if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
 			if (!big_pcluster) {
@@ -193,8 +189,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 		 * of which lo saves delta[1] rather than delta[0].
 		 * Hence, get delta[0] by the previous lcluster indirectly.
 		 */
-		lo = decode_compactedbits(lclusterbits, lomask,
-					  in, encodebits * (i - 1), &type);
+		lo = decode_compactedbits(lobits, in,
+					  encodebits * (i - 1), &type);
 		if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
 			lo = 0;
 		else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
@@ -209,8 +205,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 		nblk = 1;
 		while (i > 0) {
 			--i;
-			lo = decode_compactedbits(lclusterbits, lomask,
-						  in, encodebits * i, &type);
+			lo = decode_compactedbits(lobits, in,
+						  encodebits * i, &type);
 			if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
 				i -= lo;
 
@@ -221,8 +217,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 		nblk = 0;
 		while (i > 0) {
 			--i;
-			lo = decode_compactedbits(lclusterbits, lomask,
-						  in, encodebits * i, &type);
+			lo = decode_compactedbits(lobits, in,
+						  encodebits * i, &type);
 			if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
 				if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
 					--i;

From a18efa4e4aa95a7348469db998411322ac8193be Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Fri, 15 Dec 2023 00:13:37 +0800
Subject: [PATCH 081/139] FROMGIT: erofs: fix ztailpacking for subpage
 compressed blocks

`pageofs_in` should be the compressed data offset of the page rather
than of the block.

Acked-by: Chao Yu <chao@kernel.org>
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231214161337.753049-1-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I0997a69b22b0f42c327c810359f55f5fa6a76275
(cherry picked from commit e5aba911dee5e20fa82efbe13e0af8f38ea459e7
 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/zdata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 471129a41335..ffb5ed1b07b0 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -820,7 +820,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
 
 	if (ztailpacking) {
 		pcl->obj.index = 0;	/* which indicates ztailpacking */
-		pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa);
 	} else {
 		pcl->obj.index = erofs_blknr(sb, map->m_pa);
 
@@ -897,6 +896,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
 		}
 		get_page(map->buf.page);
 		WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+		fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
 		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 	}
 	/* file-backed inplace I/O pages are traversed in reverse order */

From f466d5216404d611d6f9559bc97547edb7afa714 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 6 Dec 2023 17:10:56 +0800
Subject: [PATCH 082/139] FROMGIT: erofs: refine z_erofs_transform_plain() for
 sub-page block support

Sub-page block support is still unusable even with previous commits if
interlaced PLAIN pclusters exist.  Such pclusters can be found if the
fragment feature is enabled.

This commit tries to handle "the head part" of interlaced PLAIN
pclusters first: it was once explained in commit fdffc091e6f9 ("erofs:
support interlaced uncompressed data for compressed files").

It uses a unique way for both shifted and interlaced PLAIN pclusters.
As an added bonus, PLAIN pclusters larger than the block size is also
supported now for the upcoming large lclusters.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231206091057.87027-5-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I3d50132664f8754f56d62744420060108ed0da4f
(cherry picked from commit 192351616a9dde686492bcb9d1e4895a1411a527
https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/decompressor.c | 82 ++++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 33 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 024e0b4733e8..38c7f9c96c68 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -321,43 +321,59 @@ dstmap_out:
 static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 				   struct page **pagepool)
 {
-	const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
-	const unsigned int outpages =
+	const unsigned int nrpages_in =
+		PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT;
+	const unsigned int nrpages_out =
 		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
-	const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
-					     PAGE_SIZE - rq->pageofs_out);
-	const unsigned int lefthalf = rq->outputsize - righthalf;
-	const unsigned int interlaced_offset =
-		rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
-	u8 *src;
+	const unsigned int bs = rq->sb->s_blocksize;
+	unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
+	u8 *kin;
 
-	if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
-		DBG_BUGON(1);
-		return -EFSCORRUPTED;
-	}
-
-	if (rq->out[0] == *rq->in) {
-		DBG_BUGON(rq->pageofs_out);
-		return 0;
-	}
-
-	src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
-	if (rq->out[0])
-		memcpy_to_page(rq->out[0], rq->pageofs_out,
-			       src + interlaced_offset, righthalf);
-
-	if (outpages > inpages) {
-		DBG_BUGON(!rq->out[outpages - 1]);
-		if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
-			memcpy_to_page(rq->out[outpages - 1], 0, src +
-					(interlaced_offset ? 0 : righthalf),
-				       lefthalf);
-		} else if (!interlaced_offset) {
-			memmove(src, src + righthalf, lefthalf);
-			flush_dcache_page(rq->in[inpages - 1]);
+	DBG_BUGON(rq->outputsize > rq->inputsize);
+	if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
+		cur = bs - (rq->pageofs_out & (bs - 1));
+		pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
+		cur = min(cur, rq->outputsize);
+		if (cur && rq->out[0]) {
+			kin = kmap_local_page(rq->in[nrpages_in - 1]);
+			if (rq->out[0] == rq->in[nrpages_in - 1]) {
+				memmove(kin + rq->pageofs_out, kin + pi, cur);
+				flush_dcache_page(rq->out[0]);
+			} else {
+				memcpy_to_page(rq->out[0], rq->pageofs_out,
+					       kin + pi, cur);
+			}
+			kunmap_local(kin);
 		}
+		rq->outputsize -= cur;
 	}
-	kunmap_local(src);
+
+	for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) {
+		insz = min_t(unsigned int, PAGE_SIZE - rq->pageofs_in,
+			     rq->outputsize);
+		rq->outputsize -= insz;
+		if (!rq->in[ni])
+			continue;
+		kin = kmap_local_page(rq->in[ni]);
+		pi = 0;
+		do {
+			no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT;
+			po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
+			DBG_BUGON(no >= nrpages_out);
+			cnt = min_t(unsigned int, insz - pi, PAGE_SIZE - po);
+			if (rq->out[no] == rq->in[ni]) {
+				memmove(kin + po,
+					kin + rq->pageofs_in + pi, cnt);
+				flush_dcache_page(rq->out[no]);
+			} else if (rq->out[no]) {
+				memcpy_to_page(rq->out[no], po,
+					       kin + rq->pageofs_in + pi, cnt);
+			}
+			pi += cnt;
+		} while (pi < insz);
+		kunmap_local(kin);
+	}
+	DBG_BUGON(ni > nrpages_in);
 	return 0;
 }
 

From 37e0a5b868093b50e698e43ac2412cd9a883f395 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 6 Dec 2023 17:10:57 +0800
Subject: [PATCH 083/139] BACKPORT: FROMGIT: erofs: enable sub-page compressed
 block support

Let's just disable cached decompression and inplace I/Os for partial
pages as the first step in order to enable sub-page block initial
support.  In other words, currently it works primarily based on
temporary short-lived pages.  Don't expect too much in terms of
performance.

Reviewed-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231206091057.87027-6-hsiangkao@linux.alibaba.com

Bug: 318378021
Change-Id: I00238aa437f20c46d015bbe5ab7b706b80b8cfd7
(cherry picked from commit 0ee3a0d59e007320167a2e9f4b8bf1304ada7771
 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev)
[dhavale: resolved conflicts in inode.c in erofs_fill_inode()]
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
 fs/erofs/inode.c | 7 +++++--
 fs/erofs/zdata.c | 6 ++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 187ca02bbc2d..190bc3ad5622 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -291,9 +291,12 @@ static int erofs_fill_inode(struct inode *inode)
 	}
 
 	if (erofs_inode_is_data_compressed(vi->datalayout)) {
-		if (!erofs_is_fscache_mode(inode->i_sb) &&
-		    inode->i_sb->s_blocksize_bits == PAGE_SHIFT)
+		if (!erofs_is_fscache_mode(inode->i_sb)) {
+			DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE,
+				  erofs_info, inode->i_sb,
+				  "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
 			err = z_erofs_fill_inode(inode);
+                }
 		else
 			err = -EOPNOTSUPP;
 		goto out_unlock;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ffb5ed1b07b0..0b1b6ca804b3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -575,6 +575,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 	unsigned int i;
 
+	if (i_blocksize(fe->inode) != PAGE_SIZE)
+		return;
 	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
 		return;
 
@@ -978,12 +980,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 	struct inode *const inode = fe->inode;
 	struct erofs_map_blocks *const map = &fe->map;
 	const loff_t offset = page_offset(page);
+	const unsigned int bs = i_blocksize(inode);
 	bool tight = true, exclusive;
 	unsigned int cur, end, len, split;
 	int err = 0;
 
 	z_erofs_onlinepage_init(page);
-
 	split = 0;
 	end = PAGE_SIZE;
 repeat:
@@ -1033,7 +1035,7 @@ repeat:
 	 * for inplace I/O or bvpage (should be processed in a strict order.)
 	 */
 	tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
-	exclusive = (!cur && ((split <= 1) || tight));
+	exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE)));
 	if (cur)
 		tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
 

From 7d50253c2790f5c8a2f43c4c737241ab2458a77f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 16 Mar 2020 16:39:31 -0700
Subject: [PATCH 084/139] ANDROID: Export functions to be used with dma_map_ops
 in modules

For modules to reuse default dma_map_ops implementations they need to be
exported. Export the following functions:

dma_direct_alloc
dma_direct_free
dma_common_mmap
dma_common_get_sgtable
dma_direct_get_required_mask

Bug: 151050914
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: Ia77b797fcd909fce01da7431bfbde282dc70b3b3
(cherry picked from commit fd31496dae939c7bf2ef874e08d4bf8c6ab738b3)
Signed-off-by: Qian-Hao Huang <qhhuang@google.com>
(cherry picked from commit cdc9f6ef946f3c17b048b3ee9e36aa0bbc12d712)
---
 kernel/dma/direct.c      | 3 +++
 kernel/dma/ops_helpers.c | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 71bb2e3440e2..09ca202ac480 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -43,6 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev)
 
 	return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
+EXPORT_SYMBOL_GPL(dma_direct_get_required_mask);
 
 static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 				  u64 *phys_limit)
@@ -320,6 +321,7 @@ out_free_pages:
 	__dma_direct_free_pages(dev, page, size);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(dma_direct_alloc);
 
 void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
@@ -365,6 +367,7 @@ void dma_direct_free(struct device *dev, size_t size,
 
 	__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
 }
+EXPORT_SYMBOL_GPL(dma_direct_free);
 
 struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c
index af4a6ef48ce0..e28e1e17eaf5 100644
--- a/kernel/dma/ops_helpers.c
+++ b/kernel/dma/ops_helpers.c
@@ -27,6 +27,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(dma_common_get_sgtable);
 
 /*
  * Create userspace mapping for the DMA-coherent memory.
@@ -57,6 +58,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	return -ENXIO;
 #endif /* CONFIG_MMU */
 }
+EXPORT_SYMBOL_GPL(dma_common_mmap);
 
 struct page *dma_common_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)

From 9c4bc457ab31d2a121da0b5884a6b7e927b1e1f9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Sat, 12 Aug 2023 16:56:25 +0100
Subject: [PATCH 085/139] UPSTREAM: mm/memory.c: fix mismerge

Fix a build issue.

Link: https://lkml.kernel.org/r/ZNerqcNS4EBJA/2v@casper.infradead.org
Fixes: 4aaa60dad4d1 ("mm: allow per-VMA locks on file-backed VMAs")
Signed-off-by: Matthew Wilcox <willy@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202308121909.XNYBtqNI-lkp@intel.com/
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 08dff2810e8feb3096bf5c8242ab1649d1e8b1a4)

Bug: 293665307
Change-Id: I07ce19f29c44831cdcf709fe1ce122d1963f0be2
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 56057f97afaf..24463fd7de64 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5514,7 +5514,7 @@ retry:
 	 * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA
 	 * from its anon_vma.
 	 */
-	if (unlikely(!vma->anon_vma))
+	if (vma_is_anonymous(vma) && !vma->anon_vma)
 		goto inval_end_read;
 
 	/* Check since vm_start/vm_end might change before we lock the VMA */

From b43b26b4cd1ff080ac1577a558de2962c8f03d07 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 6 Oct 2023 20:53:13 +0100
Subject: [PATCH 086/139] UPSTREAM: mm: make lock_folio_maybe_drop_mmap() VMA
 lock aware

Patch series "Handle more faults under the VMA lock", v2.

At this point, we're handling the majority of file-backed page faults
under the VMA lock, using the ->map_pages entry point.  This patch set
attempts to expand that for the following siutations:

 - We have to do a read.  This could be because we've hit the point in
   the readahead window where we need to kick off the next readahead,
   or because the page is simply not present in cache.
 - We're handling a write fault.  Most applications don't do I/O by writes
   to shared mmaps for very good reasons, but some do, and it'd be nice
   to not make that slow unnecessarily.
 - We're doing a COW of a private mapping (both PTE already present
   and PTE not-present).  These are two different codepaths and I handle
   both of them in this patch set.

There is no support in this patch set for drivers to mark themselves as
being VMA lock friendly; they could implement the ->map_pages
vm_operation, but if they do, they would be the first.  This is probably
something we want to change at some point in the future, and I've marked
where to make that change in the code.

There is very little performance change in the benchmarks we've run;
mostly because the vast majority of page faults are handled through the
other paths.  I still think this patch series is useful for workloads that
may take these paths more often, and just for cleaning up the fault path
in general (it's now clearer why we have to retry in these cases).

This patch (of 6):

Drop the VMA lock instead of the mmap_lock if that's the one which
is held.

Link: https://lkml.kernel.org/r/20231006195318.4087158-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20231006195318.4087158-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 5d74b2ab2c15d596c470bae6626f345d5575a9d0)

Bug: 293665307
Change-Id: Ife2d11ab12fb428868cd44751784cf731fbffe62
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/filemap.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 63a846a1c1a3..c38dc43bfc8c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2967,7 +2967,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
 
 	/*
 	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
-	 * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
+	 * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
 	 * is supposed to work. We have way too many special cases..
 	 */
 	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
@@ -2977,13 +2977,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
 	if (vmf->flags & FAULT_FLAG_KILLABLE) {
 		if (__folio_lock_killable(folio)) {
 			/*
-			 * We didn't have the right flags to drop the mmap_lock,
-			 * but all fault_handlers only check for fatal signals
-			 * if we return VM_FAULT_RETRY, so we need to drop the
-			 * mmap_lock here and return 0 if we don't have a fpin.
+			 * We didn't have the right flags to drop the
+			 * fault lock, but all fault_handlers only check
+			 * for fatal signals if we return VM_FAULT_RETRY,
+			 * so we need to drop the fault lock here and
+			 * return 0 if we don't have a fpin.
 			 */
 			if (*fpin == NULL)
-				mmap_read_unlock(vmf->vma->vm_mm);
+				release_fault_lock(vmf);
 			return 0;
 		}
 	} else

From 95af8a80bb7ba4ad46de6f743ec91a882f4cd624 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 6 Oct 2023 20:53:14 +0100
Subject: [PATCH 087/139] BACKPORT: mm: call wp_page_copy() under the VMA lock

It is usually safe to call wp_page_copy() under the VMA lock.  The only
unsafe situation is when no anon_vma has been allocated for this VMA, and
we have to look at adjacent VMAs to determine if their anon_vma can be
shared.  Since this happens only for the first COW of a page in this VMA,
the majority of calls to wp_page_copy() do not need to fall back to the
mmap_sem.

Add vmf_anon_prepare() as an alternative to anon_vma_prepare() which will
return RETRY if we currently hold the VMA lock and need to allocate an
anon_vma.  This lets us drop the check in do_wp_page().

Link: https://lkml.kernel.org/r/20231006195318.4087158-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 164b06f238b986317131e6b61b2f22aabcbc2cc0)
[surenb: resolved merge conflicts due to folio/page differences]

Bug: 293665307
Change-Id: I39bdc247b375bd3dae8078b52c60fd4ce12e1850
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/memory.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 24463fd7de64..3cccb73b0a13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3099,6 +3099,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 	count_vm_event(PGREUSE);
 }
 
+static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+
+	if (likely(vma->anon_vma))
+		return 0;
+	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+		vma_end_read(vma);
+		return VM_FAULT_RETRY;
+	}
+	if (__anon_vma_prepare(vma))
+		return VM_FAULT_OOM;
+	return 0;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page,
  * either due to COW or unsharing.
@@ -3126,12 +3141,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	pte_t entry;
 	int page_copied = 0;
 	struct mmu_notifier_range range;
-	int ret;
+	vm_fault_t ret;
 
 	delayacct_wpcopy_start();
 
-	if (unlikely(anon_vma_prepare(vma)))
-		goto oom;
+	ret = vmf_anon_prepare(vmf);
+	if (unlikely(ret))
+		goto out;
 
 	if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
 		new_page = alloc_zeroed_user_highpage_movable(vma,
@@ -3139,13 +3155,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		if (!new_page)
 			goto oom;
 	} else {
+		int err;
 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 				vmf->address);
 		if (!new_page)
 			goto oom;
 
-		ret = __wp_page_copy_user(new_page, old_page, vmf);
-		if (ret) {
+		err = __wp_page_copy_user(new_page, old_page, vmf);
+		if (err) {
 			/*
 			 * COW failed, if the fault was solved by other,
 			 * it's fine. If not, userspace would re-fault on
@@ -3158,7 +3175,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 				put_page(old_page);
 
 			delayacct_wpcopy_end();
-			return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
+			return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
 		}
 		kmsan_copy_page_meta(new_page, old_page);
 	}
@@ -3271,11 +3288,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 oom_free_new:
 	put_page(new_page);
 oom:
+	ret = VM_FAULT_OOM;
+out:
 	if (old_page)
 		put_page(old_page);
 
 	delayacct_wpcopy_end();
-	return VM_FAULT_OOM;
+	return ret;
 }
 
 /**
@@ -3510,12 +3529,6 @@ reuse:
 		return wp_page_shared(vmf);
 	}
 copy:
-	if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
-		pte_unmap_unlock(vmf->pte, vmf->ptl);
-		vma_end_read(vmf->vma);
-		return VM_FAULT_RETRY;
-	}
-
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */

From c7fa581a793af13559143914227a42dcc83fa3e7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 6 Oct 2023 20:53:15 +0100
Subject: [PATCH 088/139] UPSTREAM: mm: handle shared faults under the VMA lock

There are many implementations of ->fault and some of them depend on
mmap_lock being held.  All vm_ops that implement ->map_pages() end up
calling filemap_fault(), which I have audited to be sure it does not rely
on mmap_lock.  So (for now) key off ->map_pages existing as a flag to
indicate that it's safe to call ->fault while only holding the vma lock.

Link: https://lkml.kernel.org/r/20231006195318.4087158-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 4ed4379881aa62588aba6442a9f362a8cf7624e6)

Bug: 293665307
Change-Id: Ifb5ab3df5d05fb182d0cb52820fa24e28e2d6496
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/memory.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 3cccb73b0a13..0db26276bed7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3099,6 +3099,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 	count_vm_event(PGREUSE);
 }
 
+/*
+ * We could add a bitflag somewhere, but for now, we know that all
+ * vm_ops that have a ->map_pages have been audited and don't need
+ * the mmap_lock to be held.
+ */
+static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+
+	if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
+		return 0;
+	vma_end_read(vma);
+	return VM_FAULT_RETRY;
+}
+
 static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -4701,10 +4716,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	vm_fault_t ret, tmp;
 
-	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-		vma_end_read(vma);
-		return VM_FAULT_RETRY;
-	}
+	ret = vmf_can_call_fault(vmf);
+	if (ret)
+		return ret;
 
 	ret = __do_fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))

From 6541fffd92e5584e6297b14b0ef9f5e09cf79e5d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 6 Oct 2023 20:53:16 +0100
Subject: [PATCH 089/139] UPSTREAM: mm: handle COW faults under the VMA lock

If the page is not currently present in the page tables, we need to call
the page fault handler to find out which page we're supposed to COW, so we
need to both check that there is already an anon_vma and that the fault
handler doesn't need the mmap_lock.

Link: https://lkml.kernel.org/r/20231006195318.4087158-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 4de8c93a4751e10737b6af65db42c743228c67a6)

Bug: 293665307
Change-Id: If749a6f8fcf69d83bbf872c1d45865d1b1b77ea0
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/memory.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 0db26276bed7..cc0a95fc14b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4672,13 +4672,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	vm_fault_t ret;
 
-	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-		vma_end_read(vma);
-		return VM_FAULT_RETRY;
-	}
-
-	if (unlikely(anon_vma_prepare(vma)))
-		return VM_FAULT_OOM;
+	ret = vmf_can_call_fault(vmf);
+	if (!ret)
+		ret = vmf_anon_prepare(vmf);
+	if (ret)
+		return ret;
 
 	vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
 	if (!vmf->cow_page)

From c1da94fa44e60c93e3896b087680e905116d69b3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 6 Oct 2023 20:53:17 +0100
Subject: [PATCH 090/139] UPSTREAM: mm: handle read faults under the VMA lock

Most file-backed faults are already handled through ->map_pages(), but if
we need to do I/O we'll come this way.  Since filemap_fault() is now safe
to be called under the VMA lock, we can handle these faults under the VMA
lock now.

Link: https://lkml.kernel.org/r/20231006195318.4087158-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 12214eba1992642eee5813a9cc9f626e5b2d1815)

Bug: 293665307
Change-Id: Iee48af98b866d88d88ec01143eb26389ab373b6b
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/memory.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index cc0a95fc14b2..ddebfa1457f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4651,10 +4651,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
 			return ret;
 	}
 
-	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-		vma_end_read(vmf->vma);
-		return VM_FAULT_RETRY;
-	}
+	ret = vmf_can_call_fault(vmf);
+	if (ret)
+		return ret;
 
 	ret = __do_fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))

From 4a518d86339bb74f1edb918b0808bc755273258a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 6 Oct 2023 20:53:18 +0100
Subject: [PATCH 091/139] UPSTREAM: mm: handle write faults to RO pages under
 the VMA lock

I think this is a pretty rare occurrence, but for consistency handle
faults with the VMA lock held the same way that we handle other faults
with the VMA lock held.

Link: https://lkml.kernel.org/r/20231006195318.4087158-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 4a68fef16df9d88d528094116f8bbd2dbfa62089)

Bug: 293665307
Change-Id: I69cec218c8a1fe14df3268722e6b1be6dffe7978
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 mm/memory.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ddebfa1457f4..7f29d9394bdf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3358,10 +3358,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
 		vm_fault_t ret;
 
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
-		if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-			vma_end_read(vmf->vma);
-			return VM_FAULT_RETRY;
-		}
+		ret = vmf_can_call_fault(vmf);
+		if (ret)
+			return ret;
 
 		vmf->flags |= FAULT_FLAG_MKWRITE;
 		ret = vma->vm_ops->pfn_mkwrite(vmf);
@@ -3385,10 +3384,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 		vm_fault_t tmp;
 
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
-		if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+		tmp = vmf_can_call_fault(vmf);
+		if (tmp) {
 			put_page(vmf->page);
-			vma_end_read(vmf->vma);
-			return VM_FAULT_RETRY;
+			return tmp;
 		}
 
 		tmp = do_page_mkwrite(vmf);

From 6c8f7108578a1f4ed013c3bec63784e7f25cd6bf Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 26 Dec 2023 13:46:10 -0800
Subject: [PATCH 092/139] FROMGIT: arch/mm/fault: fix major fault accounting
 when retrying under per-VMA lock

A test [1] in Android test suite started failing after [2] was merged.  It
turns out that after handling a major fault under per-VMA lock, the
process major fault counter does not register that fault as major.  Before
[2] read faults would be done under mmap_lock, in which case
FAULT_FLAG_TRIED flag is set before retrying.  That in turn causes
mm_account_fault() to account the fault as major once retry completes.
With per-VMA locks we often retry because a fault can't be handled without
locking the whole mm using mmap_lock.  Therefore such retries do not set
FAULT_FLAG_TRIED flag.  This logic does not work after [2] because we can
now handle read major faults under per-VMA lock and upon retry the fact
there was a major fault gets lost.  Fix this by setting FAULT_FLAG_TRIED
after retrying under per-VMA lock if VM_FAULT_MAJOR was returned.  Ideally
we would use an additional VM_FAULT bit to indicate the reason for the
retry (could not handle under per-VMA lock vs other reason) but this
simpler solution seems to work, so keeping it simple.

[1] https://cs.android.com/android/platform/superproject/+/master:test/vts-testcase/kernel/api/drop_caches_prop/drop_caches_test.cpp
[2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/

Link: https://lkml.kernel.org/r/20231226214610.109282-1-surenb@google.com
Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 46e714c729c8d1d8110bc0545d7ffe8a759c9dc0
 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-hotfixes-stable)

Bug: 317385399
Change-Id: Ic7e97bf610dcabb7d3ac2306b2f1213be0ddd269
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 arch/arm64/mm/fault.c   | 2 ++
 arch/powerpc/mm/fault.c | 2 ++
 arch/riscv/mm/fault.c   | 2 ++
 arch/s390/mm/fault.c    | 3 +++
 arch/x86/mm/fault.c     | 2 ++
 5 files changed, 11 insertions(+)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9970a4785819..7d522b037d9a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -619,6 +619,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 		goto done;
 	}
 	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		mm_flags |= FAULT_FLAG_TRIED;
 
 	/* Quick path to respond to signals */
 	if (fault_signal_pending(fault, regs)) {
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b1723094d464..ec23164ad768 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -496,6 +496,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
 		goto done;
 	}
 	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
 
 	if (fault_signal_pending(fault, regs))
 		return user_mode(regs) ? 0 : SIGBUS;
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 34a44febae86..d710bb834a2a 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -310,6 +310,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
 		goto done;
 	}
 	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
 
 	if (fault_signal_pending(fault, regs)) {
 		if (!user_mode(regs))
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 0843adb266d1..60fed3c88332 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -420,6 +420,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 		goto out;
 	}
 	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
+
 	/* Quick path to respond to signals */
 	if (fault_signal_pending(fault, regs)) {
 		fault = VM_FAULT_SIGNAL;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 97599581ec6b..bcb5678b5b91 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1369,6 +1369,8 @@ void do_user_addr_fault(struct pt_regs *regs,
 		goto done;
 	}
 	count_vm_vma_lock_event(VMA_LOCK_RETRY);
+	if (fault & VM_FAULT_MAJOR)
+		flags |= FAULT_FLAG_TRIED;
 
 	/* Quick path to respond to signals */
 	if (fault_signal_pending(fault, regs)) {

From 4d99e41ce174d65ef1d710922acb7a2c67bbaa0b Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <quic_mojha@quicinc.com>
Date: Sat, 25 Nov 2023 02:41:58 +0530
Subject: [PATCH 093/139] FROMGIT: PM / devfreq: Synchronize
 devfreq_monitor_[start/stop]

There is a chance if a frequent switch of the governor
done in a loop result in timer list corruption where
timer cancel being done from two place one from
cancel_delayed_work_sync() and followed by expire_timers()
can be seen from the traces[1].

while true
do
        echo "simple_ondemand" > /sys/class/devfreq/1d84000.ufshc/governor
        echo "performance" > /sys/class/devfreq/1d84000.ufshc/governor
done

It looks to be issue with devfreq driver where
device_monitor_[start/stop] need to synchronized so that
delayed work should get corrupted while it is either
being queued or running or being cancelled.

Let's use polling flag and devfreq lock to synchronize the
queueing the timer instance twice and work data being
corrupted.

[1]
...
..
<idle>-0    [003]   9436.209662:  timer_cancel   timer=0xffffff80444f0428
<idle>-0    [003]   9436.209664:  timer_expire_entry   timer=0xffffff80444f0428  now=0x10022da1c  function=__typeid__ZTSFvP10timer_listE_global_addr  baseclk=0x10022da1c
<idle>-0    [003]   9436.209718:  timer_expire_exit   timer=0xffffff80444f0428
kworker/u16:6-14217    [003]   9436.209863:  timer_start   timer=0xffffff80444f0428  function=__typeid__ZTSFvP10timer_listE_global_addr  expires=0x10022da2b  now=0x10022da1c  flags=182452227
vendor.xxxyyy.ha-1593    [004]   9436.209888:  timer_cancel   timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593    [004]   9436.216390:  timer_init   timer=0xffffff80444f0428
vendor.xxxyyy.ha-1593    [004]   9436.216392:  timer_start   timer=0xffffff80444f0428  function=__typeid__ZTSFvP10timer_listE_global_addr  expires=0x10022da2c  now=0x10022da1d  flags=186646532
vendor.xxxyyy.ha-1593    [005]   9436.220992:  timer_cancel   timer=0xffffff80444f0428
xxxyyyTraceManag-7795    [004]   9436.261641:  timer_cancel   timer=0xffffff80444f0428

[2]

 9436.261653][    C4] Unable to handle kernel paging request at virtual address dead00000000012a
[ 9436.261664][    C4] Mem abort info:
[ 9436.261666][    C4]   ESR = 0x96000044
[ 9436.261669][    C4]   EC = 0x25: DABT (current EL), IL = 32 bits
[ 9436.261671][    C4]   SET = 0, FnV = 0
[ 9436.261673][    C4]   EA = 0, S1PTW = 0
[ 9436.261675][    C4] Data abort info:
[ 9436.261677][    C4]   ISV = 0, ISS = 0x00000044
[ 9436.261680][    C4]   CM = 0, WnR = 1
[ 9436.261682][    C4] [dead00000000012a] address between user and kernel address ranges
[ 9436.261685][    C4] Internal error: Oops: 96000044 [#1] PREEMPT SMP
[ 9436.261701][    C4] Skip md ftrace buffer dump for: 0x3a982d0
...

[ 9436.262138][    C4] CPU: 4 PID: 7795 Comm: TraceManag Tainted: G S      W  O      5.10.149-android12-9-o-g17f915d29d0c #1
[ 9436.262141][    C4] Hardware name: Qualcomm Technologies, Inc.  (DT)
[ 9436.262144][    C4] pstate: 22400085 (nzCv daIf +PAN -UAO +TCO BTYPE=--)
[ 9436.262161][    C4] pc : expire_timers+0x9c/0x438
[ 9436.262164][    C4] lr : expire_timers+0x2a4/0x438
[ 9436.262168][    C4] sp : ffffffc010023dd0
[ 9436.262171][    C4] x29: ffffffc010023df0 x28: ffffffd0636fdc18
[ 9436.262178][    C4] x27: ffffffd063569dd0 x26: ffffffd063536008
[ 9436.262182][    C4] x25: 0000000000000001 x24: ffffff88f7c69280
[ 9436.262185][    C4] x23: 00000000000000e0 x22: dead000000000122
[ 9436.262188][    C4] x21: 000000010022da29 x20: ffffff8af72b4e80
[ 9436.262191][    C4] x19: ffffffc010023e50 x18: ffffffc010025038
[ 9436.262195][    C4] x17: 0000000000000240 x16: 0000000000000201
[ 9436.262199][    C4] x15: ffffffffffffffff x14: ffffff889f3c3100
[ 9436.262203][    C4] x13: ffffff889f3c3100 x12: 00000000049f56b8
[ 9436.262207][    C4] x11: 00000000049f56b8 x10: 00000000ffffffff
[ 9436.262212][    C4] x9 : ffffffc010023e50 x8 : dead000000000122
[ 9436.262216][    C4] x7 : ffffffffffffffff x6 : ffffffc0100239d8
[ 9436.262220][    C4] x5 : 0000000000000000 x4 : 0000000000000101
[ 9436.262223][    C4] x3 : 0000000000000080 x2 : ffffff889edc155c
[ 9436.262227][    C4] x1 : ffffff8001005200 x0 : ffffff80444f0428
[ 9436.262232][    C4] Call trace:
[ 9436.262236][    C4]  expire_timers+0x9c/0x438
[ 9436.262240][    C4]  __run_timers+0x1f0/0x330
[ 9436.262245][    C4]  run_timer_softirq+0x28/0x58
[ 9436.262255][    C4]  efi_header_end+0x168/0x5ec
[ 9436.262265][    C4]  __irq_exit_rcu+0x108/0x124
[ 9436.262274][    C4]  __handle_domain_irq+0x118/0x1e4
[ 9436.262282][    C4]  gic_handle_irq.30369+0x6c/0x2bc
[ 9436.262286][    C4]  el0_irq_naked+0x60/0x6c

Bug: 317188938
Change-Id: I9a22325f6abbf28217c8f37b093cf77509b0139a
Link: https://lore.kernel.org/all/1700860318-4025-1-git-send-email-quic_mojha@quicinc.com/
Reported-by: Joyyoung Huang <huangzaiyang@oppo.com>
Acked-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
(cherry picked from commit aed5ed595960c6d301dcd4ed31aeaa7a8054c0c6
 https://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git devfreq-next)
Signed-off-by: Srinivasarao Pathipati <quic_c_spathi@quicinc.com>
---
 drivers/devfreq/devfreq.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index fe6644f99887..8e9ba701a643 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -461,10 +461,14 @@ static void devfreq_monitor(struct work_struct *work)
 	if (err)
 		dev_err(&devfreq->dev, "dvfs failed with (%d) error\n", err);
 
+	if (devfreq->stop_polling)
+		goto out;
+
 	queue_delayed_work(devfreq_wq, &devfreq->work,
 				msecs_to_jiffies(devfreq->profile->polling_ms));
-	mutex_unlock(&devfreq->lock);
 
+out:
+	mutex_unlock(&devfreq->lock);
 	trace_devfreq_monitor(devfreq);
 }
 
@@ -482,6 +486,10 @@ void devfreq_monitor_start(struct devfreq *devfreq)
 	if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN))
 		return;
 
+	mutex_lock(&devfreq->lock);
+	if (delayed_work_pending(&devfreq->work))
+		goto out;
+
 	switch (devfreq->profile->timer) {
 	case DEVFREQ_TIMER_DEFERRABLE:
 		INIT_DEFERRABLE_WORK(&devfreq->work, devfreq_monitor);
@@ -490,12 +498,16 @@ void devfreq_monitor_start(struct devfreq *devfreq)
 		INIT_DELAYED_WORK(&devfreq->work, devfreq_monitor);
 		break;
 	default:
-		return;
+		goto out;
 	}
 
 	if (devfreq->profile->polling_ms)
 		queue_delayed_work(devfreq_wq, &devfreq->work,
 			msecs_to_jiffies(devfreq->profile->polling_ms));
+
+out:
+	devfreq->stop_polling = false;
+	mutex_unlock(&devfreq->lock);
 }
 EXPORT_SYMBOL(devfreq_monitor_start);
 
@@ -512,6 +524,14 @@ void devfreq_monitor_stop(struct devfreq *devfreq)
 	if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN))
 		return;
 
+	mutex_lock(&devfreq->lock);
+	if (devfreq->stop_polling) {
+		mutex_unlock(&devfreq->lock);
+		return;
+	}
+
+	devfreq->stop_polling = true;
+	mutex_unlock(&devfreq->lock);
 	cancel_delayed_work_sync(&devfreq->work);
 }
 EXPORT_SYMBOL(devfreq_monitor_stop);

From 99288e911ad553ffee55b3da942f6e722fc0573b Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 20 Dec 2023 13:35:00 +0000
Subject: [PATCH 094/139] ANDROID: KVM: arm64: Document
 module_change_host_prot_range

When this pKVM module ops has been introduced, the documentation has
been omitted.

Bug: 308373293
Change-Id: I9e471414e72a1ee04c132de4ed95d77e815ae8c9
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
---
 arch/arm64/include/asm/kvm_pkvm_module.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h
index bf68d862b7d8..b8e5f8064358 100644
--- a/arch/arm64/include/asm/kvm_pkvm_module.h
+++ b/arch/arm64/include/asm/kvm_pkvm_module.h
@@ -72,6 +72,11 @@ enum pkvm_psci_notification {
  *				@register_host_perm_fault_handler), otherwise
  *				pKVM will be unable to handle this fault and the
  *				CPU will be stuck in an infinite loop.
+ * @host_stage2_mod_prot_range:	Similar to @host_stage2_mod_prot, but takes a
+ *				range as an argument (@nr_pages). This
+ *				considerably speeds up the process for a
+ *				contiguous memory region, compared to the
+ *				per-page @host_stage2_mod_prot.
  * @host_stage2_get_leaf:	Query the host's stage2 page-table entry for
  *				the page @phys.
  * @register_host_smc_handler:	@cb is called whenever the host issues an SMC

From 8fc25d7862c9c669025dd534d3eefd20d071ea0d Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Thu, 19 Oct 2023 15:51:08 -0700
Subject: [PATCH 095/139] FROMGIT: f2fs: do not return EFSCORRUPTED, but try to
 run online repair

If we return the error, there's no way to recover the status as of now, since
fsck does not fix the xattr boundary issue.

Bug: 305658663
Cc: stable@vger.kernel.org
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
(cherry picked from commit 50a472bbc79ff9d5a88be8019a60e936cadf9f13
 https://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git dev)
Change-Id: I55060a4eede3f5f85066aba22a6ab7155517e5c4
(cherry picked from commit 70113b9d489050d3e7a6f28e0cd6e43f104fc132)
(cherry picked from commit 2c1f3789d609bd549f14c019b6c7b311bfd2fa64)
---
 fs/f2fs/node.c  |  4 +++-
 fs/f2fs/xattr.c | 20 +++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 5010a33acb8a..60708d47aaa8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2734,7 +2734,9 @@ recover_xnid:
 	f2fs_update_inode_page(inode);
 
 	/* 3: update and set xattr node page dirty */
-	memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
+	if (page)
+		memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
+				VALID_XATTR_BLOCK_SIZE);
 
 	set_page_dirty(xpage);
 	f2fs_put_page(xpage, 1);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index db3b641f2158..adaad16468d8 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -363,10 +363,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 	*xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name);
 	if (!*xe) {
-		f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+		f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr",
 								inode->i_ino);
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
-		err = -EFSCORRUPTED;
+		err = -ENODATA;
 		f2fs_handle_error(F2FS_I_SB(inode),
 					ERROR_CORRUPTED_XATTR);
 		goto out;
@@ -583,13 +583,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 		if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
 			(void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
-			f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+			f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr",
 						inode->i_ino);
 			set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
-			error = -EFSCORRUPTED;
 			f2fs_handle_error(F2FS_I_SB(inode),
 						ERROR_CORRUPTED_XATTR);
-			goto cleanup;
+			break;
 		}
 
 		if (!handler || (handler->list && !handler->list(dentry)))
@@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 
 	if (size > MAX_VALUE_LEN(inode))
 		return -E2BIG;
-
+retry:
 	error = read_all_xattrs(inode, ipage, &base_addr);
 	if (error)
 		return error;
@@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	/* find entry with wanted name. */
 	here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
 	if (!here) {
-		f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+		if (!F2FS_I(inode)->i_xattr_nid) {
+			f2fs_notice(F2FS_I_SB(inode),
+				"recover xattr in inode (%lu)", inode->i_ino);
+			f2fs_recover_xattr_data(inode, NULL);
+			kfree(base_addr);
+			goto retry;
+		}
+		f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr",
 								inode->i_ino);
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
 		error = -EFSCORRUPTED;

From 717d1f8f91abc780615bd85ac778f702aff6fde4 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Tue, 19 Dec 2023 16:40:19 +0000
Subject: [PATCH 096/139] ANDROID: KVM: arm64: Fix host_smc print typo

From pKVM point of view, unknown SMCs are simply forwarded, we can't
consider them invalid or not. This was probably a typo following a copy
of the host_hcall event.

Bug: 299430621
Change-Id: Ieb53f985a5187a8b5a9feb4a95982b15cdc1b04a
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
---
 arch/arm64/include/asm/kvm_hypevents.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h
index 8a2dd41b8569..c507d8978444 100644
--- a/arch/arm64/include/asm/kvm_hypevents.h
+++ b/arch/arm64/include/asm/kvm_hypevents.h
@@ -53,7 +53,7 @@ HYP_EVENT(host_smc,
 		__entry->id = id;
 		__entry->forwarded = forwarded;
 	),
-	HE_PRINTK("id=%llu invalid=%u",
+	HE_PRINTK("id=%llu forwarded=%u",
 		  __entry->id, __entry->forwarded)
 );
 

From 15a93de4641d882e26cf1de50af69679c1a20aee Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Wed, 20 Dec 2023 16:18:05 +0000
Subject: [PATCH 097/139] ANDROID: KVM: arm64: Fix hyp event alignment

The structures that define hyp events must be packed so they match
their format definitions in the tracefs file
hyp/events/hyp/<event>/format.

Bug: 299430621
Change-Id: Ia7e1a686744d5c9c3f8a21881f03228c8acecade
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
---
 arch/arm64/include/asm/kvm_hypevents_defs.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hypevents_defs.h b/arch/arm64/include/asm/kvm_hypevents_defs.h
index e228d894a898..606f3477ecd3 100644
--- a/arch/arm64/include/asm/kvm_hypevents_defs.h
+++ b/arch/arm64/include/asm/kvm_hypevents_defs.h
@@ -15,10 +15,10 @@ struct hyp_entry_hdr {
 /*
  * Hyp events definitions common to the hyp and the host
  */
-#define HYP_EVENT_FORMAT(__name, __struct)	\
-	struct trace_hyp_format_##__name {	\
-		struct hyp_entry_hdr hdr;	\
-		__struct			\
+#define HYP_EVENT_FORMAT(__name, __struct)		\
+	struct __packed trace_hyp_format_##__name {	\
+		struct hyp_entry_hdr hdr;		\
+		__struct				\
 	}
 
 #define HE_PROTO(args...)	args

From e74417834ea0485b4a1e9ba9d8d7f2fed143eddd Mon Sep 17 00:00:00 2001
From: Ken Huang <kenbshuang@google.com>
Date: Thu, 4 Jan 2024 23:18:14 +0800
Subject: [PATCH 098/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
  - __drmm_crtc_alloc_with_planes

Bug: 275278929
Change-Id: I41b6069612d44214f474ed82ee2a4b07ca739302
Signed-off-by: Ken Huang <kenbshuang@google.com>
---
 android/abi_gki_aarch64_pixel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index af22721e20b8..a01a49721a1a 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -739,6 +739,7 @@
   drm_kms_helper_poll_fini
   drm_kms_helper_poll_init
   drm_match_cea_mode
+  __drmm_crtc_alloc_with_planes
   drmm_kmalloc
   drmm_mode_config_init
   drm_mode_config_reset

From 066d57de875d52d8fd4fbb9248f42e3c8a1066cc Mon Sep 17 00:00:00 2001
From: Kever Yang <kever.yang@rock-chips.com>
Date: Wed, 3 Jan 2024 22:02:54 +0800
Subject: [PATCH 099/139] ANDROID: GKI: Enable symbols for v4l2 in async and
 fwnode

INFO: 14 function symbol(s) added
  'struct v4l2_async_subdev* __v4l2_async_nf_add_fwnode(struct v4l2_async_notifier*, struct fwnode_handle*, unsigned int)'
  'struct v4l2_async_subdev* __v4l2_async_nf_add_fwnode_remote(struct v4l2_async_notifier*, struct fwnode_handle*, unsigned int)'
  'void v4l2_async_nf_cleanup(struct v4l2_async_notifier*)'
  'void v4l2_async_nf_init(struct v4l2_async_notifier*)'
  'int v4l2_async_nf_parse_fwnode_endpoints(struct device*, struct v4l2_async_notifier*, size_t, parse_endpoint_func)'
  'int v4l2_async_nf_register(struct v4l2_device*, struct v4l2_async_notifier*)'
  'void v4l2_async_nf_unregister(struct v4l2_async_notifier*)'
  'int v4l2_async_register_subdev(struct v4l2_subdev*)'
  'int v4l2_async_register_subdev_sensor(struct v4l2_subdev*)'
  'int v4l2_async_subdev_nf_register(struct v4l2_subdev*, struct v4l2_async_notifier*)'
  'void v4l2_async_unregister_subdev(struct v4l2_subdev*)'
  'int v4l2_fwnode_endpoint_alloc_parse(struct fwnode_handle*, struct v4l2_fwnode_endpoint*)'
  'void v4l2_fwnode_endpoint_free(struct v4l2_fwnode_endpoint*)'
  'int v4l2_fwnode_endpoint_parse(struct fwnode_handle*, struct v4l2_fwnode_endpoint*)'

Bug: 300024866
Change-Id: I7e4c2faac5c8341a19ea3fed694190d38679dc5b
Signed-off-by: Kever Yang <kever.yang@rock-chips.com>
---
 android/abi_gki_aarch64.stg      | 269 +++++++++++++++++++++++++++++++
 android/abi_gki_aarch64_rockchip |  14 ++
 2 files changed, 283 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index f2b7c07a7716..4a190c90a757 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -9018,6 +9018,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0x72d62916
 }
+pointer_reference {
+  id: 0x1625e208
+  kind: POINTER
+  pointee_type_id: 0x72d76ebd
+}
 pointer_reference {
   id: 0x162c7a70
   kind: POINTER
@@ -18183,6 +18188,11 @@ pointer_reference {
   kind: POINTER
   pointee_type_id: 0x9d41cc1a
 }
+pointer_reference {
+  id: 0x2dc069c5
+  kind: POINTER
+  pointee_type_id: 0x9d414188
+}
 pointer_reference {
   id: 0x2dc1540f
   kind: POINTER
@@ -30713,6 +30723,11 @@ typedef {
   name: "p4d_t"
   referred_type_id: 0x148546d4
 }
+typedef {
+  id: 0xbad82a2c
+  name: "parse_endpoint_func"
+  referred_type_id: 0x2dc069c5
+}
 typedef {
   id: 0x8ef19fe7
   name: "pci_bus_flags_t"
@@ -52189,6 +52204,11 @@ member {
   name: "base"
   type_id: 0x180f82e8
 }
+member {
+  id: 0x85d2e2e4
+  name: "base"
+  type_id: 0x080c6fc2
+}
 member {
   id: 0x85d6188a
   name: "base"
@@ -56419,6 +56439,12 @@ member {
   name: "bus"
   type_id: 0x2309ad3e
 }
+member {
+  id: 0xdaf846cc
+  name: "bus"
+  type_id: 0x286a95aa
+  offset: 160
+}
 member {
   id: 0x1639ef00
   name: "bus_cleanup"
@@ -56648,6 +56674,12 @@ member {
   type_id: 0x945e7ef6
   offset: 448
 }
+member {
+  id: 0x2c928e64
+  name: "bus_type"
+  type_id: 0x3c57148f
+  offset: 128
+}
 member {
   id: 0xb43c45b4
   name: "bus_width"
@@ -116596,6 +116628,12 @@ member {
   name: "link_fd"
   type_id: 0xe62ebf07
 }
+member {
+  id: 0x6075ccdc
+  name: "link_frequencies"
+  type_id: 0x2e18f543
+  offset: 512
+}
 member {
   id: 0x178cf8a4
   name: "link_gen"
@@ -126808,6 +126846,18 @@ member {
   name: "mipi_csi1"
   type_id: 0xe49bfc8b
 }
+member {
+  id: 0xa7e5d7c1
+  name: "mipi_csi1"
+  type_id: 0xe49bfc8b
+  offset: 64
+}
+member {
+  id: 0xeda56411
+  name: "mipi_csi2"
+  type_id: 0xe72f0de6
+  offset: 128
+}
 member {
   id: 0xeda56dd3
   name: "mipi_csi2"
@@ -136120,6 +136170,12 @@ member {
   type_id: 0xe62ebf07
   offset: 672
 }
+member {
+  id: 0x4519d21b
+  name: "nr_of_link_frequencies"
+  type_id: 0x4585663f
+  offset: 576
+}
 member {
   id: 0x9c6b34f7
   name: "nr_off"
@@ -211032,6 +211088,16 @@ struct_union {
     member_id: 0x9683f73d
   }
 }
+struct_union {
+  id: 0x286a95aa
+  kind: STRUCT
+  definition {
+    bytesize: 40
+    member_id: 0xc0bc4db7
+    member_id: 0xa7e5d7c1
+    member_id: 0xeda56411
+  }
+}
 struct_union {
   id: 0x2880e524
   kind: STRUCT
@@ -265808,6 +265874,19 @@ struct_union {
     member_id: 0x465224ed
   }
 }
+struct_union {
+  id: 0x72d76ebd
+  kind: STRUCT
+  name: "v4l2_fwnode_endpoint"
+  definition {
+    bytesize: 80
+    member_id: 0x85d2e2e4
+    member_id: 0x2c928e64
+    member_id: 0xdaf846cc
+    member_id: 0x6075ccdc
+    member_id: 0x4519d21b
+  }
+}
 struct_union {
   id: 0xccd4dc1a
   kind: STRUCT
@@ -287051,6 +287130,13 @@ enumeration {
     }
   }
 }
+function {
+  id: 0x003279c7
+  return_type_id: 0x3c2dd1ca
+  parameter_id: 0x3cfe7778
+  parameter_id: 0x0490bb4a
+  parameter_id: 0x4585663f
+}
 function {
   id: 0x004cf563
   return_type_id: 0x48b5725f
@@ -291664,6 +291750,11 @@ function {
   parameter_id: 0x14528516
   parameter_id: 0x2712b6f9
 }
+function {
+  id: 0x15112911
+  return_type_id: 0x48b5725f
+  parameter_id: 0x1625e208
+}
 function {
   id: 0x151457b1
   return_type_id: 0xd5cc9c9a
@@ -298655,6 +298746,11 @@ function {
   parameter_id: 0x3c2755a3
   parameter_id: 0x0cbf60eb
 }
+function {
+  id: 0x1fa7cc4d
+  return_type_id: 0x48b5725f
+  parameter_id: 0x3cfe7778
+}
 function {
   id: 0x1fa8b2bc
   return_type_id: 0x48b5725f
@@ -321574,6 +321670,12 @@ function {
   parameter_id: 0x04b193cc
   parameter_id: 0x0335a07f
 }
+function {
+  id: 0x9ca0dc77
+  return_type_id: 0x6720d32f
+  parameter_id: 0x074f1a14
+  parameter_id: 0x3cfe7778
+}
 function {
   id: 0x9ca1921c
   return_type_id: 0x6720d32f
@@ -322038,6 +322140,12 @@ function {
   parameter_id: 0x054f691a
   parameter_id: 0x0aa1f0ee
 }
+function {
+  id: 0x9cfc5a75
+  return_type_id: 0x6720d32f
+  parameter_id: 0x0490bb4a
+  parameter_id: 0x1625e208
+}
 function {
   id: 0x9cfd713b
   return_type_id: 0x6720d32f
@@ -322060,6 +322168,12 @@ function {
   parameter_id: 0x02ed0755
   parameter_id: 0x0e68dab6
 }
+function {
+  id: 0x9d027320
+  return_type_id: 0x6720d32f
+  parameter_id: 0x01c5a749
+  parameter_id: 0x3cfe7778
+}
 function {
   id: 0x9d038726
   return_type_id: 0x6720d32f
@@ -322608,6 +322722,13 @@ function {
   parameter_id: 0x0258f96e
   parameter_id: 0x15f20052
 }
+function {
+  id: 0x9d414188
+  return_type_id: 0x6720d32f
+  parameter_id: 0x0258f96e
+  parameter_id: 0x1625e208
+  parameter_id: 0x3c2dd1ca
+}
 function {
   id: 0x9d419277
   return_type_id: 0x6720d32f
@@ -323728,6 +323849,14 @@ function {
   parameter_id: 0x33756485
   parameter_id: 0x064d6086
 }
+function {
+  id: 0x9ddac293
+  return_type_id: 0x6720d32f
+  parameter_id: 0x0258f96e
+  parameter_id: 0x3cfe7778
+  parameter_id: 0xf435685e
+  parameter_id: 0xbad82a2c
+}
 function {
   id: 0x9ddaf106
   return_type_id: 0x6720d32f
@@ -343026,6 +343155,24 @@ elf_symbol {
   type_id: 0x20cd94dc
   full_name: "__usecs_to_jiffies"
 }
+elf_symbol {
+  id: 0xf51d746f
+  name: "__v4l2_async_nf_add_fwnode"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x03599cac
+  type_id: 0x003279c7
+  full_name: "__v4l2_async_nf_add_fwnode"
+}
+elf_symbol {
+  id: 0xe13e16ca
+  name: "__v4l2_async_nf_add_fwnode_remote"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x82966749
+  type_id: 0x003279c7
+  full_name: "__v4l2_async_nf_add_fwnode_remote"
+}
 elf_symbol {
   id: 0x4c0a941a
   name: "__v4l2_ctrl_handler_setup"
@@ -394585,6 +394732,87 @@ elf_symbol {
   type_id: 0x927d452a
   full_name: "uuid_parse"
 }
+elf_symbol {
+  id: 0x4e2f55da
+  name: "v4l2_async_nf_cleanup"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xdad12cba
+  type_id: 0x1fa7cc4d
+  full_name: "v4l2_async_nf_cleanup"
+}
+elf_symbol {
+  id: 0x04aadf7f
+  name: "v4l2_async_nf_init"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xc88abf32
+  type_id: 0x1fa7cc4d
+  full_name: "v4l2_async_nf_init"
+}
+elf_symbol {
+  id: 0x7920fabe
+  name: "v4l2_async_nf_parse_fwnode_endpoints"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xde590e4b
+  type_id: 0x9ddac293
+  full_name: "v4l2_async_nf_parse_fwnode_endpoints"
+}
+elf_symbol {
+  id: 0x48e55006
+  name: "v4l2_async_nf_register"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x8be566ca
+  type_id: 0x9ca0dc77
+  full_name: "v4l2_async_nf_register"
+}
+elf_symbol {
+  id: 0x65ffd1d0
+  name: "v4l2_async_nf_unregister"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xc74894f9
+  type_id: 0x1fa7cc4d
+  full_name: "v4l2_async_nf_unregister"
+}
+elf_symbol {
+  id: 0x507a9ef5
+  name: "v4l2_async_register_subdev"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x64ab86bc
+  type_id: 0x9df18afd
+  full_name: "v4l2_async_register_subdev"
+}
+elf_symbol {
+  id: 0x050dd932
+  name: "v4l2_async_register_subdev_sensor"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x61c8f608
+  type_id: 0x9df18afd
+  full_name: "v4l2_async_register_subdev_sensor"
+}
+elf_symbol {
+  id: 0x0664687c
+  name: "v4l2_async_subdev_nf_register"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x4d890f4b
+  type_id: 0x9d027320
+  full_name: "v4l2_async_subdev_nf_register"
+}
+elf_symbol {
+  id: 0xf440f7f1
+  name: "v4l2_async_unregister_subdev"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x2592ea78
+  type_id: 0x10e93841
+  full_name: "v4l2_async_unregister_subdev"
+}
 elf_symbol {
   id: 0xf39bae65
   name: "v4l2_compat_ioctl32"
@@ -394990,6 +395218,33 @@ elf_symbol {
   type_id: 0x209ae488
   full_name: "v4l2_format_info"
 }
+elf_symbol {
+  id: 0x7ba36329
+  name: "v4l2_fwnode_endpoint_alloc_parse"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x05930b06
+  type_id: 0x9cfc5a75
+  full_name: "v4l2_fwnode_endpoint_alloc_parse"
+}
+elf_symbol {
+  id: 0x2643c2c9
+  name: "v4l2_fwnode_endpoint_free"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xf01d6f06
+  type_id: 0x15112911
+  full_name: "v4l2_fwnode_endpoint_free"
+}
+elf_symbol {
+  id: 0xcb8b4f14
+  name: "v4l2_fwnode_endpoint_parse"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x9dcd6cfe
+  type_id: 0x9cfc5a75
+  full_name: "v4l2_fwnode_endpoint_parse"
+}
 elf_symbol {
   id: 0x58330374
   name: "v4l2_g_parm_cap"
@@ -399757,6 +400012,8 @@ interface {
   symbol_id: 0x7c261545
   symbol_id: 0xf497de36
   symbol_id: 0xf44f6a18
+  symbol_id: 0xf51d746f
+  symbol_id: 0xe13e16ca
   symbol_id: 0x4c0a941a
   symbol_id: 0xfc85c168
   symbol_id: 0xb6af2644
@@ -405485,6 +405742,15 @@ interface {
   symbol_id: 0xb0c1eaf9
   symbol_id: 0xe7b3f166
   symbol_id: 0xb21b47da
+  symbol_id: 0x4e2f55da
+  symbol_id: 0x04aadf7f
+  symbol_id: 0x7920fabe
+  symbol_id: 0x48e55006
+  symbol_id: 0x65ffd1d0
+  symbol_id: 0x507a9ef5
+  symbol_id: 0x050dd932
+  symbol_id: 0x0664687c
+  symbol_id: 0xf440f7f1
   symbol_id: 0xf39bae65
   symbol_id: 0xfd78bf45
   symbol_id: 0x218d39b6
@@ -405530,6 +405796,9 @@ interface {
   symbol_id: 0xe66642fe
   symbol_id: 0x538ad5cc
   symbol_id: 0x2244c8f0
+  symbol_id: 0x7ba36329
+  symbol_id: 0x2643c2c9
+  symbol_id: 0xcb8b4f14
   symbol_id: 0x58330374
   symbol_id: 0xdb18c924
   symbol_id: 0x5e36dba6
diff --git a/android/abi_gki_aarch64_rockchip b/android/abi_gki_aarch64_rockchip
index 0010cf2300b6..a051b3843047 100644
--- a/android/abi_gki_aarch64_rockchip
+++ b/android/abi_gki_aarch64_rockchip
@@ -1268,6 +1268,15 @@
   usb_submit_urb
   __usecs_to_jiffies
   usleep_range_state
+  __v4l2_async_nf_add_fwnode_remote
+  v4l2_async_nf_cleanup
+  v4l2_async_nf_init
+  v4l2_async_nf_parse_fwnode_endpoints
+  v4l2_async_nf_register
+  v4l2_async_register_subdev
+  v4l2_async_register_subdev_sensor
+  v4l2_async_subdev_nf_register
+  v4l2_async_unregister_subdev
   v4l2_ctrl_find
   v4l2_ctrl_g_ctrl
   v4l2_ctrl_g_ctrl_int64
@@ -1295,6 +1304,9 @@
   v4l2_event_subscribe
   v4l2_event_unsubscribe
   v4l2_fh_open
+  v4l2_fwnode_endpoint_alloc_parse
+  v4l2_fwnode_endpoint_free
+  v4l2_fwnode_endpoint_parse
   v4l2_i2c_subdev_init
   v4l2_match_dv_timings
   v4l2_pipeline_link_notify
@@ -2871,9 +2883,11 @@
 
 # required by video_rkcif.ko
   media_entity_setup_link
+  __v4l2_async_nf_add_fwnode
 
 # required by video_rkisp.ko
   param_ops_ullong
+  v4l2_async_nf_unregister
   v4l2_ctrl_poll
 
 # required by videobuf2-cma-sg.ko

From c52d48818b621678f7ffbeddf29ee1a3437a1363 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Mon, 24 Jul 2023 14:31:47 -0400
Subject: [PATCH 100/139] UPSTREAM: maple_tree: introduce __mas_set_range()

mas_set_range() resets the node to MAS_START, which will cause a re-walk
of the tree to the range.  This is unnecessary when the maple state is
already at the correct location of the write.  Add a function that only
sets the range to avoid unnecessary re-walking of the tree.

Link: https://lkml.kernel.org/r/20230724183157.3939892-6-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Peng Zhang <zhangpeng.00@bytedance.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit c1297987cc2ada57a7faea7985c2334548d110f9)

Bug: 308042511
Change-Id: I9e026d0f103e3aa24b47998be6b83e28e7928540
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/maple_tree.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 0ff8ce8cd06a..6053986cbdf6 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -525,6 +525,22 @@ static inline void mas_reset(struct ma_state *mas)
  */
 #define mas_for_each(__mas, __entry, __max) \
 	while (((__entry) = mas_find((__mas), (__max))) != NULL)
+/**
+ * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
+ * current location.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * set the internal maple state values to a sub-range.
+ * Please use mas_set_range() if you do not know where you are in the tree.
+ */
+static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
+		unsigned long last)
+{
+	mas->index = start;
+	mas->last = last;
+}
 
 /**
  * mas_set_range() - Set up Maple Tree operation state for a different index.
@@ -539,9 +555,8 @@ static inline void mas_reset(struct ma_state *mas)
 static inline
 void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
 {
-	       mas->index = start;
-	       mas->last = last;
-	       mas->node = MAS_START;
+	__mas_set_range(mas, start, last);
+	mas->node = MAS_START;
 }
 
 /**

From 4ddcdc519b4ae94f56dfa81769d5ce59f1cc41a8 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:36 +0800
Subject: [PATCH 101/139] FROMGIT: maple_tree: add mt_free_one() and mt_attr()
 helpers

Patch series "Introduce __mt_dup() to improve the performance of fork()", v7.

This series introduces __mt_dup() to improve the performance of fork().
During the duplication process of mmap, all VMAs are traversed and
inserted one by one into the new maple tree, causing the maple tree to be
rebalanced multiple times.  Balancing the maple tree is a costly
operation.  To duplicate VMAs more efficiently, mtree_dup() and __mt_dup()
are introduced for the maple tree.  They can efficiently duplicate a maple
tree.

Here are some algorithmic details about {mtree,__mt}_dup().  We perform a
DFS pre-order traversal of all nodes in the source maple tree.  During
this process, we fully copy the nodes from the source tree to the new
tree.  This involves memory allocation, and when encountering a new node,
if it is a non-leaf node, all its child nodes are allocated at once.

This idea was originally from Liam R.  Howlett's Maple Tree Work email,
and I added some of my own ideas to implement it.  Some previous
discussions can be found in [1].  For a more detailed analysis of the
algorithm, please refer to the logs for patch [3/10] and patch [10/10].

There is a "spawn" in byte-unixbench[2], which can be used to test the
performance of fork().  I modified it slightly to make it work with
different number of VMAs.

Below are the test results.  The first row shows the number of VMAs.  The
second and third rows show the number of fork() calls per ten seconds,
corresponding to next-20231006 and the this patchset, respectively.  The
test results were obtained with CPU binding to avoid scheduler load
balancing that could cause unstable results.  There are still some
fluctuations in the test results, but at least they are better than the
original performance.

21     121   221    421    821    1621   3221   6421   12821  25621  51221
112100 76261 54227  34035  20195  11112  6017   3161   1606   802    393
114558 83067 65008  45824  28751  16072  8922   4747   2436   1233   599
2.19%  8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42%

Thanks to Liam and Matthew for the review.

This patch (of 10):

Add two helpers:
1. mt_free_one(), used to free a maple node.
2. mt_attr(), used to obtain the attributes of maple tree.

Link: https://lkml.kernel.org/r/20231027033845.90608-1-zhangpeng.00@bytedance.com
Link: https://lkml.kernel.org/r/20231027033845.90608-2-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 4f2267b58a22d972be98edef8e6b3c7a67c9fb91
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: Ib9b13dee357ac4c85668901c20a3c370fbdd08da
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 lib/maple_tree.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 826f7b8d5e05..d932bf27fa0b 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -158,6 +158,11 @@ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
 	return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
 }
 
+static inline void mt_free_one(struct maple_node *node)
+{
+	kmem_cache_free(maple_node_cache, node);
+}
+
 static inline void mt_free_bulk(size_t size, void __rcu **nodes)
 {
 	kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
@@ -199,6 +204,11 @@ static unsigned int mas_mt_height(struct ma_state *mas)
 	return mt_height(mas->tree);
 }
 
+static inline unsigned int mt_attr(struct maple_tree *mt)
+{
+	return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK;
+}
+
 static inline enum maple_type mte_node_type(const struct maple_enode *entry)
 {
 	return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) &
@@ -5702,7 +5712,7 @@ void mas_destroy(struct ma_state *mas)
 			mt_free_bulk(count, (void __rcu **)&node->slot[1]);
 			total -= count;
 		}
-		kmem_cache_free(maple_node_cache, node);
+		mt_free_one(ma_mnode_ptr(node));
 		total--;
 	}
 

From dc9323545b03f3d1ebeeac46c5ceb366eced9314 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:37 +0800
Subject: [PATCH 102/139] FROMGIT: maple_tree: introduce
 {mtree,mas}_lock_nested()

In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is
introduced.  For example, when duplicating maple tree, we need to hold the
locks of two trees, in which case nested locks are needed.

At the same time, add the definition of spin_lock_nested() in tools for
testing.

Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit b2472efe4316b2687c153919c1513a098bd82c17
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: I06f0eb0a32a2f39b7842de08a0e5ce59895345c5
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/maple_tree.h     | 4 ++++
 tools/include/linux/spinlock.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 6053986cbdf6..73d9f342eac4 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -249,6 +249,8 @@ struct maple_tree {
 	struct maple_tree name = MTREE_INIT(name, 0)
 
 #define mtree_lock(mt)		spin_lock((&(mt)->ma_lock))
+#define mtree_lock_nested(mas, subclass) \
+		spin_lock_nested((&(mt)->ma_lock), subclass)
 #define mtree_unlock(mt)	spin_unlock((&(mt)->ma_lock))
 
 /*
@@ -399,6 +401,8 @@ struct ma_wr_state {
 };
 
 #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
+#define mas_lock_nested(mas, subclass) \
+		spin_lock_nested(&((mas)->tree->ma_lock), subclass)
 #define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))
 
 
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index 622266b197d0..a6cdf25b6b9d 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -11,6 +11,7 @@
 #define spin_lock_init(x)	pthread_mutex_init(x, NULL)
 
 #define spin_lock(x)			pthread_mutex_lock(x)
+#define spin_lock_nested(x, subclass)	pthread_mutex_lock(x)
 #define spin_unlock(x)			pthread_mutex_unlock(x)
 #define spin_lock_bh(x)			pthread_mutex_lock(x)
 #define spin_unlock_bh(x)		pthread_mutex_unlock(x)

From eb5048ea90b2f2935edfce9e3182a8aee3f2c8fd Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:38 +0800
Subject: [PATCH 103/139] FROMGIT: maple_tree: introduce interfaces __mt_dup()
 and mtree_dup()

Introduce interfaces __mt_dup() and mtree_dup(), which are used to
duplicate a maple tree.  They duplicate a maple tree in Depth-First Search
(DFS) pre-order traversal.  It uses memcopy() to copy nodes in the source
tree and allocate new child nodes in non-leaf nodes.  The new node is
exactly the same as the source node except for all the addresses stored in
it.  It will be faster than traversing all elements in the source tree and
inserting them one by one into the new tree.  The time complexity of these
two functions is O(n).

The difference between __mt_dup() and mtree_dup() is that mtree_dup()
handles locks internally.

Analysis of the average time complexity of this algorithm:

For simplicity, let's assume that the maximum branching factor of all
non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a
full tree.

Under the given conditions, if there is a maple tree with n elements, the
number of its leaves is n/16.  From bottom to top, the number of nodes in
each level is 1/16 of the number of nodes in the level below.  So the
total number of nodes in the entire tree is given by the sum of n/16 +
n/16^2 + n/16^3 + ...  + 1.  This is a geometric series, and it has log(n)
terms with base 16.  According to the formula for the sum of a geometric
series, the sum of this series can be calculated as (n-1)/15.  Each node
has only one parent node pointer, which can be considered as an edge.  In
total, there are (n-1)/15-1 edges.

This algorithm consists of two operations:

1. Traversing all nodes in DFS order.
2. For each node, making a copy and performing necessary modifications
   to create a new node.

For the first part, DFS traversal will visit each edge twice.  Let
T(ascend) represent the cost of taking one step downwards, and T(descend)
represent the cost of taking one step upwards.  And both of them are
constants (although mas_ascend() may not be, as it contains a loop, but
here we ignore it and treat it as a constant).  So the time spent on the
first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)).

For the second part, each node will be copied, and the cost of copying a
node is denoted as T(copy_node).  For each non-leaf node, it is necessary
to reallocate all child nodes, and the cost of this operation is denoted
as T(dup_alloc).  The behavior behind memory allocation is complex and not
specific to the maple tree operation.  Here, we assume that the time
required for a single allocation is constant.  Since the size of a node is
fixed, both of these symbols are also constants.  We can calculate that
the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15
- n/16) * T(dup_alloc).

Adding both parts together, the total time spent by the algorithm can be
represented as:

((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) -
n/16 * T(dup_alloc) - (T(ascend) + T(descend))

Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)
Let C2 = T(dup_alloc)
Let C3 = T(ascend) + T(descend)

Finally, the expression can be simplified as:
((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3).

This is a linear function, so the average time complexity is O(n).

Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: I385759a1184a202498e086458b572c203616b9b4
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/maple_tree.h |   3 +
 lib/maple_tree.c           | 274 +++++++++++++++++++++++++++++++++++++
 2 files changed, 277 insertions(+)

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 73d9f342eac4..7ccb05ba08ce 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -322,6 +322,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index,
 		void *entry, gfp_t gfp);
 void *mtree_erase(struct maple_tree *mt, unsigned long index);
 
+int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+
 void mtree_destroy(struct maple_tree *mt);
 void __mt_destroy(struct maple_tree *mt);
 
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index d932bf27fa0b..3bc5414f78ab 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4,6 +4,8 @@
  * Copyright (c) 2018-2022 Oracle Corporation
  * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
  *	    Matthew Wilcox <willy@infradead.org>
+ * Copyright (c) 2023 ByteDance
+ * Author: Peng Zhang <zhangpeng.00@bytedance.com>
  */
 
 /*
@@ -6537,6 +6539,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index)
 }
 EXPORT_SYMBOL(mtree_erase);
 
+/*
+ * mas_dup_free() - Free an incomplete duplication of a tree.
+ * @mas: The maple state of a incomplete tree.
+ *
+ * The parameter @mas->node passed in indicates that the allocation failed on
+ * this node. This function frees all nodes starting from @mas->node in the
+ * reverse order of mas_dup_build(). There is no need to hold the source tree
+ * lock at this time.
+ */
+static void mas_dup_free(struct ma_state *mas)
+{
+	struct maple_node *node;
+	enum maple_type type;
+	void __rcu **slots;
+	unsigned char count, i;
+
+	/* Maybe the first node allocation failed. */
+	if (mas_is_none(mas))
+		return;
+
+	while (!mte_is_root(mas->node)) {
+		mas_ascend(mas);
+		if (mas->offset) {
+			mas->offset--;
+			do {
+				mas_descend(mas);
+				mas->offset = mas_data_end(mas);
+			} while (!mte_is_leaf(mas->node));
+
+			mas_ascend(mas);
+		}
+
+		node = mte_to_node(mas->node);
+		type = mte_node_type(mas->node);
+		slots = ma_slots(node, type);
+		count = mas_data_end(mas) + 1;
+		for (i = 0; i < count; i++)
+			((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
+		mt_free_bulk(count, slots);
+	}
+
+	node = mte_to_node(mas->node);
+	mt_free_one(node);
+}
+
+/*
+ * mas_copy_node() - Copy a maple node and replace the parent.
+ * @mas: The maple state of source tree.
+ * @new_mas: The maple state of new tree.
+ * @parent: The parent of the new node.
+ *
+ * Copy @mas->node to @new_mas->node, set @parent to be the parent of
+ * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
+ */
+static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
+		struct maple_pnode *parent)
+{
+	struct maple_node *node = mte_to_node(mas->node);
+	struct maple_node *new_node = mte_to_node(new_mas->node);
+	unsigned long val;
+
+	/* Copy the node completely. */
+	memcpy(new_node, node, sizeof(struct maple_node));
+	/* Update the parent node pointer. */
+	val = (unsigned long)node->parent & MAPLE_NODE_MASK;
+	new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
+}
+
+/*
+ * mas_dup_alloc() - Allocate child nodes for a maple node.
+ * @mas: The maple state of source tree.
+ * @new_mas: The maple state of new tree.
+ * @gfp: The GFP_FLAGS to use for allocations.
+ *
+ * This function allocates child nodes for @new_mas->node during the duplication
+ * process. If memory allocation fails, @mas is set to -ENOMEM.
+ */
+static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
+		gfp_t gfp)
+{
+	struct maple_node *node = mte_to_node(mas->node);
+	struct maple_node *new_node = mte_to_node(new_mas->node);
+	enum maple_type type;
+	unsigned char request, count, i;
+	void __rcu **slots;
+	void __rcu **new_slots;
+	unsigned long val;
+
+	/* Allocate memory for child nodes. */
+	type = mte_node_type(mas->node);
+	new_slots = ma_slots(new_node, type);
+	request = mas_data_end(mas) + 1;
+	count = mt_alloc_bulk(gfp, request, (void **)new_slots);
+	if (unlikely(count < request)) {
+		memset(new_slots, 0, request * sizeof(void *));
+		mas_set_err(mas, -ENOMEM);
+		return;
+	}
+
+	/* Restore node type information in slots. */
+	slots = ma_slots(node, type);
+	for (i = 0; i < count; i++) {
+		val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
+		val &= MAPLE_NODE_MASK;
+		((unsigned long *)new_slots)[i] |= val;
+	}
+}
+
+/*
+ * mas_dup_build() - Build a new maple tree from a source tree
+ * @mas: The maple state of source tree, need to be in MAS_START state.
+ * @new_mas: The maple state of new tree, need to be in MAS_START state.
+ * @gfp: The GFP_FLAGS to use for allocations.
+ *
+ * This function builds a new tree in DFS preorder. If the memory allocation
+ * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
+ * last node. mas_dup_free() will free the incomplete duplication of a tree.
+ *
+ * Note that the attributes of the two trees need to be exactly the same, and the
+ * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
+ */
+static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
+		gfp_t gfp)
+{
+	struct maple_node *node;
+	struct maple_pnode *parent = NULL;
+	struct maple_enode *root;
+	enum maple_type type;
+
+	if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
+	    unlikely(!mtree_empty(new_mas->tree))) {
+		mas_set_err(mas, -EINVAL);
+		return;
+	}
+
+	root = mas_start(mas);
+	if (mas_is_ptr(mas) || mas_is_none(mas))
+		goto set_new_tree;
+
+	node = mt_alloc_one(gfp);
+	if (!node) {
+		new_mas->node = MAS_NONE;
+		mas_set_err(mas, -ENOMEM);
+		return;
+	}
+
+	type = mte_node_type(mas->node);
+	root = mt_mk_node(node, type);
+	new_mas->node = root;
+	new_mas->min = 0;
+	new_mas->max = ULONG_MAX;
+	root = mte_mk_root(root);
+	while (1) {
+		mas_copy_node(mas, new_mas, parent);
+		if (!mte_is_leaf(mas->node)) {
+			/* Only allocate child nodes for non-leaf nodes. */
+			mas_dup_alloc(mas, new_mas, gfp);
+			if (unlikely(mas_is_err(mas)))
+				return;
+		} else {
+			/*
+			 * This is the last leaf node and duplication is
+			 * completed.
+			 */
+			if (mas->max == ULONG_MAX)
+				goto done;
+
+			/* This is not the last leaf node and needs to go up. */
+			do {
+				mas_ascend(mas);
+				mas_ascend(new_mas);
+			} while (mas->offset == mas_data_end(mas));
+
+			/* Move to the next subtree. */
+			mas->offset++;
+			new_mas->offset++;
+		}
+
+		mas_descend(mas);
+		parent = ma_parent_ptr(mte_to_node(new_mas->node));
+		mas_descend(new_mas);
+		mas->offset = 0;
+		new_mas->offset = 0;
+	}
+done:
+	/* Specially handle the parent of the root node. */
+	mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
+set_new_tree:
+	/* Make them the same height */
+	new_mas->tree->ma_flags = mas->tree->ma_flags;
+	rcu_assign_pointer(new_mas->tree->ma_root, root);
+}
+
+/**
+ * __mt_dup(): Duplicate an entire maple tree
+ * @mt: The source maple tree
+ * @new: The new maple tree
+ * @gfp: The GFP_FLAGS to use for allocations
+ *
+ * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
+ * traversal. It uses memcpy() to copy nodes in the source tree and allocate
+ * new child nodes in non-leaf nodes. The new node is exactly the same as the
+ * source node except for all the addresses stored in it. It will be faster than
+ * traversing all elements in the source tree and inserting them one by one into
+ * the new tree.
+ * The user needs to ensure that the attributes of the source tree and the new
+ * tree are the same, and the new tree needs to be an empty tree, otherwise
+ * -EINVAL will be returned.
+ * Note that the user needs to manually lock the source tree and the new tree.
+ *
+ * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
+ * the attributes of the two trees are different or the new tree is not an empty
+ * tree.
+ */
+int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
+{
+	int ret = 0;
+	MA_STATE(mas, mt, 0, 0);
+	MA_STATE(new_mas, new, 0, 0);
+
+	mas_dup_build(&mas, &new_mas, gfp);
+	if (unlikely(mas_is_err(&mas))) {
+		ret = xa_err(mas.node);
+		if (ret == -ENOMEM)
+			mas_dup_free(&new_mas);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(__mt_dup);
+
+/**
+ * mtree_dup(): Duplicate an entire maple tree
+ * @mt: The source maple tree
+ * @new: The new maple tree
+ * @gfp: The GFP_FLAGS to use for allocations
+ *
+ * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
+ * traversal. It uses memcpy() to copy nodes in the source tree and allocate
+ * new child nodes in non-leaf nodes. The new node is exactly the same as the
+ * source node except for all the addresses stored in it. It will be faster than
+ * traversing all elements in the source tree and inserting them one by one into
+ * the new tree.
+ * The user needs to ensure that the attributes of the source tree and the new
+ * tree are the same, and the new tree needs to be an empty tree, otherwise
+ * -EINVAL will be returned.
+ *
+ * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
+ * the attributes of the two trees are different or the new tree is not an empty
+ * tree.
+ */
+int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
+{
+	int ret = 0;
+	MA_STATE(mas, mt, 0, 0);
+	MA_STATE(new_mas, new, 0, 0);
+
+	mas_lock(&new_mas);
+	mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
+	mas_dup_build(&mas, &new_mas, gfp);
+	mas_unlock(&mas);
+	if (unlikely(mas_is_err(&mas))) {
+		ret = xa_err(mas.node);
+		if (ret == -ENOMEM)
+			mas_dup_free(&new_mas);
+	}
+
+	mas_unlock(&new_mas);
+	return ret;
+}
+EXPORT_SYMBOL(mtree_dup);
+
 /**
  * __mt_destroy() - Walk and free all nodes of a locked maple tree.
  * @mt: The maple tree

From f73f881af49ecfcc16be7e460ed8046a91d259ad Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:39 +0800
Subject: [PATCH 104/139] FROMGIT: radix tree test suite: align
 kmem_cache_alloc_bulk() with kernel behavior.

When kmem_cache_alloc_bulk() fails to allocate, leave the freed pointers
in the array.  This enables a more accurate simulation of the kernel's
behavior and allows for testing potential double-free scenarios.

Link: https://lkml.kernel.org/r/20231027033845.90608-5-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 46c99e26f2f86260fed226cab217d0b3ca8dca56
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: If822e9d219066e1573b7c044ef9a7344f652e365
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 tools/testing/radix-tree/linux.c | 45 +++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
index d587a558997f..64a1645ff94c 100644
--- a/tools/testing/radix-tree/linux.c
+++ b/tools/testing/radix-tree/linux.c
@@ -93,13 +93,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
 	return p;
 }
 
-void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
+void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
 {
 	assert(objp);
-	uatomic_dec(&nr_allocated);
-	uatomic_dec(&cachep->nr_allocated);
-	if (kmalloc_verbose)
-		printf("Freeing %p to slab\n", objp);
 	if (cachep->nr_objs > 10 || cachep->align) {
 		memset(objp, POISON_FREE, cachep->size);
 		free(objp);
@@ -111,6 +107,15 @@ void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
 	}
 }
 
+void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp)
+{
+	uatomic_dec(&nr_allocated);
+	uatomic_dec(&cachep->nr_allocated);
+	if (kmalloc_verbose)
+		printf("Freeing %p to slab\n", objp);
+	__kmem_cache_free_locked(cachep, objp);
+}
+
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	pthread_mutex_lock(&cachep->lock);
@@ -141,18 +146,17 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size,
 	if (kmalloc_verbose)
 		pr_debug("Bulk alloc %lu\n", size);
 
-	if (!(gfp & __GFP_DIRECT_RECLAIM)) {
-		if (cachep->non_kernel < size)
-			return 0;
-
-		cachep->non_kernel -= size;
-	}
-
 	pthread_mutex_lock(&cachep->lock);
 	if (cachep->nr_objs >= size) {
 		struct radix_tree_node *node;
 
 		for (i = 0; i < size; i++) {
+			if (!(gfp & __GFP_DIRECT_RECLAIM)) {
+				if (!cachep->non_kernel)
+					break;
+				cachep->non_kernel--;
+			}
+
 			node = cachep->objs;
 			cachep->nr_objs--;
 			cachep->objs = node->parent;
@@ -163,11 +167,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size,
 	} else {
 		pthread_mutex_unlock(&cachep->lock);
 		for (i = 0; i < size; i++) {
+			if (!(gfp & __GFP_DIRECT_RECLAIM)) {
+				if (!cachep->non_kernel)
+					break;
+				cachep->non_kernel--;
+			}
+
 			if (cachep->align) {
 				posix_memalign(&p[i], cachep->align,
 					       cachep->size * size);
 			} else {
 				p[i] = malloc(cachep->size * size);
+				if (!p[i])
+					break;
 			}
 			if (cachep->ctor)
 				cachep->ctor(p[i]);
@@ -176,6 +188,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size,
 		}
 	}
 
+	if (i < size) {
+		size = i;
+		pthread_mutex_lock(&cachep->lock);
+		for (i = 0; i < size; i++)
+			__kmem_cache_free_locked(cachep, p[i]);
+		pthread_mutex_unlock(&cachep->lock);
+		return 0;
+	}
+
 	for (i = 0; i < size; i++) {
 		uatomic_inc(&nr_allocated);
 		uatomic_inc(&cachep->nr_allocated);

From 7befa7bbc90ae65849a08dce9bf5ce1845f19494 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:40 +0800
Subject: [PATCH 105/139] FROMGIT: maple_tree: add test for mtree_dup()

Add test for mtree_dup().

Test by duplicating different maple trees and then comparing the two
trees.  Includes tests for duplicating full trees and memory allocation
failures on different nodes.

Link: https://lkml.kernel.org/r/20231027033845.90608-6-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit a2587a7e8d37885dc063255f5400a66299b42e48
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: I7501db5735b1dfd15240ef2946b26d63ffe1d8e0
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 tools/testing/radix-tree/maple.c | 361 +++++++++++++++++++++++++++++++
 1 file changed, 361 insertions(+)

diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index b598b7fe4419..916cf8b45ddf 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -35753,6 +35753,363 @@ static noinline void __init check_locky(struct maple_tree *mt)
 	mt_clear_in_rcu(mt);
 }
 
+/*
+ * Compares two nodes except for the addresses stored in the nodes.
+ * Returns zero if they are the same, otherwise returns non-zero.
+ */
+static int __init compare_node(struct maple_enode *enode_a,
+			       struct maple_enode *enode_b)
+{
+	struct maple_node *node_a, *node_b;
+	struct maple_node a, b;
+	void **slots_a, **slots_b; /* Do not use the rcu tag. */
+	enum maple_type type;
+	int i;
+
+	if (((unsigned long)enode_a & MAPLE_NODE_MASK) !=
+	    ((unsigned long)enode_b & MAPLE_NODE_MASK)) {
+		pr_err("The lower 8 bits of enode are different.\n");
+		return -1;
+	}
+
+	type = mte_node_type(enode_a);
+	node_a = mte_to_node(enode_a);
+	node_b = mte_to_node(enode_b);
+	a = *node_a;
+	b = *node_b;
+
+	/* Do not compare addresses. */
+	if (ma_is_root(node_a) || ma_is_root(node_b)) {
+		a.parent = (struct maple_pnode *)((unsigned long)a.parent &
+						  MA_ROOT_PARENT);
+		b.parent = (struct maple_pnode *)((unsigned long)b.parent &
+						  MA_ROOT_PARENT);
+	} else {
+		a.parent = (struct maple_pnode *)((unsigned long)a.parent &
+						  MAPLE_NODE_MASK);
+		b.parent = (struct maple_pnode *)((unsigned long)b.parent &
+						  MAPLE_NODE_MASK);
+	}
+
+	if (a.parent != b.parent) {
+		pr_err("The lower 8 bits of parents are different. %p %p\n",
+			a.parent, b.parent);
+		return -1;
+	}
+
+	/*
+	 * If it is a leaf node, the slots do not contain the node address, and
+	 * no special processing of slots is required.
+	 */
+	if (ma_is_leaf(type))
+		goto cmp;
+
+	slots_a = ma_slots(&a, type);
+	slots_b = ma_slots(&b, type);
+
+	for (i = 0; i < mt_slots[type]; i++) {
+		if (!slots_a[i] && !slots_b[i])
+			break;
+
+		if (!slots_a[i] || !slots_b[i]) {
+			pr_err("The number of slots is different.\n");
+			return -1;
+		}
+
+		/* Do not compare addresses in slots. */
+		((unsigned long *)slots_a)[i] &= MAPLE_NODE_MASK;
+		((unsigned long *)slots_b)[i] &= MAPLE_NODE_MASK;
+	}
+
+cmp:
+	/*
+	 * Compare all contents of two nodes, including parent (except address),
+	 * slots (except address), pivots, gaps and metadata.
+	 */
+	return memcmp(&a, &b, sizeof(struct maple_node));
+}
+
+/*
+ * Compare two trees and return 0 if they are the same, non-zero otherwise.
+ */
+static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b)
+{
+	MA_STATE(mas_a, mt_a, 0, 0);
+	MA_STATE(mas_b, mt_b, 0, 0);
+
+	if (mt_a->ma_flags != mt_b->ma_flags) {
+		pr_err("The flags of the two trees are different.\n");
+		return -1;
+	}
+
+	mas_dfs_preorder(&mas_a);
+	mas_dfs_preorder(&mas_b);
+
+	if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) {
+		if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) {
+			pr_err("One is MAS_ROOT and the other is not.\n");
+			return -1;
+		}
+		return 0;
+	}
+
+	while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) {
+
+		if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) {
+			pr_err("One is MAS_NONE and the other is not.\n");
+			return -1;
+		}
+
+		if (mas_a.min != mas_b.min ||
+		    mas_a.max != mas_b.max) {
+			pr_err("mas->min, mas->max do not match.\n");
+			return -1;
+		}
+
+		if (compare_node(mas_a.node, mas_b.node)) {
+			pr_err("The contents of nodes %p and %p are different.\n",
+			       mas_a.node, mas_b.node);
+			mt_dump(mt_a, mt_dump_dec);
+			mt_dump(mt_b, mt_dump_dec);
+			return -1;
+		}
+
+		mas_dfs_preorder(&mas_a);
+		mas_dfs_preorder(&mas_b);
+	}
+
+	return 0;
+}
+
+static __init void mas_subtree_max_range(struct ma_state *mas)
+{
+	unsigned long limit = mas->max;
+	MA_STATE(newmas, mas->tree, 0, 0);
+	void *entry;
+
+	mas_for_each(mas, entry, limit) {
+		if (mas->last - mas->index >=
+		    newmas.last - newmas.index) {
+			newmas = *mas;
+		}
+	}
+
+	*mas = newmas;
+}
+
+/*
+ * build_full_tree() - Build a full tree.
+ * @mt: The tree to build.
+ * @flags: Use @flags to build the tree.
+ * @height: The height of the tree to build.
+ *
+ * Build a tree with full leaf nodes and internal nodes. Note that the height
+ * should not exceed 3, otherwise it will take a long time to build.
+ * Return: zero if the build is successful, non-zero if it fails.
+ */
+static __init int build_full_tree(struct maple_tree *mt, unsigned int flags,
+		int height)
+{
+	MA_STATE(mas, mt, 0, 0);
+	unsigned long step;
+	int ret = 0, cnt = 1;
+	enum maple_type type;
+
+	mt_init_flags(mt, flags);
+	mtree_insert_range(mt, 0, ULONG_MAX, xa_mk_value(5), GFP_KERNEL);
+
+	mtree_lock(mt);
+
+	while (1) {
+		mas_set(&mas, 0);
+		if (mt_height(mt) < height) {
+			mas.max = ULONG_MAX;
+			goto store;
+		}
+
+		while (1) {
+			mas_dfs_preorder(&mas);
+			if (mas_is_none(&mas))
+				goto unlock;
+
+			type = mte_node_type(mas.node);
+			if (mas_data_end(&mas) + 1 < mt_slots[type]) {
+				mas_set(&mas, mas.min);
+				goto store;
+			}
+		}
+store:
+		mas_subtree_max_range(&mas);
+		step = mas.last - mas.index;
+		if (step < 1) {
+			ret = -1;
+			goto unlock;
+		}
+
+		step /= 2;
+		mas.last = mas.index + step;
+		mas_store_gfp(&mas, xa_mk_value(5),
+				GFP_KERNEL);
+		++cnt;
+	}
+unlock:
+	mtree_unlock(mt);
+
+	MT_BUG_ON(mt, mt_height(mt) != height);
+	/* pr_info("height:%u number of elements:%d\n", mt_height(mt), cnt); */
+	return ret;
+}
+
+static noinline void __init check_mtree_dup(struct maple_tree *mt)
+{
+	DEFINE_MTREE(new);
+	int i, j, ret, count = 0;
+	unsigned int rand_seed = 17, rand;
+
+	/* store a value at [0, 0] */
+	mt_init_flags(mt, 0);
+	mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL);
+	ret = mtree_dup(mt, &new, GFP_KERNEL);
+	MT_BUG_ON(&new, ret);
+	mt_validate(&new);
+	if (compare_tree(mt, &new))
+		MT_BUG_ON(&new, 1);
+
+	mtree_destroy(mt);
+	mtree_destroy(&new);
+
+	/* The two trees have different attributes. */
+	mt_init_flags(mt, 0);
+	mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE);
+	ret = mtree_dup(mt, &new, GFP_KERNEL);
+	MT_BUG_ON(&new, ret != -EINVAL);
+	mtree_destroy(mt);
+	mtree_destroy(&new);
+
+	/* The new tree is not empty */
+	mt_init_flags(mt, 0);
+	mt_init_flags(&new, 0);
+	mtree_store(&new, 5, xa_mk_value(5), GFP_KERNEL);
+	ret = mtree_dup(mt, &new, GFP_KERNEL);
+	MT_BUG_ON(&new, ret != -EINVAL);
+	mtree_destroy(mt);
+	mtree_destroy(&new);
+
+	/* Test for duplicating full trees. */
+	for (i = 1; i <= 3; i++) {
+		ret = build_full_tree(mt, 0, i);
+		MT_BUG_ON(mt, ret);
+		mt_init_flags(&new, 0);
+
+		ret = mtree_dup(mt, &new, GFP_KERNEL);
+		MT_BUG_ON(&new, ret);
+		mt_validate(&new);
+		if (compare_tree(mt, &new))
+			MT_BUG_ON(&new, 1);
+
+		mtree_destroy(mt);
+		mtree_destroy(&new);
+	}
+
+	for (i = 1; i <= 3; i++) {
+		ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, i);
+		MT_BUG_ON(mt, ret);
+		mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE);
+
+		ret = mtree_dup(mt, &new, GFP_KERNEL);
+		MT_BUG_ON(&new, ret);
+		mt_validate(&new);
+		if (compare_tree(mt, &new))
+			MT_BUG_ON(&new, 1);
+
+		mtree_destroy(mt);
+		mtree_destroy(&new);
+	}
+
+	/* Test for normal duplicating. */
+	for (i = 0; i < 1000; i += 3) {
+		if (i & 1) {
+			mt_init_flags(mt, 0);
+			mt_init_flags(&new, 0);
+		} else {
+			mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+			mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE);
+		}
+
+		for (j = 0; j < i; j++) {
+			mtree_store_range(mt, j * 10, j * 10 + 5,
+					  xa_mk_value(j), GFP_KERNEL);
+		}
+
+		ret = mtree_dup(mt, &new, GFP_KERNEL);
+		MT_BUG_ON(&new, ret);
+		mt_validate(&new);
+		if (compare_tree(mt, &new))
+			MT_BUG_ON(&new, 1);
+
+		mtree_destroy(mt);
+		mtree_destroy(&new);
+	}
+
+	/* Test memory allocation failed. */
+	mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+	for (i = 0; i < 30; i += 3) {
+		mtree_store_range(mt, j * 10, j * 10 + 5,
+					  xa_mk_value(j), GFP_KERNEL);
+	}
+
+	/* Failed at the first node. */
+	mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE);
+	mt_set_non_kernel(0);
+	ret = mtree_dup(mt, &new, GFP_NOWAIT);
+	mt_set_non_kernel(0);
+	MT_BUG_ON(&new, ret != -ENOMEM);
+	mtree_destroy(mt);
+	mtree_destroy(&new);
+
+	/* Random maple tree fails at a random node. */
+	for (i = 0; i < 1000; i += 3) {
+		if (i & 1) {
+			mt_init_flags(mt, 0);
+			mt_init_flags(&new, 0);
+		} else {
+			mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE);
+			mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE);
+		}
+
+		for (j = 0; j < i; j++) {
+			mtree_store_range(mt, j * 10, j * 10 + 5,
+					  xa_mk_value(j), GFP_KERNEL);
+		}
+		/*
+		 * The rand() library function is not used, so we can generate
+		 * the same random numbers on any platform.
+		 */
+		rand_seed = rand_seed * 1103515245 + 12345;
+		rand = rand_seed / 65536 % 128;
+		mt_set_non_kernel(rand);
+
+		ret = mtree_dup(mt, &new, GFP_NOWAIT);
+		mt_set_non_kernel(0);
+		if (ret != 0) {
+			MT_BUG_ON(&new, ret != -ENOMEM);
+			count++;
+			mtree_destroy(mt);
+			continue;
+		}
+
+		mt_validate(&new);
+		if (compare_tree(mt, &new))
+			MT_BUG_ON(&new, 1);
+
+		mtree_destroy(mt);
+		mtree_destroy(&new);
+	}
+
+	/* pr_info("mtree_dup() fail %d times\n", count); */
+	BUG_ON(!count);
+}
+
 extern void test_kmem_cache_bulk(void);
 
 void farmer_tests(void)
@@ -35800,6 +36157,10 @@ void farmer_tests(void)
 	check_null_expand(&tree);
 	mtree_destroy(&tree);
 
+	mt_init_flags(&tree, 0);
+	check_mtree_dup(&tree);
+	mtree_destroy(&tree);
+
 	/* RCU testing */
 	mt_init_flags(&tree, 0);
 	check_erase_testset(&tree);

From c79ca61edc7b3d27a6fd3acbeb81d4638e1acdec Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:41 +0800
Subject: [PATCH 106/139] FROMGIT: maple_tree: update the documentation of
 maple tree

Introduce the new interface mtree_dup() in the documentation.

Link: https://lkml.kernel.org/r/20231027033845.90608-7-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 9bc1d3cdb904170214456bca96c4924f28522ab8
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: I3eb330f0be49f7e8d8b37ecb64de3b7ef349c05b
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 Documentation/core-api/maple_tree.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst
index 45defcf15da7..285e2d2b21ae 100644
--- a/Documentation/core-api/maple_tree.rst
+++ b/Documentation/core-api/maple_tree.rst
@@ -81,6 +81,9 @@ section.
 Sometimes it is necessary to ensure the next call to store to a maple tree does
 not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
 
+You can use mtree_dup() to duplicate an entire maple tree. It is a more
+efficient way than inserting all elements one by one into a new tree.
+
 Finally, you can remove all entries from a maple tree by calling
 mtree_destroy().  If the maple tree entries are pointers, you may wish to free
 the entries first.
@@ -112,6 +115,7 @@ Takes ma_lock internally:
  * mtree_insert()
  * mtree_insert_range()
  * mtree_erase()
+ * mtree_dup()
  * mtree_destroy()
  * mt_set_in_rcu()
  * mt_clear_in_rcu()

From e57d333531ad6cfac38e42826d0229c03d6b6b41 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:42 +0800
Subject: [PATCH 107/139] FROMGIT: maple_tree: skip other tests when BENCH is
 enabled

Skip other tests when BENCH is enabled so that performance can be measured
in user space.

Link: https://lkml.kernel.org/r/20231027033845.90608-8-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit f670fa1caadb4ea532a89012c5451e4c6789bfcc
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: I0a761a4b6211b19ec80c97d5aef80f3979523bcb
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 lib/test_maple_tree.c            | 8 ++++----
 tools/testing/radix-tree/maple.c | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index ab9d4461abc9..b80e28cdaa96 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -2741,10 +2741,6 @@ static int __init maple_tree_seed(void)
 
 	pr_info("\nTEST STARTING\n\n");
 
-	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
-	check_root_expand(&tree);
-	mtree_destroy(&tree);
-
 #if defined(BENCH_SLOT_STORE)
 #define BENCH
 	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
@@ -2788,6 +2784,10 @@ static int __init maple_tree_seed(void)
 	goto skip;
 #endif
 
+	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
+	check_root_expand(&tree);
+	mtree_destroy(&tree);
+
 	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
 	check_iteration(&tree);
 	mtree_destroy(&tree);
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index 916cf8b45ddf..a2626f02f385 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -36195,7 +36195,9 @@ void farmer_tests(void)
 
 void maple_tree_tests(void)
 {
+#if !defined(BENCH)
 	farmer_tests();
+#endif
 	maple_tree_seed();
 	maple_tree_harvest();
 }

From 1bec2dd52eabfd5c2908353ab7f3c2f324131da9 Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:43 +0800
Subject: [PATCH 108/139] FROMGIT: maple_tree: update check_forking() and
 bench_forking()

Updated check_forking() and bench_forking() to use __mt_dup() to duplicate
maple tree.

Link: https://lkml.kernel.org/r/20231027033845.90608-9-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 446e1867e6df3cbdd19af6be8f8f4ed56176adb4
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: I3b64ad1cb5ae40b10dc86ee55501f12522c7e2f5
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 lib/test_maple_tree.c       | 117 ++++++++++++++++++------------------
 tools/include/linux/rwsem.h |   4 ++
 2 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c
index b80e28cdaa96..68b2c387fddb 100644
--- a/lib/test_maple_tree.c
+++ b/lib/test_maple_tree.c
@@ -1671,47 +1671,48 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt)
 #endif
 
 /* check_forking - simulate the kernel forking sequence with the tree. */
-static noinline void __init check_forking(struct maple_tree *mt)
+static noinline void __init check_forking(void)
 {
-
-	struct maple_tree newmt;
-	int i, nr_entries = 134;
+	struct maple_tree mt, newmt;
+	int i, nr_entries = 134, ret;
 	void *val;
-	MA_STATE(mas, mt, 0, 0);
-	MA_STATE(newmas, mt, 0, 0);
-	struct rw_semaphore newmt_lock;
+	MA_STATE(mas, &mt, 0, 0);
+	MA_STATE(newmas, &newmt, 0, 0);
+	struct rw_semaphore mt_lock, newmt_lock;
 
+	init_rwsem(&mt_lock);
 	init_rwsem(&newmt_lock);
 
-	for (i = 0; i <= nr_entries; i++)
-		mtree_store_range(mt, i*10, i*10 + 5,
-				  xa_mk_value(i), GFP_KERNEL);
+	mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
+	mt_set_external_lock(&mt, &mt_lock);
 
-	mt_set_non_kernel(99999);
 	mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
 	mt_set_external_lock(&newmt, &newmt_lock);
-	newmas.tree = &newmt;
-	mas_reset(&newmas);
-	mas_reset(&mas);
-	down_write(&newmt_lock);
-	mas.index = 0;
-	mas.last = 0;
-	if (mas_expected_entries(&newmas, nr_entries)) {
+
+	down_write(&mt_lock);
+	for (i = 0; i <= nr_entries; i++) {
+		mas_set_range(&mas, i*10, i*10 + 5);
+		mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL);
+	}
+
+	down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING);
+	ret = __mt_dup(&mt, &newmt, GFP_KERNEL);
+	if (ret) {
 		pr_err("OOM!");
 		BUG_ON(1);
 	}
-	rcu_read_lock();
-	mas_for_each(&mas, val, ULONG_MAX) {
-		newmas.index = mas.index;
-		newmas.last = mas.last;
+
+	mas_set(&newmas, 0);
+	mas_for_each(&newmas, val, ULONG_MAX)
 		mas_store(&newmas, val);
-	}
-	rcu_read_unlock();
+
 	mas_destroy(&newmas);
+	mas_destroy(&mas);
 	mt_validate(&newmt);
-	mt_set_non_kernel(0);
 	__mt_destroy(&newmt);
+	__mt_destroy(&mt);
 	up_write(&newmt_lock);
+	up_write(&mt_lock);
 }
 
 static noinline void __init check_iteration(struct maple_tree *mt)
@@ -1815,49 +1816,51 @@ static noinline void __init check_mas_store_gfp(struct maple_tree *mt)
 }
 
 #if defined(BENCH_FORK)
-static noinline void __init bench_forking(struct maple_tree *mt)
+static noinline void __init bench_forking(void)
 {
-
-	struct maple_tree newmt;
-	int i, nr_entries = 134, nr_fork = 80000;
+	struct maple_tree mt, newmt;
+	int i, nr_entries = 134, nr_fork = 80000, ret;
 	void *val;
-	MA_STATE(mas, mt, 0, 0);
-	MA_STATE(newmas, mt, 0, 0);
-	struct rw_semaphore newmt_lock;
+	MA_STATE(mas, &mt, 0, 0);
+	MA_STATE(newmas, &newmt, 0, 0);
+	struct rw_semaphore mt_lock, newmt_lock;
 
+	init_rwsem(&mt_lock);
 	init_rwsem(&newmt_lock);
-	mt_set_external_lock(&newmt, &newmt_lock);
 
-	for (i = 0; i <= nr_entries; i++)
-		mtree_store_range(mt, i*10, i*10 + 5,
-				  xa_mk_value(i), GFP_KERNEL);
+	mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
+	mt_set_external_lock(&mt, &mt_lock);
+
+	down_write(&mt_lock);
+	for (i = 0; i <= nr_entries; i++) {
+		mas_set_range(&mas, i*10, i*10 + 5);
+		mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL);
+	}
 
 	for (i = 0; i < nr_fork; i++) {
-		mt_set_non_kernel(99999);
-		mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE);
-		newmas.tree = &newmt;
-		mas_reset(&newmas);
-		mas_reset(&mas);
-		mas.index = 0;
-		mas.last = 0;
-		rcu_read_lock();
-		down_write(&newmt_lock);
-		if (mas_expected_entries(&newmas, nr_entries)) {
-			printk("OOM!");
+		mt_init_flags(&newmt,
+			      MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN);
+		mt_set_external_lock(&newmt, &newmt_lock);
+
+		down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING);
+		ret = __mt_dup(&mt, &newmt, GFP_KERNEL);
+		if (ret) {
+			pr_err("OOM!");
 			BUG_ON(1);
 		}
-		mas_for_each(&mas, val, ULONG_MAX) {
-			newmas.index = mas.index;
-			newmas.last = mas.last;
+
+		mas_set(&newmas, 0);
+		mas_for_each(&newmas, val, ULONG_MAX)
 			mas_store(&newmas, val);
-		}
+
 		mas_destroy(&newmas);
-		rcu_read_unlock();
 		mt_validate(&newmt);
-		mt_set_non_kernel(0);
 		__mt_destroy(&newmt);
 		up_write(&newmt_lock);
 	}
+	mas_destroy(&mas);
+	__mt_destroy(&mt);
+	up_write(&mt_lock);
 }
 #endif
 
@@ -2771,9 +2774,7 @@ static int __init maple_tree_seed(void)
 #endif
 #if defined(BENCH_FORK)
 #define BENCH
-	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
-	bench_forking(&tree);
-	mtree_destroy(&tree);
+	bench_forking();
 	goto skip;
 #endif
 #if defined(BENCH_MT_FOR_EACH)
@@ -2792,9 +2793,7 @@ static int __init maple_tree_seed(void)
 	check_iteration(&tree);
 	mtree_destroy(&tree);
 
-	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
-	check_forking(&tree);
-	mtree_destroy(&tree);
+	check_forking();
 
 	mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
 	check_mas_store_gfp(&tree);
diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h
index 83971b3cbfce..f8bffd4a987c 100644
--- a/tools/include/linux/rwsem.h
+++ b/tools/include/linux/rwsem.h
@@ -37,4 +37,8 @@ static inline int up_write(struct rw_semaphore *sem)
 {
 	return pthread_rwlock_unlock(&sem->lock);
 }
+
+#define down_read_nested(sem, subclass)		down_read(sem)
+#define down_write_nested(sem, subclass)	down_write(sem)
+
 #endif /* _TOOLS_RWSEM_H */

From 3743b40f655249489c00b52a1481f789eba2f4bb Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:44 +0800
Subject: [PATCH 109/139] FROMGIT: maple_tree: preserve the tree attributes
 when destroying maple tree

When destroying maple tree, preserve its attributes and then turn it into
an empty tree.  This allows it to be reused without needing to be
reinitialized.

Link: https://lkml.kernel.org/r/20231027033845.90608-10-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 8e50d32c7a89bde896945e4e572ef28ccd87bbf8
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

Bug: 308042511
Change-Id: If1725d5a37dcd26bec23e6bffe95d877903dfab1
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 lib/maple_tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 3bc5414f78ab..1200ff73c1b0 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -6825,7 +6825,7 @@ void __mt_destroy(struct maple_tree *mt)
 	if (xa_is_node(root))
 		mte_destroy_walk(root, mt);
 
-	mt->ma_flags = 0;
+	mt->ma_flags = mt_attr(mt);
 }
 EXPORT_SYMBOL_GPL(__mt_destroy);
 

From ed9b660cd1ad6746e8357e9717981f0c7ceb73ce Mon Sep 17 00:00:00 2001
From: Peng Zhang <zhangpeng.00@bytedance.com>
Date: Fri, 27 Oct 2023 11:38:45 +0800
Subject: [PATCH 110/139] BACKPORT: FROMGIT fork: use __mt_dup() to duplicate
 maple tree in dup_mmap()

In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then
directly replacing the entries of VMAs in the new maple tree can result in
better performance.  __mt_dup() uses DFS pre-order to duplicate the maple
tree, so it is efficient.

The average time complexity of __mt_dup() is O(n), where n is the number
of VMAs.  The proof of the time complexity is provided in the commit log
that introduces __mt_dup().  After duplicating the maple tree, each
element is traversed and replaced (ignoring the cases of deletion, which
are rare).  Since it is only a replacement operation for each element,
this process is also O(n).

Analyzing the exact time complexity of the previous algorithm is
challenging because each insertion can involve appending to a node,
pushing data to adjacent nodes, or even splitting nodes.  The frequency of
each action is difficult to calculate.  The worst-case scenario for a
single insertion is when the tree undergoes splitting at every level.  If
we consider each insertion as the worst-case scenario, we can determine
that the upper bound of the time complexity is O(n*log(n)), although this
is a loose upper bound.  However, based on the test data, it appears that
the actual time complexity is likely to be O(n).

As the entire maple tree is duplicated using __mt_dup(), if dup_mmap()
fails, there will be a portion of VMAs that have not been duplicated in
the maple tree.  To handle this, we mark the failure point with
XA_ZERO_ENTRY.  In exit_mmap(), if this marker is encountered, stop
releasing VMAs that have not been duplicated after this point.

There is a "spawn" in byte-unixbench[1], which can be used to test the
performance of fork().  I modified it slightly to make it work with
different number of VMAs.

Below are the test results.  The first row shows the number of VMAs.  The
second and third rows show the number of fork() calls per ten seconds,
corresponding to next-20231006 and the this patchset, respectively.  The
test results were obtained with CPU binding to avoid scheduler load
balancing that could cause unstable results.  There are still some
fluctuations in the test results, but at least they are better than the
original performance.

21     121   221    421    821    1621   3221   6421   12821  25621  51221
112100 76261 54227  34035  20195  11112  6017   3161   1606   802    393
114558 83067 65008  45824  28751  16072  8922   4747   2436   1233   599
2.19%  8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42%

[1] https://github.com/kdlucas/byte-unixbench/tree/master

Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
Suggested-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Mike Christie <michael.christie@oracle.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit d2406291483775ecddaee929231a39c70c08fda2
https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable)

[surenb: open-coded vma_iter_clear_gfp(), vma_iter_bulk_store();
replaced vma_next() with mas_find()]

Bug: 308042511
Change-Id: I42d6620e8ce6a0b16211c231a9b72ba16ba9c0d2
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 kernel/fork.c | 40 +++++++++++++++++++++++++++++-----------
 mm/memory.c   |  7 ++++++-
 mm/mmap.c     |  9 ++++++---
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 9ef103c05891..1109a10c5ccd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -659,7 +659,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	int retval;
 	unsigned long charge = 0;
 	LIST_HEAD(uf);
-	MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
 	MA_STATE(mas, &mm->mm_mt, 0, 0);
 
 	uprobe_start_dup_mmap();
@@ -687,16 +686,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		goto out;
 	khugepaged_fork(mm, oldmm);
 
-	retval = mas_expected_entries(&mas, oldmm->map_count);
-	if (retval)
+	/* Use __mt_dup() to efficiently build an identical maple tree. */
+	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
+	if (unlikely(retval))
 		goto out;
 
 	mt_clear_in_rcu(mas.tree);
-	mas_for_each(&old_mas, mpnt, ULONG_MAX) {
+	mas_for_each(&mas, mpnt, ULONG_MAX) {
 		struct file *file;
 
 		vma_start_write(mpnt);
 		if (mpnt->vm_flags & VM_DONTCOPY) {
+			__mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1);
+			mas_store_gfp(&mas, NULL, GFP_KERNEL);
+			if (unlikely(mas_is_err(&mas))) {
+				retval = -ENOMEM;
+				goto loop_out;
+			}
 			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
 			continue;
 		}
@@ -758,12 +764,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (is_vm_hugetlb_page(tmp))
 			hugetlb_dup_vma_private(tmp);
 
-		/* Link the vma into the MT */
+		/*
+		 * Link the vma into the MT. After using __mt_dup(), memory
+		 * allocation is not necessary here, so it cannot fail.
+		 */
 		mas.index = tmp->vm_start;
 		mas.last = tmp->vm_end - 1;
 		mas_store(&mas, tmp);
-		if (mas_is_err(&mas))
-			goto fail_nomem_mas_store;
 
 		mm->map_count++;
 		if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -772,15 +779,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
 
-		if (retval)
+		if (retval) {
+			mpnt = mas_find(&mas, ULONG_MAX);
 			goto loop_out;
+		}
 	}
 	/* a new mm has just been created */
 	retval = arch_dup_mmap(oldmm, mm);
 loop_out:
 	mas_destroy(&mas);
-	if (!retval)
+	if (!retval) {
 		mt_set_in_rcu(mas.tree);
+	} else if (mpnt) {
+		/*
+		 * The entire maple tree has already been duplicated. If the
+		 * mmap duplication fails, mark the failure point with
+		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
+		 * stop releasing VMAs that have not been duplicated after this
+		 * point.
+		 */
+		mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1);
+		mas_store(&mas, XA_ZERO_ENTRY);
+	}
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
@@ -790,8 +810,6 @@ fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
 
-fail_nomem_mas_store:
-	unlink_anon_vmas(tmp);
 fail_nomem_anon_vma_fork:
 	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
diff --git a/mm/memory.c b/mm/memory.c
index 7f29d9394bdf..5e161551a5d1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -411,6 +411,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 		 * be 0.  This will underflow and is okay.
 		 */
 		next = mas_find(&mas, ceiling - 1);
+		if (unlikely(xa_is_zero(next)))
+			next = NULL;
 
 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
@@ -432,6 +434,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
 			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = mas_find(&mas, ceiling - 1);
+				if (unlikely(xa_is_zero(next)))
+					next = NULL;
 				if (mm_wr_locked)
 					vma_start_write(vma);
 				unlink_anon_vmas(vma);
@@ -1736,7 +1740,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
 	do {
 		unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
 				 mm_wr_locked);
-	} while ((vma = mas_find(&mas, end_t - 1)) != NULL);
+		vma = mas_find(&mas, end_t - 1);
+	} while (vma && likely(!xa_is_zero(vma)));
 	mmu_notifier_invalidate_range_end(&range);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index bd2140cfcf36..d5c48b243869 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3303,10 +3303,11 @@ void exit_mmap(struct mm_struct *mm)
 	arch_exit_mmap(mm);
 
 	vma = mas_find(&mas, ULONG_MAX);
-	if (!vma) {
+	if (!vma || unlikely(xa_is_zero(vma))) {
 		/* Can happen if dup_mmap() received an OOM */
 		mmap_read_unlock(mm);
-		return;
+		mmap_write_lock(mm);
+		goto destroy;
 	}
 
 	lru_add_drain();
@@ -3339,11 +3340,13 @@ void exit_mmap(struct mm_struct *mm)
 		remove_vma(vma, true);
 		count++;
 		cond_resched();
-	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+		vma = mas_find(&mas, ULONG_MAX);
+	} while (vma && likely(!xa_is_zero(vma)));
 
 	BUG_ON(count != mm->map_count);
 
 	trace_exit_mmap(mm);
+destroy:
 	__mt_destroy(&mm->mm_mt);
 	mmap_write_unlock(mm);
 	vm_unacct_memory(nr_accounted);

From 8a597e7a2d06d699fb7b6c7787291e3916a502e2 Mon Sep 17 00:00:00 2001
From: Sebastian Ene <sebastianene@google.com>
Date: Tue, 31 Oct 2023 12:15:46 +0000
Subject: [PATCH 111/139] ANDROID: KVM: arm64: Don't prepopulate MMIO regions
 for host stage-2

As we reserve only 1GB of memory for the MMIO region don't prepopulate
the entire remaining address space with MMIO as this is prone to failure.
Instead, let the MMIO regions to be created lazily on the fault path and
keep only the RAM regions prepopulated.

Bug: 307805059
Test: Boot pKVM with CONFIG_ARM64_16K_PAGES
Change-Id: I6327f42eb17c6588335a1e04736393c9032114ab
Signed-off-by: Sebastian Ene <sebastianene@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 2c1032a59826..b3920a37f334 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -149,22 +149,16 @@ static void prepare_host_vtcr(void)
 static int prepopulate_host_stage2(void)
 {
 	struct memblock_region *reg;
-	u64 addr = 0;
-	int i, ret;
+	int i, ret = 0;
 
 	for (i = 0; i < hyp_memblock_nr; i++) {
 		reg = &hyp_memory[i];
-		ret = host_stage2_idmap_locked(addr, reg->base - addr, PKVM_HOST_MMIO_PROT, false);
-		if (ret)
-			return ret;
 		ret = host_stage2_idmap_locked(reg->base, reg->size, PKVM_HOST_MEM_PROT, false);
 		if (ret)
 			return ret;
-		addr = reg->base + reg->size;
 	}
 
-	return host_stage2_idmap_locked(addr, BIT(host_mmu.pgt.ia_bits) - addr, PKVM_HOST_MMIO_PROT,
-					false);
+	return ret;
 }
 
 int kvm_host_prepare_stage2(void *pgt_pool_base)

From 934a40576ee2c3f07dc59e38a339767d51a84238 Mon Sep 17 00:00:00 2001
From: Stanley Chang <stanley_chang@realtek.com>
Date: Wed, 19 Apr 2023 10:00:42 +0800
Subject: [PATCH 112/139] UPSTREAM: usb: dwc3: core: add support for disabling
 High-speed park mode

Setting the PARKMODE_DISABLE_HS bit in the DWC3_USB3_GUCTL1.
When this bit is set to '1' all HS bus instances in park mode are disabled

For some USB wifi devices, if enable this feature it will reduce the
performance. Therefore, add an option for disabling HS park mode by
device-tree.

In Synopsys's dwc3 data book:
In a few high speed devices when an IN request is sent within 900ns of the
ACK of the previous packet, these devices send a NAK. When connected to
these devices, if required, the software can disable the park mode if you
see performance drop in your system. When park mode is disabled,
pipelining of multiple packet is disabled and instead one packet at a time
is requested by the scheduler. This allows up to 12 NAKs in a micro-frame
and improves performance of these slow devices.

Bug: 300024866
Acked-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Signed-off-by: Stanley Chang <stanley_chang@realtek.com>
Link: https://lore.kernel.org/r/20230419020044.15475-1-stanley_chang@realtek.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: William Wu <william.wu@rock-chips.com>
(cherry picked from commit d21a797a3eeb2b001e07ff943e5611eab67a71a3)
Change-Id: I43ee416e54779a073a0ba4057edf4be8bd7886de
Signed-off-by: Kever Yang <kever.yang@rock-chips.com>
---
 drivers/usb/dwc3/core.c | 5 +++++
 drivers/usb/dwc3/core.h | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 3ee70ffaf003..2b01c3e05ebe 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1233,6 +1233,9 @@ static int dwc3_core_init(struct dwc3 *dwc)
 		if (dwc->parkmode_disable_ss_quirk)
 			reg |= DWC3_GUCTL1_PARKMODE_DISABLE_SS;
 
+		if (dwc->parkmode_disable_hs_quirk)
+			reg |= DWC3_GUCTL1_PARKMODE_DISABLE_HS;
+
 		if (DWC3_VER_IS_WITHIN(DWC3, 290A, ANY) &&
 		    (dwc->maximum_speed == USB_SPEED_HIGH ||
 		     dwc->maximum_speed == USB_SPEED_FULL))
@@ -1539,6 +1542,8 @@ static void dwc3_get_properties(struct dwc3 *dwc)
 				"snps,resume-hs-terminations");
 	dwc->parkmode_disable_ss_quirk = device_property_read_bool(dev,
 				"snps,parkmode-disable-ss-quirk");
+	dwc->parkmode_disable_hs_quirk = device_property_read_bool(dev,
+				"snps,parkmode-disable-hs-quirk");
 	dwc->gfladj_refclk_lpm_sel = device_property_read_bool(dev,
 				"snps,gfladj-refclk-lpm-sel-quirk");
 
diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h
index 89219a14efb0..d21888658806 100644
--- a/drivers/usb/dwc3/core.h
+++ b/drivers/usb/dwc3/core.h
@@ -263,6 +263,7 @@
 #define DWC3_GUCTL1_DEV_FORCE_20_CLK_FOR_30_CLK	BIT(26)
 #define DWC3_GUCTL1_DEV_L1_EXIT_BY_HW		BIT(24)
 #define DWC3_GUCTL1_PARKMODE_DISABLE_SS		BIT(17)
+#define DWC3_GUCTL1_PARKMODE_DISABLE_HS		BIT(16)
 #define DWC3_GUCTL1_RESUME_OPMODE_HS_HOST	BIT(10)
 
 /* Global Status Register */
@@ -1113,6 +1114,8 @@ struct dwc3_scratchpad_array {
  *			generation after resume from suspend.
  * @parkmode_disable_ss_quirk: set if we need to disable all SuperSpeed
  *			instances in park mode.
+ * @parkmode_disable_hs_quirk: set if we need to disable all HishSpeed
+ *			instances in park mode.
  * @tx_de_emphasis_quirk: set if we enable Tx de-emphasis quirk
  * @tx_de_emphasis: Tx de-emphasis value
  *	0	- -6dB de-emphasis
@@ -1330,6 +1333,7 @@ struct dwc3 {
 	unsigned		dis_tx_ipgap_linecheck_quirk:1;
 	unsigned		resume_hs_terminations:1;
 	unsigned		parkmode_disable_ss_quirk:1;
+	unsigned		parkmode_disable_hs_quirk:1;
 	unsigned		gfladj_refclk_lpm_sel:1;
 
 	unsigned		tx_de_emphasis_quirk:1;

From ef67750d99e2a666ee18a46aeb0615516845c089 Mon Sep 17 00:00:00 2001
From: Rick Yiu <rickyiu@google.com>
Date: Wed, 3 Jan 2024 06:56:23 +0000
Subject: [PATCH 113/139] ANDROID: sched: Export symbols for vendor modules

Export sysctl_sched_min_granularity and
sysctl_sched_idle_min_granularity. In the vendor module, it will use
several static function in GKI, while we do not want to export these
static functions, which will need to make them not static, we copied
them to the vendor module, so we need the export the symbols used in
those static functions. For example, sysctl_sched_min_granularity
and sysctl_sched_idle_min_granularity are referred in sched_slice(),
and they are only used as read-only.

Bug: 316276520
Change-Id: I976d0a1f3a70e8e60099e55fdd3cc99a90053fbb
Signed-off-by: Rick Yiu <rickyiu@google.com>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 507d1fc4e163..ac870e416e2a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,6 +96,7 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
 unsigned int sysctl_sched_min_granularity			= 750000ULL;
+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
 
 /*
@@ -105,6 +106,7 @@ static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
  * (default: 0.75 msec)
  */
 unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
+EXPORT_SYMBOL_GPL(sysctl_sched_idle_min_granularity);
 
 /*
  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity

From ac90f0829243d0c6d2f9b219b519c0d0be696ab6 Mon Sep 17 00:00:00 2001
From: Rick Yiu <rickyiu@google.com>
Date: Wed, 3 Jan 2024 07:22:49 +0000
Subject: [PATCH 114/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
  - sysctl_sched_idle_min_granularity
  - sysctl_sched_min_granularity

Bug: 316276520
Change-Id: I8e33c3105a3ca62d168a6289ceafc31404757453
Signed-off-by: Rick Yiu <rickyiu@google.com>
---
 android/abi_gki_aarch64.stg   | 20 ++++++++++++++++++++
 android/abi_gki_aarch64_pixel |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 4a190c90a757..113fdec98f04 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -388738,6 +388738,15 @@ elf_symbol {
   type_id: 0x4585663f
   full_name: "sysctl_sched_features"
 }
+elf_symbol {
+  id: 0xe6ea21b1
+  name: "sysctl_sched_idle_min_granularity"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0x69545cfa
+  type_id: 0x4585663f
+  full_name: "sysctl_sched_idle_min_granularity"
+}
 elf_symbol {
   id: 0x87812861
   name: "sysctl_sched_latency"
@@ -388747,6 +388756,15 @@ elf_symbol {
   type_id: 0x4585663f
   full_name: "sysctl_sched_latency"
 }
+elf_symbol {
+  id: 0x34555a8a
+  name: "sysctl_sched_min_granularity"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0x04390257
+  type_id: 0x4585663f
+  full_name: "sysctl_sched_min_granularity"
+}
 elf_symbol {
   id: 0x18d0dd21
   name: "sysctl_vals"
@@ -405076,7 +405094,9 @@ interface {
   symbol_id: 0x2f857527
   symbol_id: 0x3e5f4f82
   symbol_id: 0xbf1515af
+  symbol_id: 0xe6ea21b1
   symbol_id: 0x87812861
+  symbol_id: 0x34555a8a
   symbol_id: 0x18d0dd21
   symbol_id: 0x92705587
   symbol_id: 0xdbe66171
diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index a01a49721a1a..1acd4b663615 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -2091,7 +2091,9 @@
   synchronize_rcu
   syscon_regmap_lookup_by_phandle
   sysctl_sched_features
+  sysctl_sched_idle_min_granularity
   sysctl_sched_latency
+  sysctl_sched_min_granularity
   sysfs_add_file_to_group
   sysfs_add_link_to_group
   sysfs_create_file_ns

From 98b0e4cf0968083bfeaf1b63e5b5873acf534566 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 15 Sep 2023 17:31:06 +0300
Subject: [PATCH 115/139] BACKPORT: xhci: track port suspend state correctly in
 unsuccessful resume cases

xhci-hub.c tracks suspended ports in a suspended_port bitfield.
This is checked when responding to a Get_Status(PORT) request to see if a
port in running U0 state was recently resumed, and adds the required
USB_PORT_STAT_C_SUSPEND change bit in those cases.

The suspended_port bit was left uncleared if a device is disconnected
during suspend. The bit remained set even when a new device was connected
and enumerated. The set bit resulted in a incorrect Get_Status(PORT)
response with a bogus USB_PORT_STAT_C_SUSPEND change
bit set once the new device reached U0 link state.

USB_PORT_STAT_C_SUSPEND change bit is only used for USB2 ports, but
xhci-hub keeps track of both USB2 and USB3 suspended ports.

Cc: stable@vger.kernel.org
Reported-by: Wesley Cheng <quic_wcheng@quicinc.com>
Closes: https://lore.kernel.org/linux-usb/d68aa806-b26a-0e43-42fb-b8067325e967@quicinc.com/
Fixes: 1d5810b6923c ("xhci: Rework port suspend structures for limited ports.")
Tested-by: Wesley Cheng <quic_wcheng@quicinc.com>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Link: https://lore.kernel.org/r/20230915143108.1532163-3-mathias.nyman@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Bug: 200589374
(cherry picked from commit d7cdfc319b2bcf6899ab0a05eec0958bc802a9a1 https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-next)
[wcheng: modified change to remove dependency on updated resume timestamp tracking]
Change-Id: Icccc1778a1f193b4b4c03532d291db88772bd454
Signed-off-by: Wesley Cheng <quic_wcheng@quicinc.com>
---
 drivers/usb/host/xhci-hub.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 544028862de0..10eb2175e749 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1053,19 +1053,19 @@ static void xhci_get_usb3_port_status(struct xhci_port *port, u32 *status,
 		*status |= USB_PORT_STAT_C_CONFIG_ERROR << 16;
 
 	/* USB3 specific wPortStatus bits */
-	if (portsc & PORT_POWER) {
+	if (portsc & PORT_POWER)
 		*status |= USB_SS_PORT_STAT_POWER;
-		/* link state handling */
-		if (link_state == XDEV_U0)
-			bus_state->suspended_ports &= ~(1 << portnum);
-	}
 
-	/* remote wake resume signaling complete */
-	if (bus_state->port_remote_wakeup & (1 << portnum) &&
+	/* no longer suspended or resuming */
+	if (link_state != XDEV_U3 &&
 	    link_state != XDEV_RESUME &&
 	    link_state != XDEV_RECOVERY) {
-		bus_state->port_remote_wakeup &= ~(1 << portnum);
-		usb_hcd_end_port_resume(&hcd->self, portnum);
+		/* remote wake resume signaling complete */
+		if (bus_state->port_remote_wakeup & (1 << portnum)) {
+			bus_state->port_remote_wakeup &= ~(1 << portnum);
+			usb_hcd_end_port_resume(&hcd->self, portnum);
+		}
+		bus_state->suspended_ports &= ~(1 << portnum);
 	}
 
 	xhci_hub_report_usb3_link_state(xhci, status, portsc);
@@ -1111,6 +1111,21 @@ static void xhci_get_usb2_port_status(struct xhci_port *port, u32 *status,
 				return;
 		}
 	}
+
+	/*
+	 * Clear usb2 resume signalling variables if port is no longer suspended
+	 * or resuming. Port either resumed to U0/U1/U2, disconnected, or in a
+	 * error state. Resume related variables should be cleared in all those cases.
+	 */
+	if (link_state != XDEV_U3 && link_state != XDEV_RESUME) {
+		if (bus_state->resume_done[portnum] ||
+		    test_bit(portnum, &bus_state->resuming_ports)) {
+			bus_state->resume_done[portnum] = 0;
+			clear_bit(portnum, &bus_state->resuming_ports);
+			usb_hcd_end_port_resume(&port->rhub->hcd->self, portnum);
+		}
+		bus_state->suspended_ports &= ~(1 << portnum);
+	}
 }
 
 /*

From ec46fe0ac7cb11d1f9b6b4709745af317a28c489 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 6 Dec 2023 09:30:40 +0100
Subject: [PATCH 116/139] UPSTREAM: bpf: Fix prog_array_map_poke_run map poke
 update

commit 4b7de801606e504e69689df71475d27e35336fb3 upstream.

Lee pointed out issue found by syscaller [0] hitting BUG in prog array
map poke update in prog_array_map_poke_run function due to error value
returned from bpf_arch_text_poke function.

There's race window where bpf_arch_text_poke can fail due to missing
bpf program kallsym symbols, which is accounted for with check for
-EINVAL in that BUG_ON call.

The problem is that in such case we won't update the tail call jump
and cause imbalance for the next tail call update check which will
fail with -EBUSY in bpf_arch_text_poke.

I'm hitting following race during the program load:

  CPU 0                             CPU 1

  bpf_prog_load
    bpf_check
      do_misc_fixups
        prog_array_map_poke_track

                                    map_update_elem
                                      bpf_fd_array_map_update_elem
                                        prog_array_map_poke_run

                                          bpf_arch_text_poke returns -EINVAL

    bpf_prog_kallsyms_add

After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next
poke update fails on expected jump instruction check in bpf_arch_text_poke
with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run.

Similar race exists on the program unload.

Fixing this by moving the update to bpf_arch_poke_desc_update function which
makes sure we call __bpf_arch_text_poke that skips the bpf address check.

Each architecture has slightly different approach wrt looking up bpf address
in bpf_arch_text_poke, so instead of splitting the function or adding new
'checkip' argument in previous version, it seems best to move the whole
map_poke_run update as arch specific code.

  [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810

Bug: 309551558
Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Cc: Lee Jones <lee@kernel.org>
Cc: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 57a6b0a464eb322bd62a78469d251f1d428c5ebb)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: I251c3da579e5d48cd7de4043913fd42d0671d6b5
---
 arch/x86/net/bpf_jit_comp.c | 46 +++++++++++++++++++++++++++++
 include/linux/bpf.h         |  3 ++
 kernel/bpf/arraymap.c       | 58 +++++++------------------------------
 3 files changed, 59 insertions(+), 48 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5e680e039d0e..4686c1d9d0cf 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2553,3 +2553,49 @@ void bpf_jit_free(struct bpf_prog *prog)
 
 	bpf_prog_unlock_free(prog);
 }
+
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old)
+{
+	u8 *old_addr, *new_addr, *old_bypass_addr;
+	int ret;
+
+	old_bypass_addr = old ? NULL : poke->bypass_addr;
+	old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+	new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+	/*
+	 * On program loading or teardown, the program's kallsym entry
+	 * might not be in place, so we use __bpf_arch_text_poke to skip
+	 * the kallsyms check.
+	 */
+	if (new) {
+		ret = __bpf_arch_text_poke(poke->tailcall_target,
+					   BPF_MOD_JUMP,
+					   old_addr, new_addr);
+		BUG_ON(ret < 0);
+		if (!old) {
+			ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+						   BPF_MOD_JUMP,
+						   poke->bypass_addr,
+						   NULL);
+			BUG_ON(ret < 0);
+		}
+	} else {
+		ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+					   BPF_MOD_JUMP,
+					   old_bypass_addr,
+					   poke->bypass_addr);
+		BUG_ON(ret < 0);
+		/* let other CPUs finish the execution of program
+		 * so that it will not possible to expose them
+		 * to invalid nop, stack unwind, nop state
+		 */
+		if (!ret)
+			synchronize_rcu();
+		ret = __bpf_arch_text_poke(poke->tailcall_target,
+					   BPF_MOD_JUMP,
+					   old_addr, NULL);
+		BUG_ON(ret < 0);
+	}
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 320d3b287ed0..6a6ff277501d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2697,6 +2697,9 @@ enum bpf_text_poke_type {
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *addr1, void *addr2);
 
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+			       struct bpf_prog *new, struct bpf_prog *old);
+
 void *bpf_arch_text_copy(void *dst, void *src, size_t len);
 int bpf_arch_text_invalidate(void *dst, size_t len);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 832b2659e96e..00f23febb9a7 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -997,11 +997,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
 	mutex_unlock(&aux->poke_mutex);
 }
 
+void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+				      struct bpf_prog *new, struct bpf_prog *old)
+{
+	WARN_ON_ONCE(1);
+}
+
 static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 				    struct bpf_prog *old,
 				    struct bpf_prog *new)
 {
-	u8 *old_addr, *new_addr, *old_bypass_addr;
 	struct prog_poke_elem *elem;
 	struct bpf_array_aux *aux;
 
@@ -1010,7 +1015,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 
 	list_for_each_entry(elem, &aux->poke_progs, list) {
 		struct bpf_jit_poke_descriptor *poke;
-		int i, ret;
+		int i;
 
 		for (i = 0; i < elem->aux->size_poke_tab; i++) {
 			poke = &elem->aux->poke_tab[i];
@@ -1029,21 +1034,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			 *    activated, so tail call updates can arrive from here
 			 *    while JIT is still finishing its final fixup for
 			 *    non-activated poke entries.
-			 * 3) On program teardown, the program's kallsym entry gets
-			 *    removed out of RCU callback, but we can only untrack
-			 *    from sleepable context, therefore bpf_arch_text_poke()
-			 *    might not see that this is in BPF text section and
-			 *    bails out with -EINVAL. As these are unreachable since
-			 *    RCU grace period already passed, we simply skip them.
-			 * 4) Also programs reaching refcount of zero while patching
+			 * 3) Also programs reaching refcount of zero while patching
 			 *    is in progress is okay since we're protected under
 			 *    poke_mutex and untrack the programs before the JIT
-			 *    buffer is freed. When we're still in the middle of
-			 *    patching and suddenly kallsyms entry of the program
-			 *    gets evicted, we just skip the rest which is fine due
-			 *    to point 3).
-			 * 5) Any other error happening below from bpf_arch_text_poke()
-			 *    is a unexpected bug.
+			 *    buffer is freed.
 			 */
 			if (!READ_ONCE(poke->tailcall_target_stable))
 				continue;
@@ -1053,39 +1047,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
 			    poke->tail_call.key != key)
 				continue;
 
-			old_bypass_addr = old ? NULL : poke->bypass_addr;
-			old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
-			new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
-
-			if (new) {
-				ret = bpf_arch_text_poke(poke->tailcall_target,
-							 BPF_MOD_JUMP,
-							 old_addr, new_addr);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-				if (!old) {
-					ret = bpf_arch_text_poke(poke->tailcall_bypass,
-								 BPF_MOD_JUMP,
-								 poke->bypass_addr,
-								 NULL);
-					BUG_ON(ret < 0 && ret != -EINVAL);
-				}
-			} else {
-				ret = bpf_arch_text_poke(poke->tailcall_bypass,
-							 BPF_MOD_JUMP,
-							 old_bypass_addr,
-							 poke->bypass_addr);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-				/* let other CPUs finish the execution of program
-				 * so that it will not possible to expose them
-				 * to invalid nop, stack unwind, nop state
-				 */
-				if (!ret)
-					synchronize_rcu();
-				ret = bpf_arch_text_poke(poke->tailcall_target,
-							 BPF_MOD_JUMP,
-							 old_addr, NULL);
-				BUG_ON(ret < 0 && ret != -EINVAL);
-			}
+			bpf_arch_poke_desc_update(poke, new, old);
 		}
 	}
 }

From 02f444ba07d22d35f4f2ddbf7d7da0643811af15 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 28 Oct 2023 07:30:27 -0600
Subject: [PATCH 117/139] UPSTREAM: io_uring/fdinfo: lock SQ thread while
 retrieving thread cpu/pid

commit 7644b1a1c9a7ae8ab99175989bfc8676055edb46 upstream.

We could race with SQ thread exit, and if we do, we'll hit a NULL pointer
dereference when the thread is cleared. Grab the SQPOLL data lock before
attempting to get the task cpu and pid for fdinfo, this ensures we have a
stable view of it.

Bug: 309790656
Cc: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218032
Reviewed-by: Gabriel Krisman Bertazi <krisman@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
(cherry picked from commit 9236d2ea6465b37c0a73d994c1ad31753d31e5f5)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: I044e0285d4535440606ff593230b873e3145db91
---
 io_uring/fdinfo.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 882bd56b01ed..ea2c2ded4e41 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -51,7 +51,6 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 					  struct seq_file *m)
 {
-	struct io_sq_data *sq = NULL;
 	struct io_overflow_cqe *ocqe;
 	struct io_rings *r = ctx->rings;
 	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
@@ -62,6 +61,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 	unsigned int cq_shift = 0;
 	unsigned int sq_shift = 0;
 	unsigned int sq_entries, cq_entries;
+	int sq_pid = -1, sq_cpu = -1;
 	bool has_lock;
 	unsigned int i;
 
@@ -139,13 +139,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 	has_lock = mutex_trylock(&ctx->uring_lock);
 
 	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
-		sq = ctx->sq_data;
-		if (!sq->thread)
-			sq = NULL;
+		struct io_sq_data *sq = ctx->sq_data;
+
+		if (mutex_trylock(&sq->lock)) {
+			if (sq->thread) {
+				sq_pid = task_pid_nr(sq->thread);
+				sq_cpu = task_cpu(sq->thread);
+			}
+			mutex_unlock(&sq->lock);
+		}
 	}
 
-	seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
-	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
+	seq_printf(m, "SqThread:\t%d\n", sq_pid);
+	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
 		struct file *f = io_file_from_index(&ctx->file_table, i);

From 29544d41573d0085debd0416803ec5287948fccd Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipeng.wang_1@nxp.com>
Date: Fri, 5 Jan 2024 19:17:45 +0900
Subject: [PATCH 118/139] ANDROID: ABI: Update symbol list for imx

1 function symbol(s) added
  'bool iio_trigger_using_own(struct iio_dev*)'

Bug: 318788290

Change-Id: I5b17b2380f7087dabf51f4ed207e9ea4cab1ba38
Signed-off-by: Zhipeng Wang <zhipeng.wang_1@nxp.com>
---
 android/abi_gki_aarch64.stg | 10 ++++++++++
 android/abi_gki_aarch64_imx |  1 +
 2 files changed, 11 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 113fdec98f04..dc2644ea5502 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -365298,6 +365298,15 @@ elf_symbol {
   type_id: 0x16dc304e
   full_name: "iio_trigger_unregister"
 }
+elf_symbol {
+  id: 0xfb09b362
+  name: "iio_trigger_using_own"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xe2c1359e
+  type_id: 0xf886bca4
+  full_name: "iio_trigger_using_own"
+}
 elf_symbol {
   id: 0xdf3e8655
   name: "iio_update_buffers"
@@ -402489,6 +402498,7 @@ interface {
   symbol_id: 0x7551a60b
   symbol_id: 0x08fd4b84
   symbol_id: 0xc6d8f246
+  symbol_id: 0xfb09b362
   symbol_id: 0xdf3e8655
   symbol_id: 0x6f2f4bd1
   symbol_id: 0xf87ecda4
diff --git a/android/abi_gki_aarch64_imx b/android/abi_gki_aarch64_imx
index c3797c477a4c..1b556c8705f6 100644
--- a/android/abi_gki_aarch64_imx
+++ b/android/abi_gki_aarch64_imx
@@ -1025,6 +1025,7 @@
   iio_trigger_poll_chained
   iio_trigger_register
   iio_trigger_unregister
+  iio_trigger_using_own
   import_iovec
   in4_pton
   inet_csk_get_port

From d6554d1262c4f39c7b652e02e54bb692e7f20747 Mon Sep 17 00:00:00 2001
From: Wesley Cheng <quic_wcheng@quicinc.com>
Date: Tue, 21 Nov 2023 14:52:53 -0800
Subject: [PATCH 119/139] FROMGIT: usb: dwc3: gadget: Handle EP0 request
 dequeuing properly

Current EP0 dequeue path will share the same as other EPs.  However, there
are some special considerations that need to be made for EP0 transfers:

  - EP0 transfers never transition into the started_list
  - EP0 only has one active request at a time

In case there is a vendor specific control message for a function over USB
FFS, then there is no guarantee on the timeline which the DATA/STATUS stage
is responded to.  While this occurs, any attempt to end transfers on
non-control EPs will end up having the DWC3_EP_DELAY_STOP flag set, and
defer issuing of the end transfer command.  If the USB FFS application
decides to timeout the control transfer, or if USB FFS AIO path exits, the
USB FFS driver will issue a call to usb_ep_dequeue() for the ep0 request.

In case of the AIO exit path, the AIO FS blocks until all pending USB
requests utilizing the AIO path is completed.  However, since the dequeue
of ep0 req does not happen properly, all non-control EPs with the
DWC3_EP_DELAY_STOP flag set will not be handled, and the AIO exit path will
be stuck waiting for the USB FFS data endpoints to receive a completion
callback.

Fix is to utilize dwc3_ep0_reset_state() in the dequeue API to ensure EP0
is brought back to the SETUP state, and ensures that any deferred end
transfer commands are handled.  This also will end any active transfers
on EP0, compared to the previous implementation which directly called
giveback only.

Fixes: fcd2def66392 ("usb: dwc3: gadget: Refactor dwc3_gadget_ep_dequeue")
Acked-by: Thinh Nguyen <Thinh.Nguyen@synopsys.com>
Signed-off-by: Wesley Cheng <quic_wcheng@quicinc.com>

Bug: 318577849
Change-Id: Ic00684db4b502f1aab128f7e49f22510dda24f60
(cherry picked from commit 730e12fbec53ab59dd807d981a204258a4cfb29a https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-testing)
Signed-off-by: Wesley Cheng <quic_wcheng@quicinc.com>
---
 drivers/usb/dwc3/gadget.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 6fdac0fae461..121092e35ec6 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -2093,7 +2093,17 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep,
 
 	list_for_each_entry(r, &dep->pending_list, list) {
 		if (r == req) {
-			dwc3_gadget_giveback(dep, req, -ECONNRESET);
+			/*
+			 * Explicitly check for EP0/1 as dequeue for those
+			 * EPs need to be handled differently.  Control EP
+			 * only deals with one USB req, and giveback will
+			 * occur during dwc3_ep0_stall_and_restart().  EP0
+			 * requests are never added to started_list.
+			 */
+			if (dep->number > 1)
+				dwc3_gadget_giveback(dep, req, -ECONNRESET);
+			else
+				dwc3_ep0_reset_state(dwc);
 			goto out;
 		}
 	}

From 02aa72665c85f3398ada5ba37bacd56732799e11 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 2 Oct 2023 13:54:28 +0300
Subject: [PATCH 120/139] UPSTREAM: nvmet-tcp: Fix a possible UAF in queue
 intialization setup

commit d920abd1e7c4884f9ecd0749d1921b7ab19ddfbd upstream.

From Alon:
"Due to a logical bug in the NVMe-oF/TCP subsystem in the Linux kernel,
a malicious user can cause a UAF and a double free, which may lead to
RCE (may also lead to an LPE in case the attacker already has local
privileges)."

Hence, when a queue initialization fails after the ahash requests are
allocated, it is guaranteed that the queue removal async work will be
called, hence leave the deallocation to the queue removal.

Also, be extra careful not to continue processing the socket, so set
queue rcv_state to NVMET_TCP_RECV_ERR upon a socket error.

Bug: 310114968
Cc: stable@vger.kernel.org
Reported-by: Alon Zahavi <zahavi.alon@gmail.com>
Tested-by: Alon Zahavi <zahavi.alon@gmail.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit e985d78bdcf37f7ef73666a43b0d2407715f00d3)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: Ifd7ec8294182a6bf6d8c261aeda5d989e909f7ff
---
 drivers/nvme/target/tcp.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 5e29da94f72d..355d80323b83 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -345,6 +345,7 @@ static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
 {
+	queue->rcv_state = NVMET_TCP_RECV_ERR;
 	if (status == -EPIPE || status == -ECONNRESET)
 		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 	else
@@ -871,15 +872,11 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
 	iov.iov_len = sizeof(*icresp);
 	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 	if (ret < 0)
-		goto free_crypto;
+		return ret; /* queue removal will cleanup */
 
 	queue->state = NVMET_TCP_Q_LIVE;
 	nvmet_prepare_receive_pdu(queue);
 	return 0;
-free_crypto:
-	if (queue->hdr_digest || queue->data_digest)
-		nvmet_tcp_free_crypto(queue);
-	return ret;
 }
 
 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,

From 5070b3b594e321d5abf9613b15f662c6638ea959 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Thu, 23 Nov 2023 15:13:14 +0800
Subject: [PATCH 121/139] UPSTREAM: ipv4: igmp: fix refcnt uaf issue when
 receiving igmp query packet

[ Upstream commit e2b706c691905fe78468c361aaabc719d0a496f1 ]

When I perform the following test operations:
1.ip link add br0 type bridge
2.brctl addif br0 eth0
3.ip addr add 239.0.0.1/32 dev eth0
4.ip addr add 239.0.0.1/32 dev br0
5.ip addr add 224.0.0.1/32 dev br0
6.while ((1))
    do
        ifconfig br0 up
        ifconfig br0 down
    done
7.send IGMPv2 query packets to port eth0 continuously. For example,
./mausezahn ethX -c 0 "01 00 5e 00 00 01 00 72 19 88 aa 02 08 00 45 00 00
1c 00 01 00 00 01 02 0e 7f c0 a8 0a b7 e0 00 00 01 11 64 ee 9b 00 00 00 00"

The preceding tests may trigger the refcnt uaf issue of the mc list. The
stack is as follows:
	refcount_t: addition on 0; use-after-free.
	WARNING: CPU: 21 PID: 144 at lib/refcount.c:25 refcount_warn_saturate (lib/refcount.c:25)
	CPU: 21 PID: 144 Comm: ksoftirqd/21 Kdump: loaded Not tainted 6.7.0-rc1-next-20231117-dirty #80
	Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
	RIP: 0010:refcount_warn_saturate (lib/refcount.c:25)
	RSP: 0018:ffffb68f00657910 EFLAGS: 00010286
	RAX: 0000000000000000 RBX: ffff8a00c3bf96c0 RCX: ffff8a07b6160908
	RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff8a07b6160900
	RBP: ffff8a00cba36862 R08: 0000000000000000 R09: 00000000ffff7fff
	R10: ffffb68f006577c0 R11: ffffffffb0fdcdc8 R12: ffff8a00c3bf9680
	R13: ffff8a00c3bf96f0 R14: 0000000000000000 R15: ffff8a00d8766e00
	FS:  0000000000000000(0000) GS:ffff8a07b6140000(0000) knlGS:0000000000000000
	CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
	CR2: 000055f10b520b28 CR3: 000000039741a000 CR4: 00000000000006f0
	Call Trace:
	<TASK>
	igmp_heard_query (net/ipv4/igmp.c:1068)
	igmp_rcv (net/ipv4/igmp.c:1132)
	ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205)
	ip_local_deliver_finish (net/ipv4/ip_input.c:234)
	__netif_receive_skb_one_core (net/core/dev.c:5529)
	netif_receive_skb_internal (net/core/dev.c:5729)
	netif_receive_skb (net/core/dev.c:5788)
	br_handle_frame_finish (net/bridge/br_input.c:216)
	nf_hook_bridge_pre (net/bridge/br_input.c:294)
	__netif_receive_skb_core (net/core/dev.c:5423)
	__netif_receive_skb_list_core (net/core/dev.c:5606)
	__netif_receive_skb_list (net/core/dev.c:5674)
	netif_receive_skb_list_internal (net/core/dev.c:5764)
	napi_gro_receive (net/core/gro.c:609)
	e1000_clean_rx_irq (drivers/net/ethernet/intel/e1000/e1000_main.c:4467)
	e1000_clean (drivers/net/ethernet/intel/e1000/e1000_main.c:3805)
	__napi_poll (net/core/dev.c:6533)
	net_rx_action (net/core/dev.c:6735)
	__do_softirq (kernel/softirq.c:554)
	run_ksoftirqd (kernel/softirq.c:913)
	smpboot_thread_fn (kernel/smpboot.c:164)
	kthread (kernel/kthread.c:388)
	ret_from_fork (arch/x86/kernel/process.c:153)
	ret_from_fork_asm (arch/x86/entry/entry_64.S:250)
	</TASK>

The root causes are as follows:
Thread A					Thread B
...						netif_receive_skb
br_dev_stop					...
    br_multicast_leave_snoopers			...
        __ip_mc_dec_group			...
            __igmp_group_dropped		igmp_rcv
                igmp_stop_timer			    igmp_heard_query         //ref = 1
                ip_ma_put			        igmp_mod_timer
                    refcount_dec_and_test	            igmp_start_timer //ref = 0
			...                                     refcount_inc //ref increases from 0
When the device receives an IGMPv2 Query message, it starts the timer
immediately, regardless of whether the device is running. If the device is
down and has left the multicast group, it will cause the mc list refcount
uaf issue.

Bug: 316932391
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
(cherry picked from commit 94445d9583079e0ccc5dde1370076ff24800d86e)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: I277be2304e564994e05b981ccd6cd8cbb9dc85be
---
 net/ipv4/igmp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index cbc4816ed7d8..ac53ef7eec91 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -216,8 +216,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
 	int tv = prandom_u32_max(max_delay);
 
 	im->tm_running = 1;
-	if (!mod_timer(&im->timer, jiffies+tv+2))
-		refcount_inc(&im->refcnt);
+	if (refcount_inc_not_zero(&im->refcnt)) {
+		if (mod_timer(&im->timer, jiffies + tv + 2))
+			ip_ma_put(im);
+	}
 }
 
 static void igmp_gq_start_timer(struct in_device *in_dev)

From c5dc4b4b3d11ade04153efa22dbaa77159d73be5 Mon Sep 17 00:00:00 2001
From: Jia-Shiuan Chen <chenjs@google.com>
Date: Tue, 9 Jan 2024 07:12:12 +0000
Subject: [PATCH 122/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
  - dma_fence_array_ops

Bug: 319196045
Change-Id: Id65c62e0aedd65c9c72d71c8e39f7fae1e1de740
Signed-off-by: Jia-Shiuan Chen <chenjs@google.com>
---
 android/abi_gki_aarch64_pixel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index 1acd4b663615..353946f63fd3 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -543,6 +543,7 @@
   dmaengine_unmap_put
   dma_fence_add_callback
   dma_fence_array_create
+  dma_fence_array_ops
   dma_fence_context_alloc
   dma_fence_default_wait
   dma_fence_enable_sw_signaling

From 031f804149b99bcec0f52fdfcf89b85a8141a5da Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Mon, 27 Nov 2023 09:31:46 +0000
Subject: [PATCH 123/139] ANDROID: KVM: arm64: Avoid BUG-ing from the host
 abort path

Under certain circumstances __get_fault_info() may resolve the faulting
address using the AT instruction. Given that this is being done outside
of the host lock critical section, it is racy and the resolution via AT
may fail. We currently BUG() in this situation, which is obviously less
than ideal. Moving the address resolution to the critical section may
have a performance impact, so let's keep it where it is, but bail out
and return to the host to try a second time.

Bug: 311830307
Change-Id: I26d61b04a4ccf040bd31802abb3c6b998ff4a48b
Signed-off-by: Quentin Perret <qperret@google.com>
---
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index b3920a37f334..3a5193ca0fb3 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -875,7 +875,14 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 	int ret = -EPERM;
 
 	esr = read_sysreg_el2(SYS_ESR);
-	BUG_ON(!__get_fault_info(esr, &fault));
+	if (!__get_fault_info(esr, &fault)) {
+		addr = (u64)-1;
+		/*
+		 * We've presumably raced with a page-table change which caused
+		 * AT to fail, try again.
+		 */
+		goto return_to_host;
+	}
 	fault.esr_el2 = esr;
 
 	addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
@@ -902,6 +909,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
 	else
 		BUG_ON(ret && ret != -EAGAIN);
 
+return_to_host:
 	trace_host_mem_abort(esr, addr);
 }
 

From 928b3b5dde28ee2c7e85888e13ee2593ed864532 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2023 19:44:49 +0100
Subject: [PATCH 124/139] UPSTREAM: netfilter: nf_tables: skip set commit for
 deleted/destroyed sets

commit 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a upstream.

NFT_MSG_DELSET deactivates all elements in the set, skip
set->ops->commit() to avoid the unnecessary clone (for the pipapo case)
as well as the sync GC cycle, which could deactivate again expired
elements in such set.

Bug: 318548348
Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane")
Reported-by: Kevin Rich <kevinrich1337@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
(cherry picked from commit 0105571f80edb96f81bb4bbdd5233a9130dc345b)
Signed-off-by: Lee Jones <joneslee@google.com>
Change-Id: Ie733688e27d9568d797fc1bc477261883b7dc8c1
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6b5f22dc1d94..30802f7f2114 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -9474,7 +9474,7 @@ static void nft_set_commit_update(struct list_head *set_update_list)
 	list_for_each_entry_safe(set, next, set_update_list, pending_update) {
 		list_del_init(&set->pending_update);
 
-		if (!set->ops->commit)
+		if (!set->ops->commit || set->dead)
 			continue;
 
 		set->ops->commit(set);

From c1b1201d39dcb59bf20e45f40459ea25b37392b7 Mon Sep 17 00:00:00 2001
From: Dezhi Huang <huangdezhi@hihonor.com>
Date: Mon, 25 Dec 2023 15:06:46 +0800
Subject: [PATCH 125/139] BACKPORT: FROMLIST: dma-buf: Move sysfs work out of
 DMA-BUF export path

We have identified an animation lag issue on our Android 14-6.1 product
which seems to be caused by contention in the rwsem lock during the
dmabuf request process. It appears that other processes are holding
sysfs read locks, resulting in the blocking of dmabuf sysfs node
creation. We encountered an issue in android14-6.1 that is similar to
the problem described in [1]. So we cherry-pick this commit to
android14-6.1.

[1] https://android-review.googlesource.com/c/kernel/common/+/2111974

Bug: 311282169
Bug: 206979019
Link: https://lore.kernel.org/lkml/CABdmKX2dNYhgOYdrrJU6-jt6F=LjCidbKhR6t4F7yaa0SPr+-A@mail.gmail.com/T/
Signed-off-by: Dezhi Huang <huangdezhi@hihonor.com>
Conflicts:
	include/linux/dma-buf.h

1. The android14-6.1 KMI is frozen, and the modification to struct
   dma_buf_sysfs_entry in the original patch triggers ABI check
   failures. Instead of an anonymous union, use the existing struct
   kobject directly as a work_struct with type punning.
Signed-off-by: T.J. Mercier <tjmercier@google.com>
Change-Id: Ic0386849b6b248b0a72215633fc1a50782455bac
---
 drivers/dma-buf/dma-buf-sysfs-stats.c | 66 +++++++++++++++++++++------
 drivers/dma-buf/dma-buf.c             | 16 +++++--
 2 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c
index 4b680e10c15a..46520c4d8ec9 100644
--- a/drivers/dma-buf/dma-buf-sysfs-stats.c
+++ b/drivers/dma-buf/dma-buf-sysfs-stats.c
@@ -11,6 +11,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
+#include <linux/workqueue.h>
 
 #include "dma-buf-sysfs-stats.h"
 
@@ -168,35 +169,72 @@ void dma_buf_uninit_sysfs_statistics(void)
 	kset_unregister(dma_buf_stats_kset);
 }
 
+static void sysfs_add_workfn(struct work_struct *work)
+{
+	/* The ABI would have to change for this to be false, but let's be paranoid. */
+	_Static_assert(sizeof(struct kobject) >= sizeof(struct work_struct),
+		"kobject is smaller than work_struct");
+
+	struct dma_buf_sysfs_entry *sysfs_entry =
+		container_of((struct kobject *)work, struct dma_buf_sysfs_entry, kobj);
+	struct dma_buf *dmabuf = sysfs_entry->dmabuf;
+
+	/*
+	 * A dmabuf is ref-counted via its file member. If this handler holds the only
+	 * reference to the dmabuf, there is no need for sysfs kobject creation. This is an
+	 * optimization and a race; when the reference count drops to 1 immediately after
+	 * this check it is not harmful as the sysfs entry will still get cleaned up in
+	 * dma_buf_stats_teardown, which won't get called until the final dmabuf reference
+	 * is released, and that can't happen until the end of this function.
+	 */
+	if (file_count(dmabuf->file) > 1) {
+		/*
+		 * kobject_init_and_add expects kobject to be zero-filled, but we have populated it
+		 * (the sysfs_add_work union member) to trigger this work function.
+		 */
+		memset(&dmabuf->sysfs_entry->kobj, 0, sizeof(dmabuf->sysfs_entry->kobj));
+		dmabuf->sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset;
+		if (kobject_init_and_add(&dmabuf->sysfs_entry->kobj, &dma_buf_ktype, NULL,
+						"%lu", file_inode(dmabuf->file)->i_ino)) {
+			kobject_put(&dmabuf->sysfs_entry->kobj);
+			dmabuf->sysfs_entry = NULL;
+		}
+	} else {
+		/*
+		 * Free the sysfs_entry and reset the pointer so dma_buf_stats_teardown doesn't
+		 * attempt to operate on it.
+		 */
+		kfree(dmabuf->sysfs_entry);
+		dmabuf->sysfs_entry = NULL;
+	}
+	dma_buf_put(dmabuf);
+}
+
 int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file)
 {
 	struct dma_buf_sysfs_entry *sysfs_entry;
-	int ret;
+	struct work_struct *work;
 
 	if (!dmabuf->exp_name) {
 		pr_err("exporter name must not be empty if stats needed\n");
 		return -EINVAL;
 	}
 
-	sysfs_entry = kzalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL);
+	sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL);
 	if (!sysfs_entry)
 		return -ENOMEM;
 
-	sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset;
 	sysfs_entry->dmabuf = dmabuf;
-
 	dmabuf->sysfs_entry = sysfs_entry;
 
-	/* create the directory for buffer stats */
-	ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL,
-				   "%lu", file_inode(file)->i_ino);
-	if (ret)
-		goto err_sysfs_dmabuf;
+	/*
+	 * The use of kobj as a work_struct is an ugly hack
+	 * to avoid an ABI break in this frozen kernel.
+	 */
+	work = (struct work_struct *)&dmabuf->sysfs_entry->kobj;
+	INIT_WORK(work, sysfs_add_workfn);
+	get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */
+	schedule_work(work);
 
 	return 0;
-
-err_sysfs_dmabuf:
-	kobject_put(&sysfs_entry->kobj);
-	dmabuf->sysfs_entry = NULL;
-	return ret;
 }
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index e0d42ee76b43..fbe8a07552ef 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -727,10 +727,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 		dmabuf->resv = resv;
 	}
 
-	ret = dma_buf_stats_setup(dmabuf, file);
-	if (ret)
-		goto err_dmabuf;
-
 	file->private_data = dmabuf;
 	file->f_path.dentry->d_fsdata = dmabuf;
 	dmabuf->file = file;
@@ -739,9 +735,19 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 	list_add(&dmabuf->list_node, &db_list.head);
 	mutex_unlock(&db_list.lock);
 
+	ret = dma_buf_stats_setup(dmabuf, file);
+	if (ret)
+		goto err_sysfs;
+
 	return dmabuf;
 
-err_dmabuf:
+err_sysfs:
+	mutex_lock(&db_list.lock);
+	list_del(&dmabuf->list_node);
+	mutex_unlock(&db_list.lock);
+	dmabuf->file = NULL;
+	file->f_path.dentry->d_fsdata = NULL;
+	file->private_data = NULL;
 	if (!resv)
 		dma_resv_fini(dmabuf->resv);
 	kfree(dmabuf);

From 227b55a7a3ccacd83510b176f4d879de1887b2fa Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
Date: Thu, 8 Dec 2022 16:16:37 +0530
Subject: [PATCH 126/139] ANDROID: dma-buf: don't re-purpose kobject as
 work_struct

The commit 5aec776ef8c9 ("BACKPORT: ANDROID: dma-buf: Move sysfs work
out of DMA-BUF export path) re-purposed kobject as work_struct temporarily
to create the sysfs entries asynchronously. The author knows what he is
doing and rightly added a build assert if kobject struct size is smaller than
the work_struct size. We are hitting this build assert on a non-GKI platform
where CONFIG_ANDROID_KABI_RESERVE is not set. Fix this problem by allocating
a new union with dma_buf_sysfs_entry structure and temporary structure as
members. We only end up allocating more memory (because of union) only when
kobject size is smaller than work_struct which the original patch any way
assumed would never be true.

Bug: 261818147
Bug: 262666413
Change-Id: Ifb089bf80d8a3a44ece9f05fc0b99ee76cb11645
Signed-off-by: Pavankumar Kondeti <quic_pkondeti@quicinc.com>
(cherry picked from commit ce18af9b5d7d0baad2ac3eea4c732d2bf128d690)
Signed-off-by: T.J. Mercier <tjmercier@google.com>
---
 drivers/dma-buf/dma-buf-sysfs-stats.c | 44 +++++++++++++++------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c
index 46520c4d8ec9..4f3ee92dbe1b 100644
--- a/drivers/dma-buf/dma-buf-sysfs-stats.c
+++ b/drivers/dma-buf/dma-buf-sysfs-stats.c
@@ -169,15 +169,21 @@ void dma_buf_uninit_sysfs_statistics(void)
 	kset_unregister(dma_buf_stats_kset);
 }
 
+struct dma_buf_create_sysfs_entry {
+	struct dma_buf *dmabuf;
+	struct work_struct work;
+};
+
+union dma_buf_create_sysfs_work_entry {
+	struct dma_buf_create_sysfs_entry create_entry;
+	struct dma_buf_sysfs_entry sysfs_entry;
+};
+
 static void sysfs_add_workfn(struct work_struct *work)
 {
-	/* The ABI would have to change for this to be false, but let's be paranoid. */
-	_Static_assert(sizeof(struct kobject) >= sizeof(struct work_struct),
-		"kobject is smaller than work_struct");
-
-	struct dma_buf_sysfs_entry *sysfs_entry =
-		container_of((struct kobject *)work, struct dma_buf_sysfs_entry, kobj);
-	struct dma_buf *dmabuf = sysfs_entry->dmabuf;
+	struct dma_buf_create_sysfs_entry *create_entry =
+		container_of(work, struct dma_buf_create_sysfs_entry, work);
+	struct dma_buf *dmabuf = create_entry->dmabuf;
 
 	/*
 	 * A dmabuf is ref-counted via its file member. If this handler holds the only
@@ -188,6 +194,7 @@ static void sysfs_add_workfn(struct work_struct *work)
 	 * is released, and that can't happen until the end of this function.
 	 */
 	if (file_count(dmabuf->file) > 1) {
+		dmabuf->sysfs_entry->dmabuf = dmabuf;
 		/*
 		 * kobject_init_and_add expects kobject to be zero-filled, but we have populated it
 		 * (the sysfs_add_work union member) to trigger this work function.
@@ -212,29 +219,26 @@ static void sysfs_add_workfn(struct work_struct *work)
 
 int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file)
 {
-	struct dma_buf_sysfs_entry *sysfs_entry;
-	struct work_struct *work;
+	struct dma_buf_create_sysfs_entry *create_entry;
+	union dma_buf_create_sysfs_work_entry *work_entry;
 
 	if (!dmabuf->exp_name) {
 		pr_err("exporter name must not be empty if stats needed\n");
 		return -EINVAL;
 	}
 
-	sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL);
-	if (!sysfs_entry)
+	work_entry = kmalloc(sizeof(union dma_buf_create_sysfs_work_entry), GFP_KERNEL);
+	if (!work_entry)
 		return -ENOMEM;
 
-	sysfs_entry->dmabuf = dmabuf;
-	dmabuf->sysfs_entry = sysfs_entry;
+	dmabuf->sysfs_entry = &work_entry->sysfs_entry;
 
-	/*
-	 * The use of kobj as a work_struct is an ugly hack
-	 * to avoid an ABI break in this frozen kernel.
-	 */
-	work = (struct work_struct *)&dmabuf->sysfs_entry->kobj;
-	INIT_WORK(work, sysfs_add_workfn);
+	create_entry = &work_entry->create_entry;
+	create_entry->dmabuf = dmabuf;
+
+	INIT_WORK(&create_entry->work, sysfs_add_workfn);
 	get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */
-	schedule_work(work);
+	schedule_work(&create_entry->work);
 
 	return 0;
 }

From bc4d82ee40515f0c770b28d0dc4fa532a2b1850e Mon Sep 17 00:00:00 2001
From: Norihiko Hama <Norihiko.Hama@alpsalpine.com>
Date: Fri, 15 Dec 2023 12:04:47 +0900
Subject: [PATCH 127/139] ANDROID: KMI workaround for
 CONFIG_NETFILTER_FAMILY_BRIDGE

Enabling CONFIG_NETFILTER_FAMILY_BRIDGE causes the new element,
hooks_bridge[] to be added to netns_nf. Since the KMI is frozen
this could not be added.

The only instantiation of struct netns_nf is as an embedded field
of struct net. So instead of adding the field to struct netns_nf,
a new "struct ext_net" is added that contains struct net and
the new hooks_bridge[] field. An accessor function,
get_nf_hooks_bridge() is added to get a pointer to the new
field.

There is a global init_net of type struct net which must be special
cased since it is not a member of a struct ext_net. All other
instances of struct net are allocated via net_alloc() which now
allocates a struct ext_net.

Since CONFIG_NETFILTER_FAMILY_BRIDGE is a hidden config that is
needed for vendor modules, it is enabled via init/Kconfig.gki.

Bug: 316040984
Fixes: 0145780bfc78 ("fix KASAN-related kernel crash by KMI W/A for NETFILTER_FAMILY_BRIDGE")

Change-Id: I2c7384e3df9b88f12464dc0138986fed12ca626a
Signed-off-by: Norihiko Hama <Norihiko.Hama@alpsalpine.com>
---
 include/linux/netfilter.h       |  2 +-
 include/net/net_namespace.h     | 30 ++++++++++++++++++++++++++++++
 include/net/netns/netfilter.h   |  3 ---
 init/Kconfig.gki                |  1 +
 net/bridge/br_input.c           |  2 +-
 net/bridge/br_netfilter_hooks.c |  2 +-
 net/core/net_namespace.c        | 10 +++++++---
 net/netfilter/core.c            | 12 +++++++++---
 net/netfilter/nf_queue.c        |  2 +-
 net/netfilter/nfnetlink_hook.c  |  4 ++--
 10 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 445494c502ba..49dc95d21f01 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -243,7 +243,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 		break;
 	case NFPROTO_BRIDGE:
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
-		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+		hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]);
 #endif
 		break;
 	default:
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 8c3587d5c308..6641c4543d18 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -188,6 +188,36 @@ struct net {
 #endif
 } __randomize_layout;
 
+/*
+ * To work around a KMI issue, hooks_bridge[] could not be
+ * added to struct netns_nf. Since the only use of netns_nf
+ * is embedded in struct net, struct ext_net is added to
+ * contain struct net plus the new field. Users of the new
+ * field must use get_nf_hooks_bridge() to access the field.
+ */
+struct ext_net {
+	struct net net;
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+	struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
+#endif
+	ANDROID_VENDOR_DATA(1);
+};
+
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+extern struct net init_net;
+extern struct nf_hook_entries **init_nf_hooks_bridgep;
+
+static inline struct nf_hook_entries __rcu **get_nf_hooks_bridge(const struct net *net)
+{
+	struct ext_net *ext_net;
+
+	if (net == &init_net)
+		return init_nf_hooks_bridgep;
+	ext_net = container_of(net, struct ext_net, net);
+	return ext_net->hooks_bridge;
+}
+#endif
+
 #include <linux/seq_file_net.h>
 
 /* Init's network namespace */
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 3b7eb0cb1201..56c72117b5b3 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -22,9 +22,6 @@ struct netns_nf {
 #ifdef CONFIG_NETFILTER_FAMILY_ARP
 	struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS];
 #endif
-#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
-	struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS];
-#endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	unsigned int defrag_ipv4_users;
 #endif
diff --git a/init/Kconfig.gki b/init/Kconfig.gki
index 081b1cdc9c7e..1a17a3d6e27b 100644
--- a/init/Kconfig.gki
+++ b/init/Kconfig.gki
@@ -202,6 +202,7 @@ config GKI_HIDDEN_NET_CONFIGS
 	select PAGE_POOL
 	select NET_PTP_CLASSIFY
 	select NET_DEVLINK
+	select NETFILTER_FAMILY_BRIDGE
 	help
 	  Dummy config option used to enable the networking hidden
 	  config, required by various SoC platforms.
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 6bb272894c96..0da15e1f3f72 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -243,7 +243,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
 		goto frame_finish;
 #endif
 
-	e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]);
+	e = rcu_dereference(get_nf_hooks_bridge(net)[NF_BR_PRE_ROUTING]);
 	if (!e)
 		goto frame_finish;
 
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 01d690d9fe5f..c7f0aedf2244 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1016,7 +1016,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 	unsigned int i;
 	int ret;
 
-	e = rcu_dereference(net->nf.hooks_bridge[hook]);
+	e = rcu_dereference(get_nf_hooks_bridge(net)[hook]);
 	if (!e)
 		return okfn(net, sk, skb);
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 4c1707d0eb9b..1d0110152862 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1093,9 +1093,13 @@ void __init net_ns_init(void)
 	struct net_generic *ng;
 
 #ifdef CONFIG_NET_NS
-	net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
-					SMP_CACHE_BYTES,
-					SLAB_PANIC|SLAB_ACCOUNT, NULL);
+	/* Allocate size for struct ext_net instead of struct net
+	 * to fix a KMI issue when CONFIG_NETFILTER_FAMILY_BRIDGE
+	 * is enabled
+	 */
+	net_cachep = kmem_cache_create("net_namespace", sizeof(struct ext_net),
+				       SMP_CACHE_BYTES,
+				       SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	/* Create workqueue for cleanup */
 	netns_wq = create_singlethread_workqueue("netns");
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 55a7f72d547c..6c7a44f84b93 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -39,6 +39,12 @@ struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
 #endif
 
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+struct nf_hook_entries __rcu *init_nf_hooks_bridge[NF_INET_NUMHOOKS];
+struct nf_hook_entries __rcu **init_nf_hooks_bridgep = &init_nf_hooks_bridge[0];
+EXPORT_SYMBOL_GPL(init_nf_hooks_bridgep);
+#endif
+
 static DEFINE_MUTEX(nf_hook_mutex);
 
 /* max hooks per family/hooknum */
@@ -278,9 +284,9 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum,
 #endif
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	case NFPROTO_BRIDGE:
-		if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum))
+		if (WARN_ON_ONCE(hooknum >= NF_INET_NUMHOOKS))
 			return NULL;
-		return net->nf.hooks_bridge + hooknum;
+		return get_nf_hooks_bridge(net) + hooknum;
 #endif
 #ifdef CONFIG_NETFILTER_INGRESS
 	case NFPROTO_INET:
@@ -747,7 +753,7 @@ static int __net_init netfilter_net_init(struct net *net)
 	__netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp));
 #endif
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
-	__netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge));
+	__netfilter_net_init(get_nf_hooks_bridge(net), NF_INET_NUMHOOKS);
 #endif
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 63d1516816b1..566f7794bf58 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -281,7 +281,7 @@ static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf
 	switch (pf) {
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
 	case NFPROTO_BRIDGE:
-		return rcu_dereference(net->nf.hooks_bridge[hooknum]);
+		return rcu_dereference(get_nf_hooks_bridge(net)[hooknum]);
 #endif
 	case NFPROTO_IPV4:
 		return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index 8120aadf6a0f..3ca3c3a3ba01 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -210,9 +210,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de
 		break;
 	case NFPROTO_BRIDGE:
 #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
-		if (hook >= ARRAY_SIZE(net->nf.hooks_bridge))
+		if (hook >= NF_INET_NUMHOOKS)
 			return ERR_PTR(-EINVAL);
-		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
+		hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]);
 #endif
 		break;
 #if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS)

From febcf1429fa1f4cc476fa9616fe81d3b29059818 Mon Sep 17 00:00:00 2001
From: Aran Dalton <arda@allwinnertech.com>
Date: Fri, 5 Jan 2024 19:07:02 +0800
Subject: [PATCH 128/139] ANDROID: gki_defconfig: Set CONFIG_IDLE_INJECT and
 CONFIG_CPU_IDLE_THERMAL into y

Under certain circumstances a SoC can reach a critical temperaturelimit
and is unable to stabilize the temperature around a temperaturecontrol.
The system may ask for a specific power budget butbecause of the OPP
density, we can only choose an OPP with a powerbudget lower than the
requested one and under-utilize the CPU, thuslosing performance. In
other words, one OPP under-utilizes the CPUwith a power less than the
requested power budget and the next OPPexceeds the power budget. The
cpu idle cooling can solve this problem.

Bug: 299411923
Signed-off-by: Aran Dalton <arda@allwinnertech.com>
Change-Id: I1c17b340617e88be075097dc47f30ce94be2a4d7
---
 arch/arm64/configs/gki_defconfig | 2 ++
 arch/x86/configs/gki_defconfig   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig
index 38c90d04264e..009675466150 100644
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@@ -431,6 +431,7 @@ CONFIG_THERMAL_WRITABLE_TRIPS=y
 CONFIG_THERMAL_GOV_USER_SPACE=y
 CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y
 CONFIG_CPU_THERMAL=y
+CONFIG_CPU_IDLE_THERMAL=y
 CONFIG_DEVFREQ_THERMAL=y
 CONFIG_THERMAL_EMULATION=y
 CONFIG_WATCHDOG=y
@@ -580,6 +581,7 @@ CONFIG_IIO_TRIGGER=y
 CONFIG_PWM=y
 CONFIG_GENERIC_PHY=y
 CONFIG_POWERCAP=y
+CONFIG_IDLE_INJECT=y
 CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_BINDERFS=y
 CONFIG_ANDROID_DEBUG_SYMBOLS=y
diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig
index 22797e912979..7e2df44033bc 100644
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@@ -396,6 +396,7 @@ CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100
 CONFIG_THERMAL_WRITABLE_TRIPS=y
 CONFIG_THERMAL_GOV_USER_SPACE=y
 CONFIG_CPU_THERMAL=y
+CONFIG_CPU_IDLE_THERMAL=y
 CONFIG_DEVFREQ_THERMAL=y
 CONFIG_THERMAL_EMULATION=y
 # CONFIG_X86_PKG_TEMP_THERMAL is not set
@@ -523,6 +524,7 @@ CONFIG_IIO=y
 CONFIG_IIO_BUFFER=y
 CONFIG_IIO_TRIGGER=y
 CONFIG_POWERCAP=y
+CONFIG_IDLE_INJECT=y
 CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_BINDERFS=y
 CONFIG_ANDROID_DEBUG_SYMBOLS=y

From 28154afe74965b64bfb305f609cbc4cda7cf7004 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 18 Dec 2023 14:52:14 -0800
Subject: [PATCH 129/139] FROMLIST: scsi: ufs: Simplify power management during
 async scan

ufshcd_init() calls pm_runtime_get_sync() before it calls
async_schedule(). ufshcd_async_scan() calls pm_runtime_put_sync()
directly or indirectly from ufshcd_add_lus(). Simplify
ufshcd_async_scan() by always calling pm_runtime_put_sync() from
ufshcd_async_scan().

Cc: stable@vger.kernel.org
Change-Id: I4b6ede95360c665594963fff0962742728064fb0
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Bug: 310401362
Link: https://lore.kernel.org/linux-scsi/20231218225229.2542156-2-bvanassche@acm.org/
Signed-off-by: Bart Van Assche <bvanassche@google.com>
---
 drivers/ufs/core/ufshcd.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 94db7033989a..0a86de0feb79 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -8683,7 +8683,6 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
 	ufs_bsg_probe(hba);
 	ufshpb_init(hba);
 	scsi_scan_host(hba->host);
-	pm_runtime_put_sync(hba->dev);
 
 out:
 	return ret;
@@ -8916,15 +8915,15 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 
 	/* Probe and add UFS logical units  */
 	ret = ufshcd_add_lus(hba);
+
 out:
+	pm_runtime_put_sync(hba->dev);
 	/*
 	 * If we failed to initialize the device or the device is not
 	 * present, turn off the power/clocks etc.
 	 */
-	if (ret) {
-		pm_runtime_put_sync(hba->dev);
+	if (ret)
 		ufshcd_hba_exit(hba);
-	}
 }
 
 static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)

From 7c91752f5dd9d5c187e9c02282f726a0206e590b Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 18 Dec 2023 14:52:15 -0800
Subject: [PATCH 130/139] FROMLIST: scsi: ufs: Remove the ufshcd_hba_exit()
 call from ufshcd_async_scan()

Calling ufshcd_hba_exit() from a function that is called asynchronously
from ufshcd_init() is wrong because this triggers multiple race
conditions. Instead of calling ufshcd_hba_exit(), log an error message.

Reported-by: Daniel Mentz <danielmentz@google.com>
Closes: https://b.corp.google.com/issues/310401362
Fixes: 1d337ec2f35e ("ufs: improve init sequence")
Change-Id: I1c056c2e42889301f69107468f2b3eb38bf3d734
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Bug: 310401362
Link: https://lore.kernel.org/linux-scsi/20231218225229.2542156-3-bvanassche@acm.org/
Signed-off-by: Bart Van Assche <bvanassche@google.com>
---
 drivers/ufs/core/ufshcd.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 0a86de0feb79..0f0cfea31cbf 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -8918,12 +8918,9 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie)
 
 out:
 	pm_runtime_put_sync(hba->dev);
-	/*
-	 * If we failed to initialize the device or the device is not
-	 * present, turn off the power/clocks etc.
-	 */
+
 	if (ret)
-		ufshcd_hba_exit(hba);
+		dev_err(hba->dev, "%s failed: %d\n", __func__, ret);
 }
 
 static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd)

From 0801d8a89de16b3371ac091b123081565d58f53a Mon Sep 17 00:00:00 2001
From: liangjlee <liangjlee@google.com>
Date: Tue, 9 Jan 2024 03:17:54 +0000
Subject: [PATCH 131/139] ANDROID: mm: export dump_tasks symbol.

Export dump_tasks to dump per-task memory status when ramdump.

Bug: 316372318
Change-Id: Ie0dd1a4c7ada280dc0c7696781b4b9a5e2a100ab
Signed-off-by: liangjlee <liangjlee@google.com>
---
 mm/oom_kill.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2c5b854f767b..76a1954071e1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -420,7 +420,7 @@ static int dump_task(struct task_struct *p, void *arg)
  * State information includes task's pid, uid, tgid, vm size, rss,
  * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
-static void dump_tasks(struct oom_control *oc)
+void dump_tasks(struct oom_control *oc)
 {
 	pr_info("Tasks state (memory values in pages):\n");
 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
@@ -436,6 +436,7 @@ static void dump_tasks(struct oom_control *oc)
 		rcu_read_unlock();
 	}
 }
+EXPORT_SYMBOL_GPL(dump_tasks);
 
 static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
 {

From a41a4ee3704a02c39e67207df9ecc09a0e018e53 Mon Sep 17 00:00:00 2001
From: liangjlee <liangjlee@google.com>
Date: Wed, 10 Jan 2024 09:00:21 +0000
Subject: [PATCH 132/139] ANDROID: Update the ABI symbol list

Adding the following symbols:
  - dump_tasks

Bug: 316372318
Change-Id: Iddaed980a227d8beb966cf0fae24947f5bf8b473
Signed-off-by: liangjlee <liangjlee@google.com>
---
 android/abi_gki_aarch64.stg   | 15 +++++++++++++++
 android/abi_gki_aarch64_pixel |  1 +
 2 files changed, 16 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index dc2644ea5502..573e477d70d3 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -298635,6 +298635,11 @@ function {
   parameter_id: 0x3e10b518
   parameter_id: 0x0bb0c019
 }
+function {
+  id: 0x1f821b4c
+  return_type_id: 0x48b5725f
+  parameter_id: 0x3c692b7e
+}
 function {
   id: 0x1f835b6f
   return_type_id: 0x48b5725f
@@ -359364,6 +359369,15 @@ elf_symbol {
   type_id: 0x10985193
   full_name: "dump_stack"
 }
+elf_symbol {
+  id: 0x652fbf96
+  name: "dump_tasks"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x6fe3e49b
+  type_id: 0x1f821b4c
+  full_name: "dump_tasks"
+}
 elf_symbol {
   id: 0xda364c85
   name: "dw_handle_msi_irq"
@@ -401839,6 +401853,7 @@ interface {
   symbol_id: 0xe09fd784
   symbol_id: 0xded28924
   symbol_id: 0xe3421d56
+  symbol_id: 0x652fbf96
   symbol_id: 0xda364c85
   symbol_id: 0x68e0756b
   symbol_id: 0x12cb063e
diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index 353946f63fd3..d0f6d7be74ff 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -813,6 +813,7 @@
   drm_writeback_signal_completion
   dump_backtrace
   dump_stack
+  dump_tasks
   dw_handle_msi_irq
   dw_pcie_find_capability
   dw_pcie_host_init

From 800cac4b33b120783548aba4f04f6fce05453e27 Mon Sep 17 00:00:00 2001
From: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Date: Wed, 29 Nov 2023 18:20:43 +0530
Subject: [PATCH 133/139] FROMGIT: wifi: nl80211: Extend del pmksa support for
 SAE and OWE security

Current handling of del pmksa with SSID is limited to FILS
security. In the current change the del pmksa support is extended
to SAE/OWE security offloads as well. For OWE/SAE offloads, the
PMK is generated and cached at driver/FW, so user app needs the
capability to request cache deletion based on SSID for drivers
supporting SAE/OWE offload.

Signed-off-by: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Link: https://msgid.link/ecdae726459e0944c377a6a6f6cb2c34d2e057d0.1701262123.git.vinayak.yadawad@broadcom.com
[drop whitespace-damaged rdev_ops pointer completely, enabling tracing]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

Bug: 301410304
(cherry picked from commit aa0887c4f18e280f8c2aa6964af602bd16c37f54
https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git main)
Change-Id: Ia665b9760279eb77347e79c97d177cba3beaa107
Signed-off-by: Paul Chen <chenpaul@google.com>
---
 include/uapi/linux/nl80211.h |  3 +-
 net/wireless/nl80211.c       | 94 +++++++++++++++++++++++++-----------
 2 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f3af16ce1f64..50a59769828a 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -567,7 +567,8 @@
  * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC
  *	(for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID,
  *	%NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS
- *	authentication.
+ *	authentication. Additionally in case of SAE offload and OWE offloads
+ *	PMKSA entry can be deleted using %NL80211_ATTR_SSID.
  * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries.
  *
  * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d919eff62ebe..0fdf95420bec 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12017,16 +12017,18 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
-static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
+static int nl80211_set_pmksa(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
-	int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev,
-			struct cfg80211_pmksa *pmksa) = NULL;
 	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_pmksa pmksa;
+	bool ap_pmksa_caching_support = false;
 
 	memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
 
+	ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy,
+		NL80211_EXT_FEATURE_AP_PMKSA_CACHING);
+
 	if (!info->attrs[NL80211_ATTR_PMKID])
 		return -EINVAL;
 
@@ -12035,16 +12037,15 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 	if (info->attrs[NL80211_ATTR_MAC]) {
 		pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
 	} else if (info->attrs[NL80211_ATTR_SSID] &&
-		   info->attrs[NL80211_ATTR_FILS_CACHE_ID] &&
-		   (info->genlhdr->cmd == NL80211_CMD_DEL_PMKSA ||
-		    info->attrs[NL80211_ATTR_PMK])) {
+	           info->attrs[NL80211_ATTR_FILS_CACHE_ID] &&
+	           info->attrs[NL80211_ATTR_PMK]) {
 		pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
 		pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
-		pmksa.cache_id =
-			nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]);
+		pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]);
 	} else {
 		return -EINVAL;
 	}
+
 	if (info->attrs[NL80211_ATTR_PMK]) {
 		pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]);
 		pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
@@ -12056,32 +12057,71 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 
 	if (info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD])
 		pmksa.pmk_reauth_threshold =
-			nla_get_u8(
-				info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]);
+			nla_get_u8(info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]);
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
-	    !(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP &&
-	      wiphy_ext_feature_isset(&rdev->wiphy,
-				      NL80211_EXT_FEATURE_AP_PMKSA_CACHING)))
+	    !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP ||
+	       dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) &&
+	       ap_pmksa_caching_support))
 		return -EOPNOTSUPP;
 
-	switch (info->genlhdr->cmd) {
-	case NL80211_CMD_SET_PMKSA:
-		rdev_ops = rdev->ops->set_pmksa;
-		break;
-	case NL80211_CMD_DEL_PMKSA:
-		rdev_ops = rdev->ops->del_pmksa;
-		break;
-	default:
-		WARN_ON(1);
-		break;
+	if (!rdev->ops->set_pmksa)
+		return -EOPNOTSUPP;
+
+	return rdev_set_pmksa(rdev, dev, &pmksa);
+}
+
+static int nl80211_del_pmksa(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_pmksa pmksa;
+	bool sae_offload_support = false;
+	bool owe_offload_support = false;
+	bool ap_pmksa_caching_support = false;
+
+	memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
+
+	sae_offload_support = wiphy_ext_feature_isset(&rdev->wiphy,
+		NL80211_EXT_FEATURE_SAE_OFFLOAD);
+	owe_offload_support = wiphy_ext_feature_isset(&rdev->wiphy,
+		NL80211_EXT_FEATURE_OWE_OFFLOAD);
+	ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy,
+		NL80211_EXT_FEATURE_AP_PMKSA_CACHING);
+
+	if (info->attrs[NL80211_ATTR_PMKID])
+		pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
+
+	if (info->attrs[NL80211_ATTR_MAC]) {
+		pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	} else if (info->attrs[NL80211_ATTR_SSID]) {
+		/* SSID based pmksa flush suppported only for FILS,
+		 * OWE/SAE OFFLOAD cases
+		 */
+		if (info->attrs[NL80211_ATTR_FILS_CACHE_ID] &&
+		    info->attrs[NL80211_ATTR_PMK]) {
+			pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]);
+		} else if (!sae_offload_support && !owe_offload_support) {
+			return -EINVAL;
+		}
+		pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+		pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+	} else {
+		return -EINVAL;
 	}
 
-	if (!rdev_ops)
+	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
+	    !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP ||
+	       dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) &&
+	       ap_pmksa_caching_support))
 		return -EOPNOTSUPP;
 
-	return rdev_ops(&rdev->wiphy, dev, &pmksa);
+	if (!rdev->ops->del_pmksa)
+		return -EOPNOTSUPP;
+
+	return rdev_del_pmksa(rdev, dev, &pmksa);
 }
 
 static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info)
@@ -16817,7 +16857,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 	{
 		.cmd = NL80211_CMD_SET_PMKSA,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.doit = nl80211_setdel_pmksa,
+		.doit = nl80211_set_pmksa,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_CLEAR_SKB),
@@ -16825,7 +16865,7 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 	{
 		.cmd = NL80211_CMD_DEL_PMKSA,
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
-		.doit = nl80211_setdel_pmksa,
+		.doit = nl80211_del_pmksa,
 		.flags = GENL_UNS_ADMIN_PERM,
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
 	},

From 82bf9e7625bebc1aef7271ea8f0c9b256b2a89ce Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Tue, 9 Jan 2024 17:22:33 -0800
Subject: [PATCH 134/139] FROMGIT: BACKPORT: mm/cma: fix placement of
 trace_cma_alloc_start/finish

The current placement of trace_cma_alloc_start/finish misses the fail
cases: !cma || !cma->count || !cma->bitmap.

trace_cma_alloc_finish is also not emitted for the failure case
where bitmap_count > bitmap_maxno.

Fix these missed cases by moving the start event before the failure
checks and moving the finish event to the out label.

Link: https://lkml.kernel.org/r/20240110012234.3793639-1-kaleshsingh@google.com
Fixes: 7bc1aec5e287 ("mm: cma: add trace events for CMA alloc perf testing")
Change-Id: I61153fe078da4f9f3338147f1fbb7697a5554078
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Liam Mark <lmark@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 3b08ab9a811caebe1327f25f51557f95200d94bf https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-unstable)
Bug: 315897033
[ Remove ret arg from trace_cma_alloc_finish - Kalesh Singh ]
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
---
 mm/cma.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index b64768625d82..6d466c77630f 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -438,6 +438,9 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count,
 	int ret = -ENOMEM;
 	int num_attempts = 0;
 	int max_retries = 5;
+	const char *name = cma ? cma->name : NULL;
+
+	trace_cma_alloc_start(name, count, align);
 
 	if (WARN_ON_ONCE((gfp_mask & GFP_KERNEL) == 0 ||
 		(gfp_mask & ~(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY)) != 0))
@@ -452,8 +455,6 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count,
 	if (!count)
 		goto out;
 
-	trace_cma_alloc_start(cma->name, count, align);
-
 	mask = cma_bitmap_aligned_mask(cma, align);
 	offset = cma_bitmap_aligned_offset(cma, align);
 	bitmap_maxno = cma_bitmap_maxno(cma);
@@ -522,8 +523,6 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count,
 		start = bitmap_no + mask + 1;
 	}
 
-	trace_cma_alloc_finish(cma->name, pfn, page, count, align);
-
 	/*
 	 * CMA can allocate multiple page blocks, which results in different
 	 * blocks being marked with different tags. Reset the tags to ignore
@@ -542,6 +541,7 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count,
 
 	pr_debug("%s(): returned %p\n", __func__, page);
 out:
+	trace_cma_alloc_finish(name, pfn, page, count, align);
 	if (page) {
 		count_vm_event(CMA_ALLOC_SUCCESS);
 		cma_sysfs_account_success_pages(cma, count);

From fd40c1d9017aae068ebbd569f1319588e3f78a34 Mon Sep 17 00:00:00 2001
From: Nikita Ioffe <ioffe@google.com>
Date: Thu, 11 Jan 2024 10:49:01 +0000
Subject: [PATCH 135/139] ANDROID: add 16k targets for Microdroid kernel

Bug: 317201718
Test: tools/bazel run //common:kernel_aarch64_microdroid_16k_dist

Change-Id: I542f07d1d0b4f2b6a3c4c58185eee16b2b7f1667
Signed-off-by: Nikita Ioffe <ioffe@google.com>
---
 BUILD.bazel | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/BUILD.bazel b/BUILD.bazel
index 6f4d747cdc94..1c1dc8627744 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -198,6 +198,34 @@ copy_to_dist_dir(
     log = "info",
 )
 
+kernel_build(
+    name = "kernel_aarch64_microdroid_16k",
+    srcs = ["//common:kernel_aarch64_sources"],
+    outs = [
+        "Image",
+        "System.map",
+        "modules.builtin",
+        "modules.builtin.modinfo",
+        "vmlinux",
+        "vmlinux.symvers",
+    ],
+    build_config = "build.config.microdroid.aarch64",
+    make_goals = [
+        "Image",
+    ],
+    page_size = "16k",
+)
+
+copy_to_dist_dir(
+    name = "kernel_aarch64_microdroid_16k_dist",
+    data = [
+        ":kernel_aarch64_microdroid_16k",
+    ],
+    dist_dir = "out/kernel_aarch64_microdroid_16k/dist",
+    flat = True,
+    log = "info",
+)
+
 # Microdroid is not a real device. The kernel image is built with special
 #  configs to reduce the size. Hence, not using mixed build.
 kernel_build(

From d4db0d5d081dfa2e41a2895c076c33aeb463f21f Mon Sep 17 00:00:00 2001
From: Lianjun Huang <huanglianjun@xiaomi.com>
Date: Tue, 12 Dec 2023 15:51:36 +0800
Subject: [PATCH 136/139] ANDROID: GKI: add vendor hooks for swapping in ahead

Add vendor hooks to capture demand paging during APP launch,
so we can do it in advance in next launch.

Bug: 315913896
Signed-off-by: Lianjun Huang <huanglianjun@xiaomi.com>
Signed-off-by: Lianjun Huang <huanglianjun@xiaomi.corp-partner.google.com>
Change-Id: I2698fefd347745fb4ff84b111caedbb3bb365ce3
---
 drivers/android/vendor_hooks.c | 1 +
 include/trace/hooks/mm.h       | 3 +++
 mm/readahead.c                 | 1 +
 3 files changed, 5 insertions(+)

diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c
index ee68032b1918..14bb47fcf8b0 100644
--- a/drivers/android/vendor_hooks.c
+++ b/drivers/android/vendor_hooks.c
@@ -312,6 +312,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_read_pages);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass);
 EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_pageout_swap_entry);
diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h
index 0bd0c34e17b9..50addc57dc10 100644
--- a/include/trace/hooks/mm.h
+++ b/include/trace/hooks/mm.h
@@ -76,6 +76,9 @@ struct slabinfo;
 DECLARE_HOOK(android_vh_cache_show,
 	TP_PROTO(struct seq_file *m, struct slabinfo *sinfo, struct kmem_cache *s),
 	TP_ARGS(m, sinfo, s));
+DECLARE_HOOK(android_vh_read_pages,
+	TP_PROTO(struct readahead_control *ractl),
+	TP_ARGS(ractl));
 DECLARE_HOOK(android_vh_alloc_pages_reclaim_bypass,
     TP_PROTO(gfp_t gfp_mask, int order, int alloc_flags,
 	int migratetype, struct page **page),
diff --git a/mm/readahead.c b/mm/readahead.c
index a8620cac2d83..dc5cc73775e7 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -167,6 +167,7 @@ static void read_pages(struct readahead_control *rac)
 		psi_memstall_enter(&rac->_pflags);
 	blk_start_plug(&plug);
 
+	trace_android_vh_read_pages(rac);
 	if (aops->readahead) {
 		aops->readahead(rac);
 		/*

From a70d3b7bdd7822665db5248f9b556fae267b116c Mon Sep 17 00:00:00 2001
From: Lianjun Huang <huanglianjun@xiaomi.corp-partner.google.com>
Date: Mon, 8 Jan 2024 10:59:49 +0800
Subject: [PATCH 137/139] ANDROID: GKI: add symbols of vendor hooks to ABI for
 swapping in ahead

Add symbols of vendor hooks to capture demand paging during APP launch,
so we can do it in advance in next launch.
INFO: 1 function symbol(s) added
  'int __traceiter_android_vh_read_pages(void*, struct readahead_control*)'

1 variable symbol(s) added
  'struct tracepoint __tracepoint_android_vh_read_pages'

Bug: 315913896
Signed-off-by: Lianjun Huang <huanglianjun@xiaomi.com>
Signed-off-by: Lianjun Huang <huanglianjun@xiaomi.corp-partner.google.com>
Change-Id: Ibb1e31b6912f7b6b92b76727f7e5043897434def
---
 android/abi_gki_aarch64.stg    | 26 ++++++++++++++++++++++++++
 android/abi_gki_aarch64_xiaomi |  4 ++++
 2 files changed, 30 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 573e477d70d3..8da5d4a06705 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -317298,6 +317298,12 @@ function {
   parameter_id: 0x064d6086
   parameter_id: 0x064d6086
 }
+function {
+  id: 0x9b32d0a3
+  return_type_id: 0x6720d32f
+  parameter_id: 0x18bd6530
+  parameter_id: 0x275ab027
+}
 function {
   id: 0x9b32f2ad
   return_type_id: 0x6720d32f
@@ -337670,6 +337676,15 @@ elf_symbol {
   type_id: 0x9b3343fb
   full_name: "__traceiter_android_vh_ra_tuning_max_page"
 }
+elf_symbol {
+  id: 0xb35da0ec
+  name: "__traceiter_android_vh_read_pages"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x4cb21384
+  type_id: 0x9b32d0a3
+  full_name: "__traceiter_android_vh_read_pages"
+}
 elf_symbol {
   id: 0x7d069e91
   name: "__traceiter_android_vh_record_mutex_lock_starttime"
@@ -341675,6 +341690,15 @@ elf_symbol {
   type_id: 0x18ccbd2c
   full_name: "__tracepoint_android_vh_ra_tuning_max_page"
 }
+elf_symbol {
+  id: 0x9fc2933e
+  name: "__tracepoint_android_vh_read_pages"
+  is_defined: true
+  symbol_type: OBJECT
+  crc: 0xb3878023
+  type_id: 0x18ccbd2c
+  full_name: "__tracepoint_android_vh_read_pages"
+}
 elf_symbol {
   id: 0x761f292f
   name: "__tracepoint_android_vh_record_mutex_lock_starttime"
@@ -399443,6 +399467,7 @@ interface {
   symbol_id: 0xf2c39651
   symbol_id: 0x93303c51
   symbol_id: 0x3a545b61
+  symbol_id: 0xb35da0ec
   symbol_id: 0x7d069e91
   symbol_id: 0x0fa39b81
   symbol_id: 0x1a91ec8c
@@ -399888,6 +399913,7 @@ interface {
   symbol_id: 0x0e92ee53
   symbol_id: 0xb0c197a3
   symbol_id: 0x811d5fab
+  symbol_id: 0x9fc2933e
   symbol_id: 0x761f292f
   symbol_id: 0xef7ad117
   symbol_id: 0x158c4cfa
diff --git a/android/abi_gki_aarch64_xiaomi b/android/abi_gki_aarch64_xiaomi
index 1ca73267f242..d502877c9b2c 100644
--- a/android/abi_gki_aarch64_xiaomi
+++ b/android/abi_gki_aarch64_xiaomi
@@ -341,3 +341,7 @@
 #required by zram.ko
   bioset_init
   bioset_exit
+
+#required by mi_asap.ko
+  __traceiter_android_vh_read_pages
+  __tracepoint_android_vh_read_pages

From 66cd99ccdbea66677398e65065a275d0bdc43482 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Thu, 2 Feb 2023 13:53:29 -0800
Subject: [PATCH 138/139] BACKPORT: UPSTREAM: phy: qcom-qmp: Introduce Kconfig
 symbols for discrete drivers

Introduce a config option for each QMP PHY driver now that the QMP PHY
mega-driver has been split up into different modules. This allows kernel
configurators to limit the binary size of the kernel by only compiling
in the QMP PHY driver that they need.

Leave the old config QCOM_QMP in place and make it into a menuconfig so
that 'make olddefconfig' continues to work. Furthermore, set the default
of the new Kconfig symbols to be QCOM_QMP so that the transition is
smooth.

Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Stephen Boyd <swboyd@chromium.org>

Link: https://lore.kernel.org/all/20230202215330.2152726-1-swboyd@chromium.org/

Bug: 319064658
Change-Id: I633e6e1bbc3e79292bfde927e46f84219f0178ae
(cherry picked from commit d1abd69534bec16c43c633313e8e937af1354a7a)
[quic_kuruva: Resolved minor conflict in drivers/phy/qualcomm/Kconfig ]
Signed-off-by: Rajashekar kuruva <quic_kuruva@quicinc.com>
---
 drivers/phy/qualcomm/Kconfig  | 50 ++++++++++++++++++++++++++++++++---
 drivers/phy/qualcomm/Makefile | 12 ++++-----
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/drivers/phy/qualcomm/Kconfig b/drivers/phy/qualcomm/Kconfig
index 5c98850f5a36..1d3a8062e651 100644
--- a/drivers/phy/qualcomm/Kconfig
+++ b/drivers/phy/qualcomm/Kconfig
@@ -50,13 +50,55 @@ config PHY_QCOM_PCIE2
 	  Enable this to support the Qualcomm PCIe PHY, used with the Synopsys
 	  based PCIe controller.
 
-config PHY_QCOM_QMP
-	tristate "Qualcomm QMP PHY Driver"
+menuconfig PHY_QCOM_QMP
+	tristate "Qualcomm QMP PHY Drivers"
 	depends on OF && COMMON_CLK && (ARCH_QCOM || COMPILE_TEST)
+
+if PHY_QCOM_QMP
+
+config PHY_QCOM_QMP_COMBO
+	tristate "Qualcomm QMP Combo PHY Driver"
+	default PHY_QCOM_QMP
 	select GENERIC_PHY
 	help
-	  Enable this to support the QMP PHY transceiver that is used
-	  with controllers such as PCIe, UFS, and USB on Qualcomm chips.
+	  Enable this to support the QMP Combo PHY transceiver that is used
+	  with USB3 and DisplayPort controllers on Qualcomm chips.
+
+config PHY_QCOM_QMP_PCIE
+	tristate "Qualcomm QMP PCIe PHY Driver"
+	depends on PCI || COMPILE_TEST
+	select GENERIC_PHY
+	default PHY_QCOM_QMP
+	help
+	  Enable this to support the QMP PCIe PHY transceiver that is used
+	  with PCIe controllers on Qualcomm chips.
+
+config PHY_QCOM_QMP_PCIE_8996
+	tristate "Qualcomm QMP PCIe 8996 PHY Driver"
+	depends on PCI || COMPILE_TEST
+	select GENERIC_PHY
+	default PHY_QCOM_QMP
+	help
+	  Enable this to support the QMP PCIe PHY transceiver that is used
+	  with PCIe controllers on Qualcomm msm8996 chips.
+
+config PHY_QCOM_QMP_UFS
+	tristate "Qualcomm QMP UFS PHY Driver"
+	select GENERIC_PHY
+	default PHY_QCOM_QMP
+	help
+	  Enable this to support the QMP UFS PHY transceiver that is used
+	  with UFS controllers on Qualcomm chips.
+
+config PHY_QCOM_QMP_USB
+	tristate "Qualcomm QMP USB PHY Driver"
+	select GENERIC_PHY
+	default PHY_QCOM_QMP
+	help
+	  Enable this to support the QMP USB PHY transceiver that is used
+	  with USB3 controllers on Qualcomm chips.
+
+endif # PHY_QCOM_QMP
 
 config PHY_QCOM_QUSB2
 	tristate "Qualcomm QUSB2 PHY Driver"
diff --git a/drivers/phy/qualcomm/Makefile b/drivers/phy/qualcomm/Makefile
index 65f6c30a3e93..79dd4e507961 100644
--- a/drivers/phy/qualcomm/Makefile
+++ b/drivers/phy/qualcomm/Makefile
@@ -5,12 +5,12 @@ obj-$(CONFIG_PHY_QCOM_EDP)		+= phy-qcom-edp.o
 obj-$(CONFIG_PHY_QCOM_IPQ4019_USB)	+= phy-qcom-ipq4019-usb.o
 obj-$(CONFIG_PHY_QCOM_IPQ806X_SATA)	+= phy-qcom-ipq806x-sata.o
 obj-$(CONFIG_PHY_QCOM_PCIE2)		+= phy-qcom-pcie2.o
-obj-$(CONFIG_PHY_QCOM_QMP)		+= \
-	phy-qcom-qmp-combo.o \
-	phy-qcom-qmp-pcie.o \
-	phy-qcom-qmp-pcie-msm8996.o \
-	phy-qcom-qmp-ufs.o \
-	phy-qcom-qmp-usb.o
+
+obj-$(CONFIG_PHY_QCOM_QMP_COMBO)	+= phy-qcom-qmp-combo.o
+obj-$(CONFIG_PHY_QCOM_QMP_PCIE)		+= phy-qcom-qmp-pcie.o
+obj-$(CONFIG_PHY_QCOM_QMP_PCIE_8996)	+= phy-qcom-qmp-pcie-msm8996.o
+obj-$(CONFIG_PHY_QCOM_QMP_UFS)		+= phy-qcom-qmp-ufs.o
+obj-$(CONFIG_PHY_QCOM_QMP_USB)		+= phy-qcom-qmp-usb.o
 
 obj-$(CONFIG_PHY_QCOM_QUSB2)		+= phy-qcom-qusb2.o
 obj-$(CONFIG_PHY_QCOM_USB_HS) 		+= phy-qcom-usb-hs.o

From df1cdb0a703318d8804872fa93e30208ad14398c Mon Sep 17 00:00:00 2001
From: Qian-Hao Huang <qhhuang@google.com>
Date: Mon, 15 Jan 2024 19:58:07 +0800
Subject: [PATCH 139/139] ANDROID: Update the pixel symbol list

These symbols are needed as part of an upgrade to v6.1:
- add_uevent_var
- aes_encrypt
- aes_expandkey
- alloc_skb_with_frags
- cpufreq_quick_get_max
- cpuidle_governor_latency_req
- cpu_topology
- crypto_shash_final
- datagram_poll
- debugfs_create_blob
- dev_pm_qos_add_notifier
- dev_pm_qos_add_request
- dev_pm_qos_remove_notifier
- dev_pm_qos_remove_request
- dma_direct_alloc
- dma_direct_free
- dma_get_sgtable_attrs
- firmware_request_nowarn
- idr_alloc_cyclic
- in_egroup_p
- init_user_ns
- iov_iter_revert
- __ipv6_addr_type
- kernel_bind
- kernel_connect
- kernel_getsockname
- kernel_recvmsg
- kernel_sendmsg
- kmem_cache_create_usercopy
- ksize
- lock_sock_nested
- mempool_alloc
- mempool_alloc_slab
- mempool_create
- mempool_destroy
- mempool_free
- mempool_free_slab
- napi_gro_flush
- netif_tx_lock
- netif_tx_unlock
- ns_capable_noaudit
- param_get_string
- param_set_copystring
- param_set_int
- pci_disable_msi
- pcie_capability_read_word
- pci_iomap
- pci_iounmap
- pci_irq_vector
- pci_release_region
- pci_request_region
- pm_system_wakeup
- proto_register
- proto_unregister
- radix_tree_iter_delete
- radix_tree_next_chunk
- _raw_read_lock_irq
- _raw_read_unlock_irq
- _raw_write_lock_bh
- _raw_write_unlock_bh
- refcount_dec_not_one
- register_netevent_notifier
- regulator_set_load
- release_sock
- seq_vprintf
- sk_alloc
- skb_coalesce_rx_frag
- skb_copy_datagram_iter
- skb_free_datagram
- __skb_pad
- skb_recv_datagram
- skb_set_owner_w
- skb_store_bits
- sk_free
- sock_alloc_send_pskb
- sock_create_kern
- sock_gettstamp
- sock_init_data
- sock_no_accept
- sock_no_listen
- sock_no_mmap
- sock_no_sendpage
- sock_no_shutdown
- sock_no_socketpair
- sock_queue_rcv_skb_reason
- sock_register
- sock_setsockopt
- sock_unregister
- strchrnul
- unregister_netevent_notifier
- vscnprintf
- wait_for_completion_killable
- wireless_send_event
- __xa_insert
- xa_store

Bug: 303533633
Bug: 308924989
Change-Id: Ifbc09d5025f1bd3416f136fabd344ef2452390a8
Signed-off-by: Qian-Hao Huang <qhhuang@google.com>
---
 android/abi_gki_aarch64.stg   | 30 +++++++++++
 android/abi_gki_aarch64_pixel | 95 +++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)

diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg
index 8da5d4a06705..a7a7e77c8c92 100644
--- a/android/abi_gki_aarch64.stg
+++ b/android/abi_gki_aarch64.stg
@@ -354353,6 +354353,24 @@ elf_symbol {
   type_id: 0x1023f4f6
   full_name: "dma_contiguous_default_area"
 }
+elf_symbol {
+  id: 0x279bd3a7
+  name: "dma_direct_alloc"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0xb02b3af0
+  type_id: 0x55df36a2
+  full_name: "dma_direct_alloc"
+}
+elf_symbol {
+  id: 0x0e847130
+  name: "dma_direct_free"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x051debf6
+  type_id: 0x13db1955
+  full_name: "dma_direct_free"
+}
 elf_symbol {
   id: 0xd13969dd
   name: "dma_fence_add_callback"
@@ -372911,6 +372929,15 @@ elf_symbol {
   type_id: 0xfcd23386
   full_name: "ns_capable"
 }
+elf_symbol {
+  id: 0x27a870d1
+  name: "ns_capable_noaudit"
+  is_defined: true
+  symbol_type: FUNCTION
+  crc: 0x3c75b0e0
+  type_id: 0xfcd23386
+  full_name: "ns_capable_noaudit"
+}
 elf_symbol {
   id: 0xf68f8b33
   name: "ns_to_kernel_old_timeval"
@@ -401319,6 +401346,8 @@ interface {
   symbol_id: 0x710f1fc2
   symbol_id: 0xaa54a71f
   symbol_id: 0x5f554bc7
+  symbol_id: 0x279bd3a7
+  symbol_id: 0x0e847130
   symbol_id: 0xd13969dd
   symbol_id: 0xe2ee283f
   symbol_id: 0xf5808a3e
@@ -403381,6 +403410,7 @@ interface {
   symbol_id: 0xea37502b
   symbol_id: 0x0bb7f730
   symbol_id: 0xb65e3baf
+  symbol_id: 0x27a870d1
   symbol_id: 0xf68f8b33
   symbol_id: 0xfab02ca8
   symbol_id: 0xd7668767
diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel
index d0f6d7be74ff..fac865d11e03 100644
--- a/android/abi_gki_aarch64_pixel
+++ b/android/abi_gki_aarch64_pixel
@@ -3,8 +3,11 @@
   add_cpu
   add_timer
   add_timer_on
+  add_uevent_var
   add_wait_queue
   adjust_managed_page_count
+  aes_encrypt
+  aes_expandkey
   alarm_cancel
   alarm_init
   alarm_start_relative
@@ -19,6 +22,7 @@
   __alloc_percpu
   __alloc_percpu_gfp
   __alloc_skb
+  alloc_skb_with_frags
   alloc_workqueue
   alt_cb_patch_nops
   amba_bustype
@@ -243,6 +247,7 @@
   cpufreq_get_policy
   cpufreq_policy_transition_delay_us
   cpufreq_quick_get
+  cpufreq_quick_get_max
   cpufreq_register_driver
   cpufreq_register_governor
   cpufreq_register_notifier
@@ -261,6 +266,7 @@
   cpu_hwcaps
   cpuidle_driver_state_disabled
   cpuidle_get_driver
+  cpuidle_governor_latency_req
   cpu_latency_qos_add_request
   cpu_latency_qos_remove_request
   cpu_latency_qos_update_request
@@ -276,6 +282,7 @@
   cpus_read_lock
   cpus_read_unlock
   cpu_subsys
+  cpu_topology
   crc32_be
   crc32_le
   crc8
@@ -298,6 +305,7 @@
   crypto_register_shash
   crypto_req_done
   crypto_shash_digest
+  crypto_shash_final
   crypto_shash_finup
   crypto_shash_setkey
   crypto_shash_update
@@ -311,10 +319,12 @@
   csum_partial
   csum_tcpudp_nofold
   _ctype
+  datagram_poll
   deactivate_task
   debugfs_attr_read
   debugfs_attr_write
   debugfs_create_atomic_t
+  debugfs_create_blob
   debugfs_create_bool
   debugfs_create_devm_seqfile
   debugfs_create_dir
@@ -499,7 +509,11 @@
   dev_pm_opp_of_remove_table
   dev_pm_opp_put
   dev_pm_opp_set_config
+  dev_pm_qos_add_notifier
+  dev_pm_qos_add_request
   dev_pm_qos_read_value
+  dev_pm_qos_remove_notifier
+  dev_pm_qos_remove_request
   dev_pm_qos_update_request
   _dev_printk
   dev_printk_emit
@@ -540,6 +554,8 @@
   dma_buf_unmap_attachment
   dma_buf_vmap
   dma_buf_vunmap
+  dma_direct_alloc
+  dma_direct_free
   dmaengine_unmap_put
   dma_fence_add_callback
   dma_fence_array_create
@@ -559,6 +575,7 @@
   dma_fence_wait_timeout
   dma_free_attrs
   dma_free_pages
+  dma_get_sgtable_attrs
   dma_get_slave_caps
   dma_get_slave_channel
   dma_heap_add
@@ -865,6 +882,7 @@
   find_task_by_vpid
   find_vma_intersection
   finish_wait
+  firmware_request_nowarn
   flush_dcache_page
   flush_delayed_work
   flush_work
@@ -1037,6 +1055,7 @@
   ida_destroy
   ida_free
   idr_alloc
+  idr_alloc_cyclic
   idr_destroy
   idr_find
   idr_for_each
@@ -1056,6 +1075,7 @@
   in6_pton
   in_aton
   inc_zone_page_state
+  in_egroup_p
   inet_csk_get_port
   init_dummy_netdev
   init_iova_domain
@@ -1065,6 +1085,7 @@
   __init_swait_queue_head
   init_task
   init_timer_key
+  init_user_ns
   init_uts_ns
   init_wait_entry
   __init_waitqueue_head
@@ -1132,8 +1153,10 @@
   io_schedule_timeout
   iounmap
   iova_domain_init_rcaches
+  iov_iter_revert
   ip_compute_csum
   ip_send_check
+  __ipv6_addr_type
   __irq_alloc_descs
   __irq_apply_affinity_hint
   irq_create_mapping_affinity
@@ -1165,11 +1188,16 @@
   jiffies_to_usecs
   kasan_flag_enabled
   kasprintf
+  kernel_bind
+  kernel_connect
   kernel_cpustat
+  kernel_getsockname
   kernel_kobj
   kernel_param_lock
   kernel_param_unlock
+  kernel_recvmsg
   kernel_restart
+  kernel_sendmsg
   kernfs_find_and_get_ns
   kernfs_notify
   kernfs_path_from_node
@@ -1196,6 +1224,7 @@
   kmalloc_trace
   kmem_cache_alloc
   kmem_cache_create
+  kmem_cache_create_usercopy
   kmem_cache_destroy
   kmem_cache_free
   kmemdup
@@ -1210,6 +1239,7 @@
   kobject_uevent_env
   kobj_sysfs_ops
   krealloc
+  ksize
   kstat
   kstrdup
   kstrndup
@@ -1274,6 +1304,7 @@
   __list_del_entry_valid
   list_sort
   __local_bh_enable_ip
+  lock_sock_nested
   log_abnormal_wakeup_reason
   log_post_read_mmio
   log_post_write_mmio
@@ -1299,6 +1330,12 @@
   memdup_user_nul
   memmove
   memparse
+  mempool_alloc
+  mempool_alloc_slab
+  mempool_create
+  mempool_destroy
+  mempool_free
+  mempool_free_slab
   memremap
   mem_section
   memset
@@ -1350,6 +1387,7 @@
   napi_complete_done
   napi_disable
   napi_enable
+  napi_gro_flush
   napi_gro_receive
   __napi_schedule
   napi_schedule_prep
@@ -1366,7 +1404,9 @@
   netif_receive_skb
   netif_receive_skb_list
   netif_rx
+  netif_tx_lock
   netif_tx_stop_all_queues
+  netif_tx_unlock
   netif_tx_wake_queue
   netlink_broadcast
   __netlink_kernel_create
@@ -1393,6 +1433,7 @@
   nr_cpu_ids
   nr_irqs
   ns_capable
+  ns_capable_noaudit
   nsec_to_clock_t
   ns_to_timespec64
   __num_online_cpus
@@ -1494,6 +1535,7 @@
   panic_notifier_list
   param_array_ops
   param_get_int
+  param_get_string
   param_ops_bool
   param_ops_byte
   param_ops_charp
@@ -1502,10 +1544,14 @@
   param_ops_string
   param_ops_uint
   param_ops_ulong
+  param_set_copystring
+  param_set_int
   pci_alloc_irq_vectors_affinity
   pci_assign_resource
   pci_clear_master
   pci_disable_device
+  pci_disable_msi
+  pcie_capability_read_word
   pci_enable_device
   pci_enable_wake
   pci_find_bus
@@ -1513,6 +1559,9 @@
   pci_find_ext_capability
   pci_free_irq_vectors
   pci_get_device
+  pci_iomap
+  pci_iounmap
+  pci_irq_vector
   pci_load_and_free_saved_state
   pci_load_saved_state
   pci_msi_mask_irq
@@ -1520,7 +1569,9 @@
   pci_read_config_dword
   pci_read_config_word
   __pci_register_driver
+  pci_release_region
   pci_release_regions
+  pci_request_region
   pci_rescan_bus
   pci_restore_msi_state
   pci_restore_state
@@ -1618,6 +1669,7 @@
   __pm_runtime_use_autosuspend
   __pm_stay_awake
   pm_stay_awake
+  pm_system_wakeup
   pm_wakeup_dev_event
   pm_wakeup_ws_event
   power_supply_changed
@@ -1652,6 +1704,8 @@
   proc_remove
   proc_set_size
   proc_symlink
+  proto_register
+  proto_unregister
   pskb_expand_head
   __pskb_pull_tail
   ___pskb_trim
@@ -1672,7 +1726,9 @@
   radix_tree_delete_item
   radix_tree_gang_lookup
   radix_tree_insert
+  radix_tree_iter_delete
   radix_tree_lookup
+  radix_tree_next_chunk
   radix_tree_preload
   ___ratelimit
   raw_notifier_call_chain
@@ -1680,9 +1736,11 @@
   raw_notifier_chain_unregister
   _raw_read_lock
   _raw_read_lock_bh
+  _raw_read_lock_irq
   _raw_read_lock_irqsave
   _raw_read_unlock
   _raw_read_unlock_bh
+  _raw_read_unlock_irq
   _raw_read_unlock_irqrestore
   _raw_spin_lock
   _raw_spin_lock_bh
@@ -1696,9 +1754,11 @@
   _raw_spin_unlock_irq
   _raw_spin_unlock_irqrestore
   _raw_write_lock
+  _raw_write_lock_bh
   _raw_write_lock_irq
   _raw_write_lock_irqsave
   _raw_write_unlock
+  _raw_write_unlock_bh
   _raw_write_unlock_irq
   _raw_write_unlock_irqrestore
   rb_erase
@@ -1713,6 +1773,7 @@
   rdev_get_drvdata
   rdev_get_id
   reboot_mode
+  refcount_dec_not_one
   refcount_warn_saturate
   __refrigerator
   regcache_cache_only
@@ -1730,6 +1791,7 @@
   register_netdev
   register_netdevice
   register_netdevice_notifier
+  register_netevent_notifier
   register_oom_notifier
   register_pernet_device
   register_pernet_subsys
@@ -1772,11 +1834,13 @@
   regulator_notifier_call_chain
   regulator_put
   regulator_set_active_discharge_regmap
+  regulator_set_load
   regulator_set_voltage
   regulator_set_voltage_sel_regmap
   regulator_unregister
   release_firmware
   __release_region
+  release_sock
   remap_pfn_range
   remap_vmalloc_range
   remove_cpu
@@ -1877,6 +1941,7 @@
   seq_read
   seq_release
   seq_release_private
+  seq_vprintf
   seq_write
   set_capacity
   set_capacity_and_notify
@@ -1917,20 +1982,25 @@
   single_open
   single_open_size
   single_release
+  sk_alloc
   skb_add_rx_frag
   skb_checksum
   skb_checksum_help
   skb_clone
   skb_clone_sk
+  skb_coalesce_rx_frag
   skb_complete_wifi_ack
   skb_copy
   skb_copy_bits
+  skb_copy_datagram_iter
   skb_copy_expand
   skb_dequeue
   skb_dequeue_tail
   skb_ensure_writable
+  skb_free_datagram
   __skb_get_hash
   __skb_gso_segment
+  __skb_pad
   skb_pull
   skb_push
   skb_put
@@ -1938,7 +2008,11 @@
   skb_queue_purge
   skb_queue_tail
   skb_realloc_headroom
+  skb_recv_datagram
+  skb_set_owner_w
+  skb_store_bits
   skb_trim
+  sk_free
   skip_spaces
   smp_call_function
   smp_call_function_single
@@ -2015,8 +2089,22 @@
   snd_soc_unregister_component
   snprintf
   soc_device_register
+  sock_alloc_send_pskb
   __sock_create
+  sock_create_kern
+  sock_gettstamp
+  sock_init_data
+  sock_no_accept
+  sock_no_listen
+  sock_no_mmap
+  sock_no_sendpage
+  sock_no_shutdown
+  sock_no_socketpair
+  sock_queue_rcv_skb_reason
+  sock_register
   sock_release
+  sock_setsockopt
+  sock_unregister
   sock_wfree
   softnet_data
   sort
@@ -2054,6 +2142,7 @@
   strcasecmp
   strcat
   strchr
+  strchrnul
   strcmp
   strcpy
   strcspn
@@ -2456,6 +2545,7 @@
   unregister_netdevice_many
   unregister_netdevice_notifier
   unregister_netdevice_queue
+  unregister_netevent_notifier
   unregister_oom_notifier
   unregister_pernet_device
   unregister_pernet_subsys
@@ -2604,6 +2694,7 @@
   vring_del_virtqueue
   vring_interrupt
   vring_new_virtqueue
+  vscnprintf
   vsnprintf
   vunmap
   vzalloc
@@ -2611,6 +2702,7 @@
   wait_for_completion
   wait_for_completion_interruptible
   wait_for_completion_interruptible_timeout
+  wait_for_completion_killable
   wait_for_completion_timeout
   wait_woken
   __wake_up
@@ -2628,6 +2720,7 @@
   watchdog_set_restart_priority
   watchdog_unregister_device
   wireless_nlevent_flush
+  wireless_send_event
   woken_wake_function
   work_busy
   __write_overflow_field
@@ -2639,11 +2732,13 @@
   xa_find
   xa_find_after
   xa_get_mark
+  __xa_insert
   xa_load
   xa_set_mark
   xas_find
   xas_pause
   __xa_store
+  xa_store
   __xfrm_state_destroy
   xfrm_state_lookup_byspi
   xfrm_stateonly_find