From ddf142e5a817e3a260e5dedce0cd29db6fbaa010 Mon Sep 17 00:00:00 2001 From: Jong eon Park Date: Wed, 29 Nov 2023 16:50:44 +0900 Subject: [PATCH 001/139] ANDROID: netlink: add netlink poll and hooks In huge uevents generating system, especially for user apps who have small size of rcvbuf socket, it has been reported that netlink overrun happens quite frequently. Moreover, if there's no POLLERR (caused by this netlink overrun) handler in user apps, the system can almost be stucked by calling 'poll' repeatedly. Regarding this issue, I have sent a kernel netlink patch to linux maintainers and got replied that this is absolutely user app's problem, must not addressing kernel. Until Android team look into this issue and some modification comes out, we need kernel patch for temporary. To minimize the effect by this patch to the others who have never met this issue, I would like to just add netlink's dedicated poll and its hooks. Please refer to below v1/v2 patch links for history. v1: https://lore.kernel.org/netdev/20231110110002.7279f895@kernel.org/T/#t v2: https://lore.kernel.org/netdev/d599922fd89b3e61c7cf531a03ea8b81cbcb003e.camel@redhat.com/T/#t Bug: 300009377 Link: https://lore.kernel.org/netdev/d599922fd89b3e61c7cf531a03ea8b81cbcb003e.camel@redhat.com/T/#t Change-Id: I4f11399d61c10332ba05bac64cfa1e92bb111565 Signed-off-by: Jong eon Park --- drivers/android/vendor_hooks.c | 1 + include/trace/hooks/net.h | 7 +++++++ net/netlink/af_netlink.c | 14 ++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index 8425e8709b41..789fa7beea83 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -364,3 +364,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mmc_blk_mq_rw_recovery); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_slab_folio_alloced); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_large_alloced); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_netlink_poll); diff --git a/include/trace/hooks/net.h b/include/trace/hooks/net.h index 50988f672216..835943c31f3d 100644 --- a/include/trace/hooks/net.h +++ b/include/trace/hooks/net.h @@ -25,6 +25,13 @@ DECLARE_RESTRICTED_HOOK(android_rvh_sk_alloc, DECLARE_RESTRICTED_HOOK(android_rvh_sk_free, TP_PROTO(struct sock *sock), TP_ARGS(sock), 1); +struct poll_table_struct; +typedef struct poll_table_struct poll_table; +DECLARE_HOOK(android_vh_netlink_poll, + TP_PROTO(struct file *file, struct socket *sock, poll_table *wait, + __poll_t *mask), + TP_ARGS(file, sock, wait, mask)); + /* macro versions of hooks are no longer required */ #endif /* _TRACE_HOOK_NET_VH_H */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index cb833302270a..5b328a82ea70 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -71,7 +71,8 @@ #include #define CREATE_TRACE_POINTS #include - +#undef CREATE_TRACE_POINTS +#include #include "af_netlink.h" struct listeners { @@ -1966,6 +1967,15 @@ out: return err ? : copied; } +static __poll_t netlink_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + __poll_t mask = datagram_poll(file, sock, wait); + + trace_android_vh_netlink_poll(file, sock, wait, &mask); + return mask; +} + static void netlink_data_ready(struct sock *sk) { BUG(); @@ -2766,7 +2776,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll = netlink_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, From 2d8a5ddebb22fd1dab4c2fe3a7ba978be62423a3 Mon Sep 17 00:00:00 2001 From: Richard Chang Date: Thu, 7 Dec 2023 07:31:31 +0000 Subject: [PATCH 002/139] ANDROID: Update the ABI symbol list Adding the following symbol: - __cma_alloc Bug: 308881290 Change-Id: I5b3ffb0c804dc636355c1462aaa6e96b1189446b Signed-off-by: Richard Chang --- android/abi_gki_aarch64.stg | 18 ++++++++++++++++++ android/abi_gki_aarch64_pixel | 1 + 2 files changed, 19 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 2a708ffe2af0..b6bcc62d2c90 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -327602,6 +327602,14 @@ function { parameter_id: 0x4585663f parameter_id: 0x6d7f5ff6 } +function { + id: 0xb94f7fed + return_type_id: 0x06835e9c + parameter_id: 0x1023f4f6 + parameter_id: 0x33756485 + parameter_id: 0x4585663f + parameter_id: 0xf1a6dfed +} function { id: 0xb957d705 return_type_id: 0x6720d32f @@ -331936,6 +331944,15 @@ elf_symbol { type_id: 0x9b8e2bf2 full_name: "__clocksource_register_scale" } +elf_symbol { + id: 0xc7d06fb9 + name: "__cma_alloc" + is_defined: true + symbol_type: FUNCTION + crc: 0x5dbf1ea9 + type_id: 0xb94f7fed + full_name: "__cma_alloc" +} elf_symbol { id: 0xac1ff1ce name: "__const_udelay" @@ -397960,6 +397977,7 @@ interface { symbol_id: 0x6a30419a symbol_id: 0x021741b4 symbol_id: 0x9339caba + symbol_id: 0xc7d06fb9 symbol_id: 0xac1ff1ce symbol_id: 0xba429af2 symbol_id: 0xe495eb53 diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index 8da36c314c22..d19d3f7e39aa 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -188,6 +188,7 @@ clockevents_config_and_register clocks_calc_mult_shift __clocksource_register_scale + __cma_alloc cma_alloc cma_for_each_area cma_get_name From a9567a35d0b87f17387ee2a86f6092aa6c1c85d0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 11 Dec 2023 19:53:17 +0000 Subject: [PATCH 003/139] ANDROID: arm64: Disable workaround for CPU errata 2441007 and 2441009 CPU errata 2441007 (Cortex-A55) and 2441009 (Cortex-A510) are categorised as "rare" by Arm and consequently the workaround is not intended to be deployed in practice as the issue is not expected to occur in real-world environments. Given that the cost of the workaround, which issues additional broadcast TLB invalidation requests, has been shown to impact kswapd significantly on Pixel devices, disable the workaround following Arm's recommendation. Bug: 306231846 Signed-off-by: Will Deacon Change-Id: I39b6d9736cfa79827321151b45774f62c8d1a747 (cherry picked from commit 4ba6c3197cb6f0e11cb8af10bb0924ba9d73c110) --- arch/arm64/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7dafeacab872..ce95f67faa64 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -642,7 +642,6 @@ config ARM64_WORKAROUND_REPEAT_TLBI config ARM64_ERRATUM_2441007 bool "Cortex-A55: Completion of affected memory accesses might not be guaranteed by completion of a TLBI" - default y select ARM64_WORKAROUND_REPEAT_TLBI help This option adds a workaround for ARM Cortex-A55 erratum #2441007. @@ -881,7 +880,6 @@ config ARM64_ERRATUM_2224489 config ARM64_ERRATUM_2441009 bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI" - default y select ARM64_WORKAROUND_REPEAT_TLBI help This option adds a workaround for ARM Cortex-A510 erratum #2441009. From 071c14698cf57357fba712330849a1515a993960 Mon Sep 17 00:00:00 2001 From: Guan-Yu Lin Date: Thu, 16 Nov 2023 16:32:16 +0800 Subject: [PATCH 004/139] FROMGIT: usb: typec: tcpm: skip checking port->send_discover in PD3.0 The original Collison Avoidance mechanism, port->send_discover, avoids the conflict when port partners start AMS almost the same time. However, this mechanism is replaced by SINK_TX_OK and SINK_TX_NG. Skip the check in PD3.0 to avoid the deadlock when source is requesting DR_SWAP where sink is requesting DISCOVER_IDENTITY. Signed-off-by: Guan-Yu Lin Reviewed-by: Heikki Krogerus Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20231116083221.1201892-1-guanyulin@google.com Signed-off-by: Greg Kroah-Hartman Bug: 292178486 (cherry picked from commit e0cc05d52ad310cced029449bcda0f9fc847097c https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git/ usb-next) Change-Id: I4691628d8085dfa7be9189b2bd598896664c38b5 Signed-off-by: Guan-Yu Lin --- drivers/usb/typec/tcpm/tcpm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index f00e69bf4c64..c3dd132f02fe 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -2855,7 +2855,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS); } else { - if (port->send_discover) { + if (port->send_discover && port->negotiated_rev < PD_REV30) { tcpm_queue_message(port, PD_MSG_CTRL_WAIT); break; } @@ -2871,7 +2871,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS); } else { - if (port->send_discover) { + if (port->send_discover && port->negotiated_rev < PD_REV30) { tcpm_queue_message(port, PD_MSG_CTRL_WAIT); break; } @@ -2880,7 +2880,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, } break; case PD_CTRL_VCONN_SWAP: - if (port->send_discover) { + if (port->send_discover && port->negotiated_rev < PD_REV30) { tcpm_queue_message(port, PD_MSG_CTRL_WAIT); break; } From 62b97630d464e048769c5f4011565b00c60d264a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 21 Jun 2023 17:42:42 +0200 Subject: [PATCH 005/139] UPSTREAM: x86/sev: Check IOBM for IOIO exceptions from user-space Upstream commit: b9cb9c45583b911e0db71d09caa6b56469eb2bdf Check the IO permission bitmap (if present) before emulating IOIO #VC exceptions for user-space. These permissions are checked by hardware already before the #VC is raised, but due to the VC-handler decoding race it needs to be checked again in software. Bug: 309733863 Fixes: 25189d08e516 ("x86/sev-es: Add support for handling IOIO exceptions") Reported-by: Tom Dohrmann Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov (AMD) Tested-by: Tom Dohrmann Cc: Signed-off-by: Greg Kroah-Hartman (cherry picked from commit def94eb9a804acdcdba5b959ad72cf9119f03f3b) Signed-off-by: Lee Jones Change-Id: Ia520acc67da21353148fd07a3a8e48ee8a97d364 --- arch/x86/boot/compressed/sev.c | 5 +++++ arch/x86/kernel/sev-shared.c | 22 +++++++++++++++------- arch/x86/kernel/sev.c | 27 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index e65f0968e0d9..b9b8ff3fe8e9 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -103,6 +103,11 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + #undef __init #undef __pa #define __init diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 7dce812ce253..abbe7af14d92 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -693,6 +693,9 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) { struct insn *insn = &ctxt->insn; + size_t size; + u64 port; + *exitinfo = 0; switch (insn->opcode.bytes[0]) { @@ -701,7 +704,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6d: *exitinfo |= IOIO_TYPE_INS; *exitinfo |= IOIO_SEG_ES; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUTS opcodes */ @@ -709,41 +712,43 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0x6f: *exitinfo |= IOIO_TYPE_OUTS; *exitinfo |= IOIO_SEG_DS; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* IN immediate opcodes */ case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u8)insn->immediate.value << 16; + port = (u8)insn->immediate.value & 0xffff; break; /* IN register opcodes */ case 0xec: case 0xed: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; /* OUT register opcodes */ case 0xee: case 0xef: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + port = ctxt->regs->dx & 0xffff; break; default: return ES_DECODE_FAILED; } + *exitinfo |= port << 16; + switch (insn->opcode.bytes[0]) { case 0x6c: case 0x6e: @@ -753,12 +758,15 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xee: /* Single byte opcodes */ *exitinfo |= IOIO_DATA_8; + size = 1; break; default: /* Length determined by instruction parsing */ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 : IOIO_DATA_32; + size = (insn->opnd_bytes == 2) ? 2 : 4; } + switch (insn->addr_bytes) { case 2: *exitinfo |= IOIO_ADDR_16; @@ -774,7 +782,7 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) if (insn_has_rep_prefix(insn)) *exitinfo |= IOIO_REP; - return ES_OK; + return vc_ioio_check(ctxt, (u16)port, size); } static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index afda719dd725..392097f7c241 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -512,6 +512,33 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt return ES_OK; } +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + BUG_ON(size > 4); + + if (user_mode(ctxt->regs)) { + struct thread_struct *t = ¤t->thread; + struct io_bitmap *iobm = t->io_bitmap; + size_t idx; + + if (!iobm) + goto fault; + + for (idx = port; idx < port + size; ++idx) { + if (test_bit(idx, iobm->bitmap)) + goto fault; + } + } + + return ES_OK; + +fault: + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + + return ES_EXCEPTION; +} + /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" From b2b3a1e6d1bb749bda9eb920ac70a92ff04c41b9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 16 Oct 2023 14:42:50 +0200 Subject: [PATCH 006/139] UPSTREAM: x86/sev: Check for user-space IOIO pointing to kernel space Upstream commit: 63e44bc52047f182601e7817da969a105aa1f721 Check the memory operand of INS/OUTS before emulating the instruction. The #VC exception can get raised from user-space, but the memory operand can be manipulated to access kernel memory before the emulation actually begins and after the exception handler has run. [ bp: Massage commit message. ] Bug: 309733863 Fixes: 597cfe48212a ("x86/boot/compressed/64: Setup a GHCB-based VC Exception handler") Reported-by: Tom Dohrmann Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov (AMD) Cc: Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 57d0639f60f1ff04cbe7fd52823b94b894d7f812) Signed-off-by: Lee Jones Change-Id: Iac1c2f15cc922ab215d57654b004d020a0b65e53 --- arch/x86/boot/compressed/sev.c | 5 +++++ arch/x86/kernel/sev-shared.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index b9b8ff3fe8e9..9c91cc40f456 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -108,6 +108,11 @@ static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t si return ES_OK; } +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + #undef __init #undef __pa #define __init diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index abbe7af14d92..71d8698702ce 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -629,6 +629,23 @@ fail: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } +static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, + unsigned long address, + bool write) +{ + if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_USER; + ctxt->fi.cr2 = address; + if (write) + ctxt->fi.error_code |= X86_PF_WRITE; + + return ES_EXCEPTION; + } + + return ES_OK; +} + static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, void *src, char *buf, unsigned int data_size, @@ -636,7 +653,12 @@ static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, bool backwards) { int i, b = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)src; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, false); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *s = src + (i * data_size * b); @@ -657,7 +679,12 @@ static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, bool backwards) { int i, s = backwards ? -1 : 1; - enum es_result ret = ES_OK; + unsigned long address = (unsigned long)dst; + enum es_result ret; + + ret = vc_insn_string_check(ctxt, address, true); + if (ret != ES_OK) + return ret; for (i = 0; i < count; i++) { void *d = dst + (i * data_size * s); From bcc758eed789329e391a9a0180262ba51a9e14f9 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 13 Dec 2023 09:40:49 +0000 Subject: [PATCH 007/139] Reapply "binder: fix UAF caused by faulty buffer cleanup" This reverts commit 9f67f4f5007d5a8ecc26486f23c1f1d5093e3e9e. Vanir complained that this fix was missing, but only from this branch. Let's bring it back and see how the ABI checker behaves. Bug: 275041864 Bug: 308350116 Signed-off-by: Lee Jones Change-Id: I1fc248582347a295d9168bbd8e55dbd6880e34ed --- drivers/android/binder.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 766b9d5dffb1..b0188e8ee00b 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2127,24 +2127,23 @@ static void binder_deferred_fd_close(int fd) static void binder_transaction_buffer_release(struct binder_proc *proc, struct binder_thread *thread, struct binder_buffer *buffer, - binder_size_t failed_at, + binder_size_t off_end_offset, bool is_failure) { int debug_id = buffer->debug_id; - binder_size_t off_start_offset, buffer_offset, off_end_offset; + binder_size_t off_start_offset, buffer_offset; binder_debug(BINDER_DEBUG_TRANSACTION, "%d buffer release %d, size %zd-%zd, failed at %llx\n", proc->pid, buffer->debug_id, buffer->data_size, buffer->offsets_size, - (unsigned long long)failed_at); + (unsigned long long)off_end_offset); if (buffer->target_node) binder_dec_node(buffer->target_node, 1, 0); off_start_offset = ALIGN(buffer->data_size, sizeof(void *)); - off_end_offset = is_failure && failed_at ? failed_at : - off_start_offset + buffer->offsets_size; + for (buffer_offset = off_start_offset; buffer_offset < off_end_offset; buffer_offset += sizeof(binder_size_t)) { struct binder_object_header *hdr; @@ -2304,6 +2303,21 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, } } +/* Clean up all the objects in the buffer */ +static inline void binder_release_entire_buffer(struct binder_proc *proc, + struct binder_thread *thread, + struct binder_buffer *buffer, + bool is_failure) +{ + binder_size_t off_end_offset; + + off_end_offset = ALIGN(buffer->data_size, sizeof(void *)); + off_end_offset += buffer->offsets_size; + + binder_transaction_buffer_release(proc, thread, buffer, + off_end_offset, is_failure); +} + static int binder_translate_binder(struct flat_binder_object *fp, struct binder_transaction *t, struct binder_thread *thread) @@ -3013,7 +3027,7 @@ static int binder_proc_transaction(struct binder_transaction *t, t_outdated->buffer = NULL; buffer->transaction = NULL; trace_binder_transaction_update_buffer_release(buffer); - binder_transaction_buffer_release(proc, NULL, buffer, 0, 0); + binder_release_entire_buffer(proc, NULL, buffer, false); binder_alloc_free_buf(&proc->alloc, buffer); kfree(t_outdated); binder_stats_deleted(BINDER_STAT_TRANSACTION); @@ -4004,7 +4018,7 @@ binder_free_buf(struct binder_proc *proc, binder_node_inner_unlock(buf_node); } trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, thread, buffer, 0, is_failure); + binder_release_entire_buffer(proc, thread, buffer, is_failure); binder_alloc_free_buf(&proc->alloc, buffer); } From d8d2b95fd0b415ea700d02348ac852f463908950 Mon Sep 17 00:00:00 2001 From: DooHyun Hwang Date: Wed, 13 Dec 2023 15:56:40 +0900 Subject: [PATCH 008/139] ANDROID: ABI: update symbol list for galaxy 2 function symbol(s) added 'int scsi_device_quiesce(struct scsi_device*)' 'void scsi_device_resume(struct scsi_device*)' Bug: 316076675 Change-Id: I301b9445f41736ae485c3779b7164962c17117b2 Signed-off-by: DooHyun Hwang --- android/abi_gki_aarch64.stg | 20 ++++++++++++++++++++ android/abi_gki_aarch64_galaxy | 2 ++ 2 files changed, 22 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index b6bcc62d2c90..541a5f2beb27 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -382267,6 +382267,24 @@ elf_symbol { type_id: 0x19c71538 full_name: "scsi_device_put" } +elf_symbol { + id: 0x61df84bc + name: "scsi_device_quiesce" + is_defined: true + symbol_type: FUNCTION + crc: 0x0daef571 + type_id: 0x94dfa784 + full_name: "scsi_device_quiesce" +} +elf_symbol { + id: 0x054c0bba + name: "scsi_device_resume" + is_defined: true + symbol_type: FUNCTION + crc: 0x069ea5a4 + type_id: 0x19c71538 + full_name: "scsi_device_resume" +} elf_symbol { id: 0xf10245da name: "scsi_dma_map" @@ -403567,6 +403585,8 @@ interface { symbol_id: 0x76dea2aa symbol_id: 0x14eb95fa symbol_id: 0x474e9bcc + symbol_id: 0x61df84bc + symbol_id: 0x054c0bba symbol_id: 0xf10245da symbol_id: 0x18cbd7f9 symbol_id: 0x30f6b9b1 diff --git a/android/abi_gki_aarch64_galaxy b/android/abi_gki_aarch64_galaxy index 01f6927b0592..4dfd51b49f67 100644 --- a/android/abi_gki_aarch64_galaxy +++ b/android/abi_gki_aarch64_galaxy @@ -274,6 +274,8 @@ sched_clock sched_show_task scnprintf + scsi_device_quiesce + scsi_device_resume seq_hex_dump seq_lseek seq_printf From 956a0d3998cdc9bfc3608bc358d0fab974bf4905 Mon Sep 17 00:00:00 2001 From: Manish Varma Date: Mon, 15 Mar 2021 21:40:24 -0700 Subject: [PATCH 009/139] ANDROID: fs: Add vendor hooks for ep_create_wakeup_source & timerfd_create timerfd doesn't create any wakelocks, but eventpoll can. When it does, it names them after the underlying file descriptor, and since all timerfd file descriptors are named "[timerfd]" (which saves memory on systems like desktops with potentially many timerfd instances), all wakesources created as a result of using the eventpoll-on-timerfd idiom are called... "[timerfd]". However, it becomes impossible to tell which "[timerfd]" wakesource is affliated with which process and hence troubleshooting is difficult. Adding vendor hooks to allow vendor to assign appropriate names to timerfd descriptors and eventoll wakesource. Bug: 155142106 Signed-off-by: Manish Varma Change-Id: I330a42ab48bed4b26d5eb2f636925c66061165ec (cherry picked from commit 0ff110fbb309be385126a42ac9f7004ba9b0644e) --- drivers/android/vendor_hooks.c | 3 +++ fs/eventpoll.c | 11 +++++++++-- fs/timerfd.c | 9 +++++++-- include/trace/hooks/fs.h | 23 +++++++++++++++++++++++ 4 files changed, 42 insertions(+), 4 deletions(-) create mode 100644 include/trace/hooks/fs.h diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index 789fa7beea83..43eb748c1103 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -365,3 +366,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_sd_update_bus_speed_mode); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_slab_folio_alloced); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_kmalloc_large_alloced); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_netlink_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_ep_create_wakeup_source); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_timerfd_create); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eccecd3fac90..30217f0fed81 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -39,6 +39,8 @@ #include #include +#include + /* * LOCKING: * There are three level of locking required by epoll : @@ -1373,15 +1375,20 @@ static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; + char ws_name[64]; + strlcpy(ws_name, "eventpoll", sizeof(ws_name)); + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); if (!epi->ep->ws) { - epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); + epi->ep->ws = wakeup_source_register(NULL, ws_name); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); - ws = wakeup_source_register(NULL, n.name.name); + strlcpy(ws_name, n.name.name, sizeof(ws_name)); + trace_android_vh_ep_create_wakeup_source(ws_name, sizeof(ws_name)); + ws = wakeup_source_register(NULL, ws_name); release_dentry_name_snapshot(&n); if (!ws) diff --git a/fs/timerfd.c b/fs/timerfd.c index e9c96a0c79f1..de8e736bbf7b 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -28,6 +28,8 @@ #include #include +#include + struct timerfd_ctx { union { struct hrtimer tmr; @@ -407,6 +409,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) { int ufd; struct timerfd_ctx *ctx; + char file_name_buf[32]; /* Check the TFD_* constants for consistency. */ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); @@ -443,7 +446,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ctx->moffs = ktime_mono_to_real(0); - ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, + strlcpy(file_name_buf, "[timerfd]", sizeof(file_name_buf)); + trace_android_vh_timerfd_create(file_name_buf, sizeof(file_name_buf)); + ufd = anon_inode_getfd(file_name_buf, &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); if (ufd < 0) kfree(ctx); @@ -451,7 +456,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } -static int do_timerfd_settime(int ufd, int flags, +static int do_timerfd_settime(int ufd, int flags, const struct itimerspec64 *new, struct itimerspec64 *old) { diff --git a/include/trace/hooks/fs.h b/include/trace/hooks/fs.h new file mode 100644 index 000000000000..bb8f177db5c1 --- /dev/null +++ b/include/trace/hooks/fs.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fs + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH trace/hooks + +#if !defined(_TRACE_HOOK_FS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HOOK_FS_H + +#include + +DECLARE_HOOK(android_vh_ep_create_wakeup_source, + TP_PROTO(char *name, int len), + TP_ARGS(name, len)); + +DECLARE_HOOK(android_vh_timerfd_create, + TP_PROTO(char *name, int len), + TP_ARGS(name, len)); +#endif /* _TRACE_HOOK_FS_H */ + +/* This part must be outside protection */ +#include \ No newline at end of file From cc294d9503f8aa03d45318c0bf2a7870cea9c930 Mon Sep 17 00:00:00 2001 From: Benjamin Schwartz Date: Wed, 13 Dec 2023 16:12:48 -0800 Subject: [PATCH 010/139] ANDROID: Update the ABI symbol list Adding the following symbols: - __traceiter_android_vh_ep_create_wakeup_source - __traceiter_android_vh_timerfd_create - __tracepoint_android_vh_ep_create_wakeup_source - __tracepoint_android_vh_timerfd_create Bug: 155142106 Change-Id: Ie895faefacd62674ac58783ba6a3cd5c3bc46637 Signed-off-by: Benjamin Schwartz --- android/abi_gki_aarch64.stg | 47 +++++++++++++++++++++++++++++++++++ android/abi_gki_aarch64_pixel | 8 ++++++ 2 files changed, 55 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 541a5f2beb27..60b8a56bcba4 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -318689,6 +318689,13 @@ function { parameter_id: 0x3e10b518 parameter_id: 0x33756485 } +function { + id: 0x9ba47dcc + return_type_id: 0x6720d32f + parameter_id: 0x18bd6530 + parameter_id: 0x0483e6f8 + parameter_id: 0x6720d32f +} function { id: 0x9ba4eebd return_type_id: 0x6720d32f @@ -336481,6 +336488,15 @@ elf_symbol { type_id: 0x9bcd4ff7 full_name: "__traceiter_android_vh_encrypt_page" } +elf_symbol { + id: 0x7f1591a1 + name: "__traceiter_android_vh_ep_create_wakeup_source" + is_defined: true + symbol_type: FUNCTION + crc: 0x1e8ed582 + type_id: 0x9ba47dcc + full_name: "__traceiter_android_vh_ep_create_wakeup_source" +} elf_symbol { id: 0x1921d10d name: "__traceiter_android_vh_exit_check" @@ -337777,6 +337793,15 @@ elf_symbol { type_id: 0x9ab83ca3 full_name: "__traceiter_android_vh_timer_calc_index" } +elf_symbol { + id: 0x641d703d + name: "__traceiter_android_vh_timerfd_create" + is_defined: true + symbol_type: FUNCTION + crc: 0x8c68d59c + type_id: 0x9ba47dcc + full_name: "__traceiter_android_vh_timerfd_create" +} elf_symbol { id: 0x2bc25325 name: "__traceiter_android_vh_try_to_freeze_todo" @@ -340459,6 +340484,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_encrypt_page" } +elf_symbol { + id: 0xdef7c547 + name: "__tracepoint_android_vh_ep_create_wakeup_source" + is_defined: true + symbol_type: OBJECT + crc: 0x7db48833 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_ep_create_wakeup_source" +} elf_symbol { id: 0x684e5f4f name: "__tracepoint_android_vh_exit_check" @@ -341755,6 +341789,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_timer_calc_index" } +elf_symbol { + id: 0x2df766e3 + name: "__tracepoint_android_vh_timerfd_create" + is_defined: true + symbol_type: OBJECT + crc: 0x181a4352 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_timerfd_create" +} elf_symbol { id: 0xd9d2bcff name: "__tracepoint_android_vh_try_to_freeze_todo" @@ -398499,6 +398542,7 @@ interface { symbol_id: 0xdcaa59a3 symbol_id: 0x7ebac47a symbol_id: 0xf586d5b6 + symbol_id: 0x7f1591a1 symbol_id: 0x1921d10d symbol_id: 0x1f554c2a symbol_id: 0x343adff1 @@ -398643,6 +398687,7 @@ interface { symbol_id: 0x226cc38b symbol_id: 0xeecc1529 symbol_id: 0xfeff2e7f + symbol_id: 0x641d703d symbol_id: 0x2bc25325 symbol_id: 0x0119fc41 symbol_id: 0xd9f43028 @@ -398941,6 +398986,7 @@ interface { symbol_id: 0x54b2cd01 symbol_id: 0x188eab44 symbol_id: 0xe7584e1c + symbol_id: 0xdef7c547 symbol_id: 0x684e5f4f symbol_id: 0x0d418d38 symbol_id: 0x2121385f @@ -399085,6 +399131,7 @@ interface { symbol_id: 0xa5c71571 symbol_id: 0xfa3284c7 symbol_id: 0x69721329 + symbol_id: 0x2df766e3 symbol_id: 0xd9d2bcff symbol_id: 0x09ba106b symbol_id: 0xf9580976 diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index d19d3f7e39aa..7153beba5ba6 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -1162,7 +1162,10 @@ kernel_param_lock kernel_param_unlock kernel_restart + kernfs_find_and_get_ns + kernfs_notify kernfs_path_from_node + kernfs_put key_create_or_update key_put keyring_alloc @@ -2150,6 +2153,7 @@ thermal_zone_get_temp thermal_zone_get_zone_by_name thread_group_cputime_adjusted + tick_nohz_get_idle_calls_cpu time64_to_tm topology_update_thermal_pressure _totalram_pages @@ -2209,6 +2213,7 @@ __traceiter_android_vh_dup_task_struct __traceiter_android_vh_early_resume_begin __traceiter_android_vh_enable_thermal_genl_check + __traceiter_android_vh_ep_create_wakeup_source __traceiter_android_vh_filemap_get_folio __traceiter_android_vh_ipi_stop __traceiter_android_vh_meminfo_proc_show @@ -2222,6 +2227,7 @@ __traceiter_android_vh_setscheduler_uclamp __traceiter_android_vh_si_meminfo_adjust __traceiter_android_vh_sysrq_crash + __traceiter_android_vh_timerfd_create __traceiter_android_vh_typec_store_partner_src_caps __traceiter_android_vh_typec_tcpci_override_toggling __traceiter_android_vh_typec_tcpm_get_timer @@ -2316,6 +2322,7 @@ __tracepoint_android_vh_dup_task_struct __tracepoint_android_vh_early_resume_begin __tracepoint_android_vh_enable_thermal_genl_check + __tracepoint_android_vh_ep_create_wakeup_source __tracepoint_android_vh_filemap_get_folio __tracepoint_android_vh_ipi_stop __tracepoint_android_vh_meminfo_proc_show @@ -2329,6 +2336,7 @@ __tracepoint_android_vh_setscheduler_uclamp __tracepoint_android_vh_si_meminfo_adjust __tracepoint_android_vh_sysrq_crash + __tracepoint_android_vh_timerfd_create __tracepoint_android_vh_typec_store_partner_src_caps __tracepoint_android_vh_typec_tcpci_override_toggling __tracepoint_android_vh_typec_tcpm_get_timer From cc653d701f72a92103ae30b7573129258d34e5e9 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Thu, 14 Dec 2023 15:16:26 -0800 Subject: [PATCH 011/139] ANDROID: virt: gunyah: Zero state_data after vcpu_run Do not re-use stale state_data on subsequent vcpu runs as the stale data could be interpreted by Gunyah and rejected. Bug: 268234781 Change-Id: I3d4bf7a922da1e0e85006ffa58b64a74e320d3c9 Signed-off-by: Elliot Berman --- drivers/virt/gunyah/gunyah_vcpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/virt/gunyah/gunyah_vcpu.c b/drivers/virt/gunyah/gunyah_vcpu.c index 82a0cbf55caf..bb13a1aed2e4 100644 --- a/drivers/virt/gunyah/gunyah_vcpu.c +++ b/drivers/virt/gunyah/gunyah_vcpu.c @@ -196,6 +196,7 @@ static int gh_vcpu_run(struct gh_vcpu *vcpu) } gh_error = gh_hypercall_vcpu_run(vcpu->rsc->capid, state_data, &vcpu_run_resp); + memset(state_data, 0, sizeof(state_data)); if (gh_error == GH_ERROR_OK) { switch (vcpu_run_resp.state) { case GH_VCPU_STATE_READY: From 01dd8c280b9cfa4b3bbd4a2ffbaa0e07567a5163 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 25 Oct 2023 13:52:57 +0100 Subject: [PATCH 012/139] ANDROID: KVM: arm64: Prefault entries when splitting a block mapping When splitting a block mapping, we install a table entry pointing to an empty page and recreate the new entries lazily as we fault them in. For page-tables with the KVM_PGTABLE_S2_IDMAP flag, this can result in unnecessary translation faults. When splitting a block for a page-table with KVM_PGTABLE_S2_IDMAP set, pre-populate the newly allocate page-table page with contiguous ptes based on the attributes of the block. Bug: 308373293 Change-Id: I0c53d048de913e193830caef93d75755270db709 Signed-off-by: Will Deacon Signed-off-by: Keir Fraser --- arch/arm64/kvm/hyp/pgtable.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b7e8faa894c9..2d11455aabe8 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -775,6 +775,22 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, return 0; } +static void stage2_map_prefault_idmap(u64 addr, u32 level, kvm_pte_t *ptep, + kvm_pte_t attr) +{ + u64 granule = kvm_granule_size(level); + int i; + + if (!kvm_pte_valid(attr)) + return; + + for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, addr += granule) { + kvm_pte_t pte = kvm_init_valid_leaf_pte(addr, attr, level); + /* We can write non-atomically: ptep isn't yet live. */ + *ptep = pte; + } +} + static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) { @@ -805,6 +821,12 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, if (!childp) return -ENOMEM; + if (pgt->flags & KVM_PGTABLE_S2_IDMAP) { + WARN_ON(pte_ops->pte_is_counted_cb(pte, level)); + addr = ALIGN_DOWN(addr, kvm_granule_size(level)); + stage2_map_prefault_idmap(addr, level + 1, childp, pte); + } + /* * If we've run into an existing block mapping then replace it with * a table. Accesses beyond 'end' that fall within the new table From f082d22541bf871efd1b7d56d0bf51ffa0e01bfd Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 31 Oct 2023 17:09:30 +0000 Subject: [PATCH 013/139] ANDROID: KVM: arm64: Optimise module_change_host_page_prot Merge the relaxation and restriction paths to both only need to adjust permissions. This avoids un-map + re-map on the restriction path; and avoids installing an annotated entry on the relaxation path (which will cause a translation fault on first access by the host). Bug: 308373293 Change-Id: I9c7a6ac149aad64b19a5ce7808334188475b27cc Signed-off-by: Keir Fraser --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 25 ++++++------------------- arch/arm64/kvm/hyp/pgtable.c | 20 ++++++++++++++------ 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 86cd64130328..4ba504f5f4bd 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -2008,19 +2008,6 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) return ret; } -static int restrict_host_page_perms(u64 addr, kvm_pte_t pte, u32 level, enum kvm_pgtable_prot prot) -{ - int ret = 0; - - /* XXX: optimize ... */ - if (kvm_pte_valid(pte) && (level == KVM_PGTABLE_MAX_LEVELS - 1)) - ret = kvm_pgtable_stage2_unmap(&host_mmu.pgt, addr, PAGE_SIZE); - if (!ret) - ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false); - - return ret; -} - #define MODULE_PROT_ALLOWLIST (KVM_PGTABLE_PROT_RWX | \ KVM_PGTABLE_PROT_DEVICE |\ KVM_PGTABLE_PROT_NC | \ @@ -2065,12 +2052,12 @@ int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot) } update: - if (prot == default_host_prot(!!page)) - ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_HOST); - else if (!prot) - ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_PROTECTED); - else - ret = restrict_host_page_perms(addr, pte, level, prot); + if (!prot) { + ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, + PKVM_ID_PROTECTED); + } else { + ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false); + } if (ret || !page) goto unlock; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 2d11455aabe8..b9140293da7d 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -645,8 +645,13 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) return prot; } -static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +static bool stage2_pte_needs_update(struct kvm_pgtable *pgt, + kvm_pte_t old, kvm_pte_t new) { + /* Following filter logic applies only to guest stage-2 entries. */ + if (pgt->flags & KVM_PGTABLE_S2_IDMAP) + return true; + if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) return true; @@ -715,12 +720,15 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, new = data->annotation; /* - * Skip updating the PTE if we are trying to recreate the exact - * same mapping or only change the access permissions. Instead, - * the vCPU will exit one more time from guest if still needed - * and then go through the path of relaxing permissions. + * Skip updating a guest PTE if we are trying to recreate the exact + * same mapping or change only the access permissions. Instead, + * the vCPU will exit one more time from the guest if still needed + * and then go through the path of relaxing permissions. This applies + * only to guest PTEs; Host PTEs are unconditionally updated. The + * host cannot livelock because the abort handler has done prior + * checks before calling here. */ - if (!stage2_pte_needs_update(old, new)) + if (!stage2_pte_needs_update(pgt, old, new)) return -EAGAIN; if (pte_ops->pte_is_counted_cb(old, level)) From fd720ebc6a4049162e3bec2f16b95fd57931164d Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Wed, 1 Nov 2023 15:54:56 +0000 Subject: [PATCH 014/139] ANDROID: KVM: arm64: Relax checks in module_change_host_page_prot Modules can only relax permissions to RWX. This seems rather arbitrary. Instead, allow any valid permissions to be set, as long as the page is a pristine host page, or already module owned. Bug: 308373293 Change-Id: I905786fad6543f47a00bd9b9f07e17dd660d457c Signed-off-by: Keir Fraser --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 4ba504f5f4bd..f7f1c184b4fd 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -2041,15 +2041,12 @@ int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot) page = hyp_phys_to_page(addr); /* - * Modules can only relax permissions of pages they own, and restrict - * permissions of pristine pages. + * Modules can only modify pages they already own, and pristine host + * pages. */ - if (prot == KVM_PGTABLE_PROT_RWX) { - if (!(page->flags & MODULE_OWNED_PAGE)) - goto unlock; - } else if (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED) { + if (!(page->flags & MODULE_OWNED_PAGE) && + (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED)) goto unlock; - } update: if (!prot) { From fbc707442cb65d1368b82291c5975cdea1cc6c74 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Thu, 2 Nov 2023 16:26:11 +0000 Subject: [PATCH 015/139] ANDROID: KVM: arm64: Introduce module_change_host_prot_range This allows protection attributes to be changed for a range of pages via a single module API call. The original API call modifying a single page is now implemented as a shim on top of the new range-based call. The ABI STG is also fixed up: type 'struct pkvm_module_ops' changed member 'union { int(* host_stage2_mod_prot_range)(u64, enum kvm_pgtable_prot, u64); struct { u64 android_kabi_reserved1; }; union { }; }' was added member 'u64 android_kabi_reserved1' was removed Bug: 308373293 Change-Id: I6fbb2e0b325aa972148f48746565dcc10d74edaf Signed-off-by: Keir Fraser --- android/abi_gki_aarch64.stg | 34 +++++++++- arch/arm64/include/asm/kvm_pkvm_module.h | 3 +- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/mem_protect.c | 66 +++++++++++++------ arch/arm64/kvm/hyp/nvhe/modules.c | 1 + 5 files changed, 82 insertions(+), 23 deletions(-) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 60b8a56bcba4..0ceacc54c6de 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -12283,6 +12283,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0xb94739b9 } +pointer_reference { + id: 0x24c218d7 + kind: POINTER + pointee_type_id: 0xb94885c2 +} pointer_reference { id: 0x24c6c7eb kind: POINTER @@ -40287,6 +40292,11 @@ member { type_id: 0x797868f8 offset: 32 } +member { + id: 0x3dbb0f88 + type_id: 0x79c25039 + offset: 2048 +} member { id: 0x3dbd80ff type_id: 0x79d85976 @@ -100848,6 +100858,11 @@ member { type_id: 0x24cb3ae4 offset: 896 } +member { + id: 0xbcc50199 + name: "host_stage2_mod_prot_range" + type_id: 0x24c218d7 +} member { id: 0xedc7b540 name: "host_status" @@ -215152,6 +215167,16 @@ struct_union { member_id: 0x3bfa35f3 } } +struct_union { + id: 0x79c25039 + kind: UNION + definition { + bytesize: 8 + member_id: 0xbcc50199 + member_id: 0x27000c61 + member_id: 0x36752b74 + } +} struct_union { id: 0x79d85976 kind: UNION @@ -247063,7 +247088,7 @@ struct_union { member_id: 0x636da10f member_id: 0x6f066e7f member_id: 0x3afd0925 - member_id: 0x2d0812b0 + member_id: 0x3dbb0f88 member_id: 0x637607e0 member_id: 0xac894cc9 member_id: 0xe0f63db8 @@ -327601,6 +327626,13 @@ function { parameter_id: 0x18bd6530 parameter_id: 0x310ec01d } +function { + id: 0xb94885c2 + return_type_id: 0x6720d32f + parameter_id: 0x92233392 + parameter_id: 0x1908b154 + parameter_id: 0x92233392 +} function { id: 0xb94d0c8b return_type_id: 0x06835e9c diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h index 5752e1d11abd..bf68d862b7d8 100644 --- a/arch/arm64/include/asm/kvm_pkvm_module.h +++ b/arch/arm64/include/asm/kvm_pkvm_module.h @@ -153,7 +153,8 @@ struct pkvm_module_ops { void* (*hyp_va)(phys_addr_t phys); unsigned long (*kern_hyp_va)(unsigned long x); - ANDROID_KABI_RESERVE(1); + ANDROID_KABI_USE(1, int (*host_stage2_mod_prot_range)(u64 pfn, enum kvm_pgtable_prot prot, u64 nr_pages)); + ANDROID_KABI_RESERVE(2); ANDROID_KABI_RESERVE(3); ANDROID_KABI_RESERVE(4); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index b0eabed053d2..f80d15d52be6 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -104,6 +104,7 @@ int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot); +int module_change_host_page_prot_range(u64 pfn, enum kvm_pgtable_prot prot, u64 nr_pages); void destroy_hyp_vm_pgt(struct pkvm_hyp_vm *vm); void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index f7f1c184b4fd..2c1032a59826 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -2013,56 +2013,75 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) KVM_PGTABLE_PROT_NC | \ KVM_PGTABLE_PROT_PXN | \ KVM_PGTABLE_PROT_UXN) -int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot) + +int module_change_host_page_prot_range(u64 pfn, enum kvm_pgtable_prot prot, u64 nr_pages) { - u64 addr = hyp_pfn_to_phys(pfn); + u64 i, addr = hyp_pfn_to_phys(pfn); + u64 end = addr + nr_pages * PAGE_SIZE; struct hyp_page *page = NULL; - kvm_pte_t pte; - u32 level; + struct kvm_mem_range range; + bool is_mmio; int ret; if ((prot & MODULE_PROT_ALLOWLIST) != prot) return -EINVAL; + is_mmio = !find_mem_range(addr, &range); + if (end > range.end) { + /* Specified range not in a single mmio or memory block. */ + return -EPERM; + } + host_lock_component(); - ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level); - if (ret) - goto unlock; /* * There is no hyp_vmemmap covering MMIO regions, which makes tracking * of module-owned MMIO regions hard, so we trust the modules not to * mess things up. */ - if (!addr_is_memory(addr)) + if (is_mmio) goto update; - ret = -EPERM; + /* Range is memory: we can track module ownership. */ page = hyp_phys_to_page(addr); /* * Modules can only modify pages they already own, and pristine host - * pages. + * pages. The entire range must be consistently one or the other. */ - if (!(page->flags & MODULE_OWNED_PAGE) && - (host_get_page_state(pte, addr) != PKVM_PAGE_OWNED)) - goto unlock; + if (page->flags & MODULE_OWNED_PAGE) { + /* The entire range must be module-owned. */ + ret = -EPERM; + for (i = 1; i < nr_pages; i++) { + if (!(page[i].flags & MODULE_OWNED_PAGE)) + goto unlock; + } + } else { + /* The entire range must be pristine. */ + ret = __host_check_page_state_range( + addr, nr_pages << PAGE_SHIFT, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + } update: if (!prot) { - ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, - PKVM_ID_PROTECTED); + ret = host_stage2_set_owner_locked( + addr, nr_pages << PAGE_SHIFT, PKVM_ID_PROTECTED); } else { - ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot, false); + ret = host_stage2_idmap_locked( + addr, nr_pages << PAGE_SHIFT, prot, false); } - if (ret || !page) + if (WARN_ON(ret) || !page) goto unlock; - if (prot != KVM_PGTABLE_PROT_RWX) - hyp_phys_to_page(addr)->flags |= MODULE_OWNED_PAGE; - else - hyp_phys_to_page(addr)->flags &= ~MODULE_OWNED_PAGE; + for (i = 0; i < nr_pages; i++) { + if (prot != KVM_PGTABLE_PROT_RWX) + page[i].flags |= MODULE_OWNED_PAGE; + else + page[i].flags &= ~MODULE_OWNED_PAGE; + } unlock: host_unlock_component(); @@ -2070,6 +2089,11 @@ unlock: return ret; } +int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot) +{ + return module_change_host_page_prot_range(pfn, prot, 1); +} + int hyp_pin_shared_mem(void *from, void *to) { u64 cur, start = ALIGN_DOWN((u64)from, PAGE_SIZE); diff --git a/arch/arm64/kvm/hyp/nvhe/modules.c b/arch/arm64/kvm/hyp/nvhe/modules.c index 49e6c2c2e2ae..862e7b7a75ff 100644 --- a/arch/arm64/kvm/hyp/nvhe/modules.c +++ b/arch/arm64/kvm/hyp/nvhe/modules.c @@ -115,6 +115,7 @@ const struct pkvm_module_ops module_ops = { .hyp_pa = hyp_virt_to_phys, .hyp_va = hyp_phys_to_virt, .kern_hyp_va = __kern_hyp_va, + .host_stage2_mod_prot_range = module_change_host_page_prot_range, }; int __pkvm_init_module(void *module_init) From 4fa87d4d8fce9c45afcbcb03258b7be84052f7c0 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 7 Nov 2023 15:40:41 +0000 Subject: [PATCH 016/139] ANDROID: KVM: arm64: Skip prefaulting ptes which will be modified later Block mappings can be split as part of a page table update. When prefaulting entries during the split, it is pointless to install valid ptes which will later be modified by the same walk. At the same time, push the check for pte_is_counted into the prefault handler, where it logically belongs. Bug: 308373293 Change-Id: If4599b2860aa62d82ce8db019a8410c2d883de71 Signed-off-by: Keir Fraser --- arch/arm64/kvm/hyp/pgtable.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index b9140293da7d..64387388584c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -783,19 +783,27 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, return 0; } -static void stage2_map_prefault_idmap(u64 addr, u32 level, kvm_pte_t *ptep, - kvm_pte_t attr) +static void stage2_map_prefault_idmap(struct kvm_pgtable_pte_ops *pte_ops, + u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, kvm_pte_t block_pte) { - u64 granule = kvm_granule_size(level); + u64 pa, granule; int i; - if (!kvm_pte_valid(attr)) + WARN_ON(pte_ops->pte_is_counted_cb(block_pte, level-1)); + + if (!kvm_pte_valid(block_pte)) return; - for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, addr += granule) { - kvm_pte_t pte = kvm_init_valid_leaf_pte(addr, attr, level); - /* We can write non-atomically: ptep isn't yet live. */ - *ptep = pte; + pa = ALIGN_DOWN(addr, kvm_granule_size(level-1)); + granule = kvm_granule_size(level); + for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep, pa += granule) { + kvm_pte_t pte = kvm_init_valid_leaf_pte(pa, block_pte, level); + /* Skip ptes in the range being modified by the caller. */ + if ((pa < addr) || (pa >= end)) { + /* We can write non-atomically: ptep isn't yet live. */ + *ptep = pte; + } } } @@ -830,9 +838,8 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return -ENOMEM; if (pgt->flags & KVM_PGTABLE_S2_IDMAP) { - WARN_ON(pte_ops->pte_is_counted_cb(pte, level)); - addr = ALIGN_DOWN(addr, kvm_granule_size(level)); - stage2_map_prefault_idmap(addr, level + 1, childp, pte); + stage2_map_prefault_idmap(pte_ops, addr, end, level + 1, + childp, pte); } /* From ac1031618a7d1cfb0a804439212de64f115a7773 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 15 Dec 2023 09:36:46 +0000 Subject: [PATCH 017/139] ANDROID: Snapshot Mainline's version of checkpatch.pl Nothing fancy here. Keeping full history is not required. `git checkout mainline/master -- scripts/checkpatch.pl` This may need to be done periodically. Bug: 316492624 Signed-off-by: Lee Jones Change-Id: I4c90b50197ca7277c59e96bf332ecf795c4f3d12 --- scripts/checkpatch.pl | 170 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 157 insertions(+), 13 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1e5e66ae5a52..25fdb7fda112 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -74,6 +74,8 @@ my $git_command ='export LANGUAGE=en_US.UTF-8; git'; my $tabsize = 8; my ${CONFIG_} = "CONFIG_"; +my %maybe_linker_symbol; # for externs in c exceptions, when seen in *vmlinux.lds.h + sub help { my ($exitcode) = @_; @@ -620,6 +622,22 @@ our $signature_tags = qr{(?xi: Cc: )}; +our @link_tags = qw(Link Closes); + +#Create a search and print patterns for all these strings to be used directly below +our $link_tags_search = ""; +our $link_tags_print = ""; +foreach my $entry (@link_tags) { + if ($link_tags_search ne "") { + $link_tags_search .= '|'; + $link_tags_print .= ' or '; + } + $entry .= ':'; + $link_tags_search .= $entry; + $link_tags_print .= "'$entry'"; +} +$link_tags_search = "(?:${link_tags_search})"; + our $tracing_logging_tags = qr{(?xi: [=-]*> | <[=-]* | @@ -702,6 +720,17 @@ sub find_standard_signature { return ""; } +our $obsolete_archives = qr{(?xi: + \Qfreedesktop.org/archives/dri-devel\E | + \Qlists.infradead.org\E | + \Qlkml.org\E | + \Qmail-archive.com\E | + \Qmailman.alsa-project.org/pipermail\E | + \Qmarc.info\E | + \Qozlabs.org/pipermail\E | + \Qspinics.net\E +)}; + our @typeListMisordered = ( qr{char\s+(?:un)?signed}, qr{int\s+(?:(?:un)?signed\s+)?short\s}, @@ -812,7 +841,9 @@ our %deprecated_apis = ( "get_state_synchronize_sched" => "get_state_synchronize_rcu", "cond_synchronize_sched" => "cond_synchronize_rcu", "kmap" => "kmap_local_page", + "kunmap" => "kunmap_local", "kmap_atomic" => "kmap_local_page", + "kunmap_atomic" => "kunmap_local", ); #Create a search pattern for all these strings to speed up a loop below @@ -3131,21 +3162,33 @@ sub process { if ($sign_off =~ /^co-developed-by:$/i) { if ($email eq $author) { WARN("BAD_SIGN_OFF", - "Co-developed-by: should not be used to attribute nominal patch author '$author'\n" . "$here\n" . $rawline); + "Co-developed-by: should not be used to attribute nominal patch author '$author'\n" . $herecurr); } if (!defined $lines[$linenr]) { WARN("BAD_SIGN_OFF", - "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline); - } elsif ($rawlines[$linenr] !~ /^\s*signed-off-by:\s*(.*)/i) { + "Co-developed-by: must be immediately followed by Signed-off-by:\n" . $herecurr); + } elsif ($rawlines[$linenr] !~ /^signed-off-by:\s*(.*)/i) { WARN("BAD_SIGN_OFF", - "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]); + "Co-developed-by: must be immediately followed by Signed-off-by:\n" . $herecurr . $rawlines[$linenr] . "\n"); } elsif ($1 ne $email) { WARN("BAD_SIGN_OFF", - "Co-developed-by and Signed-off-by: name/email do not match \n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]); + "Co-developed-by and Signed-off-by: name/email do not match\n" . $herecurr . $rawlines[$linenr] . "\n"); + } + } + +# check if Reported-by: is followed by a Closes: tag + if ($sign_off =~ /^reported(?:|-and-tested)-by:$/i) { + if (!defined $lines[$linenr]) { + WARN("BAD_REPORTED_BY_LINK", + "Reported-by: should be immediately followed by Closes: with a URL to the report\n" . $herecurr . "\n"); + } elsif ($rawlines[$linenr] !~ /^closes:\s*/i) { + WARN("BAD_REPORTED_BY_LINK", + "Reported-by: should be immediately followed by Closes: with a URL to the report\n" . $herecurr . $rawlines[$linenr] . "\n"); } } } + # Check Fixes: styles is correct if (!$in_header_lines && $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) { @@ -3225,11 +3268,11 @@ sub process { # file delta changes $line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ || # filename then : - $line =~ /^\s*(?:Fixes:|Link:|$signature_tags)/i || - # A Fixes: or Link: line or signature tag line + $line =~ /^\s*(?:Fixes:|$link_tags_search|$signature_tags)/i || + # A Fixes:, link or signature tag line $commit_log_possible_stack_dump)) { WARN("COMMIT_LOG_LONG_LINE", - "Possible unwrapped commit description (prefer a maximum 75 chars per line)\n" . $herecurr); + "Prefer a maximum 75 chars per line (possible unwrapped commit description?)\n" . $herecurr); $commit_log_long_line = 1; } @@ -3239,6 +3282,29 @@ sub process { $commit_log_possible_stack_dump = 0; } +# Check for odd tags before a URI/URL + if ($in_commit_log && + $line =~ /^\s*(\w+:)\s*http/ && $1 !~ /^$link_tags_search$/) { + if ($1 =~ /^v(?:ersion)?\d+/i) { + WARN("COMMIT_LOG_VERSIONING", + "Patch version information should be after the --- line\n" . $herecurr); + } else { + WARN("COMMIT_LOG_USE_LINK", + "Unknown link reference '$1', use $link_tags_print instead\n" . $herecurr); + } + } + +# Check for misuse of the link tags + if ($in_commit_log && + $line =~ /^\s*(\w+:)\s*(\S+)/) { + my $tag = $1; + my $value = $2; + if ($tag =~ /^$link_tags_search$/ && $value !~ m{^https?://}) { + WARN("COMMIT_LOG_WRONG_LINK", + "'$tag' should be followed by a public http(s) link\n" . $herecurr); + } + } + # Check for lines starting with a # if ($in_commit_log && $line =~ /^#/) { if (WARN("COMMIT_COMMENT_SYMBOL", @@ -3324,6 +3390,12 @@ sub process { $last_git_commit_id_linenr = $linenr if ($line =~ /\bcommit\s*$/i); } +# Check for mailing list archives other than lore.kernel.org + if ($rawline =~ m{http.*\b$obsolete_archives}) { + WARN("PREFER_LORE_ARCHIVE", + "Use lore.kernel.org archive links when possible - see https://lore.kernel.org/lists.html\n" . $herecurr); + } + # Check for added, moved or deleted files if (!$reported_maintainer_file && !$in_commit_log && ($line =~ /^(?:new|deleted) file mode\s*\d+\s*$/ || @@ -3693,7 +3765,7 @@ sub process { "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); } if ($realfile =~ m@^Documentation/devicetree/bindings/@ && - not $spdx_license =~ /GPL-2\.0.*BSD-2-Clause/) { + $spdx_license !~ /GPL-2\.0(?:-only)? OR BSD-2-Clause/) { my $msg_level = \&WARN; $msg_level = \&CHK if ($file); if (&{$msg_level}("SPDX_LICENSE_TAG", @@ -3703,12 +3775,17 @@ sub process { $fixed[$fixlinenr] =~ s/SPDX-License-Identifier: .*/SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)/; } } + if ($realfile =~ m@^include/dt-bindings/@ && + $spdx_license !~ /GPL-2\.0(?:-only)? OR \S+/) { + WARN("SPDX_LICENSE_TAG", + "DT binding headers should be licensed (GPL-2.0-only OR .*)\n" . $herecurr); + } } } } # check for embedded filenames - if ($rawline =~ /^\+.*\Q$realfile\E/) { + if ($rawline =~ /^\+.*\b\Q$realfile\E\b/) { WARN("EMBEDDED_FILENAME", "It's generally not useful to have the filename in the file\n" . $herecurr); } @@ -4971,7 +5048,7 @@ sub process { if|for|while|switch|return|case| volatile|__volatile__| __attribute__|format|__extension__| - asm|__asm__)$/x) + asm|__asm__|scoped_guard)$/x) { # cpp #define statements have non-optional spaces, ie # if there is a space between the name and the open @@ -5766,6 +5843,8 @@ sub process { $var !~ /^(?:[A-Z]+_){1,5}[A-Z]{1,3}[a-z]/ && #Ignore Page variants $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && +#Ignore ETHTOOL_LINK_MODE_ variants + $var !~ /^ETHTOOL_LINK_MODE_/ && #Ignore SI style variants like nS, mV and dB #(ie: max_uV, regulator_min_uA_show, RANGE_mA_VALUE) $var !~ /^(?:[a-z0-9_]*|[A-Z0-9_]*)?_?[a-z][A-Z](?:_[a-z0-9_]+|_[A-Z0-9_]+)?$/ && @@ -5901,6 +5980,7 @@ sub process { $dstat !~ /$exceptions/ && $dstat !~ /^\.$Ident\s*=/ && # .foo = $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo + $dstat !~ /^case\b/ && # case ... $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ && # do {...} while (...); // do {...} while (...) $dstat !~ /^while\s*$Constant\s*$Constant\s*$/ && # while (...) {...} $dstat !~ /^for\s*$Constant$/ && # for (...) @@ -5973,6 +6053,9 @@ sub process { # check for line continuations outside of #defines, preprocessor #, and asm + } elsif ($realfile =~ m@/vmlinux.lds.h$@) { + $line =~ s/(\w+)/$maybe_linker_symbol{$1}++/ge; + #print "REAL: $realfile\nln: $line\nkeys:", sort keys %maybe_linker_symbol; } else { if ($prevline !~ /^..*\\$/ && $line !~ /^\+\s*\#.*\\$/ && # preprocessor @@ -6910,10 +6993,22 @@ sub process { # } # } +# strcpy uses that should likely be strscpy + if ($line =~ /\bstrcpy\s*\(/) { + WARN("STRCPY", + "Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88\n" . $herecurr); + } + # strlcpy uses that should likely be strscpy if ($line =~ /\bstrlcpy\s*\(/) { WARN("STRLCPY", - "Prefer strscpy over strlcpy - see: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw\@mail.gmail.com/\n" . $herecurr); + "Prefer strscpy over strlcpy - see: https://github.com/KSPP/linux/issues/89\n" . $herecurr); + } + +# strncpy uses that should likely be strscpy or strscpy_pad + if ($line =~ /\bstrncpy\s*\(/) { + WARN("STRNCPY", + "Prefer strscpy, strscpy_pad, or __nonstring over strncpy - see: https://github.com/KSPP/linux/issues/90\n" . $herecurr); } # typecasts on min/max could be min_t/max_t @@ -7020,6 +7115,21 @@ sub process { "arguments for function declarations should follow identifier\n" . $herecurr); } + } elsif ($realfile =~ /\.c$/ && defined $stat && + $stat =~ /^\+extern struct\s+(\w+)\s+(\w+)\[\];/) + { + my ($st_type, $st_name) = ($1, $2); + + for my $s (keys %maybe_linker_symbol) { + #print "Linker symbol? $st_name : $s\n"; + goto LIKELY_LINKER_SYMBOL + if $st_name =~ /$s/; + } + WARN("AVOID_EXTERNS", + "found a file-scoped extern type:$st_type name:$st_name in .c file\n" + . "is this a linker symbol ?\n" . $herecurr); + LIKELY_LINKER_SYMBOL: + } elsif ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*extern\s+/) { @@ -7128,7 +7238,7 @@ sub process { } # check for alloc argument mismatch - if ($line =~ /\b((?:devm_)?(?:kcalloc|kmalloc_array))\s*\(\s*sizeof\b/) { + if ($line =~ /\b((?:devm_)?((?:k|kv)?(calloc|malloc_array)(?:_node)?))\s*\(\s*sizeof\b/) { WARN("ALLOC_ARRAY_ARGS", "$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr); } @@ -7331,6 +7441,16 @@ sub process { } } +# check for array definition/declarations that should use flexible arrays instead + if ($sline =~ /^[\+ ]\s*\}(?:\s*__packed)?\s*;\s*$/ && + $prevline =~ /^\+\s*(?:\}(?:\s*__packed\s*)?|$Type)\s*$Ident\s*\[\s*(0|1)\s*\]\s*;\s*$/) { + if (ERROR("FLEXIBLE_ARRAY", + "Use C99 flexible arrays - see https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays\n" . $hereprev) && + $1 == '0' && $fix) { + $fixed[$fixlinenr - 1] =~ s/\[\s*0\s*\]/[]/; + } + } + # nested likely/unlikely calls if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) { WARN("LIKELY_MISUSE", @@ -7348,6 +7468,30 @@ sub process { } } +# Complain about RCU Tasks Trace used outside of BPF (and of course, RCU). + our $rcu_trace_funcs = qr{(?x: + rcu_read_lock_trace | + rcu_read_lock_trace_held | + rcu_read_unlock_trace | + call_rcu_tasks_trace | + synchronize_rcu_tasks_trace | + rcu_barrier_tasks_trace | + rcu_request_urgent_qs_task + )}; + our $rcu_trace_paths = qr{(?x: + kernel/bpf/ | + include/linux/bpf | + net/bpf/ | + kernel/rcu/ | + include/linux/rcu + )}; + if ($line =~ /\b($rcu_trace_funcs)\s*\(/) { + if ($realfile !~ m{^$rcu_trace_paths}) { + WARN("RCU_TASKS_TRACE", + "use of RCU tasks trace is incorrect outside BPF or core RCU code\n" . $herecurr); + } + } + # check for lockdep_set_novalidate_class if ($line =~ /^.\s*lockdep_set_novalidate_class\s*\(/ || $line =~ /__lockdep_no_validate__\s*\)/ ) { From 6c7495f04a3c2fc58077131ca46e399c0aa83564 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:32 +0000 Subject: [PATCH 018/139] UPSTREAM: mm/damon/core: split out DAMOS-charged region skip logic into a new function Patch series "mm/damon: cleanup and refactoring code", v2. This patchset cleans up and refactors a range of DAMON code including the core, DAMON sysfs interface, and DAMON modules, for better readability and convenient future feature implementations. In detail, this patchset splits unnecessarily long and complex functions in core into smaller functions (patches 1-4). Then, it cleans up the DAMON sysfs interface by using more type-safe code (patch 5) and removing unnecessary function parameters (patch 6). Further, it refactor the code by distributing the code into multiple files (patches 7-10). Last two patches (patches 11 and 12) deduplicates and remove unnecessary header inclusion in DAMON modules (reclaim and lru_sort). This patch (of 12): The DAMOS action applying function, 'damon_do_apply_schemes()', is quite long and not so simple. Split out the already quota-charged region skip code, which is not a small amount of simple code, into a new function with some comments for better readability. Link: https://lkml.kernel.org/r/20221026225943.100429-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221026225943.100429-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 2ea3498980f5e6f3001f2984b0b92736bf1b78cb) Bug: 300502883 Change-Id: I416f3c24d8a6e41df5512a8cfee57a9de26ae185 Signed-off-by: cui yangpei --- mm/damon/core.c | 96 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 36d098d06c55..06b50ede9cc6 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -694,6 +694,67 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; } +/* + * damos_skip_charged_region() - Check if the given region or starting part of + * it is already charged for the DAMOS quota. + * @t: The target of the region. + * @rp: The pointer to the region. + * @s: The scheme to be applied. + * + * If a quota of a scheme has exceeded in a quota charge window, the scheme's + * action would applied to only a part of the target access pattern fulfilling + * regions. To avoid applying the scheme action to only already applied + * regions, DAMON skips applying the scheme action to the regions that charged + * in the previous charge window. + * + * This function checks if a given region should be skipped or not for the + * reason. If only the starting part of the region has previously charged, + * this function splits the region into two so that the second one covers the + * area that not charged in the previous charge widnow and saves the second + * region in *rp and returns false, so that the caller can apply DAMON action + * to the second one. + * + * Return: true if the region should be entirely skipped, false otherwise. + */ +static bool damos_skip_charged_region(struct damon_target *t, + struct damon_region **rp, struct damos *s) +{ + struct damon_region *r = *rp; + struct damos_quota *quota = &s->quota; + unsigned long sz_to_skip; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + return true; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return true; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + return true; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz_to_skip) { + if (damon_sz_region(r) <= DAMON_MIN_REGION) + return true; + sz_to_skip = DAMON_MIN_REGION; + } + damon_split_region_at(t, r, sz_to_skip); + r = damon_next_region(r); + *rp = r; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + return false; +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -702,7 +763,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = damon_sz_region(r); + unsigned long sz; struct timespec64 begin, end; unsigned long sz_applied = 0; @@ -713,41 +774,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - /* Skip previously charged regions */ - if (quota->charge_target_from) { - if (t != quota->charge_target_from) - continue; - if (r == damon_last_region(t)) { - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - continue; - } - if (quota->charge_addr_from && - r->ar.end <= quota->charge_addr_from) - continue; - - if (quota->charge_addr_from && r->ar.start < - quota->charge_addr_from) { - sz = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); - if (!sz) { - if (damon_sz_region(r) <= - DAMON_MIN_REGION) - continue; - sz = DAMON_MIN_REGION; - } - damon_split_region_at(t, r, sz); - r = damon_next_region(r); - sz = damon_sz_region(r); - } - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - } + if (damos_skip_charged_region(t, &r, s)) + continue; if (!damos_valid_target(c, t, r, s)) continue; /* Apply the scheme */ + sz = damon_sz_region(r); if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { From 0b0a43029ed4941a2f9faca14375e69062fe2e27 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:33 +0000 Subject: [PATCH 019/139] UPSTREAM: mm/damon/core: split damos application logic into a new function The DAMOS action applying function, 'damon_do_apply_schemes()', is still long and not easy to read. Split out the code for applying a single action to a single region into a new function for better readability. Link: https://lkml.kernel.org/r/20221026225943.100429-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit e63a30c51f8400915db401c05d3c4db6743857e8) Bug: 300502883 Change-Id: Iea228c7ed452fffb72f3d9a94078ad00dabf3269 Signed-off-by: cui yangpei --- mm/damon/core.c | 73 ++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 06b50ede9cc6..c1a912bc46ae 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -755,6 +755,44 @@ static bool damos_skip_charged_region(struct damon_target *t, return false; } +static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + unsigned long sz = damon_sz_region(r); + struct timespec64 begin, end; + unsigned long sz_applied = 0; + + if (c->ops.apply_scheme) { + if (quota->esz && quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(t, r, sz); + } + ktime_get_coarse_ts64(&begin); + sz_applied = c->ops.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -763,9 +801,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz; - struct timespec64 begin, end; - unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -780,37 +815,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (!damos_valid_target(c, t, r, s)) continue; - /* Apply the scheme */ - sz = damon_sz_region(r); - if (c->ops.apply_scheme) { - if (quota->esz && - quota->charged_sz + sz > quota->esz) { - sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); - if (!sz) - goto update_stat; - damon_split_region_at(t, r, sz); - } - ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); - ktime_get_coarse_ts64(&end); - quota->total_charged_ns += timespec64_to_ns(&end) - - timespec64_to_ns(&begin); - quota->charged_sz += sz; - if (quota->esz && quota->charged_sz >= quota->esz) { - quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; - } - } - if (s->action != DAMOS_STAT) - r->age = 0; - -update_stat: - s->stat.nr_tried++; - s->stat.sz_tried += sz; - if (sz_applied) - s->stat.nr_applied++; - s->stat.sz_applied += sz_applied; + damos_apply_scheme(c, t, r, s); } } From 43475d970841cd755d36285a5b9a249c83bf1df6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:34 +0000 Subject: [PATCH 020/139] UPSTREAM: mm/damon/core: split out scheme stat update logic into a new function The function for applying a given DAMON scheme action to a given DAMON region, 'damos_apply_scheme()' is not quite short. Make it better to read by splitting out the stat update logic into a new function. Link: https://lkml.kernel.org/r/20221026225943.100429-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit d1cbbf621fc25950938be74a228ef518d05d93a1) Bug: 300502883 Change-Id: I1502a102cdd6959494c249b3633ff97af6ccb94c Signed-off-by: cui yangpei --- mm/damon/core.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c1a912bc46ae..3a810c6e26bc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -755,6 +755,16 @@ static bool damos_skip_charged_region(struct damon_target *t, return false; } +static void damos_update_stat(struct damos *s, + unsigned long sz_tried, unsigned long sz_applied) +{ + s->stat.nr_tried++; + s->stat.sz_tried += sz_tried; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -786,11 +796,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, r->age = 0; update_stat: - s->stat.nr_tried++; - s->stat.sz_tried += sz; - if (sz_applied) - s->stat.nr_applied++; - s->stat.sz_applied += sz_applied; + damos_update_stat(s, sz, sz_applied); } static void damon_do_apply_schemes(struct damon_ctx *c, From b6e6b1dbf82480e44d2fdf0649edc853befdd45b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:35 +0000 Subject: [PATCH 021/139] UPSTREAM: mm/damon/core: split out scheme quota adjustment logic into a new function DAMOS quota adjustment logic in 'kdamond_apply_schemes()', has some amount of code, and the logic is not so straightforward. Split it out to a new function for better readability. Link: https://lkml.kernel.org/r/20221026225943.100429-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 898810e5ca54691f4e173f5ffc92bbce0335bc69) Bug: 300502883 Change-Id: I2d13cf290774d36884b533fe703eb01dfc47094c Signed-off-by: cui yangpei --- mm/damon/core.c | 91 ++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3a810c6e26bc..80d5937fe337 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -848,6 +848,53 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->esz = esz; } +static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + struct damon_target *t; + struct damon_region *r; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; + + if (!quota->ms && !quota->sz) + return; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies(quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } + + if (!c->ops.get_scheme_score) + return; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->ops.get_scheme_score(c, t, r, s); + quota->histogram[score] += damon_sz_region(r); + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; +} + static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; @@ -855,52 +902,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) struct damos *s; damon_for_each_scheme(s, c) { - struct damos_quota *quota = &s->quota; - unsigned long cumulated_sz; - unsigned int score, max_score = 0; - if (!s->wmarks.activated) continue; - if (!quota->ms && !quota->sz) - continue; - - /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + - msecs_to_jiffies( - quota->reset_interval))) { - if (quota->esz && quota->charged_sz >= quota->esz) - s->stat.qt_exceeds++; - quota->total_charged_sz += quota->charged_sz; - quota->charged_from = jiffies; - quota->charged_sz = 0; - damos_set_effective_quota(quota); - } - - if (!c->ops.get_scheme_score) - continue; - - /* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - if (!__damos_valid_target(r, s)) - continue; - score = c->ops.get_scheme_score( - c, t, r, s); - quota->histogram[score] += damon_sz_region(r); - if (score > max_score) - max_score = score; - } - } - - /* Set the min score limit */ - for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; - if (cumulated_sz >= quota->esz || !score) - break; - } - quota->min_score = score; + damos_adjust_quota(c, s); } damon_for_each_target(t, c) { From 19364f11a4db80547dfd7bad92cea4ebd133505c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:36 +0000 Subject: [PATCH 022/139] UPSTREAM: mm/damon/sysfs: use damon_addr_range for region's start and end values DAMON has a struct for each address range but DAMON sysfs interface is using the low type (unsigned long) for storing the start and end addresses of regions. Use the dedicated struct for better type safety. Link: https://lkml.kernel.org/r/20221026225943.100429-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 789a230613c8dd14bdd41653de0c22783726276f) Bug: 300502883 Change-Id: I3bcd00e51144728c7daace32bd1f5283a1ff9e3f Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 07e5f1bdf025..a5ef503d8444 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1062,13 +1062,11 @@ static struct kobj_type damon_sysfs_schemes_ktype = { struct damon_sysfs_region { struct kobject kobj; - unsigned long start; - unsigned long end; + struct damon_addr_range ar; }; static struct damon_sysfs_region *damon_sysfs_region_alloc( - unsigned long start, - unsigned long end) + struct damon_addr_range ar) { struct damon_sysfs_region *region = kmalloc(sizeof(*region), GFP_KERNEL); @@ -1076,8 +1074,7 @@ static struct damon_sysfs_region *damon_sysfs_region_alloc( if (!region) return NULL; region->kobj = (struct kobject){}; - region->start = start; - region->end = end; + region->ar = ar; return region; } @@ -1087,7 +1084,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->start); + return sysfs_emit(buf, "%lu\n", region->ar.start); } static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1095,7 +1092,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->start); + int err = kstrtoul(buf, 0, ®ion->ar.start); return err ? err : count; } @@ -1106,7 +1103,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->end); + return sysfs_emit(buf, "%lu\n", region->ar.end); } static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1114,7 +1111,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->end); + int err = kstrtoul(buf, 0, ®ion->ar.end); return err ? err : count; } @@ -1187,7 +1184,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, regions->regions_arr = regions_arr; for (i = 0; i < nr_regions; i++) { - region = damon_sysfs_region_alloc(0, 0); + region = damon_sysfs_region_alloc((struct damon_addr_range){}); if (!region) { damon_sysfs_regions_rm_dirs(regions); return -ENOMEM; @@ -2147,11 +2144,11 @@ static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_region *sys_region = sysfs_regions->regions_arr[i]; - if (sys_region->start > sys_region->end) + if (sys_region->ar.start > sys_region->ar.end) goto out; - ranges[i].start = sys_region->start; - ranges[i].end = sys_region->end; + ranges[i].start = sys_region->ar.start; + ranges[i].end = sys_region->ar.end; if (i == 0) continue; if (ranges[i - 1].end > ranges[i].start) From b7fc8d59a55859c15122aa8f63543f865d3819b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:37 +0000 Subject: [PATCH 023/139] UPSTREAM: mm/damon/sysfs: remove parameters of damon_sysfs_region_alloc() 'damon_sysfs_region_alloc()' is always called with zero-filled 'struct damon_addr_range', because the start and end addresses should set by users. Remove unnecessary parameters of the function and simplify the body by using 'kzalloc()'. Link: https://lkml.kernel.org/r/20221026225943.100429-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 1f71981408ef5696ad8544f282d336d4fc60a807) Bug: 300502883 Change-Id: I6fa54a7474851598fda50a07f5b6b2465bf4cd1f Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a5ef503d8444..f3d7b34ea0ab 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1065,17 +1065,9 @@ struct damon_sysfs_region { struct damon_addr_range ar; }; -static struct damon_sysfs_region *damon_sysfs_region_alloc( - struct damon_addr_range ar) +static struct damon_sysfs_region *damon_sysfs_region_alloc(void) { - struct damon_sysfs_region *region = kmalloc(sizeof(*region), - GFP_KERNEL); - - if (!region) - return NULL; - region->kobj = (struct kobject){}; - region->ar = ar; - return region; + return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL); } static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1184,7 +1176,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, regions->regions_arr = regions_arr; for (i = 0; i < nr_regions; i++) { - region = damon_sysfs_region_alloc((struct damon_addr_range){}); + region = damon_sysfs_region_alloc(); if (!region) { damon_sysfs_regions_rm_dirs(regions); return -ENOMEM; From c5038d80cebc558ec97786181cd90ab5ce3fd823 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:38 +0000 Subject: [PATCH 024/139] UPSTREAM: mm/damon/sysfs: move sysfs_lock to common module DAMON sysfs interface is implemented in a single file, sysfs.c, which has about 2,800 lines of code. As the interface is hierarchical and some of the code can be reused by different hierarchies, it would make more sense to split out the implementation into common parts and different parts in multiple files. As the beginning of the work, create files for common code and move the global mutex for directories modifications protection into the new file. Link: https://lkml.kernel.org/r/20221026225943.100429-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 39240595917ec0c4f71d7b9dd7909790715968b5) Bug: 300502883 Change-Id: I8e0eb32f2b90a907d917f684c35f2a321d944c0e Signed-off-by: cui yangpei --- mm/damon/Makefile | 2 +- mm/damon/sysfs-common.c | 11 +++++++++++ mm/damon/sysfs-common.h | 11 +++++++++++ mm/damon/sysfs.c | 4 +--- 4 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 mm/damon/sysfs-common.c create mode 100644 mm/damon/sysfs-common.h diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 3e6b8ad73858..f8d535a6253b 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o -obj-$(CONFIG_DAMON_SYSFS) += sysfs.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c new file mode 100644 index 000000000000..9dc743868d5b --- /dev/null +++ b/mm/damon/sysfs-common.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park + */ + +#include "sysfs-common.h" + +DEFINE_MUTEX(damon_sysfs_lock); + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h new file mode 100644 index 000000000000..745a918b94f5 --- /dev/null +++ b/mm/damon/sysfs-common.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park + */ + +#include +#include + +extern struct mutex damon_sysfs_lock; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f3d7b34ea0ab..a847b9159718 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -5,13 +5,11 @@ * Copyright (c) 2022 SeongJae Park */ -#include -#include #include #include #include -static DEFINE_MUTEX(damon_sysfs_lock); +#include "sysfs-common.h" /* * unsigned long range directory From a45dff567c923884760f51e6b111697c6b92a52c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:39 +0000 Subject: [PATCH 025/139] UPSTREAM: mm/damon/sysfs: move unsigned long range directory to common module The implementation of unsigned long type range directories can be reused by multiple DAMON sysfs directories including those for DAMON-based Operation Schemes and the range of number of monitoring regions. Move the code into the files for DAMON sysfs common logics. Link: https://lkml.kernel.org/r/20221026225943.100429-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit d332fe11debe69fee3de4c2d84fa0b6649678ad2) Bug: 300502883 Change-Id: I35f23cb5070fb3fb62a52fd2652e7f054d019201 Signed-off-by: cui yangpei --- mm/damon/sysfs-common.c | 96 ++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs-common.h | 13 ++++++ mm/damon/sysfs.c | 100 ---------------------------------------- 3 files changed, 109 insertions(+), 100 deletions(-) diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 9dc743868d5b..52bebf242f74 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -5,7 +5,103 @@ * Author: SeongJae Park */ +#include + #include "sysfs-common.h" DEFINE_MUTEX(damon_sysfs_lock); +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return err; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return err; + + range->max = max; + return count; +} + +void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 745a918b94f5..56e6a99e353b 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -9,3 +9,16 @@ #include extern struct mutex damon_sysfs_lock; + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max); +void damon_sysfs_ul_range_release(struct kobject *kobj); + +extern struct kobj_type damon_sysfs_ul_range_ktype; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a847b9159718..6774a669962e 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -11,106 +11,6 @@ #include "sysfs-common.h" -/* - * unsigned long range directory - */ - -struct damon_sysfs_ul_range { - struct kobject kobj; - unsigned long min; - unsigned long max; -}; - -static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( - unsigned long min, - unsigned long max) -{ - struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), - GFP_KERNEL); - - if (!range) - return NULL; - range->kobj = (struct kobject){}; - range->min = min; - range->max = max; - - return range; -} - -static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->min); -} - -static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long min; - int err; - - err = kstrtoul(buf, 0, &min); - if (err) - return err; - - range->min = min; - return count; -} - -static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->max); -} - -static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long max; - int err; - - err = kstrtoul(buf, 0, &max); - if (err) - return err; - - range->max = max; - return count; -} - -static void damon_sysfs_ul_range_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); -} - -static struct kobj_attribute damon_sysfs_ul_range_min_attr = - __ATTR_RW_MODE(min, 0600); - -static struct kobj_attribute damon_sysfs_ul_range_max_attr = - __ATTR_RW_MODE(max, 0600); - -static struct attribute *damon_sysfs_ul_range_attrs[] = { - &damon_sysfs_ul_range_min_attr.attr, - &damon_sysfs_ul_range_max_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_ul_range); - -static struct kobj_type damon_sysfs_ul_range_ktype = { - .release = damon_sysfs_ul_range_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_ul_range_groups, -}; - /* * schemes/stats directory */ From 0b17df8a4fee5ed733973b5afc72240362a385ca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:40 +0000 Subject: [PATCH 026/139] UPSTREAM: mm/damon/sysfs: split out kdamond-independent schemes stats update logic into a new function 'damon_sysfs_schemes_update_stats()' is coupled with both damon_sysfs_kdamond and damon_sysfs_schemes. It's a wide range of types dependency. It makes splitting the logics a little bit distracting. Split the function so that each function is coupled with smaller range of types. Link: https://lkml.kernel.org/r/20221026225943.100429-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 4acd715ff57fd05a481c64d074db68f2cf5711aa) Bug: 300502883 Change-Id: I3b0ab08f44a5cd0e6ab4dc65f8db8024a26a461e Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6774a669962e..836df19a7d86 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2246,25 +2246,13 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) mutex_unlock(&ctx->kdamond_lock); } -/* - * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. - * - * This function reads the schemes stats of specific kdamond and update the - * related values for sysfs files. This function should be called from DAMON - * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON - * contexts-internal data and DAMON sysfs variables. - */ -static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) { - struct damon_ctx *ctx = kdamond->damon_ctx; - struct damon_sysfs_schemes *sysfs_schemes; struct damos *scheme; int schemes_idx = 0; - if (!ctx) - return -EINVAL; - sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_stats *sysfs_stats; @@ -2279,6 +2267,25 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) sysfs_stats->sz_applied = scheme->stat.sz_applied; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } +} + +/* + * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes stats of specific kdamond and update the + * related values for sysfs files. This function should be called from DAMON + * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON + * contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + damon_sysfs_schemes_update_stats( + kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } From 67ef7b0f42a143e2cc471777397c2f6571e5dddd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:41 +0000 Subject: [PATCH 027/139] UPSTREAM: mm/damon/sysfs: split out schemes directory implementation to separate file DAMON sysfs interface for 'schemes' directory is implemented using about one thousand lines of code. It has no strong dependency with other parts of its file, so split it out to another file for better code management. Link: https://lkml.kernel.org/r/20221026225943.100429-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit c8e7b4d0ba348a8ef14956a80c780f152f433764) Bug: 300502883 Change-Id: I3a8f63dc4f6de6f4662b6f1a7d12afe1e25b36d2 Signed-off-by: cui yangpei --- mm/damon/Makefile | 2 +- mm/damon/sysfs-common.h | 22 + mm/damon/sysfs-schemes.c | 1068 ++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs.c | 1064 ------------------------------------- 4 files changed, 1091 insertions(+), 1065 deletions(-) create mode 100644 mm/damon/sysfs-schemes.c diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f8d535a6253b..1e86f5253d7f 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o -obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 56e6a99e353b..4626b2784404 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -22,3 +22,25 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( void damon_sysfs_ul_range_release(struct kobject *kobj); extern struct kobj_type damon_sysfs_ul_range_ktype; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); + +extern struct kobj_type damon_sysfs_schemes_ktype; + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes); + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c new file mode 100644 index 000000000000..9509d5c1e7fc --- /dev/null +++ b/mm/damon/sysfs-schemes.c @@ -0,0 +1,1068 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park + */ + +#include + +#include "sysfs-common.h" + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + return err ? err : count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + return err ? err : count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + return err ? err : count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + return err ? err : count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + return err ? err : count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + return err ? err : count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + return err ? err : count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_stats *stats; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "lru_prio", + "lru_deprio", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + return 0; + +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->stats->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); +} + +static void damon_sysfs_update_scheme(struct damos *scheme, + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + scheme->pattern.min_sz_region = access_pattern->sz->min; + scheme->pattern.max_sz_region = access_pattern->sz->max; + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; + scheme->pattern.min_age_region = access_pattern->age->min; + scheme->pattern.max_age_region = access_pattern->age->max; + + scheme->action = sysfs_scheme->action; + + scheme->quota.ms = sysfs_quotas->ms; + scheme->quota.sz = sysfs_quotas->sz; + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; + scheme->quota.weight_sz = sysfs_weights->sz; + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; + scheme->quota.weight_age = sysfs_weights->age; + + scheme->wmarks.metric = sysfs_wmarks->metric; + scheme->wmarks.interval = sysfs_wmarks->interval_us; + scheme->wmarks.high = sysfs_wmarks->high; + scheme->wmarks.mid = sysfs_wmarks->mid; + scheme->wmarks.low = sysfs_wmarks->low; +} + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + struct damos *scheme, *next; + int i = 0; + + damon_for_each_scheme_safe(scheme, next, ctx) { + if (i < sysfs_schemes->nr) + damon_sysfs_update_scheme(scheme, + sysfs_schemes->schemes_arr[i]); + else + damon_destroy_scheme(scheme); + i++; + } + + for (; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 836df19a7d86..284daf274b3e 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -11,949 +11,6 @@ #include "sysfs-common.h" -/* - * schemes/stats directory - */ - -struct damon_sysfs_stats { - struct kobject kobj; - unsigned long nr_tried; - unsigned long sz_tried; - unsigned long nr_applied; - unsigned long sz_applied; - unsigned long qt_exceeds; -}; - -static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); -} - -static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_tried); -} - -static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_tried); -} - -static ssize_t nr_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_applied); -} - -static ssize_t sz_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_applied); -} - -static ssize_t qt_exceeds_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); -} - -static void damon_sysfs_stats_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); -} - -static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = - __ATTR_RO_MODE(nr_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = - __ATTR_RO_MODE(sz_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = - __ATTR_RO_MODE(nr_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = - __ATTR_RO_MODE(sz_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = - __ATTR_RO_MODE(qt_exceeds, 0400); - -static struct attribute *damon_sysfs_stats_attrs[] = { - &damon_sysfs_stats_nr_tried_attr.attr, - &damon_sysfs_stats_sz_tried_attr.attr, - &damon_sysfs_stats_nr_applied_attr.attr, - &damon_sysfs_stats_sz_applied_attr.attr, - &damon_sysfs_stats_qt_exceeds_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_stats); - -static struct kobj_type damon_sysfs_stats_ktype = { - .release = damon_sysfs_stats_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_stats_groups, -}; - -/* - * watermarks directory - */ - -struct damon_sysfs_watermarks { - struct kobject kobj; - enum damos_wmark_metric metric; - unsigned long interval_us; - unsigned long high; - unsigned long mid; - unsigned long low; -}; - -static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( - enum damos_wmark_metric metric, unsigned long interval_us, - unsigned long high, unsigned long mid, unsigned long low) -{ - struct damon_sysfs_watermarks *watermarks = kmalloc( - sizeof(*watermarks), GFP_KERNEL); - - if (!watermarks) - return NULL; - watermarks->kobj = (struct kobject){}; - watermarks->metric = metric; - watermarks->interval_us = interval_us; - watermarks->high = high; - watermarks->mid = mid; - watermarks->low = low; - return watermarks; -} - -/* Should match with enum damos_wmark_metric */ -static const char * const damon_sysfs_wmark_metric_strs[] = { - "none", - "free_mem_rate", -}; - -static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_wmark_metric_strs[watermarks->metric]); -} - -static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - enum damos_wmark_metric metric; - - for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { - if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { - watermarks->metric = metric; - return count; - } - } - return -EINVAL; -} - -static ssize_t interval_us_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->interval_us); -} - -static ssize_t interval_us_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->interval_us); - - return err ? err : count; -} - -static ssize_t high_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->high); -} - -static ssize_t high_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->high); - - return err ? err : count; -} - -static ssize_t mid_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->mid); -} - -static ssize_t mid_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->mid); - - return err ? err : count; -} - -static ssize_t low_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->low); -} - -static ssize_t low_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->low); - - return err ? err : count; -} - -static void damon_sysfs_watermarks_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); -} - -static struct kobj_attribute damon_sysfs_watermarks_metric_attr = - __ATTR_RW_MODE(metric, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = - __ATTR_RW_MODE(interval_us, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_high_attr = - __ATTR_RW_MODE(high, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_mid_attr = - __ATTR_RW_MODE(mid, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_low_attr = - __ATTR_RW_MODE(low, 0600); - -static struct attribute *damon_sysfs_watermarks_attrs[] = { - &damon_sysfs_watermarks_metric_attr.attr, - &damon_sysfs_watermarks_interval_us_attr.attr, - &damon_sysfs_watermarks_high_attr.attr, - &damon_sysfs_watermarks_mid_attr.attr, - &damon_sysfs_watermarks_low_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_watermarks); - -static struct kobj_type damon_sysfs_watermarks_ktype = { - .release = damon_sysfs_watermarks_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_watermarks_groups, -}; - -/* - * scheme/weights directory - */ - -struct damon_sysfs_weights { - struct kobject kobj; - unsigned int sz; - unsigned int nr_accesses; - unsigned int age; -}; - -static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, - unsigned int nr_accesses, unsigned int age) -{ - struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), - GFP_KERNEL); - - if (!weights) - return NULL; - weights->kobj = (struct kobject){}; - weights->sz = sz; - weights->nr_accesses = nr_accesses; - weights->age = age; - return weights; -} - -static ssize_t sz_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->sz); -} - -static ssize_t sz_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->sz); - - return err ? err : count; -} - -static ssize_t nr_accesses_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->nr_accesses); -} - -static ssize_t nr_accesses_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->nr_accesses); - - return err ? err : count; -} - -static ssize_t age_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->age); -} - -static ssize_t age_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->age); - - return err ? err : count; -} - -static void damon_sysfs_weights_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); -} - -static struct kobj_attribute damon_sysfs_weights_sz_attr = - __ATTR_RW_MODE(sz_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = - __ATTR_RW_MODE(nr_accesses_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_age_attr = - __ATTR_RW_MODE(age_permil, 0600); - -static struct attribute *damon_sysfs_weights_attrs[] = { - &damon_sysfs_weights_sz_attr.attr, - &damon_sysfs_weights_nr_accesses_attr.attr, - &damon_sysfs_weights_age_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_weights); - -static struct kobj_type damon_sysfs_weights_ktype = { - .release = damon_sysfs_weights_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_weights_groups, -}; - -/* - * quotas directory - */ - -struct damon_sysfs_quotas { - struct kobject kobj; - struct damon_sysfs_weights *weights; - unsigned long ms; - unsigned long sz; - unsigned long reset_interval_ms; -}; - -static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); -} - -static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) -{ - struct damon_sysfs_weights *weights; - int err; - - weights = damon_sysfs_weights_alloc(0, 0, 0); - if (!weights) - return -ENOMEM; - - err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, - "as->kobj, "weights"); - if (err) - kobject_put(&weights->kobj); - else - quotas->weights = weights; - return err; -} - -static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) -{ - kobject_put("as->weights->kobj); -} - -static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->ms); -} - -static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->ms); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->sz); -} - -static ssize_t bytes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->sz); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t reset_interval_ms_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); -} - -static ssize_t reset_interval_ms_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->reset_interval_ms); - - if (err) - return -EINVAL; - return count; -} - -static void damon_sysfs_quotas_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); -} - -static struct kobj_attribute damon_sysfs_quotas_ms_attr = - __ATTR_RW_MODE(ms, 0600); - -static struct kobj_attribute damon_sysfs_quotas_sz_attr = - __ATTR_RW_MODE(bytes, 0600); - -static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = - __ATTR_RW_MODE(reset_interval_ms, 0600); - -static struct attribute *damon_sysfs_quotas_attrs[] = { - &damon_sysfs_quotas_ms_attr.attr, - &damon_sysfs_quotas_sz_attr.attr, - &damon_sysfs_quotas_reset_interval_ms_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_quotas); - -static struct kobj_type damon_sysfs_quotas_ktype = { - .release = damon_sysfs_quotas_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_quotas_groups, -}; - -/* - * access_pattern directory - */ - -struct damon_sysfs_access_pattern { - struct kobject kobj; - struct damon_sysfs_ul_range *sz; - struct damon_sysfs_ul_range *nr_accesses; - struct damon_sysfs_ul_range *age; -}; - -static -struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) -{ - struct damon_sysfs_access_pattern *access_pattern = - kmalloc(sizeof(*access_pattern), GFP_KERNEL); - - if (!access_pattern) - return NULL; - access_pattern->kobj = (struct kobject){}; - return access_pattern; -} - -static int damon_sysfs_access_pattern_add_range_dir( - struct damon_sysfs_access_pattern *access_pattern, - struct damon_sysfs_ul_range **range_dir_ptr, - char *name) -{ - struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); - int err; - - if (!range) - return -ENOMEM; - err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, - &access_pattern->kobj, name); - if (err) - kobject_put(&range->kobj); - else - *range_dir_ptr = range; - return err; -} - -static int damon_sysfs_access_pattern_add_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - int err; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->sz, "sz"); - if (err) - goto put_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->nr_accesses, "nr_accesses"); - if (err) - goto put_nr_accesses_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->age, "age"); - if (err) - goto put_age_nr_accesses_sz_out; - return 0; - -put_age_nr_accesses_sz_out: - kobject_put(&access_pattern->age->kobj); - access_pattern->age = NULL; -put_nr_accesses_sz_out: - kobject_put(&access_pattern->nr_accesses->kobj); - access_pattern->nr_accesses = NULL; -put_sz_out: - kobject_put(&access_pattern->sz->kobj); - access_pattern->sz = NULL; - return err; -} - -static void damon_sysfs_access_pattern_rm_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - kobject_put(&access_pattern->sz->kobj); - kobject_put(&access_pattern->nr_accesses->kobj); - kobject_put(&access_pattern->age->kobj); -} - -static void damon_sysfs_access_pattern_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); -} - -static struct attribute *damon_sysfs_access_pattern_attrs[] = { - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); - -static struct kobj_type damon_sysfs_access_pattern_ktype = { - .release = damon_sysfs_access_pattern_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_access_pattern_groups, -}; - -/* - * scheme directory - */ - -struct damon_sysfs_scheme { - struct kobject kobj; - enum damos_action action; - struct damon_sysfs_access_pattern *access_pattern; - struct damon_sysfs_quotas *quotas; - struct damon_sysfs_watermarks *watermarks; - struct damon_sysfs_stats *stats; -}; - -/* This should match with enum damos_action */ -static const char * const damon_sysfs_damos_action_strs[] = { - "willneed", - "cold", - "pageout", - "hugepage", - "nohugepage", - "lru_prio", - "lru_deprio", - "stat", -}; - -static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( - enum damos_action action) -{ - struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), - GFP_KERNEL); - - if (!scheme) - return NULL; - scheme->kobj = (struct kobject){}; - scheme->action = action; - return scheme; -} - -static int damon_sysfs_scheme_set_access_pattern( - struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_access_pattern *access_pattern; - int err; - - access_pattern = damon_sysfs_access_pattern_alloc(); - if (!access_pattern) - return -ENOMEM; - err = kobject_init_and_add(&access_pattern->kobj, - &damon_sysfs_access_pattern_ktype, &scheme->kobj, - "access_pattern"); - if (err) - goto out; - err = damon_sysfs_access_pattern_add_dirs(access_pattern); - if (err) - goto out; - scheme->access_pattern = access_pattern; - return 0; - -out: - kobject_put(&access_pattern->kobj); - return err; -} - -static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); - int err; - - if (!quotas) - return -ENOMEM; - err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, - &scheme->kobj, "quotas"); - if (err) - goto out; - err = damon_sysfs_quotas_add_dirs(quotas); - if (err) - goto out; - scheme->quotas = quotas; - return 0; - -out: - kobject_put("as->kobj); - return err; -} - -static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_watermarks *watermarks = - damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); - int err; - - if (!watermarks) - return -ENOMEM; - err = kobject_init_and_add(&watermarks->kobj, - &damon_sysfs_watermarks_ktype, &scheme->kobj, - "watermarks"); - if (err) - kobject_put(&watermarks->kobj); - else - scheme->watermarks = watermarks; - return err; -} - -static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); - int err; - - if (!stats) - return -ENOMEM; - err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, - &scheme->kobj, "stats"); - if (err) - kobject_put(&stats->kobj); - else - scheme->stats = stats; - return err; -} - -static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) -{ - int err; - - err = damon_sysfs_scheme_set_access_pattern(scheme); - if (err) - return err; - err = damon_sysfs_scheme_set_quotas(scheme); - if (err) - goto put_access_pattern_out; - err = damon_sysfs_scheme_set_watermarks(scheme); - if (err) - goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_stats(scheme); - if (err) - goto put_watermarks_quotas_access_pattern_out; - return 0; - -put_watermarks_quotas_access_pattern_out: - kobject_put(&scheme->watermarks->kobj); - scheme->watermarks = NULL; -put_quotas_access_pattern_out: - kobject_put(&scheme->quotas->kobj); - scheme->quotas = NULL; -put_access_pattern_out: - kobject_put(&scheme->access_pattern->kobj); - scheme->access_pattern = NULL; - return err; -} - -static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) -{ - damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); - kobject_put(&scheme->access_pattern->kobj); - damon_sysfs_quotas_rm_dirs(scheme->quotas); - kobject_put(&scheme->quotas->kobj); - kobject_put(&scheme->watermarks->kobj); - kobject_put(&scheme->stats->kobj); -} - -static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_damos_action_strs[scheme->action]); -} - -static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - enum damos_action action; - - for (action = 0; action < NR_DAMOS_ACTIONS; action++) { - if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { - scheme->action = action; - return count; - } - } - return -EINVAL; -} - -static void damon_sysfs_scheme_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); -} - -static struct kobj_attribute damon_sysfs_scheme_action_attr = - __ATTR_RW_MODE(action, 0600); - -static struct attribute *damon_sysfs_scheme_attrs[] = { - &damon_sysfs_scheme_action_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_scheme); - -static struct kobj_type damon_sysfs_scheme_ktype = { - .release = damon_sysfs_scheme_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_scheme_groups, -}; - -/* - * schemes directory - */ - -struct damon_sysfs_schemes { - struct kobject kobj; - struct damon_sysfs_scheme **schemes_arr; - int nr; -}; - -static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); -} - -static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) -{ - struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; - int i; - - for (i = 0; i < schemes->nr; i++) { - damon_sysfs_scheme_rm_dirs(schemes_arr[i]); - kobject_put(&schemes_arr[i]->kobj); - } - schemes->nr = 0; - kfree(schemes_arr); - schemes->schemes_arr = NULL; -} - -static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, - int nr_schemes) -{ - struct damon_sysfs_scheme **schemes_arr, *scheme; - int err, i; - - damon_sysfs_schemes_rm_dirs(schemes); - if (!nr_schemes) - return 0; - - schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), - GFP_KERNEL | __GFP_NOWARN); - if (!schemes_arr) - return -ENOMEM; - schemes->schemes_arr = schemes_arr; - - for (i = 0; i < nr_schemes; i++) { - scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); - if (!scheme) { - damon_sysfs_schemes_rm_dirs(schemes); - return -ENOMEM; - } - - err = kobject_init_and_add(&scheme->kobj, - &damon_sysfs_scheme_ktype, &schemes->kobj, - "%d", i); - if (err) - goto out; - err = damon_sysfs_scheme_add_dirs(scheme); - if (err) - goto out; - - schemes_arr[i] = scheme; - schemes->nr++; - } - return 0; - -out: - damon_sysfs_schemes_rm_dirs(schemes); - kobject_put(&scheme->kobj); - return err; -} - -static ssize_t nr_schemes_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); - - return sysfs_emit(buf, "%d\n", schemes->nr); -} - -static ssize_t nr_schemes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_schemes *schemes; - int nr, err = kstrtoint(buf, 0, &nr); - - if (err) - return err; - if (nr < 0) - return -EINVAL; - - schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); - - if (!mutex_trylock(&damon_sysfs_lock)) - return -EBUSY; - err = damon_sysfs_schemes_add_dirs(schemes, nr); - mutex_unlock(&damon_sysfs_lock); - if (err) - return err; - return count; -} - -static void damon_sysfs_schemes_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); -} - -static struct kobj_attribute damon_sysfs_schemes_nr_attr = - __ATTR_RW_MODE(nr_schemes, 0600); - -static struct attribute *damon_sysfs_schemes_attrs[] = { - &damon_sysfs_schemes_nr_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_schemes); - -static struct kobj_type damon_sysfs_schemes_ktype = { - .release = damon_sysfs_schemes_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_schemes_groups, -}; - /* * init region directory */ @@ -2133,104 +1190,6 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return 0; } -static struct damos *damon_sysfs_mk_scheme( - struct damon_sysfs_scheme *sysfs_scheme) -{ - struct damon_sysfs_access_pattern *access_pattern = - sysfs_scheme->access_pattern; - struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; - struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; - struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - - struct damos_access_pattern pattern = { - .min_sz_region = access_pattern->sz->min, - .max_sz_region = access_pattern->sz->max, - .min_nr_accesses = access_pattern->nr_accesses->min, - .max_nr_accesses = access_pattern->nr_accesses->max, - .min_age_region = access_pattern->age->min, - .max_age_region = access_pattern->age->max, - }; - struct damos_quota quota = { - .ms = sysfs_quotas->ms, - .sz = sysfs_quotas->sz, - .reset_interval = sysfs_quotas->reset_interval_ms, - .weight_sz = sysfs_weights->sz, - .weight_nr_accesses = sysfs_weights->nr_accesses, - .weight_age = sysfs_weights->age, - }; - struct damos_watermarks wmarks = { - .metric = sysfs_wmarks->metric, - .interval = sysfs_wmarks->interval_us, - .high = sysfs_wmarks->high, - .mid = sysfs_wmarks->mid, - .low = sysfs_wmarks->low, - }; - - return damon_new_scheme(&pattern, sysfs_scheme->action, "a, - &wmarks); -} - -static void damon_sysfs_update_scheme(struct damos *scheme, - struct damon_sysfs_scheme *sysfs_scheme) -{ - struct damon_sysfs_access_pattern *access_pattern = - sysfs_scheme->access_pattern; - struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; - struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; - struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - - scheme->pattern.min_sz_region = access_pattern->sz->min; - scheme->pattern.max_sz_region = access_pattern->sz->max; - scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; - scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; - scheme->pattern.min_age_region = access_pattern->age->min; - scheme->pattern.max_age_region = access_pattern->age->max; - - scheme->action = sysfs_scheme->action; - - scheme->quota.ms = sysfs_quotas->ms; - scheme->quota.sz = sysfs_quotas->sz; - scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; - scheme->quota.weight_sz = sysfs_weights->sz; - scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; - scheme->quota.weight_age = sysfs_weights->age; - - scheme->wmarks.metric = sysfs_wmarks->metric; - scheme->wmarks.interval = sysfs_wmarks->interval_us; - scheme->wmarks.high = sysfs_wmarks->high; - scheme->wmarks.mid = sysfs_wmarks->mid; - scheme->wmarks.low = sysfs_wmarks->low; -} - -static int damon_sysfs_set_schemes(struct damon_ctx *ctx, - struct damon_sysfs_schemes *sysfs_schemes) -{ - struct damos *scheme, *next; - int i = 0; - - damon_for_each_scheme_safe(scheme, next, ctx) { - if (i < sysfs_schemes->nr) - damon_sysfs_update_scheme(scheme, - sysfs_schemes->schemes_arr[i]); - else - damon_destroy_scheme(scheme); - i++; - } - - for (; i < sysfs_schemes->nr; i++) { - struct damos *scheme, *next; - - scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); - if (!scheme) { - damon_for_each_scheme_safe(scheme, next, ctx) - damon_destroy_scheme(scheme); - return -ENOMEM; - } - damon_add_scheme(ctx, scheme); - } - return 0; -} - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; @@ -2246,29 +1205,6 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) mutex_unlock(&ctx->kdamond_lock); } -static void damon_sysfs_schemes_update_stats( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - struct damos *scheme; - int schemes_idx = 0; - - damon_for_each_scheme(scheme, ctx) { - struct damon_sysfs_stats *sysfs_stats; - - /* user could have removed the scheme sysfs dir */ - if (schemes_idx >= sysfs_schemes->nr) - break; - - sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; - sysfs_stats->nr_tried = scheme->stat.nr_tried; - sysfs_stats->sz_tried = scheme->stat.sz_tried; - sysfs_stats->nr_applied = scheme->stat.nr_applied; - sysfs_stats->sz_applied = scheme->stat.sz_applied; - sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; - } -} - /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. * @kdamond: The kobject wrapper that associated to the kdamond thread. From 3c0bc73f6e2d8f625fa571decabb0cc1706a7485 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:42 +0000 Subject: [PATCH 028/139] UPSTREAM: mm/damon/modules: deduplicate init steps for DAMON context setup DAMON_RECLAIM and DAMON_LRU_SORT has duplicated code for DAMON context and target initializations. Deduplicate the part by implementing a function for the initialization in 'modules-common.c' and using it. Link: https://lkml.kernel.org/r/20221026225943.100429-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 7ae2c17f53d5054d1fe5c1a103ad46068034617d) Bug: 300502883 Change-Id: I3679fce4da1d68f3ba49bb623d1b1b69b0cb6bab Signed-off-by: cui yangpei --- mm/damon/Makefile | 4 ++-- mm/damon/lru_sort.c | 17 +++------------- mm/damon/modules-common.c | 42 +++++++++++++++++++++++++++++++++++++++ mm/damon/modules-common.h | 3 +++ mm/damon/reclaim.c | 17 +++------------- 5 files changed, 53 insertions(+), 30 deletions(-) create mode 100644 mm/damon/modules-common.c diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 1e86f5253d7f..f7add3f4aa79 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o -obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o -obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o +obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index efbc2bda8b9c..a1896c5acfe9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -314,25 +314,14 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) static int __init damon_lru_sort_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - schedule_delayed_work(&damon_lru_sort_timer, 0); damon_lru_sort_initialized = true; diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c new file mode 100644 index 000000000000..b2381a8466ec --- /dev/null +++ b/mm/damon/modules-common.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#include "modules-common.h" + +/* + * Allocate, set, and return a DAMON context for the physical address space. + * @ctxp: Pointer to save the point to the newly created context + * @targetp: Pointer to save the point to the newly created target + */ +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp) +{ + struct damon_ctx *ctx; + struct damon_target *target; + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + *ctxp = ctx; + *targetp = target; + return 0; +} diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 5a4921851d32..f49cdb417005 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -44,3 +44,6 @@ 0400); \ module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ 0400); + +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 162c9b1ca00f..3173f373435c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -256,25 +256,14 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) static int __init damon_reclaim_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - schedule_delayed_work(&damon_reclaim_timer, 0); damon_reclaim_initialized = true; From 4e2d3f8e31d925e3edb1e1d05df2137c1d9e6725 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:43 +0000 Subject: [PATCH 029/139] UPSTREAM: mm/damon/{reclaim,lru_sort}: remove unnecessarily included headers Some headers that 'reclaim.c' and 'lru_sort.c' are including are unnecessary now owing to previous cleanups and refactorings. Remove those. Link: https://lkml.kernel.org/r/20221026225943.100429-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit b0d3dbd1b98660ec2154fccbd21c13916c967c05) Bug: 300502883 Change-Id: I66ffcdfe7276261f5de14d8d794a1dd6b5312caf Signed-off-by: cui yangpei --- mm/damon/lru_sort.c | 2 -- mm/damon/reclaim.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index a1896c5acfe9..5c60163e556c 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -8,9 +8,7 @@ #define pr_fmt(fmt) "damon-lru-sort: " fmt #include -#include #include -#include #include #include "modules-common.h" diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3173f373435c..e14eb30c01f4 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -8,9 +8,7 @@ #define pr_fmt(fmt) "damon-reclaim: " fmt #include -#include #include -#include #include #include "modules-common.h" From 540e9b850d4abf29dcc30631c2a517111028abed Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Oct 2022 17:36:47 +0000 Subject: [PATCH 030/139] UPSTREAM: mm/damon/reclaim: enable and disable synchronously Patch series "mm/damon/reclaim,lru_sort: enable/disable synchronously". Writing a value to DAMON_RECLAIM and DAMON_LRU_SORT's 'enabled' parameters turns on or off DAMON in an ansychronous way. This means the parameter cannot be used to read the current status of them. 'kdamond_pid' parameter should be used instead for the purpose. The documentation is easy to be read as it works in a synchronous way, so it is a little bit confusing. It also makes the user space tooling dirty. There's no real reason to have the asynchronous behavior, though. Simply make the parameter works synchronously, rather than updating the document. The first and second patches changes the behavior of the 'enabled' parameter for DAMON_RECLAIM and adds a selftest for the changed behavior, respectively. Following two patches make the same changes for DAMON_LRU_SORT. This patch (of 4): Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off DAMON in an ansychronous way. This means the parameter cannot be used to read the current status of DAMON_RECLAIM. 'kdamond_pid' parameter should be used instead for the purpose. The documentation is easy to be read as it works in a synchronous way, so it is a little bit confusing. It also makes the user space tooling dirty. There's no real reason to have the asynchronous behavior, though. Simply make the parameter works synchronously, rather than updating the document. Link: https://lkml.kernel.org/r/20221025173650.90624-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221025173650.90624-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 04e98764befa371836a78b2b489e8b931a3a9e9a) Bug: 300502883 Change-Id: If3ea45d1a07d57a3be47317886b17b61f62d5bcf Signed-off-by: cui yangpei --- mm/damon/reclaim.c | 53 ++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e14eb30c01f4..e57604bec06d 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -9,7 +9,6 @@ #include #include -#include #include "modules-common.h" @@ -181,38 +180,31 @@ static int damon_reclaim_turn(bool on) return 0; } -static struct delayed_work damon_reclaim_timer; -static void damon_reclaim_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_reclaim_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); - -static bool damon_reclaim_initialized; - static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = strtobool(val, &enable); + if (err) + return err; - /* system_wq might not initialized yet */ - if (!damon_reclaim_initialized) - return rc; + if (is_enabled == enable) + return 0; - schedule_delayed_work(&damon_reclaim_timer, 0); - return 0; + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; + + err = damon_reclaim_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -262,10 +254,11 @@ static int __init damon_reclaim_init(void) ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - schedule_delayed_work(&damon_reclaim_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_reclaim_turn(true); - damon_reclaim_initialized = true; - return 0; + return err; } module_init(damon_reclaim_init); From 6547a97f321cb1b08e8b030f720f736adc12b6fa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Oct 2022 17:36:49 +0000 Subject: [PATCH 031/139] UPSTREAM: mm/damon/lru_sort: enable and disable synchronously Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off DAMON in an ansychronous way. This means the parameter cannot be used to read the current status of DAMON_RECLAIM. 'kdamond_pid' parameter should be used instead for the purpose. The documentation is easy to be read as it works in a synchronous way, so it is a little bit confusing. It also makes the user space tooling dirty. There's no real reason to have the asynchronous behavior, though. Simply make the parameter works synchronously, rather than updating the document. Link: https://lkml.kernel.org/r/20221025173650.90624-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 7a034fbba3361e94956431d17660d7c5674d13c3) Bug: 300502883 Change-Id: Iaabcbb45e4fe5dfe6781407b23d82657988c53d8 Signed-off-by: cui yangpei --- mm/damon/lru_sort.c | 51 +++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 5c60163e556c..2a532e3983df 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -9,7 +9,6 @@ #include #include -#include #include "modules-common.h" @@ -235,38 +234,31 @@ static int damon_lru_sort_turn(bool on) return 0; } -static struct delayed_work damon_lru_sort_timer; -static void damon_lru_sort_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_lru_sort_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); - -static bool damon_lru_sort_initialized; - static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = strtobool(val, &enable); + if (err) + return err; - if (!damon_lru_sort_initialized) - return rc; + if (is_enabled == enable) + return 0; - schedule_delayed_work(&damon_lru_sort_timer, 0); + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; - return 0; + err = damon_lru_sort_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -320,10 +312,11 @@ static int __init damon_lru_sort_init(void) ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; - schedule_delayed_work(&damon_lru_sort_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_lru_sort_turn(true); - damon_lru_sort_initialized = true; - return 0; + return err; } module_init(damon_lru_sort_init); From b5d1f3576b71eda9569c41a3dae8d9538ab27444 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:21 +0000 Subject: [PATCH 032/139] UPSTREAM: mm/damon/core: add a callback for scheme target regions check Patch series "efficiently expose damos action tried regions information". DAMON users can retrieve the monitoring results via 'after_aggregation' callbacks if the user is using the kernel API, or 'damon_aggregated' tracepoint if the user is in the user space. Those are useful if full monitoring results are necessary. However, if the user has interest in only a snapshot of the results for some regions having specific access pattern, the interfaces could be inefficient. For example, some users only want to know which memory regions are not accessed for more than a specific time at the moment. Also, some DAMOS users would want to know exactly to what memory regions the schemes' actions tried to be applied, for a debugging or a tuning. As DAMOS has its internal mechanism for quota and regions prioritization, the users would need to simulate DAMOS' mechanism against the monitoring results. That's unnecessarily complex. This patchset implements DAMON kernel API callbacks and sysfs directory for efficient exposure of the information for the use cases. The new callback will be called for each region when a DAMOS action is gonna tried to be applied to it. The sysfs directory will be called 'tried_regions' and placed under each scheme sysfs directory. Users can write a special keyworkd, 'update_schemes_regions', to the 'state' file of a kdamond sysfs directory. Then, DAMON sysfs interface will fill the directory with the information of regions that corresponding scheme action was tried to be applied for next one aggregation interval. Patches Sequence ---------------- The first one (patch 1) implements the callback for the kernel space users. Following two patches (patches 2 and 3) implements sysfs directories for the information and its sub directories. Two patches (patches 4 and 5) for implementing the special keywords for filling the data to and cleaning up the directories follow. Patch 6 adds a selftest for the new sysfs directory. Finally, two patches (patches 7 and 8) document the new feature in the administrator guide and the ABI document. This patch (of 8): Getting DAMON monitoring results of only specific access pattern (e.g., getting address ranges of memory that not accessed at all for two minutes) can be useful for efficient monitoring of the system. The information can also be helpful for deep level investigation of DAMON-based operation schemes. For that, users need to record (in case of the user space users) or iterate (in case of the kernel space users) full monitoring results and filter it out for the specific access pattern. In case of the DAMOS investigation, users will even need to simulate DAMOS' quota and prioritization mechanisms. It's inefficient and complex. Add a new DAMON callback that will be called before each scheme is applied to each region. DAMON kernel API users will be able to do the query-like monitoring results collection, or DAMOS investigation in an efficient and simple way using it. Commits for providing the capability to the user space users will follow. Link: https://lkml.kernel.org/r/20221101220328.95765-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 44467bbb7e81ebcef2a5bfc9d6546bf7cd015374) Bug: 300502883 Change-Id: I21ff3c9cf6c30e113f78883e5063bcb898506b41 Signed-off-by: cui yangpei --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 620ada094c3b..35630634d790 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,7 @@ struct damon_operations { * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. + * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * @private: User private data. * @@ -385,6 +386,10 @@ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); + int (*before_damos_apply)(struct damon_ctx *context, + struct damon_target *target, + struct damon_region *region, + struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 80d5937fe337..ceec75b88ef9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -772,6 +772,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; + int err = 0; if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { @@ -782,7 +783,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, damon_split_region_at(t, r, sz); } ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); + if (c->callback.before_damos_apply) + err = c->callback.before_damos_apply(c, t, r, s); + if (!err) + sz_applied = c->ops.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); From b4c34cc1689700c49e0ce371b13c695981f66d5c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:22 +0000 Subject: [PATCH 033/139] UPSTREAM: mm/damon/sysfs-schemes: implement schemes/tried_regions directory For efficient and simple query-like DAMON monitoring results readings and deep level investigations of DAMOS, DAMON kernel API (include/linux/damon.h) users can use 'before_damos_apply' DAMON callback. However, DAMON sysfs interface users don't have such option. Add a directory, namely 'tried_regions', under each scheme directory to use it as the interface for the purpose. Note that this commit is implementing only the directory but the data filling. After the data filling change is made, users will be able to signal DAMON to fill the directory with the regions that corresponding scheme has tried to be applied. By setting the access pattern of the scheme, users could do the efficient query-like monitoring. Link: https://lkml.kernel.org/r/20221101220328.95765-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 5181b75f438d2e5b7f27bf48c6ea88a87c2882b7) Bug: 300502883 Change-Id: Idc7a1fca201b90f8fea62899f1e6b500bb8e14e1 Signed-off-by: cui yangpei --- mm/damon/sysfs-schemes.c | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 9509d5c1e7fc..500759d8b20c 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -9,6 +9,36 @@ #include "sysfs-common.h" +/* + * scheme regions directory + */ + +struct damon_sysfs_scheme_regions { + struct kobject kobj; +}; + +static struct damon_sysfs_scheme_regions * +damon_sysfs_scheme_regions_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL); +} + +static void damon_sysfs_scheme_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); +} + +static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); + +static struct kobj_type damon_sysfs_scheme_regions_ktype = { + .release = damon_sysfs_scheme_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_regions_groups, +}; + /* * schemes/stats directory */ @@ -635,6 +665,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; struct damon_sysfs_stats *stats; + struct damon_sysfs_scheme_regions *tried_regions; }; /* This should match with enum damos_action */ @@ -743,6 +774,25 @@ static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) return err; } +static int damon_sysfs_scheme_set_tried_regions( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_regions *tried_regions = + damon_sysfs_scheme_regions_alloc(); + int err; + + if (!tried_regions) + return -ENOMEM; + err = kobject_init_and_add(&tried_regions->kobj, + &damon_sysfs_scheme_regions_ktype, &scheme->kobj, + "tried_regions"); + if (err) + kobject_put(&tried_regions->kobj); + else + scheme->tried_regions = tried_regions; + return err; +} + static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) { int err; @@ -759,8 +809,14 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_stats(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_tried_regions(scheme); + if (err) + goto put_tried_regions_out; return 0; +put_tried_regions_out: + kobject_put(&scheme->tried_regions->kobj); + scheme->tried_regions = NULL; put_watermarks_quotas_access_pattern_out: kobject_put(&scheme->watermarks->kobj); scheme->watermarks = NULL; @@ -781,6 +837,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); kobject_put(&scheme->stats->kobj); + kobject_put(&scheme->tried_regions->kobj); } static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, From 3421250b35221bad4e65440ac2be0831767c5ba6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:23 +0000 Subject: [PATCH 034/139] UPSTREAM: mm/damon/sysfs-schemes: implement scheme region directory Implement region directories under 'tried_regions' directory of each scheme DAMON sysfs directory. This directory will provide the address range, the monitored access frequency ('nr_accesses'), and the age of each DAMON region that corresponding DAMON-based operation scheme has tried to be applied. Note that this commit doesn't implement the code for filling the data but only the sysfs directory. Link: https://lkml.kernel.org/r/20221101220328.95765-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 9277d0367ba18ef4bb98bafb1209e715844cdf7e) Bug: 300502883 Change-Id: I69c9010a8fce2fa61a1d27f2964ac7bc7b85dd44 Signed-off-by: cui yangpei --- mm/damon/sysfs-schemes.c | 123 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 500759d8b20c..f0b5ad7e721d 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -9,18 +9,138 @@ #include "sysfs-common.h" +/* + * scheme region directory + */ + +struct damon_sysfs_scheme_region { + struct kobject kobj; + struct damon_addr_range ar; + unsigned int nr_accesses; + unsigned int age; + struct list_head list; +}; + +static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( + struct damon_region *region) +{ + struct damon_sysfs_scheme_region *sysfs_region = kmalloc( + sizeof(*sysfs_region), GFP_KERNEL); + + if (!sysfs_region) + return NULL; + sysfs_region->kobj = (struct kobject){}; + sysfs_region->ar = region->ar; + sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->age = region->age; + INIT_LIST_HEAD(&sysfs_region->list); + return sysfs_region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.start); +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.end); +} + +static ssize_t nr_accesses_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->nr_accesses); +} + +static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->age); +} + +static void damon_sysfs_scheme_region_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + list_del(®ion->list); + kfree(region); +} + +static struct kobj_attribute damon_sysfs_scheme_region_start_attr = + __ATTR_RO_MODE(start, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_end_attr = + __ATTR_RO_MODE(end, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = + __ATTR_RO_MODE(nr_accesses, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_age_attr = + __ATTR_RO_MODE(age, 0400); + +static struct attribute *damon_sysfs_scheme_region_attrs[] = { + &damon_sysfs_scheme_region_start_attr.attr, + &damon_sysfs_scheme_region_end_attr.attr, + &damon_sysfs_scheme_region_nr_accesses_attr.attr, + &damon_sysfs_scheme_region_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); + +static struct kobj_type damon_sysfs_scheme_region_ktype = { + .release = damon_sysfs_scheme_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_region_groups, +}; + /* * scheme regions directory */ struct damon_sysfs_scheme_regions { struct kobject kobj; + struct list_head regions_list; + int nr_regions; }; static struct damon_sysfs_scheme_regions * damon_sysfs_scheme_regions_alloc(void) { - return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL); + struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), + GFP_KERNEL); + + regions->kobj = (struct kobject){}; + INIT_LIST_HEAD(®ions->regions_list); + regions->nr_regions = 0; + return regions; +} + +static void damon_sysfs_scheme_regions_rm_dirs( + struct damon_sysfs_scheme_regions *regions) +{ + struct damon_sysfs_scheme_region *r, *next; + + list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + /* release function deletes it from the list */ + kobject_put(&r->kobj); + regions->nr_regions--; + } } static void damon_sysfs_scheme_regions_release(struct kobject *kobj) @@ -837,6 +957,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); kobject_put(&scheme->stats->kobj); + damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); kobject_put(&scheme->tried_regions->kobj); } From 80ccab9b0e4e326132976d42b33ef57bb85d486e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:24 +0000 Subject: [PATCH 035/139] UPSTREAM: mm/damon/sysfs: implement DAMOS tried regions update command Implement the code for filling the data of 'tried_regions' DAMON sysfs directory. With this commit, DAMON sysfs interface users can write a special keyword, 'update_schemes_tried_regions' to the corresponding 'state' file of the kdamond. Then, DAMON sysfs interface will collect the tried regions information using the 'before_damos_apply()' callback for one aggregation interval and populate scheme region directories with the values. [sj@kernel.org: skip tried regions update if the scheme directory was removed] Link: https://lkml.kernel.org/r/20221114182954.4745-2-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit f1d13cacabe140305844879e495ca67837e059cc) Bug: 300502883 Change-Id: I6749b8dc75023a9a3f3dc64902196b07fa523267 Signed-off-by: cui yangpei --- mm/damon/sysfs-common.h | 6 +++ mm/damon/sysfs-schemes.c | 80 ++++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs.c | 57 +++++++++++++++++++++++++++- 3 files changed, 141 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 4626b2784404..634a6e7fca78 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -44,3 +44,9 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx, void damon_sysfs_schemes_update_stats( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index f0b5ad7e721d..5f14f18bcc49 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1244,3 +1244,83 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } } + +/* + * damon_sysfs_schemes that need to update its schemes regions dir. Protected + * by damon_sysfs_lock + */ +static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; +static int damon_sysfs_schemes_region_idx; + +/* + * DAMON callback that called before damos apply. While this callback is + * registered, damon_sysfs_lock should be held to ensure the regions + * directories exist. + */ +static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos *scheme; + struct damon_sysfs_scheme_regions *sysfs_regions; + struct damon_sysfs_scheme_region *region; + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + if (scheme == s) + break; + schemes_idx++; + } + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + return 0; + + sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + region = damon_sysfs_scheme_region_alloc(r); + list_add_tail(®ion->list, &sysfs_regions->regions_list); + sysfs_regions->nr_regions++; + if (kobject_init_and_add(®ion->kobj, + &damon_sysfs_scheme_region_ktype, + &sysfs_regions->kobj, "%d", + damon_sysfs_schemes_region_idx++)) { + kobject_put(®ion->kobj); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_scheme *sysfs_scheme; + + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + damon_sysfs_scheme_regions_rm_dirs( + sysfs_scheme->tried_regions); + } + + damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + return 0; +} + +/* + * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller + * should unlock damon_sysfs_lock which held before + * damon_sysfs_schemes_update_regions_start() + */ +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) +{ + damon_sysfs_schemes_for_damos_callback = NULL; + ctx->callback.before_damos_apply = NULL; + damon_sysfs_schemes_region_idx = 0; + return 0; +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 284daf274b3e..ffb5a84059d7 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -999,6 +999,11 @@ enum damon_sysfs_cmd { * files. */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried + * regions + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ @@ -1011,6 +1016,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_regions", }; /* @@ -1193,6 +1199,16 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; + struct damon_sysfs_kdamond *kdamond; + + /* damon_sysfs_schemes_update_regions_stop() might not yet called */ + kdamond = damon_sysfs_cmd_request.kdamond; + if (kdamond && damon_sysfs_cmd_request.cmd == + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && + ctx == kdamond->damon_ctx) { + damon_sysfs_schemes_update_regions_stop(ctx); + mutex_unlock(&damon_sysfs_lock); + } if (!damon_target_has_pid(ctx)) return; @@ -1225,6 +1241,27 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) return 0; } +static int damon_sysfs_upd_schemes_regions_start( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_start( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + +static int damon_sysfs_upd_schemes_regions_stop( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_stop(ctx); +} + static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1277,10 +1314,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; + static bool damon_sysfs_schemes_regions_updating; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!mutex_trylock(&damon_sysfs_lock)) + if (!damon_sysfs_schemes_regions_updating && + !mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) @@ -1292,13 +1331,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + if (!damon_sysfs_schemes_regions_updating) { + err = damon_sysfs_upd_schemes_regions_start(kdamond); + if (!err) { + damon_sysfs_schemes_regions_updating = true; + goto keep_lock_out; + } + } else { + err = damon_sysfs_upd_schemes_regions_stop(kdamond); + damon_sysfs_schemes_regions_updating = false; + } + break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - mutex_unlock(&damon_sysfs_lock); + if (!damon_sysfs_schemes_regions_updating) + mutex_unlock(&damon_sysfs_lock); +keep_lock_out: return err; } From 5bf7b568603c3fc8e41280bd591a2ef291ca1618 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:25 +0000 Subject: [PATCH 036/139] UPSTREAM: mm/damon/sysfs-schemes: implement DAMOS-tried regions clear command When there are huge number of DAMON regions that specific scheme actions are tried to be applied, directories and files under 'tried_regions' scheme directory could waste some memory. Add another special input keyword ('clear_schemes_tried_regions') for 'state' file of each kdamond sysfs directory that can be used for cleanup of the 'tried_regions' sub-directories. [sj@kernel.org: skip regions clearing if the scheme directory was removed] Link: https://lkml.kernel.org/r/20221114182954.4745-3-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 772c15e5adcb32a42dbbcdb905ec49f662312976) Bug: 300502883 Change-Id: I969e05ce1fa4599bae50454633b61b5320eaa67d Signed-off-by: cui yangpei --- mm/damon/sysfs-common.h | 4 ++++ mm/damon/sysfs-schemes.c | 14 +++++++++++++- mm/damon/sysfs.c | 20 ++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 634a6e7fca78..604a6cbc3ede 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -50,3 +50,7 @@ int damon_sysfs_schemes_update_regions_start( struct damon_ctx *ctx); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); + +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 5f14f18bcc49..81fc4d27f4e4 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1292,7 +1292,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_update_regions_start( +int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx) { @@ -1302,11 +1302,23 @@ int damon_sysfs_schemes_update_regions_start( damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); } + return 0; +} +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; return 0; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ffb5a84059d7..aeb0beb1da91 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1004,6 +1004,11 @@ enum damon_sysfs_cmd { * regions */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, + /* + * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried + * regions + */ + DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ @@ -1017,6 +1022,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "commit", "update_schemes_stats", "update_schemes_tried_regions", + "clear_schemes_tried_regions", }; /* @@ -1262,6 +1268,17 @@ static int damon_sysfs_upd_schemes_regions_stop( return damon_sysfs_schemes_update_regions_stop(ctx); } +static int damon_sysfs_clear_schemes_regions( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1343,6 +1360,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) damon_sysfs_schemes_regions_updating = false; } break; + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + err = damon_sysfs_clear_schemes_regions(kdamond); + break; default: break; } From 3ca21ef5fa5a146c51584a3c28884dc078a194b4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 1 Nov 2022 22:14:08 +0100 Subject: [PATCH 037/139] UPSTREAM: mm/damon: use kstrtobool() instead of strtobool() strtobool() is the same as kstrtobool(). However, the latter is more used within the kernel. In order to remove strtobool() and slightly simplify kstrtox.h, switch to the other function name. While at it, include the corresponding header file () Link: https://lkml.kernel.org/r/ed2b46489a513988688decb53850339cc228940c.1667336095.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit e6aff38b2e25e934e95471351c96d1410bb17561) Bug: 300502883 Change-Id: I21df914f9ba754921bdc00d8e9a33e77b2606360 Signed-off-by: cui yangpei --- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 2a532e3983df..7b8fce2f67a8 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "damon-lru-sort: " fmt #include +#include #include #include "modules-common.h" @@ -241,7 +242,7 @@ static int damon_lru_sort_enabled_store(const char *val, bool enable; int err; - err = strtobool(val, &enable); + err = kstrtobool(val, &enable); if (err) return err; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e57604bec06d..e82631f39481 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "damon-reclaim: " fmt #include +#include #include #include "modules-common.h" @@ -187,7 +188,7 @@ static int damon_reclaim_enabled_store(const char *val, bool enable; int err; - err = strtobool(val, &enable); + err = kstrtobool(val, &enable); if (err) return err; From ea215c9a1040c12446c3b32887255680fb719fe4 Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Mon, 16 Jan 2023 14:23:47 +0800 Subject: [PATCH 038/139] UPSTREAM: mm/damon/core: skip apply schemes if empty Sometimes there is no scheme in damon's context, for example just use damo record to monitor workload's data access pattern. If current damon context doesn't have any scheme in the list, kdamond has no need to iterate over list of all targets and regions but do nothing. So, skip apply schemes when ctx->schemes is empty. Link: https://lkml.kernel.org/r/20230116062347.1148553-1-huaisheng.ye@intel.com Signed-off-by: Huaisheng Ye Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 64517d6e1291b5e942b00c53674ecf33f918313f) Bug: 300502883 Change-Id: Ic76ca90c85dbb24205b17dd914f91a8dd4cf7345 Signed-off-by: cui yangpei --- mm/damon/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index ceec75b88ef9..f338691e4591 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1230,7 +1230,8 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) break; - kdamond_apply_schemes(ctx); + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->ops.reset_aggregated) From a548d90994eb62ebdad8405df72576dd29d50e1a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 2 Jun 2023 10:29:48 +0100 Subject: [PATCH 039/139] UPSTREAM: mm/damon/ops-common: refactor to use {pte|pmd}p_clear_young_notify() With the fix in place to atomically test and clear young on ptes and pmds, simplify the code to handle the clearing for both the primary mmu and the mmu notifier with a single API call. Link: https://lkml.kernel.org/r/20230602092949.545577-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Yu Zhao Reviewed-by: SeongJae Park Cc: Christoph Hellwig Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (IBM) Cc: Uladzislau Rezki (Sony) Cc: Zi Yan Signed-off-by: Andrew Morton (cherry picked from commit fa8c919dac3f5f325b17f9fcf8ac7dd899992598) Bug: 300502883 Change-Id: I4414604788996e338ac638c3eb3ec1ef7959223e Signed-off-by: cui yangpei --- mm/damon/ops-common.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 13b99975cbc2..073481023bea 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -35,21 +35,12 @@ struct page *damon_get_page(unsigned long pfn) void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) { - bool referenced = false; struct page *page = damon_get_page(pte_pfn(*pte)); if (!page) return; - if (ptep_test_and_clear_young(vma, addr, pte)) - referenced = true; - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) + if (ptep_clear_young_notify(vma, addr, pte)) set_page_young(page); set_page_idle(page); @@ -59,21 +50,12 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool referenced = false; struct page *page = damon_get_page(pmd_pfn(*pmd)); if (!page) return; - if (pmdp_test_and_clear_young(vma, addr, pmd)) - referenced = true; - -#ifdef CONFIG_MMU_NOTIFIER - if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE)) - referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ - - if (referenced) + if (pmdp_clear_young_notify(vma, addr, pmd)) set_page_young(page); set_page_idle(page); From 7d48e19f74a3d74eb234f04e4565a3cebbd7c750 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:17 +0000 Subject: [PATCH 040/139] UPSTREAM: mm/damon/sysfs-schemes: implement DAMOS tried total bytes file Patch series "mm/damon/sysfs-schemes: implement DAMOS tried total bytes file". The tried_regions directory of DAMON sysfs interface is useful for retrieving monitoring results snapshot or DAMOS debugging. However, for common use case that need to monitor only the total size of the scheme tried regions (e.g., monitoring working set size), the kernel overhead for directory construction and user overhead for reading the content could be high if the number of monitoring region is not small. This patchset implements DAMON sysfs files for efficient support of the use case. The first patch implements the sysfs file to reduce the user space overhead, and the second patch implements a command for reducing the kernel space overhead. The third patch adds a selftest for the new file, and following two patches update documents. [1] https://lore.kernel.org/damon/20230728201817.70602-1-sj@kernel.org/ This patch (of 5): The tried_regions directory can be used for retrieving the monitoring results snapshot for regions of specific access pattern, by setting the scheme's action as 'stat' and the access pattern as required. While the interface provides every detail of the monitoring results, some use cases including working set size monitoring requires only the total size of the regions. For such cases, users should read all the information and calculate the total size of the regions. However, it could incur high overhead if the number of regions is high. Add a file for retrieving only the information, namely 'total_bytes' file. It allows users to get the total size by reading only the file. Link: https://lkml.kernel.org/r/20230802213222.109841-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230802213222.109841-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit b69f92a741405336fb17a8a3d67fc144192fe8e2) Bug: 300502883 Change-Id: I49c225d15ba09a9b896341da14cc9f2b45578da7 Signed-off-by: cui yangpei --- mm/damon/sysfs-schemes.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 81fc4d27f4e4..1e0270490d50 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -117,6 +117,7 @@ struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; + unsigned long total_bytes; }; static struct damon_sysfs_scheme_regions * @@ -128,9 +129,19 @@ damon_sysfs_scheme_regions_alloc(void) regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; + regions->total_bytes = 0; return regions; } +static ssize_t total_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_regions *regions = container_of(kobj, + struct damon_sysfs_scheme_regions, kobj); + + return sysfs_emit(buf, "%lu\n", regions->total_bytes); +} + static void damon_sysfs_scheme_regions_rm_dirs( struct damon_sysfs_scheme_regions *regions) { @@ -148,7 +159,11 @@ static void damon_sysfs_scheme_regions_release(struct kobject *kobj) kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); } +static struct kobj_attribute damon_sysfs_scheme_regions_total_bytes_attr = + __ATTR_RO_MODE(total_bytes, 0400); + static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + &damon_sysfs_scheme_regions_total_bytes_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); @@ -1279,6 +1294,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + sysfs_regions->total_bytes += r->ar.end - r->ar.start; region = damon_sysfs_scheme_region_alloc(r); list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; @@ -1309,6 +1325,7 @@ int damon_sysfs_schemes_clear_regions( sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); + sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } From b46391e09202c37cba18a2eb9a0049f3482c8cde Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:18 +0000 Subject: [PATCH 041/139] UPSTREAM: mm/damon/sysfs: implement a command for updating only schemes tried total bytes Using tried_regions/total_bytes file, users can efficiently retrieve the total size of memory regions having specific access pattern. However, DAMON sysfs interface in kernel still populates all the infomration on the tried_regions subdirectories. That means the kernel part overhead for the construction of tried regions directories still exists. To remove the overhead, implement yet another command input for 'state' DAMON sysfs file. Writing the input to the file makes DAMON sysfs interface to update only the total_bytes file. Link: https://lkml.kernel.org/r/20230802213222.109841-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton (cherry picked from commit 6ad243b83b5094026fdb3171711ddb25246b3d8a) Bug: 300502883 Change-Id: Id0bdf13858d6a92de0eeef22f59a65ee884e9d20 Signed-off-by: cui yangpei --- mm/damon/sysfs-common.h | 2 +- mm/damon/sysfs-schemes.c | 7 ++++++- mm/damon/sysfs.c | 26 ++++++++++++++++++++------ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 604a6cbc3ede..90b1bef41db8 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -47,7 +47,7 @@ void damon_sysfs_schemes_update_stats( int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_ctx *ctx, bool total_bytes_only); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 1e0270490d50..3ca99586f45b 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1266,6 +1266,7 @@ void damon_sysfs_schemes_update_stats( */ static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; static int damon_sysfs_schemes_region_idx; +static bool damos_regions_upd_total_bytes_only; /* * DAMON callback that called before damos apply. While this callback is @@ -1295,6 +1296,9 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; sysfs_regions->total_bytes += r->ar.end - r->ar.start; + if (damos_regions_upd_total_bytes_only) + return 0; + region = damon_sysfs_scheme_region_alloc(r); list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; @@ -1333,10 +1337,11 @@ int damon_sysfs_schemes_clear_regions( /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) + struct damon_ctx *ctx, bool total_bytes_only) { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; return 0; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index aeb0beb1da91..fb9cf961b79c 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -999,6 +999,11 @@ enum damon_sysfs_cmd { * files. */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: Update + * tried_regions/total_bytes sysfs files for each scheme. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES, /* * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried * regions @@ -1021,6 +1026,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_bytes", "update_schemes_tried_regions", "clear_schemes_tried_regions", }; @@ -1206,12 +1212,14 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; struct damon_sysfs_kdamond *kdamond; + enum damon_sysfs_cmd cmd; /* damon_sysfs_schemes_update_regions_stop() might not yet called */ kdamond = damon_sysfs_cmd_request.kdamond; - if (kdamond && damon_sysfs_cmd_request.cmd == - DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && - ctx == kdamond->damon_ctx) { + cmd = damon_sysfs_cmd_request.cmd; + if (kdamond && ctx == kdamond->damon_ctx && + (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || + cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) { damon_sysfs_schemes_update_regions_stop(ctx); mutex_unlock(&damon_sysfs_lock); } @@ -1248,14 +1256,15 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) } static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond) + struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) { struct damon_ctx *ctx = kdamond->damon_ctx; if (!ctx) return -EINVAL; return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx); + kdamond->contexts->contexts_arr[0]->schemes, ctx, + total_bytes_only); } static int damon_sysfs_upd_schemes_regions_stop( @@ -1332,6 +1341,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; static bool damon_sysfs_schemes_regions_updating; + bool total_bytes_only = false; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ @@ -1348,9 +1358,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + total_bytes_only = true; + fallthrough; case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond); + err = damon_sysfs_upd_schemes_regions_start(kdamond, + total_bytes_only); if (!err) { damon_sysfs_schemes_regions_updating = true; goto keep_lock_out; From f5a0a8bc43e23a5423c65ff554d5f28cce1933e5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 7 Oct 2023 20:04:32 +0000 Subject: [PATCH 042/139] UPSTREAM: mm/damon/sysfs: check DAMOS regions update progress from before_terminate() DAMON_SYSFS can receive DAMOS tried regions update request while kdamond is already out of the main loop and before_terminate callback (damon_sysfs_before_terminate() in this case) is not yet called. And damon_sysfs_handle_cmd() can further be finished before the callback is invoked. Then, damon_sysfs_before_terminate() unlocks damon_sysfs_lock, which is not locked by anyone. This happens because the callback function assumes damon_sysfs_cmd_request_callback() should be called before it. Check if the assumption was true before doing the unlock, to avoid this problem. Link: https://lkml.kernel.org/r/20231007200432.3110-1-sj@kernel.org Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command") Signed-off-by: SeongJae Park Cc: [6.2.x] Signed-off-by: Andrew Morton (cherry picked from commit 76b7069bcc89dec33f03eb08abee165d0306b754) Bug: 300502883 Change-Id: I7cd5e00c0d0226dc8d7856d103f88a26307cafce Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index fb9cf961b79c..a3a1c94dc60d 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1208,6 +1208,8 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return 0; } +static bool damon_sysfs_schemes_regions_updating; + static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; @@ -1219,8 +1221,10 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) cmd = damon_sysfs_cmd_request.cmd; if (kdamond && ctx == kdamond->damon_ctx && (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || - cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) { + cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES) && + damon_sysfs_schemes_regions_updating) { damon_sysfs_schemes_update_regions_stop(ctx); + damon_sysfs_schemes_regions_updating = false; mutex_unlock(&damon_sysfs_lock); } @@ -1340,7 +1344,6 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; - static bool damon_sysfs_schemes_regions_updating; bool total_bytes_only = false; int err = 0; From c194e597cb3ced4c779d6ca35c9fe879f772a7c2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:53 +0000 Subject: [PATCH 043/139] UPSTREAM: mm/damon/sysfs-schemes: do not update tried regions more than one DAMON snapshot Patch series "mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval". DAMOS tried regions update feature of DAMON sysfs interface is doing the update for one aggregation interval after the request is made. Since the per-scheme apply interval is supported, that behavior makes no much sense. That is, the tried regions directory will have regions from multiple DAMON monitoring results snapshots, or no region for apply intervals that much shorter than, or longer than the aggregation interval, respectively. Update the behavior to update the regions for each scheme for only its apply interval, and update the document. Since DAMOS apply interval is the aggregation by default, this change makes no visible behavioral difference to old users who don't explicitly set the apply intervals. Patches Sequence ---------------- The first two patches makes schemes of apply intervals that much shorter or longer than the aggregation interval to keep the maximum and minimum times for continuing the update. After the two patches, the update aligns with the each scheme's apply interval. Finally, the third patch updates the document to reflect the behavior. This patch (of 3): DAMON_SYSFS exposes every DAMON-found region that eligible for applying the scheme action for one aggregation interval. However, each DAMON-based operation scheme has its own apply interval. Hence, for a scheme that having its apply interval much smaller than the aggregation interval, DAMON_SYSFS will expose the scheme regions that applied to more than one DAMON monitoring results snapshots. Since the purpose of DAMON tried regions is exposing single snapshot, this makes no much sense. Track progress of each scheme's tried regions update and avoid the case. Link: https://lkml.kernel.org/r/20231012192256.33556-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231012192256.33556-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton (cherry picked from commit 4d4e41b682990b1dc5bba2bc313800340bf5c2d4) Bug: 300502883 Change-Id: I78602a6810a9b4d8d131c3ace69f255ac1349d13 Signed-off-by: cui yangpei --- mm/damon/sysfs-schemes.c | 77 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3ca99586f45b..4ee188714ca2 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -113,11 +113,47 @@ static struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ +/* + * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update + * status + * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. + * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. + * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. + * + * Each DAMON-based operation scheme (&struct damos) has its own apply + * interval, and we need to expose the scheme tried regions based on only + * single snapshot. For this, we keep the tried regions update status for each + * scheme. The status becomes 'idle' at the beginning. + * + * Once the tried regions update request is received, the request handling + * start function (damon_sysfs_scheme_update_regions_start()) sets the status + * of all schemes as 'idle' again, and register ->before_damos_apply() and + * ->after_sampling() callbacks. + * + * Then, the first followup ->before_damos_apply() callback + * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first + * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call + * is called only after the scheme is completely applied + * to the given snapshot. Hence the callback knows the situation by showing + * 'started' status, and sets the status as 'finished'. Then, + * damon_sysfs_before_damos_apply() understands the situation by showing the + * 'finished' status and do nothing. + * + * Finally, the tried regions request handling finisher function + * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. + */ +enum damos_sysfs_regions_upd_status { + DAMOS_TRIED_REGIONS_UPD_IDLE, + DAMOS_TRIED_REGIONS_UPD_STARTED, + DAMOS_TRIED_REGIONS_UPD_FINISHED, +}; + struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; + enum damos_sysfs_regions_upd_status upd_status; }; static struct damon_sysfs_scheme_regions * @@ -130,6 +166,7 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; + regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -1295,6 +1332,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) + return 0; + if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) + sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; if (damos_regions_upd_total_bytes_only) return 0; @@ -1311,6 +1352,29 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; } +/* + * DAMON callback that called after each accesses sampling. While this + * callback is registered, damon_sysfs_lock should be held to ensure the + * regions directories exist. + */ +static int damon_sysfs_after_sampling(struct damon_ctx *ctx) +{ + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + struct damon_sysfs_scheme_regions *sysfs_regions; + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + if (sysfs_regions->upd_status == + DAMOS_TRIED_REGIONS_UPD_STARTED) + sysfs_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_FINISHED; + } + + return 0; +} + /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, @@ -1334,6 +1398,16 @@ int damon_sysfs_schemes_clear_regions( return 0; } +static void damos_tried_regions_init_upd_status( + struct damon_sysfs_schemes *sysfs_schemes) +{ + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) + sysfs_schemes->schemes_arr[i]->tried_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_IDLE; +} + /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, @@ -1341,8 +1415,10 @@ int damon_sysfs_schemes_update_regions_start( { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + damos_tried_regions_init_upd_status(sysfs_schemes); damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + ctx->callback.after_sampling = damon_sysfs_after_sampling; return 0; } @@ -1355,6 +1431,7 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) { damon_sysfs_schemes_for_damos_callback = NULL; ctx->callback.before_damos_apply = NULL; + ctx->callback.after_sampling = NULL; damon_sysfs_schemes_region_idx = 0; return 0; } From 1e19db10e7ac53a11ca4ae40a60ae232699393ee Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:54 +0000 Subject: [PATCH 044/139] UPSTREAM: mm/damon/sysfs: avoid empty scheme tried regions for large apply interval DAMON_SYSFS assumes all schemes will be applied for at least one DAMON monitoring results snapshot within one aggregation interval, or makes no sense to wait for it while DAMON is deactivated by the watermarks. That for deactivated status still makes sense, but the aggregation interval based assumption is invalid now because each scheme can has its own apply interval. For schemes having larger than the aggregation or watermarks check interval, DAMOS tried regions update request can be finished without the update. Avoid the case by explicitly checking the status of the schemes tried regions update and watermarks based DAMON deactivation. Link: https://lkml.kernel.org/r/20231012192256.33556-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton (cherry picked from commit 76126332c7606ba25a4ae5db37145fd526985b45) Bug: 300502883 Change-Id: I8283709a023123d7a89fd37a1d4a834888c15c7e Signed-off-by: cui yangpei --- mm/damon/sysfs-common.h | 2 ++ mm/damon/sysfs-schemes.c | 16 ++++++++++++++++ mm/damon/sysfs.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 90b1bef41db8..3db199c84ed3 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx, bool total_bytes_only); +bool damos_sysfs_regions_upd_done(void); + int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); int damon_sysfs_schemes_clear_regions( diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 4ee188714ca2..6df715f113ae 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1422,6 +1422,22 @@ int damon_sysfs_schemes_update_regions_start( return 0; } +bool damos_sysfs_regions_upd_done(void) +{ + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + struct damon_sysfs_scheme_regions *sysfs_regions; + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + if (sysfs_regions->upd_status != + DAMOS_TRIED_REGIONS_UPD_FINISHED) + return false; + } + return true; +} + /* * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller * should unlock damon_sysfs_lock which held before diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a3a1c94dc60d..ca7bd9dfb7d7 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1336,12 +1336,13 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. - * @c: The DAMON context of the callback. + * @c: The DAMON context of the callback. + * @active: Whether @c is not deactivated due to watermarks. * * This function is periodically called back from the kdamond thread for @c. * Then, it checks if there is a waiting DAMON sysfs request and handles it. */ -static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) { struct damon_sysfs_kdamond *kdamond; bool total_bytes_only = false; @@ -1373,6 +1374,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) goto keep_lock_out; } } else { + /* + * Continue regions updating if DAMON is till + * active and the update for all schemes is not + * finished. + */ + if (active && !damos_sysfs_regions_upd_done()) + goto keep_lock_out; err = damon_sysfs_upd_schemes_regions_stop(kdamond); damon_sysfs_schemes_regions_updating = false; } @@ -1392,6 +1400,24 @@ keep_lock_out: return err; } +static int damon_sysfs_after_wmarks_check(struct damon_ctx *c) +{ + /* + * after_wmarks_check() is called back while the context is deactivated + * by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, false); +} + +static int damon_sysfs_after_aggregation(struct damon_ctx *c) +{ + /* + * after_aggregation() is called back only while the context is not + * deactivated by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, true); +} + static struct damon_ctx *damon_sysfs_build_ctx( struct damon_sysfs_context *sys_ctx) { @@ -1407,8 +1433,8 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ERR_PTR(err); } - ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; - ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; + ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check; + ctx->callback.after_aggregation = damon_sysfs_after_aggregation; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; } From 6b7c4cc2624da6285a97a16d6fad00a6d3f5df46 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Oct 2023 21:07:33 +0000 Subject: [PATCH 045/139] UPSTREAM: mm/damon/sysfs: remove requested targets when online-commit inputs damon_sysfs_set_targets(), which updates the targets of the context for online commitment, do not remove targets that removed from the corresponding sysfs files. As a result, more than intended targets of the context can exist and hence consume memory and monitoring CPU resource more than expected. Fix it by removing all targets of the context and fill up again using the user input. This could cause unnecessary memory dealloc and realloc operations, but this is not a hot code path. Also, note that damon_target is stateless, and hence no data is lost. [sj@kernel.org: fix unnecessary monitoring results removal] Link: https://lkml.kernel.org/r/20231028213353.45397-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231022210735.46409-2-sj@kernel.org Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: [5.19.x] Signed-off-by: Andrew Morton (cherry picked from commit 19467a950b49432a84bf6dbadbbb17bdf89418b7) Bug: 300502883 Change-Id: Icf094f138e6810182d23d2c412fbabe3ecd960fe Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 70 +++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ca7bd9dfb7d7..4b3f05fc34a2 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1150,58 +1150,60 @@ destroy_targets_out: return err; } -/* - * Search a target in a context that corresponds to the sysfs target input. - * - * Return: pointer to the target if found, NULL if not found, or negative - * error code if the search failed. - */ -static struct damon_target *damon_sysfs_existing_target( - struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) +static int damon_sysfs_update_target(struct damon_target *target, + struct damon_ctx *ctx, + struct damon_sysfs_target *sys_target) { struct pid *pid; - struct damon_target *t; + struct damon_region *r, *next; - if (!damon_target_has_pid(ctx)) { - /* Up to only one target for paddr could exist */ - damon_for_each_target(t, ctx) - return t; - return NULL; - } + if (!damon_target_has_pid(ctx)) + return 0; - /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */ pid = find_get_pid(sys_target->pid); if (!pid) - return ERR_PTR(-EINVAL); - damon_for_each_target(t, ctx) { - if (t->pid == pid) { - put_pid(pid); - return t; - } + return -EINVAL; + + /* no change to the target */ + if (pid == target->pid) { + put_pid(pid); + return 0; } - put_pid(pid); - return NULL; + + /* remove old monitoring results and update the target's pid */ + damon_for_each_region_safe(r, next, target) + damon_destroy_region(r, target); + put_pid(target->pid); + target->pid = pid; + return 0; } static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { - int i, err; + struct damon_target *t, *next; + int i = 0, err; /* Multiple physical address space monitoring targets makes no sense */ if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1) return -EINVAL; - for (i = 0; i < sysfs_targets->nr; i++) { - struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; - struct damon_target *t = damon_sysfs_existing_target(st, ctx); + damon_for_each_target_safe(t, next, ctx) { + if (i < sysfs_targets->nr) { + damon_sysfs_update_target(t, ctx, + sysfs_targets->targets_arr[i]); + } else { + if (damon_target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + i++; + } - if (IS_ERR(t)) - return PTR_ERR(t); - if (!t) - err = damon_sysfs_add_target(st, ctx); - else - err = damon_sysfs_set_regions(t, st->regions); + for (; i < sysfs_targets->nr; i++) { + struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; + + err = damon_sysfs_add_target(st, ctx); if (err) return err; } From c132d077ebc6073b0dfc70b82897cfd690e3d3ca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 31 Oct 2023 17:01:31 +0000 Subject: [PATCH 046/139] UPSTREAM: mm/damon/sysfs: update monitoring target regions for online input commit When user input is committed online, DAMON sysfs interface is ignoring the user input for the monitoring target regions. Such request is valid and useful for fixed monitoring target regions-based monitoring ops like 'paddr' or 'fvaddr'. Update the region boundaries as user specified, too. Note that the monitoring results of the regions that overlap between the latest monitoring target regions and the new target regions are preserved. Treat empty monitoring target regions user request as a request to just make no change to the monitoring target regions. Otherwise, users should set the monitoring target regions same to current one for every online input commit, and it could be challenging for dynamic monitoring target regions update DAMON ops like 'vaddr'. If the user really need to remove all monitoring target regions, they can simply remove the target and then create the target again with empty target regions. Link: https://lkml.kernel.org/r/20231031170131.46972-1-sj@kernel.org Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton (cherry picked from commit 9732336006764e2ee61225387e3c70eae9139035) Bug: 300502883 Change-Id: I6857482470951382c9be36f2099da76e9b71d502 Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 51 ++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 4b3f05fc34a2..507ca2a95a87 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1150,32 +1150,45 @@ destroy_targets_out: return err; } +static int damon_sysfs_update_target_pid(struct damon_target *target, int pid) +{ + struct pid *pid_new; + + pid_new = find_get_pid(pid); + if (!pid_new) + return -EINVAL; + + if (pid_new == target->pid) { + put_pid(pid_new); + return 0; + } + + put_pid(target->pid); + target->pid = pid_new; + return 0; +} + static int damon_sysfs_update_target(struct damon_target *target, struct damon_ctx *ctx, struct damon_sysfs_target *sys_target) { - struct pid *pid; - struct damon_region *r, *next; + int err; - if (!damon_target_has_pid(ctx)) - return 0; - - pid = find_get_pid(sys_target->pid); - if (!pid) - return -EINVAL; - - /* no change to the target */ - if (pid == target->pid) { - put_pid(pid); - return 0; + if (damon_target_has_pid(ctx)) { + err = damon_sysfs_update_target_pid(target, sys_target->pid); + if (err) + return err; } - /* remove old monitoring results and update the target's pid */ - damon_for_each_region_safe(r, next, target) - damon_destroy_region(r, target); - put_pid(target->pid); - target->pid = pid; - return 0; + /* + * Do monitoring target region boundary update only if one or more + * regions are set by the user. This is for keeping current monitoring + * target results and range easier, especially for dynamic monitoring + * target regions update ops like 'vaddr'. + */ + if (sys_target->regions->nr) + err = damon_sysfs_set_regions(target, sys_target->regions); + return err; } static int damon_sysfs_set_targets(struct damon_ctx *ctx, From 606444fd06a65bc6ecd3b582052adc84c96bcdb0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 6 Nov 2023 17:07:40 +0300 Subject: [PATCH 047/139] UPSTREAM: mm/damon/sysfs: eliminate potential uninitialized variable warning The "err" variable is not initialized if damon_target_has_pid(ctx) is false and sys_target->regions->nr is zero. Link: https://lkml.kernel.org/r/739e6aaf-a634-4e33-98a8-16546379ec9f@moroto.mountain Fixes: 0bcd216c4741 ("mm/damon/sysfs: update monitoring target regions for online input commit") Signed-off-by: Dan Carpenter Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton (cherry picked from commit 85c2ceaafbd306814a3a4740bf4d95ac26a8b36a) Bug: 300502883 Change-Id: I235ea1bfc9d8bf0fef426dbc21881d755e3a5d67 Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 507ca2a95a87..6545a46da606 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1172,7 +1172,7 @@ static int damon_sysfs_update_target(struct damon_target *target, struct damon_ctx *ctx, struct damon_sysfs_target *sys_target) { - int err; + int err = 0; if (damon_target_has_pid(ctx)) { err = damon_sysfs_update_target_pid(target, sys_target->pid); From 7fbeab3c65e91069096a7a5811abf91262d67ec0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:06 +0000 Subject: [PATCH 048/139] UPSTREAM: mm/damon/sysfs: check error from damon_sysfs_update_target() Patch series "mm/damon/sysfs: fix unhandled return values". Some of DAMON sysfs interface code is not handling return values from some functions. As a result, confusing user input handling or NULL-dereference is possible. Check those properly. This patch (of 3): damon_sysfs_update_target() returns error code for failures, but its caller, damon_sysfs_set_targets() is ignoring that. The update function seems making no critical change in case of such failures, but the behavior will look like DAMON sysfs is silently ignoring or only partially accepting the user input. Fix it. Link: https://lkml.kernel.org/r/20231106233408.51159-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231106233408.51159-2-sj@kernel.org Fixes: 19467a950b49 ("mm/damon/sysfs: remove requested targets when online-commit inputs") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton (cherry picked from commit b4936b544b08ed44949055b92bd25f77759ebafc) Bug: 300502883 Change-Id: I9bfea66f76ad094ed73defee5ff3fdb3794e8162 Signed-off-by: cui yangpei --- mm/damon/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6545a46da606..0a6b4625de9f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, damon_for_each_target_safe(t, next, ctx) { if (i < sysfs_targets->nr) { - damon_sysfs_update_target(t, ctx, + err = damon_sysfs_update_target(t, ctx, sysfs_targets->targets_arr[i]); + if (err) + return err; } else { if (damon_target_has_pid(ctx)) put_pid(t->pid); From 1cedfc05e9c1fa234a52b833b06722eb430e15ac Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:07 +0000 Subject: [PATCH 049/139] UPSTREAM: mm/damon/sysfs-schemes: handle tried regions sysfs directory allocation failure DAMOS tried regions sysfs directory allocation function (damon_sysfs_scheme_regions_alloc()) is not handling the memory allocation failure. In the case, the code will dereference NULL pointer. Handle the failure to avoid such invalid access. Link: https://lkml.kernel.org/r/20231106233408.51159-3-sj@kernel.org Fixes: 9277d0367ba1 ("mm/damon/sysfs-schemes: implement scheme region directory") Signed-off-by: SeongJae Park Cc: [6.2+] Signed-off-by: Andrew Morton (cherry picked from commit 84055688b6bc075c92a88e2d6c3ad26ab93919f9) Bug: 300502883 Change-Id: I86ecb2f3cf1604199b5567576b1fa583914f7f36 Signed-off-by: cui yangpei --- mm/damon/sysfs-schemes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6df715f113ae..b79b7c0eb9d0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -162,6 +162,9 @@ damon_sysfs_scheme_regions_alloc(void) struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), GFP_KERNEL); + if (!regions) + return NULL; + regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; From 31c59d59c7a7963e0ceec6c35f166d19ebabf8fd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Nov 2023 23:34:08 +0000 Subject: [PATCH 050/139] UPSTREAM: mm/damon/sysfs-schemes: handle tried region directory allocation failure DAMON sysfs interface's before_damos_apply callback (damon_sysfs_before_damos_apply()), which creates the DAMOS tried regions for each DAMOS action applied region, is not handling the allocation failure for the sysfs directory data. As a result, NULL pointer derefeence is possible. Fix it by handling the case. Link: https://lkml.kernel.org/r/20231106233408.51159-4-sj@kernel.org Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command") Signed-off-by: SeongJae Park Cc: [6.2+] Signed-off-by: Andrew Morton (cherry picked from commit ae636ae2bbfd9279f5681dbf320d1da817e52b68) Bug: 300502883 Change-Id: I98568f4b0cee9fea82f4fe6d3e7a505370c3c304 Signed-off-by: cui yangpei --- mm/damon/sysfs-schemes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b79b7c0eb9d0..12b2c903b0a0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1344,6 +1344,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; region = damon_sysfs_scheme_region_alloc(r); + if (!region) + return 0; list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; if (kobject_init_and_add(®ion->kobj, From cee8ebf7c50810d851430ab046eb25f3a56bbac1 Mon Sep 17 00:00:00 2001 From: cuiyangpei Date: Wed, 6 Dec 2023 21:03:17 +0800 Subject: [PATCH 051/139] ANDROID: GKI: build damon for monitoring virtual address spaces Enable damon related configs in gki_defconfig. Bug: 300502883 Change-Id: Ie00a923464d2f1fff8f12a8804cbac040f0cacdf Signed-off-by: cuiyangpei --- arch/arm64/configs/gki_defconfig | 3 +++ arch/x86/configs/gki_defconfig | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig index abf7b41cdc85..38c90d04264e 100644 --- a/arch/arm64/configs/gki_defconfig +++ b/arch/arm64/configs/gki_defconfig @@ -123,6 +123,9 @@ CONFIG_ANON_VMA_NAME=y CONFIG_USERFAULTFD=y CONFIG_LRU_GEN=y CONFIG_LRU_GEN_ENABLED=y +CONFIG_DAMON=y +CONFIG_DAMON_VADDR=y +CONFIG_DAMON_SYSFS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig index 34a85587c4f4..22797e912979 100644 --- a/arch/x86/configs/gki_defconfig +++ b/arch/x86/configs/gki_defconfig @@ -118,6 +118,9 @@ CONFIG_ANON_VMA_NAME=y CONFIG_USERFAULTFD=y CONFIG_LRU_GEN=y CONFIG_LRU_GEN_ENABLED=y +CONFIG_DAMON=y +CONFIG_DAMON_VADDR=y +CONFIG_DAMON_SYSFS=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y From efa8f34b5aa8a380bd397baba5a88e0a21592795 Mon Sep 17 00:00:00 2001 From: John Scheible Date: Wed, 13 Dec 2023 15:29:57 -0800 Subject: [PATCH 052/139] ANDROID: Update the ABI symbol list Adding the following symbols: - dma_fence_enable_sw_signaling - dma_fence_unwrap_first - __dma_fence_unwrap_merge - dma_fence_unwrap_next 3 function symbol(s) added 'struct dma_fence* __dma_fence_unwrap_merge(unsigned int, struct dma_fence**, struct dma_fence_unwrap*)' 'struct dma_fence* dma_fence_unwrap_first(struct dma_fence*, struct dma_fence_unwrap*)' 'struct dma_fence* dma_fence_unwrap_next(struct dma_fence_unwrap*)' Bug: 316212868 Change-Id: I41a4d906e98c983c4b612f65127bd7ef7ac5cb85 Signed-off-by: John Scheible --- android/abi_gki_aarch64.stg | 75 +++++++++++++++++++++++++++++++++++ android/abi_gki_aarch64_pixel | 4 ++ 2 files changed, 79 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 0ceacc54c6de..68cf985124c9 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -3098,6 +3098,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x07b1db01 } +pointer_reference { + id: 0x0b7d7682 + kind: POINTER + pointee_type_id: 0x07b53c95 +} pointer_reference { id: 0x0b85846c kind: POINTER @@ -49124,6 +49129,12 @@ member { name: "array" type_id: 0x38d23361 } +member { + id: 0xdfa4a7f6 + name: "array" + type_id: 0x030b9acf + offset: 64 +} member { id: 0xdfcad7c4 name: "array" @@ -59752,6 +59763,11 @@ member { type_id: 0x1a8b04e5 offset: 384 } +member { + id: 0x15741053 + name: "chain" + type_id: 0x030b9acf +} member { id: 0x15798222 name: "chain" @@ -224625,6 +224641,17 @@ struct_union { member_id: 0x14d0dfac } } +struct_union { + id: 0x07b53c95 + kind: STRUCT + name: "dma_fence_unwrap" + definition { + bytesize: 24 + member_id: 0x15741053 + member_id: 0xdfa4a7f6 + member_id: 0xad7c841b + } +} struct_union { id: 0x7f49bdff kind: STRUCT @@ -328631,6 +328658,13 @@ function { parameter_id: 0xc9082b19 parameter_id: 0xc9082b19 } +function { + id: 0xce0d9e89 + return_type_id: 0x030b9acf + parameter_id: 0x4585663f + parameter_id: 0x0a52df14 + parameter_id: 0x0b7d7682 +} function { id: 0xce0dc24b return_type_id: 0x4585663f @@ -329053,6 +329087,11 @@ function { parameter_id: 0x1e9745d3 parameter_id: 0x1e9745d3 } +function { + id: 0xdd980e87 + return_type_id: 0x030b9acf + parameter_id: 0x0b7d7682 +} function { id: 0xddb49ff7 return_type_id: 0x3ae3ff84 @@ -329212,6 +329251,12 @@ function { return_type_id: 0x030b9acf parameter_id: 0x030b9acf } +function { + id: 0xdfa8404e + return_type_id: 0x030b9acf + parameter_id: 0x030b9acf + parameter_id: 0x0b7d7682 +} function { id: 0xdfba2774 return_type_id: 0x4585663f @@ -332379,6 +332424,15 @@ elf_symbol { type_id: 0x4058e56a full_name: "__devres_alloc_node" } +elf_symbol { + id: 0xfa3b077f + name: "__dma_fence_unwrap_merge" + is_defined: true + symbol_type: FUNCTION + crc: 0xd88defca + type_id: 0xce0d9e89 + full_name: "__dma_fence_unwrap_merge" +} elf_symbol { id: 0x0a6e3e89 name: "__dma_request_channel" @@ -354070,6 +354124,24 @@ elf_symbol { type_id: 0x9d05158e full_name: "dma_fence_signal_timestamp_locked" } +elf_symbol { + id: 0x2012ba51 + name: "dma_fence_unwrap_first" + is_defined: true + symbol_type: FUNCTION + crc: 0xc3cd6929 + type_id: 0xdfa8404e + full_name: "dma_fence_unwrap_first" +} +elf_symbol { + id: 0xf5f318e9 + name: "dma_fence_unwrap_next" + is_defined: true + symbol_type: FUNCTION + crc: 0xd13e4af8 + type_id: 0xdd980e87 + full_name: "dma_fence_unwrap_next" +} elf_symbol { id: 0xf18ac584 name: "dma_fence_wait_any_timeout" @@ -398114,6 +398186,7 @@ interface { symbol_id: 0x279e51a3 symbol_id: 0xe78c29b1 symbol_id: 0x95c24824 + symbol_id: 0xfa3b077f symbol_id: 0x0a6e3e89 symbol_id: 0x347a699c symbol_id: 0x27ce6aa1 @@ -400523,6 +400596,8 @@ interface { symbol_id: 0xe2a2feec symbol_id: 0x904cad71 symbol_id: 0x2b7d2f8e + symbol_id: 0x2012ba51 + symbol_id: 0xf5f318e9 symbol_id: 0xf18ac584 symbol_id: 0x7ffe50b7 symbol_id: 0x3b69b427 diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index 7153beba5ba6..b7d8f1baa215 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -544,12 +544,16 @@ dma_fence_array_create dma_fence_context_alloc dma_fence_default_wait + dma_fence_enable_sw_signaling dma_fence_get_status dma_fence_init dma_fence_release dma_fence_remove_callback dma_fence_signal dma_fence_signal_locked + dma_fence_unwrap_first + __dma_fence_unwrap_merge + dma_fence_unwrap_next dma_fence_wait_timeout dma_free_attrs dma_free_pages From f44d373b32b144822b89eb24e38f4e66f2663281 Mon Sep 17 00:00:00 2001 From: Rick Yiu Date: Fri, 5 May 2023 14:31:36 +0000 Subject: [PATCH 053/139] ANDROID: sched: Add trace_android_rvh_setscheduler Sync to android13-5.10. This vendor hook is declared already. Bug: 245675204 Change-Id: Ib081b52542380d22317f225a50b553cda5f2634c Signed-off-by: Rick Yiu (cherry picked from commit f9688670ca92bf9f46f9a6134bc69623c30bbb23) --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8a1564c2268f..fc3f3dad20c3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7837,6 +7837,7 @@ change: if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); + trace_android_rvh_setscheduler(p); } __setscheduler_uclamp(p, attr); From 07775f9683d486f456e051dafa5871af18609ada Mon Sep 17 00:00:00 2001 From: Kever Yang Date: Thu, 14 Dec 2023 18:42:50 +0800 Subject: [PATCH 054/139] ANDROID: GKI: Add symbols for rockchip sata INFO: 24 function symbol(s) added 'size_t __scsi_format_command(char*, size_t, const unsigned char*, size_t)' 'int attribute_container_register(struct attribute_container*)' 'int attribute_container_unregister(struct attribute_container*)' 'void pci_intx(struct pci_dev*, int)' 'int pcim_iomap_regions_request_all(struct pci_dev*, int, const char*)' 'void pcim_pin_device(struct pci_dev*)' 'int reset_control_rearm(struct reset_control*)' 'enum scsi_disposition scsi_check_sense(struct scsi_cmnd*)' 'int scsi_device_set_state(struct scsi_device*, enum scsi_device_state)' 'void scsi_eh_finish_cmd(struct scsi_cmnd*, struct list_head*)' 'void scsi_eh_flush_done_q(struct list_head*)' 'int scsi_rescan_device(struct scsi_device*)' 'void scsi_schedule_eh(struct Scsi_Host*)' 'const u8* scsi_sense_desc_find(const u8*, int, int)' 'int scsi_set_sense_field_pointer(u8*, int, u16, u8, bool)' 'void sdev_evt_send_simple(struct scsi_device*, enum scsi_device_event, gfp_t)' 'bool system_entering_hibernation()' 'int transport_add_device(struct device*)' 'int transport_class_register(struct transport_class*)' 'void transport_class_unregister(struct transport_class*)' 'void transport_configure_device(struct device*)' 'void transport_destroy_device(struct device*)' 'void transport_remove_device(struct device*)' 'void transport_setup_device(struct device*)' Bug: 300024866 Change-Id: I6a505d48d0d199a710b0d93b6a8df189735a7b89 Signed-off-by: Kever Yang --- android/abi_gki_aarch64.stg | 411 +++++++++++++++++++++++++++++++ android/abi_gki_aarch64_rockchip | 88 ++++++- 2 files changed, 497 insertions(+), 2 deletions(-) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 68cf985124c9..e570ab134873 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -8268,6 +8268,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x6255e5da } +pointer_reference { + id: 0x120b1632 + kind: POINTER + pointee_type_id: 0x626cbe56 +} pointer_reference { id: 0x12191e2a kind: POINTER @@ -10833,6 +10838,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x52606d54 } +pointer_reference { + id: 0x1e0dbd15 + kind: POINTER + pointee_type_id: 0x527612cb +} pointer_reference { id: 0x1e20e7eb kind: POINTER @@ -15948,6 +15958,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x990a74b4 } +pointer_reference { + id: 0x2cd2cd79 + kind: POINTER + pointee_type_id: 0x990bd378 +} pointer_reference { id: 0x2cd31328 kind: POINTER @@ -61382,6 +61397,11 @@ member { type_id: 0xe62ebf07 offset: 128 } +member { + id: 0x86c22df0 + name: "class" + type_id: 0x83714889 +} member { id: 0x86f2bb02 name: "class" @@ -64796,6 +64816,12 @@ member { type_id: 0x2d7be27a offset: 448 } +member { + id: 0x73c32817 + name: "configure" + type_id: 0x2cd2cd79 + offset: 1280 +} member { id: 0x73c333e5 name: "configure" @@ -162517,6 +162543,12 @@ member { type_id: 0x2da2fbac offset: 704 } +member { + id: 0xb48f08f4 + name: "remove" + type_id: 0x2cd2cd79 + offset: 1344 +} member { id: 0xb48fbf27 name: "remove" @@ -175459,6 +175491,12 @@ member { type_id: 0x2cdc0ac8 offset: 9088 } +member { + id: 0x84e59dd8 + name: "setup" + type_id: 0x2cd2cd79 + offset: 1216 +} member { id: 0x84e68e26 name: "setup" @@ -260350,6 +260388,18 @@ struct_union { member_id: 0xba11b0ec } } +struct_union { + id: 0x527612cb + kind: STRUCT + name: "transport_class" + definition { + bytesize: 176 + member_id: 0x86c22df0 + member_id: 0x84e59dd8 + member_id: 0x73c32817 + member_id: 0xb48f08f4 + } +} struct_union { id: 0x626cbe56 kind: STRUCT @@ -282999,6 +283049,57 @@ enumeration { } } } +enumeration { + id: 0xd7ffc9ea + name: "scsi_device_event" + definition { + underlying_type_id: 0x4585663f + enumerator { + name: "SDEV_EVT_MEDIA_CHANGE" + value: 1 + } + enumerator { + name: "SDEV_EVT_INQUIRY_CHANGE_REPORTED" + value: 2 + } + enumerator { + name: "SDEV_EVT_CAPACITY_CHANGE_REPORTED" + value: 3 + } + enumerator { + name: "SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED" + value: 4 + } + enumerator { + name: "SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED" + value: 5 + } + enumerator { + name: "SDEV_EVT_LUN_CHANGE_REPORTED" + value: 6 + } + enumerator { + name: "SDEV_EVT_ALUA_STATE_CHANGE_REPORTED" + value: 7 + } + enumerator { + name: "SDEV_EVT_POWER_ON_RESET_OCCURRED" + value: 8 + } + enumerator { + name: "SDEV_EVT_FIRST" + value: 1 + } + enumerator { + name: "SDEV_EVT_LAST" + value: 8 + } + enumerator { + name: "SDEV_EVT_MAXBITS" + value: 9 + } + } +} enumeration { id: 0xdf9e95f6 name: "scsi_device_state" @@ -292974,6 +293075,11 @@ function { parameter_id: 0x1d44326e parameter_id: 0x34d3469d } +function { + id: 0x171b3ed6 + return_type_id: 0x48b5725f + parameter_id: 0x1e0dbd15 +} function { id: 0x171c8621 return_type_id: 0xd5cc9c9a @@ -294775,6 +294881,13 @@ function { parameter_id: 0x1259e377 parameter_id: 0xe276adef } +function { + id: 0x1aa483a8 + return_type_id: 0x48b5725f + parameter_id: 0x257d12af + parameter_id: 0xd7ffc9ea + parameter_id: 0xf1a6dfed +} function { id: 0x1aa56a0d return_type_id: 0x48b5725f @@ -298446,6 +298559,12 @@ function { parameter_id: 0x3e10b518 parameter_id: 0xa52a0930 } +function { + id: 0x1f84fe6e + return_type_id: 0x48b5725f + parameter_id: 0x3f949c69 + parameter_id: 0x3e6239e1 +} function { id: 0x1f85d3ef return_type_id: 0x48b5725f @@ -302490,6 +302609,11 @@ function { parameter_id: 0xe276adef parameter_id: 0xc93e017b } +function { + id: 0x62985582 + return_type_id: 0x34cf6c51 + parameter_id: 0x3f949c69 +} function { id: 0x62b8d7ec return_type_id: 0x09427c40 @@ -311320,6 +311444,14 @@ function { return_type_id: 0x6720d32f parameter_id: 0x21069feb } +function { + id: 0x95c2268d + return_type_id: 0xf435685e + parameter_id: 0x0483e6f8 + parameter_id: 0xf435685e + parameter_id: 0x384c5795 + parameter_id: 0xf435685e +} function { id: 0x95c3652e return_type_id: 0x6720d32f @@ -312535,6 +312667,12 @@ function { parameter_id: 0xf1a6dfed parameter_id: 0x0292b875 } +function { + id: 0x97a1ddd3 + return_type_id: 0x6720d32f + parameter_id: 0x257d12af + parameter_id: 0xdf9e95f6 +} function { id: 0x97a3c07a return_type_id: 0x6720d32f @@ -314085,6 +314223,13 @@ function { parameter_id: 0x6720d32f parameter_id: 0x064d6086 } +function { + id: 0x990bd378 + return_type_id: 0x6720d32f + parameter_id: 0x120b1632 + parameter_id: 0x0258f96e + parameter_id: 0x0258f96e +} function { id: 0x99132caa return_type_id: 0x6720d32f @@ -315160,6 +315305,11 @@ function { parameter_id: 0x1d19a9d5 parameter_id: 0x310ec01d } +function { + id: 0x9a038c6a + return_type_id: 0x6720d32f + parameter_id: 0x1e0dbd15 +} function { id: 0x9a03c4d6 return_type_id: 0x6720d32f @@ -320536,6 +320686,15 @@ function { parameter_id: 0x6720d32f parameter_id: 0x92233392 } +function { + id: 0x9c09446b + return_type_id: 0x6720d32f + parameter_id: 0x00c72527 + parameter_id: 0x6720d32f + parameter_id: 0x914dbfdc + parameter_id: 0x295c7202 + parameter_id: 0x6d7f5ff6 +} function { id: 0x9c09d6aa return_type_id: 0x6720d32f @@ -325996,6 +326155,11 @@ function { parameter_id: 0x0cf3d8fe parameter_id: 0x4585663f } +function { + id: 0x9faad4c6 + return_type_id: 0x6720d32f + parameter_id: 0x08a8dfa4 +} function { id: 0x9fab680a return_type_id: 0x6720d32f @@ -329000,6 +329164,13 @@ function { return_type_id: 0x02eb105a parameter_id: 0x3e10b518 } +function { + id: 0xd981a35c + return_type_id: 0x3f0185ef + parameter_id: 0x3f0185ef + parameter_id: 0x6720d32f + parameter_id: 0x6720d32f +} function { id: 0xd9bb2b92 return_type_id: 0x4585663f @@ -334324,6 +334495,15 @@ elf_symbol { type_id: 0xa017504e full_name: "__scsi_device_lookup_by_target" } +elf_symbol { + id: 0xe18b6ee8 + name: "__scsi_format_command" + is_defined: true + symbol_type: FUNCTION + crc: 0x93022ba6 + type_id: 0x95c2268d + full_name: "__scsi_format_command" +} elf_symbol { id: 0x0166be18 name: "__scsi_iterate_devices" @@ -344204,6 +344384,24 @@ elf_symbol { type_id: 0x9048c0ea full_name: "atomic_notifier_chain_unregister" } +elf_symbol { + id: 0x41765c03 + name: "attribute_container_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x167c84c3 + type_id: 0x9faad4c6 + full_name: "attribute_container_register" +} +elf_symbol { + id: 0xcd05507b + name: "attribute_container_unregister" + is_defined: true + symbol_type: FUNCTION + crc: 0x15baabca + type_id: 0x9faad4c6 + full_name: "attribute_container_unregister" +} elf_symbol { id: 0x82786c66 name: "autoremove_wake_function" @@ -375088,6 +375286,15 @@ elf_symbol { type_id: 0x93acae9b full_name: "pci_host_probe" } +elf_symbol { + id: 0xec0d5441 + name: "pci_intx" + is_defined: true + symbol_type: FUNCTION + crc: 0x4e899f5e + type_id: 0x157d734c + full_name: "pci_intx" +} elf_symbol { id: 0x9c6c58ea name: "pci_iomap" @@ -375628,6 +375835,15 @@ elf_symbol { type_id: 0x986a45dd full_name: "pcim_iomap_regions" } +elf_symbol { + id: 0xae61b91f + name: "pcim_iomap_regions_request_all" + is_defined: true + symbol_type: FUNCTION + crc: 0xd111489f + type_id: 0x986a45dd + full_name: "pcim_iomap_regions_request_all" +} elf_symbol { id: 0xc37c9a74 name: "pcim_iomap_table" @@ -375646,6 +375862,15 @@ elf_symbol { type_id: 0x157d734c full_name: "pcim_iounmap_regions" } +elf_symbol { + id: 0xfa9dbeca + name: "pcim_pin_device" + is_defined: true + symbol_type: FUNCTION + crc: 0xfdffeca6 + type_id: 0x14e1f000 + full_name: "pcim_pin_device" +} elf_symbol { id: 0x123cd197 name: "pcpu_nr_pages" @@ -380920,6 +381145,15 @@ elf_symbol { type_id: 0x1a9c8a01 full_name: "reset_control_put" } +elf_symbol { + id: 0x642147cd + name: "reset_control_rearm" + is_defined: true + symbol_type: FUNCTION + crc: 0x5d2bc42a + type_id: 0x978438bd + full_name: "reset_control_rearm" +} elf_symbol { id: 0x9c7a2d6c name: "reset_control_release" @@ -382369,6 +382603,15 @@ elf_symbol { type_id: 0x954324c8 full_name: "scsi_change_queue_depth" } +elf_symbol { + id: 0xebec291e + name: "scsi_check_sense" + is_defined: true + symbol_type: FUNCTION + crc: 0x05404117 + type_id: 0x62985582 + full_name: "scsi_check_sense" +} elf_symbol { id: 0xe4036f2e name: "scsi_cmd_allowed" @@ -382432,6 +382675,15 @@ elf_symbol { type_id: 0x19c71538 full_name: "scsi_device_resume" } +elf_symbol { + id: 0x55968d64 + name: "scsi_device_set_state" + is_defined: true + symbol_type: FUNCTION + crc: 0x3e16b971 + type_id: 0x97a1ddd3 + full_name: "scsi_device_set_state" +} elf_symbol { id: 0xf10245da name: "scsi_dma_map" @@ -382459,6 +382711,24 @@ elf_symbol { type_id: 0x1f7d7689 full_name: "scsi_done" } +elf_symbol { + id: 0xb77321e1 + name: "scsi_eh_finish_cmd" + is_defined: true + symbol_type: FUNCTION + crc: 0x8e5f7b03 + type_id: 0x1f84fe6e + full_name: "scsi_eh_finish_cmd" +} +elf_symbol { + id: 0xe584e576 + name: "scsi_eh_flush_done_q" + is_defined: true + symbol_type: FUNCTION + crc: 0xf811e69d + type_id: 0x1f00dfeb + full_name: "scsi_eh_flush_done_q" +} elf_symbol { id: 0x8ef5c221 name: "scsi_execute_cmd" @@ -382585,6 +382855,15 @@ elf_symbol { type_id: 0x14f27dac full_name: "scsi_report_bus_reset" } +elf_symbol { + id: 0x24093af7 + name: "scsi_rescan_device" + is_defined: true + symbol_type: FUNCTION + crc: 0x83fa9f1b + type_id: 0x94dfa784 + full_name: "scsi_rescan_device" +} elf_symbol { id: 0xc9021692 name: "scsi_scan_host" @@ -382594,6 +382873,33 @@ elf_symbol { type_id: 0x156efee0 full_name: "scsi_scan_host" } +elf_symbol { + id: 0x51e78cea + name: "scsi_schedule_eh" + is_defined: true + symbol_type: FUNCTION + crc: 0xd78a6752 + type_id: 0x156efee0 + full_name: "scsi_schedule_eh" +} +elf_symbol { + id: 0x9489f8a9 + name: "scsi_sense_desc_find" + is_defined: true + symbol_type: FUNCTION + crc: 0x10d9f885 + type_id: 0xd981a35c + full_name: "scsi_sense_desc_find" +} +elf_symbol { + id: 0x494ae459 + name: "scsi_set_sense_field_pointer" + is_defined: true + symbol_type: FUNCTION + crc: 0x3ab7b1cc + type_id: 0x9c09446b + full_name: "scsi_set_sense_field_pointer" +} elf_symbol { id: 0xcf17c9a6 name: "scsi_set_sense_information" @@ -382612,6 +382918,15 @@ elf_symbol { type_id: 0x156efee0 full_name: "scsi_unblock_requests" } +elf_symbol { + id: 0xe6808261 + name: "sdev_evt_send_simple" + is_defined: true + symbol_type: FUNCTION + crc: 0x1727f774 + type_id: 0x1aa483a8 + full_name: "sdev_evt_send_simple" +} elf_symbol { id: 0x771aea1d name: "sdev_prefix_printk" @@ -388511,6 +388826,15 @@ elf_symbol { type_id: 0x599826a1 full_name: "system_32bit_el0_cpumask" } +elf_symbol { + id: 0x991b4bfd + name: "system_entering_hibernation" + is_defined: true + symbol_type: FUNCTION + crc: 0x13f42152 + type_id: 0xfea45b04 + full_name: "system_entering_hibernation" +} elf_symbol { id: 0xb5701f35 name: "system_freezable_power_efficient_wq" @@ -389546,6 +389870,69 @@ elf_symbol { type_id: 0x10985193 full_name: "tracing_off" } +elf_symbol { + id: 0x8f8403dc + name: "transport_add_device" + is_defined: true + symbol_type: FUNCTION + crc: 0x9d7e8343 + type_id: 0x9d16dd74 + full_name: "transport_add_device" +} +elf_symbol { + id: 0x5911125b + name: "transport_class_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x071cb3f2 + type_id: 0x9a038c6a + full_name: "transport_class_register" +} +elf_symbol { + id: 0x113cbc59 + name: "transport_class_unregister" + is_defined: true + symbol_type: FUNCTION + crc: 0xce941924 + type_id: 0x171b3ed6 + full_name: "transport_class_unregister" +} +elf_symbol { + id: 0x7640c32b + name: "transport_configure_device" + is_defined: true + symbol_type: FUNCTION + crc: 0x106dd54f + type_id: 0x100e6fc8 + full_name: "transport_configure_device" +} +elf_symbol { + id: 0xc0be90d8 + name: "transport_destroy_device" + is_defined: true + symbol_type: FUNCTION + crc: 0x1870a351 + type_id: 0x100e6fc8 + full_name: "transport_destroy_device" +} +elf_symbol { + id: 0x09f20ac9 + name: "transport_remove_device" + is_defined: true + symbol_type: FUNCTION + crc: 0xcd97ee1a + type_id: 0x100e6fc8 + full_name: "transport_remove_device" +} +elf_symbol { + id: 0xd75a472d + name: "transport_setup_device" + is_defined: true + symbol_type: FUNCTION + crc: 0x66ba89d2 + type_id: 0x100e6fc8 + full_name: "transport_setup_device" +} elf_symbol { id: 0x3f07269b name: "truncate_inode_pages" @@ -398397,6 +398784,7 @@ interface { symbol_id: 0x99aa632e symbol_id: 0xe68925b8 symbol_id: 0x6e3bb1cf + symbol_id: 0xe18b6ee8 symbol_id: 0x0166be18 symbol_id: 0xc5953732 symbol_id: 0x1d4d84d0 @@ -399495,6 +399883,8 @@ interface { symbol_id: 0x5f6a1554 symbol_id: 0x3beebbde symbol_id: 0x24064426 + symbol_id: 0x41765c03 + symbol_id: 0xcd05507b symbol_id: 0x82786c66 symbol_id: 0xd772fde3 symbol_id: 0x1abdc14f @@ -402925,6 +403315,7 @@ interface { symbol_id: 0x9ac8ef20 symbol_id: 0x35c96922 symbol_id: 0xbe6406c3 + symbol_id: 0xec0d5441 symbol_id: 0x9c6c58ea symbol_id: 0x2fefe933 symbol_id: 0x1c994923 @@ -402985,8 +403376,10 @@ interface { symbol_id: 0xffa3ecd1 symbol_id: 0x42595f98 symbol_id: 0xd085753f + symbol_id: 0xae61b91f symbol_id: 0xc37c9a74 symbol_id: 0xd03f3f09 + symbol_id: 0xfa9dbeca symbol_id: 0x123cd197 symbol_id: 0xe57e5e73 symbol_id: 0x8ba9d028 @@ -403573,6 +403966,7 @@ interface { symbol_id: 0x57ee69c1 symbol_id: 0xd76b82b2 symbol_id: 0x30c7b7f4 + symbol_id: 0x642147cd symbol_id: 0x9c7a2d6c symbol_id: 0x48fc2cb6 symbol_id: 0xd41c441b @@ -403734,6 +404128,7 @@ interface { symbol_id: 0xd3148537 symbol_id: 0xb5b25b58 symbol_id: 0x278a6b59 + symbol_id: 0xebec291e symbol_id: 0xe4036f2e symbol_id: 0xd49d7abc symbol_id: 0x76dea2aa @@ -403741,9 +404136,12 @@ interface { symbol_id: 0x474e9bcc symbol_id: 0x61df84bc symbol_id: 0x054c0bba + symbol_id: 0x55968d64 symbol_id: 0xf10245da symbol_id: 0x18cbd7f9 symbol_id: 0x30f6b9b1 + symbol_id: 0xb77321e1 + symbol_id: 0xe584e576 symbol_id: 0x8ef5c221 symbol_id: 0x32b196e0 symbol_id: 0x022517f0 @@ -403758,9 +404156,14 @@ interface { symbol_id: 0x42390c70 symbol_id: 0x8deacb1d symbol_id: 0x2e407415 + symbol_id: 0x24093af7 symbol_id: 0xc9021692 + symbol_id: 0x51e78cea + symbol_id: 0x9489f8a9 + symbol_id: 0x494ae459 symbol_id: 0xcf17c9a6 symbol_id: 0x9c54c873 + symbol_id: 0xe6808261 symbol_id: 0x771aea1d symbol_id: 0x8d3c4841 symbol_id: 0xf399cd48 @@ -404417,6 +404820,7 @@ interface { symbol_id: 0xda44819e symbol_id: 0x46cd3193 symbol_id: 0xb6c44fb1 + symbol_id: 0x991b4bfd symbol_id: 0xb5701f35 symbol_id: 0xeeb4dc4c symbol_id: 0x314b4b2e @@ -404532,6 +404936,13 @@ interface { symbol_id: 0x3df2f359 symbol_id: 0x33172d21 symbol_id: 0x54bbaa46 + symbol_id: 0x8f8403dc + symbol_id: 0x5911125b + symbol_id: 0x113cbc59 + symbol_id: 0x7640c32b + symbol_id: 0xc0be90d8 + symbol_id: 0x09f20ac9 + symbol_id: 0xd75a472d symbol_id: 0x3f07269b symbol_id: 0x3c7c6ce9 symbol_id: 0x7a43283c diff --git a/android/abi_gki_aarch64_rockchip b/android/abi_gki_aarch64_rockchip index 8fdda5ad35fb..0010cf2300b6 100644 --- a/android/abi_gki_aarch64_rockchip +++ b/android/abi_gki_aarch64_rockchip @@ -2,6 +2,7 @@ # commonly used symbols add_timer alloc_chrdev_region + alloc_etherdev_mqs alloc_iova_fast __alloc_pages __alloc_skb @@ -827,9 +828,25 @@ param_ops_int param_ops_string param_ops_uint + param_ops_ulong + pci_disable_device + pci_disable_link_state pcie_capability_clear_and_set_word + pci_find_capability + pcim_enable_device + pcim_iomap_table + pcim_pin_device + pci_read_config_byte pci_read_config_dword + pci_read_config_word + __pci_register_driver + pci_restore_state + pci_save_state + pci_set_master + pci_set_power_state + pci_unregister_driver pci_write_config_dword + pci_write_config_word __per_cpu_offset perf_trace_buf_alloc perf_trace_run_bpf_submit @@ -1023,7 +1040,11 @@ sched_set_fifo schedule schedule_timeout + schedule_timeout_uninterruptible scnprintf + scsi_command_size_tbl + scsi_device_get + scsi_device_put __sdhci_add_host sdhci_cleanup_host sdhci_enable_clk @@ -1325,6 +1346,7 @@ vunmap vzalloc wait_for_completion + wait_for_completion_interruptible wait_for_completion_timeout __wake_up wake_up_process @@ -1346,15 +1368,23 @@ skcipher_walk_aead_decrypt skcipher_walk_aead_encrypt +# required by ahci.ko + pci_alloc_irq_vectors_affinity + pci_free_irq_vectors + pci_intx + pci_irq_vector + pci_match_id + pcim_iomap_regions_request_all + sysfs_add_file_to_group + sysfs_remove_file_from_group + # required by analogix_dp.ko drm_atomic_get_old_connector_for_encoder # required by aspm_ext.ko - pci_find_capability pci_find_ext_capability # required by bcmdhd.ko - alloc_etherdev_mqs cpu_bit_bitmap down_interruptible down_timeout @@ -1873,6 +1903,60 @@ # required by ledtrig-heartbeat.ko avenrun +# required by libahci.ko + __printk_ratelimit + +# required by libahci_platform.ko + reset_control_rearm + +# required by libata.ko + async_schedule_node + async_synchronize_cookie + attribute_container_register + attribute_container_unregister + autoremove_wake_function + blk_abort_request + blk_queue_max_hw_sectors + blk_queue_max_segments + blk_queue_update_dma_alignment + blk_queue_update_dma_pad + glob_match + pci_bus_type + pcim_iomap_regions + prepare_to_wait + __scsi_add_device + scsi_add_host_with_dma + scsi_build_sense + scsi_change_queue_depth + scsi_check_sense + scsi_device_set_state + scsi_done + scsi_eh_finish_cmd + scsi_eh_flush_done_q + scsi_execute_cmd + __scsi_format_command + scsi_host_alloc + scsi_host_put + scsi_remove_device + scsi_remove_host + scsi_rescan_device + scsi_schedule_eh + scsi_sense_desc_find + scsi_set_sense_field_pointer + scsi_set_sense_information + sdev_evt_send_simple + system_entering_hibernation + trace_seq_printf + trace_seq_putc + transport_add_device + transport_class_register + transport_class_unregister + transport_configure_device + transport_destroy_device + transport_remove_device + transport_setup_device + vscnprintf + # required by mac80211.ko alloc_netdev_mqs __alloc_percpu_gfp From 613d8368e39f4cd67f2ad629ebb870d3e3b4bc6e Mon Sep 17 00:00:00 2001 From: Paul Lawrence Date: Wed, 2 Aug 2023 12:23:44 -0700 Subject: [PATCH 055/139] ANDROID: fuse-bpf: Follow mounts in lookups Bug: 292925770 Test: fuse_test run. The following steps on Android also now pass: Create /data/123 and /data/media/0/Android/data/45 directories Mount /data/123 directory to /data/media/0/Android/data/45 directory Create 1.txt under the /data/123 directory File 1.txt should appear in /storage/emulated/0/Android/data/45 Change-Id: I1fe27d743ca2981e624a9aa87d9ab6deb313aadc Signed-off-by: Paul Lawrence --- fs/fuse/backing.c | 15 ++++--- .../selftests/filesystems/fuse/bpf_loader.c | 28 +++++++++++- .../selftests/filesystems/fuse/fuse_test.c | 45 +++++++++++++++++++ .../selftests/filesystems/fuse/test_fuse.h | 3 ++ 4 files changed, 85 insertions(+), 6 deletions(-) diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c index e16457c75944..6ca74987f7da 100644 --- a/fs/fuse/backing.c +++ b/fs/fuse/backing.c @@ -1117,7 +1117,6 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, struct kstat stat; int err; - /* TODO this will not handle lookups over mount points */ inode_lock_nested(dir_backing_inode, I_MUTEX_PARENT); backing_entry = lookup_one_len(entry->d_name.name, dir_backing_entry, strlen(entry->d_name.name)); @@ -1136,16 +1135,22 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, return 0; } + err = follow_down(&fuse_entry->backing_path); + if (err) + goto err_out; + err = vfs_getattr(&fuse_entry->backing_path, &stat, STATX_BASIC_STATS, 0); - if (err) { - path_put_init(&fuse_entry->backing_path); - return err; - } + if (err) + goto err_out; fuse_stat_to_attr(get_fuse_conn(dir), backing_entry->d_inode, &stat, &feo->attr); return 0; + +err_out: + path_put_init(&fuse_entry->backing_path); + return err; } int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode, diff --git a/tools/testing/selftests/filesystems/fuse/bpf_loader.c b/tools/testing/selftests/filesystems/fuse/bpf_loader.c index 5bf26eadd421..94f884c64d29 100644 --- a/tools/testing/selftests/filesystems/fuse/bpf_loader.c +++ b/tools/testing/selftests/filesystems/fuse/bpf_loader.c @@ -394,6 +394,29 @@ int s_rename(struct s oldpathname, struct s newpathname) return res; } +int s_mount(struct s source, struct s target, struct s filesystem, + unsigned long mountflags, struct s data) +{ + int res; + + res = mount(source.s, target.s, filesystem.s, mountflags, data.s); + free(source.s); + free(target.s); + free(filesystem.s); + free(data.s); + + return res; +} + +int s_umount(struct s target) +{ + int res; + + res = umount(target.s); + free(target.s); + return res; +} + int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out) { @@ -574,7 +597,10 @@ static int mount_fuse_maybe_init(const char *mount_dir, int bpf_fd, int dir_fd, })); } - *fuse_dev_ptr = fuse_dev; + if (fuse_dev_ptr) + *fuse_dev_ptr = fuse_dev; + else + TESTSYSCALL(close(fuse_dev)); fuse_dev = -1; result = TEST_SUCCESS; out: diff --git a/tools/testing/selftests/filesystems/fuse/fuse_test.c b/tools/testing/selftests/filesystems/fuse/fuse_test.c index 528595a8e82f..ad24ed48853e 100644 --- a/tools/testing/selftests/filesystems/fuse/fuse_test.c +++ b/tools/testing/selftests/filesystems/fuse/fuse_test.c @@ -2114,6 +2114,50 @@ out: return result; } +/** + * Test that fuse passthrough correctly traverses a mount point on the lower fs + */ +static int bpf_test_follow_mounts(const char *mount_dir) +{ + const char *bind_src = "bind_src"; + const char *bind_dst = "bind_dst"; + const char *file = "file"; + int fd = -1; + int src_fd = -1; + int result = TEST_FAILURE; + + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_src)), 0777)); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_dst)), 0777)); + TEST(fd = s_creat(s_pathn(3, s(ft_src), s(bind_src), s(file)), 0777), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + TESTSYSCALL(s_mount(s_path(s(ft_src), s(bind_src)), + s_path(s(ft_src), s(bind_dst)), + s(NULL), MS_BIND, s(NULL))); + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, src_fd, NULL), 0); + TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_src), s(file)), + O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_dst), s(file)), + O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + + result = TEST_SUCCESS; +out: + umount(mount_dir); + close(src_fd); + s_umount(s_path(s(ft_src), s(bind_dst))); + close(fd); + return result; +} + static void parse_range(const char *ranges, bool *run_test, size_t tests) { size_t i; @@ -2244,6 +2288,7 @@ int main(int argc, char *argv[]) MAKE_TEST(bpf_test_create_and_remove_bpf), MAKE_TEST(bpf_test_mkdir_and_remove_bpf), MAKE_TEST(bpf_test_readahead), + MAKE_TEST(bpf_test_follow_mounts), }; #undef MAKE_TEST diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse.h b/tools/testing/selftests/filesystems/fuse/test_fuse.h index 69dadc9c7e45..e62e2ee07713 100644 --- a/tools/testing/selftests/filesystems/fuse/test_fuse.h +++ b/tools/testing/selftests/filesystems/fuse/test_fuse.h @@ -64,6 +64,9 @@ int s_setxattr(struct s pathname, const char name[], const void *value, size_t size, int flags); int s_removexattr(struct s pathname, const char name[]); int s_rename(struct s oldpathname, struct s newpathname); +int s_mount(struct s source, struct s target, struct s filesystem, + unsigned long mountflags, struct s data); +int s_umount(struct s target); struct s tracing_folder(void); int tracing_on(void); From 44702d8fa1b046b52595a03ac7fedc7abc414c42 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 14 Dec 2023 04:58:41 +0000 Subject: [PATCH 056/139] FROMLIST: mm: migrate high-order folios in swap cache correctly Large folios occupy N consecutive entries in the swap cache instead of using multi-index entries like the page cache. However, if a large folio is re-added to the LRU list, it can be migrated. The migration code was not aware of the difference between the swap cache and the page cache and assumed that a single xas_store() would be sufficient. This leaves potentially many stale pointers to the now-migrated folio in the swap cache, which can lead to almost arbitrary data corruption in the future. This can also manifest as infinite loops with the RCU read lock held. Bug: 315281107 Change-Id: I455f964a9f21c13089890073777388236b6669d7 [willy@infradead.org: modifications to the changelog & tweaked the fix] Fixes: 3417013e0d18 ("mm/migrate: Add folio_migrate_mapping()") Link: https://lkml.kernel.org/r/20231214045841.961776-1-willy@infradead.org Link: https://lore.kernel.org/linux-mm/20231214045841.961776-1-willy@infradead.org/ Signed-off-by: Charan Teja Kalla Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Charan Teja Kalla Closes: https://lkml.kernel.org/r/1700569840-17327-1-git-send-email-quic_charante@quicinc.com Cc: David Hildenbrand Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton Signed-off-by: Charan Teja Kalla --- mm/migrate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index ef490976c98e..9f5f52deed0c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -393,6 +393,7 @@ int folio_migrate_mapping(struct address_space *mapping, int dirty; int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); + long entries, i; if (!mapping) { /* Anonymous page without mapping */ @@ -430,8 +431,10 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); } + entries = nr; } else { VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + entries = 1; } /* Move dirty while page refs frozen and newpage not yet exposed */ @@ -441,7 +444,11 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - xas_store(&xas, newfolio); + /* Swap cache still stores N entries instead of a high-order entry */ + for (i = 0; i < entries; i++) { + xas_store(&xas, newfolio); + xas_next(&xas); + } /* * Drop cache reference from old page by unfreezing From 30bca9e2785b3c7cce113308b16b40132293ca34 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Dec 2023 15:47:13 +0100 Subject: [PATCH 057/139] UPSTREAM: netfilter: nft_set_pipapo: skip inactive elements during set walk commit 317eb9685095678f2c9f5a8189de698c5354316a upstream. Otherwise set elements can be deactivated twice which will cause a crash. Bug: 316310313 Reported-by: Xingyuan Mo Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 189c2a82933c67ad360c421258d5449f6647544a) Signed-off-by: Lee Jones Change-Id: I27fb6ee806642e23ca02700763a387341dd463e6 --- net/netfilter/nft_set_pipapo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index deea6196d992..4e1cc31729b8 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2042,6 +2042,9 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, e = f->mt[r].e; + if (!nft_set_elem_active(&e->ext, iter->genmask)) + goto cont; + elem.priv = e; iter->err = iter->fn(ctx, set, iter, &elem); From 401a2769d99066952f3bfb73a8ce6b0269bc04d7 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Tue, 21 Nov 2023 20:51:50 -0700 Subject: [PATCH 058/139] UPSTREAM: dm verity: don't perform FEC for failed readahead IO We found an issue under Android OTA scenario that many BIOs have to do FEC where the data under dm-verity is 100% complete and no corruption. Android OTA has many dm-block layers, from upper to lower: dm-verity dm-snapshot dm-origin & dm-cow dm-linear ufs DM tables have to change 2 times during Android OTA merging process. When doing table change, the dm-snapshot will be suspended for a while. During this interval, many readahead IOs are submitted to dm_verity from filesystem. Then the kverity works are busy doing FEC process which cost too much time to finish dm-verity IO. This causes needless delay which feels like system is hung. After adding debugging it was found that each readahead IO needed around 10s to finish when this situation occurred. This is due to IO amplification: dm-snapshot suspend erofs_readahead // 300+ io is submitted dm_submit_bio (dm_verity) dm_submit_bio (dm_snapshot) bio return EIO bio got nothing, it's empty verity_end_io verity_verify_io forloop range(0, io->n_blocks) // each io->nblocks ~= 20 verity_fec_decode fec_decode_rsb fec_read_bufs forloop range(0, v->fec->rsn) // v->fec->rsn = 253 new_read submit_bio (dm_snapshot) end loop end loop dm-snapshot resume Readahead BIOs get nothing while dm-snapshot is suspended, so all of them will cause verity's FEC. Each readahead BIO needs to verify ~20 (io->nblocks) blocks. Each block needs to do FEC, and every block needs to do 253 (v->fec->rsn) reads. So during the suspend interval(~200ms), 300 readahead BIOs trigger ~1518000 (300*20*253) IOs to dm-snapshot. As readahead IO is not required by userspace, and to fix this issue, it is best to pass readahead errors to upper layer to handle it. Cc: stable@vger.kernel.org Fixes: a739ff3f543a ("dm verity: add support for forward error correction") Bug: 316972624 Link: https://lore.kernel.org/dm-devel/b84fb49-bf63-3442-8c99-d565e134f2@redhat.com Signed-off-by: Wu Bo Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Akilesh Kailash (cherry picked from commit 0193e3966ceeeef69e235975918b287ab093082b) Change-Id: I73560e5660cebdc1997e1f9926cbb8888789eb46 --- drivers/md/dm-verity-target.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index fd68500ae2ad..1112a0a694e0 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -656,7 +656,9 @@ static void verity_end_io(struct bio *bio) struct dm_verity_io *io = bio->bi_private; if (bio->bi_status && - (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { + (!verity_fec_is_enabled(io->v) || + verity_is_system_shutting_down() || + (bio->bi_opf & REQ_RAHEAD))) { verity_finish_io(io, bio->bi_status); return; } From bc97d5019afdfa2a26cd39d6556ace0e55269919 Mon Sep 17 00:00:00 2001 From: xieliujie Date: Mon, 25 Dec 2023 14:28:51 +0800 Subject: [PATCH 059/139] ANDROID: vendor_hooks: Add hooks for rt_mutex steal Add hooks at rt_mutex_steal function so that oems can decide whether tasks with the same priority steal the rt_mutex or not. We did experiments and found that rt_mutex throughput can benefit a lot when threads with the same priority can steal the rt_mutex lock. Bug: 317670024 Change-Id: Id60a7a41c6c77a67808982d3667946cabe4acc8f Signed-off-by: xeiliujie --- drivers/android/vendor_hooks.c | 1 + include/trace/hooks/dtask.h | 3 +++ kernel/locking/rtmutex.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index 43eb748c1103..ee68032b1918 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -95,6 +95,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_task_blocks_on_rtmutex); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_waiter_prio); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rt_mutex_steal); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner); diff --git a/include/trace/hooks/dtask.h b/include/trace/hooks/dtask.h index a63a2868e626..b51147089b2d 100644 --- a/include/trace/hooks/dtask.h +++ b/include/trace/hooks/dtask.h @@ -42,6 +42,9 @@ DECLARE_HOOK(android_vh_rtmutex_wait_start, DECLARE_HOOK(android_vh_rtmutex_wait_finish, TP_PROTO(struct rt_mutex_base *lock), TP_ARGS(lock)); +DECLARE_HOOK(android_vh_rt_mutex_steal, + TP_PROTO(int waiter_prio, int top_waiter_prio, bool *ret), + TP_ARGS(waiter_prio, top_waiter_prio, ret)); DECLARE_HOOK(android_vh_rwsem_read_wait_start, TP_PROTO(struct rw_semaphore *sem), diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 351716fe9138..8207fdade4a8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -391,9 +391,15 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { + bool ret = false; + if (rt_mutex_waiter_less(waiter, top_waiter)) return true; + trace_android_vh_rt_mutex_steal(waiter->prio, top_waiter->prio, &ret); + if (ret) + return true; + #ifdef RT_MUTEX_BUILD_SPINLOCKS /* * Note that RT tasks are excluded from same priority (lateral) From d3006fb9449d46b3c7733d5baa30c7a0e944cc5e Mon Sep 17 00:00:00 2001 From: xieliujie Date: Mon, 25 Dec 2023 15:06:38 +0800 Subject: [PATCH 060/139] ANDROID: ABI: Update oplus symbol list 1 function symbol(s) added 'int __traceiter_android_vh_rt_mutex_steal(void*, int, int, bool*)' 1 variable symbol(s) added 'struct tracepoint __tracepoint_android_vh_rt_mutex_steal' Bug: 317670024 Change-Id: I28f0379adaec041400e49cbd1e497b2f8c5c893d Signed-off-by: xeiliujie --- android/abi_gki_aarch64.stg | 28 ++++++++++++++++++++++++++++ android/abi_gki_aarch64_oplus | 2 ++ 2 files changed, 30 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index e570ab134873..f2b7c07a7716 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -315510,6 +315510,14 @@ function { parameter_id: 0x6720d32f parameter_id: 0x3c2755a3 } +function { + id: 0x9a2ab624 + return_type_id: 0x6720d32f + parameter_id: 0x18bd6530 + parameter_id: 0x6720d32f + parameter_id: 0x6720d32f + parameter_id: 0x11cfee5a +} function { id: 0x9a2abc7b return_type_id: 0x6720d32f @@ -337636,6 +337644,15 @@ elf_symbol { type_id: 0x9b08a261 full_name: "__traceiter_android_vh_rproc_recovery_set" } +elf_symbol { + id: 0xd56fbf76 + name: "__traceiter_android_vh_rt_mutex_steal" + is_defined: true + symbol_type: FUNCTION + crc: 0xf0a6d2df + type_id: 0x9a2ab624 + full_name: "__traceiter_android_vh_rt_mutex_steal" +} elf_symbol { id: 0x3ef508a2 name: "__traceiter_android_vh_rtmutex_wait_finish" @@ -341632,6 +341649,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_rproc_recovery_set" } +elf_symbol { + id: 0xed43b088 + name: "__tracepoint_android_vh_rt_mutex_steal" + is_defined: true + symbol_type: OBJECT + crc: 0xdc6b8d43 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_rt_mutex_steal" +} elf_symbol { id: 0xa3915d70 name: "__tracepoint_android_vh_rtmutex_wait_finish" @@ -399133,6 +399159,7 @@ interface { symbol_id: 0x8d62858f symbol_id: 0xcef5d79f symbol_id: 0x91384eff + symbol_id: 0xd56fbf76 symbol_id: 0x3ef508a2 symbol_id: 0xfb1b8d64 symbol_id: 0xc56d7179 @@ -399577,6 +399604,7 @@ interface { symbol_id: 0x04365139 symbol_id: 0xd94bc301 symbol_id: 0x3fc5ffc9 + symbol_id: 0xed43b088 symbol_id: 0xa3915d70 symbol_id: 0xf01f02ea symbol_id: 0xeaebbadf diff --git a/android/abi_gki_aarch64_oplus b/android/abi_gki_aarch64_oplus index a374dcf65636..8c9846b3a27d 100644 --- a/android/abi_gki_aarch64_oplus +++ b/android/abi_gki_aarch64_oplus @@ -158,6 +158,7 @@ __traceiter_android_vh_dm_bufio_shrink_scan_bypass __traceiter_android_vh_mutex_unlock_slowpath __traceiter_android_vh_rtmutex_waiter_prio + __traceiter_android_vh_rt_mutex_steal __traceiter_android_vh_rwsem_can_spin_on_owner __traceiter_android_vh_rwsem_opt_spin_finish __traceiter_android_vh_rwsem_opt_spin_start @@ -258,6 +259,7 @@ __tracepoint_android_vh_record_rtmutex_lock_starttime __tracepoint_android_vh_record_rwsem_lock_starttime __tracepoint_android_vh_rtmutex_waiter_prio + __tracepoint_android_vh_rt_mutex_steal __tracepoint_android_vh_rwsem_can_spin_on_owner __tracepoint_android_vh_rwsem_opt_spin_finish __tracepoint_android_vh_rwsem_opt_spin_start From d16a15fde575b740e248e5bbd6ffd11a90c40baa Mon Sep 17 00:00:00 2001 From: Roy Luo Date: Tue, 28 Nov 2023 22:17:56 +0000 Subject: [PATCH 061/139] UPSTREAM: USB: gadget: core: adjust uevent timing on gadget unbind The KOBJ_CHANGE uevent is sent before gadget unbind is actually executed, resulting in inaccurate uevent emitted at incorrect timing (the uevent would have USB_UDC_DRIVER variable set while it would soon be removed). Move the KOBJ_CHANGE uevent to the end of the unbind function so that uevent is sent only after the change has been made. Fixes: 2ccea03a8f7e ("usb: gadget: introduce UDC Class") Cc: stable@vger.kernel.org Signed-off-by: Roy Luo Link: https://lore.kernel.org/r/20231128221756.2591158-1-royluo@google.com Signed-off-by: Greg Kroah-Hartman Bug: 312543856 Change-Id: Ida7fa7e1cfae3d1b3f3348512a67fe91065f25af (cherry picked from commit 73ea73affe8622bdf292de898da869d441da6a9d) Signed-off-by: Roy Luo --- drivers/usb/gadget/udc/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d7e992ee7743..1f56b770465e 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1619,8 +1619,6 @@ static void gadget_unbind_driver(struct device *dev) dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function); - kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); - udc->allow_connect = false; cancel_work_sync(&udc->vbus_work); mutex_lock(&udc->connect_lock); @@ -1640,6 +1638,8 @@ static void gadget_unbind_driver(struct device *dev) driver->is_bound = false; udc->driver = NULL; mutex_unlock(&udc_lock); + + kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); } /* ------------------------------------------------------------------------- */ From 2c085909e7a7d2bb39881bc33ab6a45713c7f379 Mon Sep 17 00:00:00 2001 From: leonardian Date: Wed, 13 Dec 2023 07:00:25 +0000 Subject: [PATCH 062/139] ANDROID: Update the ABI symbol list Adding the following symbols: - _dev_alert Bug: 311337219 Change-Id: Iaf6710842c45921ccfbacd1361e0b57401cf65d9 Signed-off-by: leonardian --- android/abi_gki_aarch64_pixel | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index b7d8f1baa215..af22721e20b8 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -340,6 +340,7 @@ desc_to_gpio destroy_workqueue dev_addr_mod + _dev_alert dev_alloc_name __dev_change_net_namespace dev_close From 88a19395047b78359c6c352491c2028dc730878c Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Wed, 30 Nov 2022 14:04:55 +0800 Subject: [PATCH 063/139] UPSTREAM: erofs: enable large folios for iomap mode Enable large folios for iomap mode. Then the readahead routine will pass down large folios containing multiple pages. Let's enable this for non-compressed format for now, until the compression part supports large folios later. When large folios supported, the iomap routine will allocate iomap_page for each large folio and thus we need iomap_release_folio() and iomap_invalidate_folio() to free iomap_page when these folios get reclaimed or invalidated. Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221130060455.44532-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang Bug: 318378021 Change-Id: Iedbb9a2daf132399b7a1b5ea6905977ba123ba3c (cherry picked from commit ce529cc25b184e93397b94a8a322128fc0095cbb) Signed-off-by: Sandeep Dhavale --- fs/erofs/data.c | 2 ++ fs/erofs/inode.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 83532525282e..bbfbaf25ee59 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -404,6 +404,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 8fc41fd1620c..187ca02bbc2d 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -299,6 +299,8 @@ static int erofs_fill_inode(struct inode *inode) goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + if (!erofs_is_fscache_mode(inode->i_sb)) + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; From 66595bb17c147d49721669fe822817dea377b9ce Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 26 Apr 2023 16:44:49 +0800 Subject: [PATCH 064/139] UPSTREAM: erofs: fold in z_erofs_decompress() No need this helper since it's just a simple wrapper for decompress method and only one caller. So, let's fold in directly instead. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20230426084449.12781-1-zbestahu@gmail.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit 597e2953ae9b4a391e883c1f1a4cda5878e2dbed) Change-Id: I849360f088016cf97542858e8a5a9cee671a2f61 Signed-off-by: Sandeep Dhavale --- fs/erofs/compress.h | 3 +-- fs/erofs/decompressor.c | 8 +------- fs/erofs/zdata.c | 4 +++- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..b1b846504027 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 7021e2cf6146..2a29943fa5cc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, return 0; } -static struct z_erofs_decompressor decompressors[] = { +const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" @@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = { }, #endif }; - -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) -{ - return decompressors[rq->alg].decompress(rq, pagepool); -} diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 451b9a6cba68..f63c2422c9f9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1256,6 +1256,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; unsigned int i, inputsize; int err2; struct page *page; @@ -1299,7 +1301,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, else inputsize = pclusterpages * PAGE_SIZE; - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, From 5e861fa97e24f31aeda455870177da40c9eccd76 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Wed, 24 May 2023 14:39:44 +0800 Subject: [PATCH 065/139] UPSTREAM: erofs: remove the member readahead from struct z_erofs_decompress_frontend The struct member is only used to add REQ_RAHEAD during I/O submission. So it is cleaner to pass it as a parameter than keep it in the struct. Also, rename function z_erofs_get_sync_decompress_policy() to z_erofs_is_sync_decompress() for better clarity and conciseness. Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20230524063944.1655-1-zbestahu@gmail.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit ef4b4b46c6aaf8edeea9a79320627fe10993f153) Change-Id: I59cc13e7499968a1e93e13df1cb43a5123d510d9 Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f63c2422c9f9..50f803a48f96 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -534,7 +534,6 @@ struct z_erofs_decompress_frontend { z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -1076,7 +1075,7 @@ out: return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -1637,7 +1636,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct page **pagepool, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); @@ -1723,7 +1722,7 @@ submit_bio_retry: bio->bi_iter.bi_sector = (sector_t)cur << (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; - if (f->readahead) + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } @@ -1759,13 +1758,13 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) + struct page **pagepool, bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1863,8 +1862,8 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0), + false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1882,7 +1881,6 @@ static void z_erofs_readahead(struct readahead_control *rac) struct page *pagepool = NULL, *head = NULL, *page; unsigned int nr_pages; - f.readahead = true; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, f.headoffset + @@ -1913,7 +1911,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f); z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&pagepool); } From bed20ed1d348355f99a40e7e61729b25e21356f9 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Thu, 25 May 2023 15:26:05 +0800 Subject: [PATCH 066/139] UPSTREAM: erofs: clean up z_erofs_pcluster_readmore() `end` parameter is no needed since it's pointless for !backmost, we can handle it with backmost internally. And we only expand the trailing edge, so the newstart can be replaced with ->headoffset. Also, remove linux/prefetch.h inclusion since that is not used anymore after commit 386292919c25 ("erofs: introduce readmore decompression strategy"). Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20230525072605.17857-1-zbestahu@gmail.com [ Gao Xiang: update commit description. ] Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit 796e9149a2fcdba5543e247abd8d911a399bb9a6) Change-Id: I9412c4111800077c876a43c4256ce9760a7d902e Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 50f803a48f96..b7bf435eea82 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,7 +5,6 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" -#include #include #include #include @@ -1785,28 +1784,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) + struct page **pagepool, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); @@ -1854,10 +1853,9 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); + z_erofs_pcluster_readmore(&f, NULL, &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, NULL, &pagepool, false); (void)z_erofs_collector_end(&f); @@ -1883,8 +1881,7 @@ static void z_erofs_readahead(struct readahead_control *rac) f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, &pagepool, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1907,7 +1904,7 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, &pagepool, false); (void)z_erofs_collector_end(&f); z_erofs_runqueue(&f, &pagepool, From 5c1827383ac1df6e796ae44533a913d39425a78e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 27 May 2023 04:14:54 +0800 Subject: [PATCH 067/139] UPSTREAM: erofs: allocate extra bvec pages directly instead of retrying If non-bootstrap bvecs cannot be kept in place (very rarely), an extra short-lived page is allocated. Let's just allocate it immediately rather than do unnecessary -EAGAIN return first and retry as a cleanup. Also it's unnecessary to use __GFP_NOFAIL here since we could gracefully fail out this case instead. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20230526201459.128169-2-hsiangkao@linux.alibaba.com Bug: 318378021 (cherry picked from commit 05b63d2beb8b0f752d1f5cdd051c8bdbf532cedd) Change-Id: I2ac45a943060406bcbb741c5f7aa1094f783f906 Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b7bf435eea82..2e945803c457 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -240,12 +240,17 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, struct page **candidate_bvpage) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + if (!nextpage) { + nextpage = alloc_page(GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -872,10 +877,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } /* * if all pending pages are added, don't hold its reference @@ -1023,24 +1026,13 @@ hitted: if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); -retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + if (err) goto out; - } z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ From 3d9318266165302aae965416e8ebfed980088b8c Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 27 May 2023 04:14:55 +0800 Subject: [PATCH 068/139] UPSTREAM: erofs: avoid on-stack pagepool directly passed by arguments On-stack pagepool is used so that short-lived temporary pages could be shared within a single I/O request (e.g. among multiple pclusters). Moving the remaining frontend-related uses into z_erofs_decompress_frontend to avoid too many arguments. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20230526201459.128169-3-hsiangkao@linux.alibaba.com Bug: 318378021 (cherry picked from commit 6ab5eed6002edc5a29b683285e90459a7df6ce2b) Change-Id: I57d3ba6087904bb40c55b780aca50c16bfba2c0f Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 64 +++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2e945803c457..2abc2398708a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -238,13 +238,14 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { if (iter->cur >= iter->nr) { struct page *nextpage = *candidate_bvpage; if (!nextpage) { - nextpage = alloc_page(GFP_NOFS); + nextpage = erofs_allocpage(pagepool, GFP_NOFS); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); @@ -533,6 +534,7 @@ struct z_erofs_decompress_frontend { struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; @@ -567,8 +569,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; @@ -609,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * succeeds or fallback to in-place I/O instead * to avoid any direct reclaim. */ - newpage = erofs_allocpage(pagepool, gfp); + newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); @@ -622,7 +623,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) put_page(page); else if (newpage) - erofs_pagepool_add(pagepool, newpage); + erofs_pagepool_add(&fe->pagepool, newpage); } /* @@ -720,7 +721,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } @@ -925,7 +927,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -985,7 +987,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); + z_erofs_bind_cache(fe); } hitted: /* @@ -1625,7 +1627,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg, bool readahead) { @@ -1683,8 +1684,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct page *page; - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); + page = pickup_page_for_submission(pcl, i++, + &f->pagepool, mc); if (!page) continue; @@ -1749,16 +1750,16 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg, bool ra) + bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg, ra); + z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; @@ -1767,7 +1768,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* @@ -1775,8 +1776,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - struct page **pagepool, bool backmost) + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; @@ -1818,7 +1818,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) { unlock_page(page); } else { - err = z_erofs_do_read_page(f, page, pagepool); + err = z_erofs_do_read_page(f, page); if (err) erofs_err(inode->i_sb, "readmore error at page %lu @ nid %llu", @@ -1839,27 +1839,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) struct inode *const inode = page->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; int err; trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, &pagepool, false); - + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, page); + z_erofs_pcluster_readmore(&f, NULL, false); (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0), - false); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } @@ -1868,12 +1865,12 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; + struct page *head = NULL, *page; unsigned int nr_pages; f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1889,20 +1886,19 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_is_sync_decompress(sbi, nr_pages), true); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { From 187d034575bcaf95e145627406a5b26108fba447 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 27 May 2023 04:14:57 +0800 Subject: [PATCH 069/139] BACKPORT: erofs: adapt managed inode operations into folios This patch gets rid of erofs_try_to_free_cached_page() and fold it into .release_folio(). It also moves managed inode operations into zdata.c, which simplifies the code a bit. No logic changes. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Link: https://lore.kernel.org/r/20230526201459.128169-5-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I5cb1e44769f68edce788cb4f8084bb3d45b594b3 (cherry picked from commit 7b4e372c36fcd33c74ba3cbd65fa534b9c558184) [dhavale: changes to internal.h applied manually] Signed-off-by: Sandeep Dhavale --- fs/erofs/internal.h | 3 ++- fs/erofs/super.c | 62 --------------------------------------------- fs/erofs/zdata.c | 59 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 71 deletions(-) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1c03daf83a68..23151da13a23 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -544,7 +544,7 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); +int erofs_init_managed_cache(struct super_block *sb); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); @@ -565,6 +565,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b073b38c1c77..19af9bbcb8f1 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -597,68 +597,6 @@ static int erofs_fc_parse_param(struct fs_context *fc, return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) -{ - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} - -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); - - if (!inode) - return -ENOMEM; - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif - static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 2abc2398708a..8085e712314e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -668,29 +668,72 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + bool ret; + int i; + + if (!folio_test_private(folio)) + return true; if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + return false; - ret = 0; + ret = false; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { + if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; + ret = true; break; } } erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) - detach_page_private(page); + folio_detach_private(folio); return ret; } +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { From 365ca16da250c80d27a7f7b3499acfa2db9718a7 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jun 2023 00:12:40 +0800 Subject: [PATCH 070/139] UPSTREAM: erofs: simplify z_erofs_transform_plain() Use memcpy_to_page() instead of open-coding them. In addition, add a missing flush_dcache_page() even though almost all modern architectures clear `PG_dcache_clean` flag for new file cache pages so that it doesn't change anything in practice. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230627161240.331-2-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit c5539762f32e97c5e16215fa1336e32095b8b0fd) Change-Id: I4cb665b592936502ca95e2aee20e1c3a56103ff5 Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2a29943fa5cc..b9dd685a16fc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, const unsigned int lefthalf = rq->outputsize - righthalf; const unsigned int interlaced_offset = rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; + u8 *src; if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); @@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, } src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); - } + if (rq->out[0]) + memcpy_to_page(rq->out[0], rq->pageofs_out, + src + interlaced_offset, righthalf); if (outpages > inpages) { DBG_BUGON(!rq->out[outpages - 1]); if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); + memcpy_to_page(rq->out[outpages - 1], 0, src + + (interlaced_offset ? 0 : righthalf), + lefthalf); } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); + flush_dcache_page(rq->in[inpages - 1]); } } kunmap_local(src); From 4067dd99694a3722116ed26bfcb361201ea97a9b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jun 2023 00:12:39 +0800 Subject: [PATCH 071/139] UPSTREAM: erofs: get rid of the remaining kmap_atomic() It's unnecessary to use kmap_atomic() compared with kmap_local_page(). In addition, kmap_atomic() is deprecated now. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230627161240.331-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang Bug: 318378021 (cherry picked from commit 123ec246ebe323d468c5ca996700ea4739d20ddf) Change-Id: I7efee861bb4f079fe6b79123d554be2e1867d13b Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index b9dd685a16fc..cfad1eac7fd9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, *maptype = 0; return inpage; } - kunmap_atomic(inpage); + kunmap_local(inpage); might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) @@ -162,7 +162,7 @@ docopy: src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } @@ -173,9 +173,9 @@ docopy: min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); + inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; @@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { @@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { - kunmap_atomic(headpage); + kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & @@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { @@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } @@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; From d0dbf747924547655f733242ade325bd2426a7e0 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:06 +0800 Subject: [PATCH 072/139] BACKPORT: erofs: simplify z_erofs_read_fragment() A trivial cleanup to make the fragment handling logic more clear. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I50c09c65b7d3da5022cfc2ede27aa31a1b331d29 (cherry picked from commit 8b00be163f7b57cbf957b3d27b5a7ca1e2495cfa) [dhavale: resolved conflict around erofs_bread() in zdata.c] Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 8085e712314e..f7d0d71359e8 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -936,22 +936,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, packed_inode, erofs_blknr(sb, pos), EROFS_KMAP); @@ -959,11 +956,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; @@ -976,7 +969,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -1049,17 +1042,11 @@ hitted: goto next_part; } if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; + erofs_off_t fpos = offset + cur - map->m_la; - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; ++spiltted; From 7751567a719dc3cd78125ac4b200596bab4c501a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:07 +0800 Subject: [PATCH 073/139] BACKPORT: erofs: avoid obsolete {collector,collection} terms {collector,collection} were once reserved in order to indicate different runtime logical extent instance of multi-reference pclusters. However, de-duplicated decompression has been landed in a more flexable way, thus `struct z_erofs_collection` was formally removed in commit 87ca34a7065d ("erofs: get rid of `struct z_erofs_collection'"). Let's handle the remaining leftovers, for example: `z_erofs_collector_begin` => `z_erofs_pcluster_begin` `z_erofs_collector_end` => `z_erofs_pcluster_end` as well as some comments. No logic changes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-2-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I61b812b5ae3dd564e52012d082415b1fc198383d (cherry picked from commit dcba1b232e26ebadbd215728199455d38a59253e) [dhavale: fixed minor conflict zdata.c in z_erofs_do_read_page()] Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index f7d0d71359e8..ad921985883f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -512,19 +512,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -855,7 +853,7 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct erofs_workgroup *grp = NULL; @@ -912,12 +910,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); @@ -933,7 +931,7 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; + fe->backmost = false; } static int z_erofs_read_fragment(struct super_block *sb, struct page *page, @@ -984,8 +982,7 @@ repeat: offset + cur >= map->m_la + map->m_llen) { erofs_dbg("out-of-range map @ pos %llu", offset + cur); - if (z_erofs_collector_end(fe)) - fe->backmost = false; + z_erofs_pcluster_end(fe); map->m_la = offset + cur; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); @@ -1001,7 +998,7 @@ repeat: map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; - err = z_erofs_collector_begin(fe); + err = z_erofs_pcluster_begin(fe); if (err) goto out; @@ -1877,7 +1874,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_do_read_page(&f, page); z_erofs_pcluster_readmore(&f, NULL, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); @@ -1924,7 +1921,7 @@ static void z_erofs_readahead(struct readahead_control *rac) put_page(page); } z_erofs_pcluster_readmore(&f, rac, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); From dc94c3cc6b3fae027bc45b89ecb826bd4b6e9a2f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:08 +0800 Subject: [PATCH 074/139] UPSTREAM: erofs: move preparation logic into z_erofs_pcluster_begin() Some preparation logic should be part of z_erofs_pcluster_begin() instead of z_erofs_do_read_page(). Let's move now. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-3-hsiangkao@linux.alibaba.com Bug: 318378021 (cherry picked from commit aeebae9d77217709f8ae3edb0cd7858ec8c7a9d6) Change-Id: I4bf438d719742a18a6f3065a78bf027de5dae293 Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 60 ++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ad921985883f..87e7d9f843bd 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -856,6 +856,8 @@ err_out: static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -865,8 +867,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -885,9 +886,26 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -988,39 +1006,15 @@ repeat: err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ + } else if (fe->pcl) { + goto hitted; } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); + if ((map->m_flags & EROFS_MAP_MAPPED) && + !(map->m_flags & EROFS_MAP_FRAGMENT)) { + err = z_erofs_pcluster_begin(fe); + if (err) goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe); } hitted: /* From 0d329bbe5cac1bea2bf637e00846eee6982cba53 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 17 Aug 2023 16:28:09 +0800 Subject: [PATCH 075/139] BACKPORT: erofs: tidy up z_erofs_do_read_page() - Fix a typo: spiltted => split; - Move !EROFS_MAP_MAPPED and EROFS_MAP_FRAGMENT upwards; - Increase `split` in advance to avoid unnecessary repeats. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20230817082813.81180-4-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I465fd33c7cbbe91d5da4b4ee2343a7b319534148 (cherry picked from commit e4c1cf523d820730a86cae2c6d55924833b6f7ac) [dhavale: resolved small conflict in zdata.c in z_erofs_do_read_page()] Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 60 +++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 87e7d9f843bd..18fb6556cfd2 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -985,53 +985,35 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, len, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - erofs_dbg("out-of-range map @ pos %llu", offset + cur); - + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + end - 1); z_erofs_pcluster_end(fe); - map->m_la = offset + cur; + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else if (fe->pcl) { - goto hitted; } - if ((map->m_flags & EROFS_MAP_MAPPED) && - !(map->m_flags & EROFS_MAP_FRAGMENT)) { - err = z_erofs_pcluster_begin(fe); - if (err) - goto out; - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); - ++spiltted; tight = false; goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; @@ -1040,12 +1022,24 @@ hitted: EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; - ++spiltted; tight = false; goto next_part; } - exclusive = (!cur && (!spiltted || tight)); + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1058,8 +1052,6 @@ hitted: goto out; z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1084,8 +1076,8 @@ out: z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); - erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", - __func__, page, spiltted, map->m_llen); + erofs_dbg("%s, finish page: %pK split: %u map->m_llen %llu", + __func__, page, split, map->m_llen); return err; } From bdc5d268ba5ebf809f44e12bb31cce23703bb659 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 29 Nov 2023 02:04:31 +0800 Subject: [PATCH 076/139] FROMGIT: erofs: fix memory leak on short-lived bounced pages Both MicroLZMA and DEFLATE algorithms can use short-lived pages on demand for the overlapped inplace I/O decompression. However, those short-lived pages are actually added to `be->compressed_pages`. Thus, it should be checked instead of `pcl->compressed_bvecs`. The LZ4 algorithm doesn't work like this, so it won't be impacted. Fixes: 67139e36d970 ("erofs: introduce `z_erofs_parse_in_bvecs'") Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231128180431.4116991-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ia1f602e9944b884022a3e20db12af568304fd80c (cherry picked from commit 93d6fda7f926451a0fa1121b9558d75ca47e861e https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 18fb6556cfd2..51ae6de18cba 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1328,12 +1328,11 @@ out: put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_bvecs[i].page; + /* consider shortlived pages added when decompressing */ + page = be->compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) continue; - - /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } From 8a49ea94416049bab4c6074c6c533181eb419167 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 12:55:34 +0800 Subject: [PATCH 077/139] FROMGIT: erofs: fix lz4 inplace decompression Currently EROFS can map another compressed buffer for inplace decompression, that was used to handle the cases that some pages of compressed data are actually not in-place I/O. However, like most simple LZ77 algorithms, LZ4 expects the compressed data is arranged at the end of the decompressed buffer and it explicitly uses memmove() to handle overlapping: __________________________________________________________ |_ direction of decompression --> ____ |_ compressed data _| Although EROFS arranges compressed data like this, it typically maps two individual virtual buffers so the relative order is uncertain. Previously, it was hardly observed since LZ4 only uses memmove() for short overlapped literals and x86/arm64 memmove implementations seem to completely cover it up and they don't have this issue. Juhyung reported that EROFS data corruption can be found on a new Intel x86 processor. After some analysis, it seems that recent x86 processors with the new FSRM feature expose this issue with "rep movsb". Let's strictly use the decompressed buffer for lz4 inplace decompression for now. Later, as an useful improvement, we could try to tie up these two buffers together in the correct order. Reported-and-tested-by: Juhyung Park Closes: https://lore.kernel.org/r/CAD14+f2AVKf8Fa2OO1aAUdDNTDsVzzR6ctU_oJSmTyd6zSYR2Q@mail.gmail.com Fixes: 0ffd71bcc3a0 ("staging: erofs: introduce LZ4 decompression inplace") Fixes: 598162d05080 ("erofs: support decompress big pcluster for lz4 backend") Cc: stable # 5.4+ Tested-by: Yifan Zhao Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206045534.3920847-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ifd2981320f9f79b27bc7484d8906501a2fa05359 (cherry picked from commit 3c12466b6b7bf1e56f9b32c366a3d83d87afb4de https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cfad1eac7fd9..024e0b4733e8 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -122,11 +122,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, - void *inpage, unsigned int *inputmargin, int *maptype, - bool may_inplace) + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i, j; + unsigned int omargin, total, i; struct page **in; void *src, *tmp; @@ -136,12 +136,13 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) { - DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) - if (rq->out[j] == rq->in[i]) - goto docopy; - } + for (i = 0; i < ctx->inpages; ++i) + if (rq->out[ctx->outpages - ctx->inpages + i] != + rq->in[i]) + goto docopy; + kunmap_local(inpage); + *maptype = 3; + return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { @@ -149,7 +150,6 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, return inpage; } kunmap_local(inpage); - might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); @@ -205,12 +205,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *out) + u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; - u8 *headpage, *src; + u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); @@ -231,11 +231,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); + out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, @@ -266,7 +267,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); - } else { + } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } @@ -309,7 +310,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) From 9d259220acdf19d29e0970740d10d2ec973c5d78 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:53 +0800 Subject: [PATCH 078/139] FROMGIT: erofs: support I/O submission for sub-page compressed blocks Add a basic I/O submission path first to support sub-page blocks: - Temporary short-lived pages will be used entirely; - In-place I/O pages can be used partially, but compressed pages need to be able to be mapped in contiguous virtual memory. As a start, currently cache decompression is explicitly disabled for sub-page blocks, which will be supported in the future. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-2-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ib2cb6120805ab479a450580fc8774af131271791 (cherry picked from commit 192351616a9dde686492bcb9d1e4895a1411a527 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 156 ++++++++++++++++++++++------------------------- 1 file changed, 74 insertions(+), 82 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 51ae6de18cba..bc8e6cdf3ae3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1452,86 +1452,85 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_decompress_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; struct address_space *mapping; - struct page *oldpage, *page; - int justfound; + struct page *page, *oldpage; + int justfound, bs = i_blocksize(f->inode); + /* Except for inplace pages, the entire page can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) + oldpage = READ_ONCE(zbv->page); + if (!oldpage) goto out_allocpage; - justfound = (unsigned long)page & 1UL; - page = (struct page *)((unsigned long)page & ~1UL); + justfound = (unsigned long)oldpage & 1UL; + page = (struct page *)((unsigned long)oldpage & ~1UL); + bvec->bv_page = page; + DBG_BUGON(z_erofs_is_shortlived_page(page)); /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. + * Handle preallocated cached pages. We tried to allocate such pages + * without triggering direct reclaim. If allocation failed, inplace + * file-backed pages will be used instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); + WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } + mapping = READ_ONCE(page->mapping); - /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed pages for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; + if (mapping && mapping != mc) { + if (zbv->offset < 0) + bvec->bv_offset = round_up(-zbv->offset, bs); + bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + return; + } lock_page(page); - /* only true if page reclaim goes wrong, should never happen */ DBG_BUGON(justfound && PagePrivate(page)); - /* the page is still in manage cache */ + /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - + WRITE_ONCE(zbv->page, page); + /* + * The cached page is still available but without a valid + * `->private` pcluster hint. Let's reconnect them. + */ if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ DBG_BUGON(!justfound); - - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + /* compressed_bvecs[] already takes a ref */ + attach_page_private(page, pcl); + put_page(page); } - /* no need to submit io if it is already up-to-date */ + /* no need to submit if it is already up-to-date */ if (PageUptodate(page)) { unlock_page(page); - page = NULL; + bvec->bv_page = NULL; } - goto out; + return; } /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. + * It has been truncated, so it's unsafe to reuse this one. Let's + * allocate a new page for compressed data. */ DBG_BUGON(page->mapping); DBG_BUGON(!justfound); @@ -1540,25 +1539,23 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + erofs_pagepool_add(&f->pagepool, page); cond_resched(); goto repeat; } + bvec->bv_page = page; out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ + if (!tocache || bs != PAGE_SIZE || + add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived page (1 ref) */ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + return; } attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ + /* drop a refcount added by allocpage (then 2 refs in total here) */ put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1613,7 +1610,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_submissionqueue_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; @@ -1625,7 +1622,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); @@ -1648,17 +1644,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; + erofs_off_t last_pa; struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ + /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); @@ -1671,7 +1664,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; @@ -1690,18 +1684,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(sb, mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = cur + (pcl->pclusterpages << PAGE_SHIFT); do { - struct page *page; - - page = pickup_page_for_submission(pcl, i++, - &f->pagepool, mc); - if (!page) + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) continue; - if (bio && (cur != last_index + 1 || + if (bio && (cur != last_pa || last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); @@ -1712,7 +1702,8 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } @@ -1720,23 +1711,24 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - (sb->s_blocksize_bits - 9); + bio->bi_end_io = z_erofs_submissionqueue_endio; + bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; + last_bdev = mdev.m_bdev; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) goto submit_bio_retry; - last_index = cur; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; From d7bb85f1cb1950bfa12b4c6c09f8346d32710d49 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:54 +0800 Subject: [PATCH 079/139] FROMGIT: erofs: record `pclustersize` in bytes instead of pages Currently, compressed sizes are recorded in pages using `pclusterpages`, However, for tailpacking pclusters, `tailpacking_size` is used instead. This approach doesn't work when dealing with sub-page blocks. To address this, let's switch them to the unified `pclustersize` in bytes. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-3-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Ia8c50a7b4adcf6cd161b1d6f8bfc5a7fd3371079 (cherry picked from commit 54ed3fdd66055d073cb1cd2c6c65bbc0683c40cf https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 64 ++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc8e6cdf3ae3..471129a41335 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -57,6 +57,9 @@ struct z_erofs_pcluster { /* L: total number of bvecs */ unsigned int vcnt; + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; @@ -71,14 +74,6 @@ struct z_erofs_pcluster { struct rcu_head rcu; }; - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - /* I: compression algorithm format */ unsigned char algorithmformat; @@ -118,9 +113,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; + return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } /* @@ -306,12 +299,12 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) @@ -320,7 +313,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; + pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -571,6 +564,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* @@ -584,10 +578,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; + for (i = 0; i < pclusterpages; ++i) { + struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) @@ -597,6 +590,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) if (page) { t = (void *)((unsigned long)page | 1); + newpage = NULL; } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -604,9 +598,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Try cached I/O if allocation succeeds or fallback to + * in-place I/O instead to avoid any direct reclaim. */ newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) @@ -638,6 +631,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); @@ -645,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. */ - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { struct page *page = pcl->compressed_bvecs[i].page; if (!page) @@ -669,6 +663,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool ret; int i; @@ -680,7 +675,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) ret = false; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); ret = true; @@ -789,20 +784,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); @@ -826,9 +821,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); - pcl->tailpacking_size = map->m_plen; } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; + pcl->obj.index = erofs_blknr(sb, map->m_pa); grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { @@ -1263,8 +1257,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, unsigned int pclusterpages = z_erofs_pclusterpages(pcl); const struct z_erofs_decompressor *decompressor = &erofs_decompressors[pcl->algorithmformat]; - unsigned int i, inputsize; - int err2; + int i, err2; struct page *page; bool overlapped; @@ -1301,18 +1294,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (err) goto out; - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -1685,7 +1673,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; - end = cur + (pcl->pclusterpages << PAGE_SHIFT); + end = cur + pcl->pclustersize; do { z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); if (!bvec.bv_page) From 0c6a18c75b54e9045df5284d1cd5d65afcdfe34d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:55 +0800 Subject: [PATCH 080/139] BACKPORT: FROMGIT: erofs: fix up compacted indexes for block size < 4096 Previously, the block size always equaled to PAGE_SIZE, therefore `lclusterbits` couldn't be less than 12. Since sub-page compressed blocks are now considered, `lobits` for a lcluster in each pack cannot always be `lclusterbits` as before. Otherwise, there is no enough room for the special value `Z_EROFS_VLE_DI_D0_CBLKCNT`. To support smaller block sizes, `lobits` for each compacted lcluster is now calculated as: lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1) Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-4-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: Iacd89e2b33ddf39ea40b90e88a2bf99bb5a83b31 (cherry picked from commit 8d2517aaeea3ab8651bb517bca8f3c8664d318ea https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) [dhavale: resolved conflicts in zmap.c due to older naming of constants and updated commit message also to use the older names] Signed-off-by: Sandeep Dhavale --- fs/erofs/zmap.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 8973ccad707d..f5d3ba39dd42 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -101,29 +101,26 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, } static unsigned int decode_compactedbits(unsigned int lobits, - unsigned int lomask, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); - const unsigned int lo = v & lomask; + const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } -static int get_compacted_la_distance(unsigned int lclusterbits, +static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { - const unsigned int lomask = (1 << lclusterbits) - 1; unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) return d1; @@ -142,15 +139,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk, eofs; + unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits == 12) + else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; @@ -159,6 +155,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); @@ -166,15 +163,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, i = (eofs - base) >> amortizedshift; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) - m->delta[1] = get_compacted_la_distance(lclusterbits, + m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { if (!big_pcluster) { @@ -193,8 +189,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * (i - 1), &type); + lo = decode_compactedbits(lobits, in, + encodebits * (i - 1), &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) @@ -209,8 +205,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 1; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) i -= lo; @@ -221,8 +217,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 0; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { --i; From a18efa4e4aa95a7348469db998411322ac8193be Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 15 Dec 2023 00:13:37 +0800 Subject: [PATCH 081/139] FROMGIT: erofs: fix ztailpacking for subpage compressed blocks `pageofs_in` should be the compressed data offset of the page rather than of the block. Acked-by: Chao Yu Reviewed-by: Yue Hu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231214161337.753049-1-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I0997a69b22b0f42c327c810359f55f5fa6a76275 (cherry picked from commit e5aba911dee5e20fa82efbe13e0af8f38ea459e7 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/zdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 471129a41335..ffb5ed1b07b0 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -820,7 +820,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); } else { pcl->obj.index = erofs_blknr(sb, map->m_pa); @@ -897,6 +896,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) } get_page(map->buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* file-backed inplace I/O pages are traversed in reverse order */ From f466d5216404d611d6f9559bc97547edb7afa714 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:56 +0800 Subject: [PATCH 082/139] FROMGIT: erofs: refine z_erofs_transform_plain() for sub-page block support Sub-page block support is still unusable even with previous commits if interlaced PLAIN pclusters exist. Such pclusters can be found if the fragment feature is enabled. This commit tries to handle "the head part" of interlaced PLAIN pclusters first: it was once explained in commit fdffc091e6f9 ("erofs: support interlaced uncompressed data for compressed files"). It uses a unique way for both shifted and interlaced PLAIN pclusters. As an added bonus, PLAIN pclusters larger than the block size is also supported now for the upcoming large lclusters. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-5-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I3d50132664f8754f56d62744420060108ed0da4f (cherry picked from commit 192351616a9dde686492bcb9d1e4895a1411a527 https: //git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) Signed-off-by: Sandeep Dhavale --- fs/erofs/decompressor.c | 82 ++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 024e0b4733e8..38c7f9c96c68 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -321,43 +321,59 @@ dstmap_out: static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = + const unsigned int nrpages_in = + PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; + const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - u8 *src; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; - } - - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) - memcpy_to_page(rq->out[0], rq->pageofs_out, - src + interlaced_offset, righthalf); - - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - memcpy_to_page(rq->out[outpages - 1], 0, src + - (interlaced_offset ? 0 : righthalf), - lefthalf); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); - flush_dcache_page(rq->in[inpages - 1]); + DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) { + memmove(kin + rq->pageofs_out, kin + pi, cur); + flush_dcache_page(rq->out[0]); + } else { + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + } + kunmap_local(kin); } + rq->outputsize -= cur; } - kunmap_local(src); + + for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + insz = min_t(unsigned int, PAGE_SIZE - rq->pageofs_in, + rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min_t(unsigned int, insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) { + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + flush_dcache_page(rq->out[no]); + } else if (rq->out[no]) { + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + } + pi += cnt; + } while (pi < insz); + kunmap_local(kin); + } + DBG_BUGON(ni > nrpages_in); return 0; } From 37e0a5b868093b50e698e43ac2412cd9a883f395 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 6 Dec 2023 17:10:57 +0800 Subject: [PATCH 083/139] BACKPORT: FROMGIT: erofs: enable sub-page compressed block support Let's just disable cached decompression and inplace I/Os for partial pages as the first step in order to enable sub-page block initial support. In other words, currently it works primarily based on temporary short-lived pages. Don't expect too much in terms of performance. Reviewed-by: Yue Hu Reviewed-by: Chao Yu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20231206091057.87027-6-hsiangkao@linux.alibaba.com Bug: 318378021 Change-Id: I00238aa437f20c46d015bbe5ab7b706b80b8cfd7 (cherry picked from commit 0ee3a0d59e007320167a2e9f4b8bf1304ada7771 https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev) [dhavale: resolved conflicts in inode.c in erofs_fill_inode()] Signed-off-by: Sandeep Dhavale --- fs/erofs/inode.c | 7 +++++-- fs/erofs/zdata.c | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 187ca02bbc2d..190bc3ad5622 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -291,9 +291,12 @@ static int erofs_fill_inode(struct inode *inode) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - if (!erofs_is_fscache_mode(inode->i_sb) && - inode->i_sb->s_blocksize_bits == PAGE_SHIFT) + if (!erofs_is_fscache_mode(inode->i_sb)) { + DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); err = z_erofs_fill_inode(inode); + } else err = -EOPNOTSUPP; goto out_unlock; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ffb5ed1b07b0..0b1b6ca804b3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -575,6 +575,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; + if (i_blocksize(fe->inode) != PAGE_SIZE) + return; if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; @@ -978,12 +980,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); + const unsigned int bs = i_blocksize(inode); bool tight = true, exclusive; unsigned int cur, end, len, split; int err = 0; z_erofs_onlinepage_init(page); - split = 0; end = PAGE_SIZE; repeat: @@ -1033,7 +1035,7 @@ repeat: * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - exclusive = (!cur && ((split <= 1) || tight)); + exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE))); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); From 7d50253c2790f5c8a2f43c4c737241ab2458a77f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 16 Mar 2020 16:39:31 -0700 Subject: [PATCH 084/139] ANDROID: Export functions to be used with dma_map_ops in modules For modules to reuse default dma_map_ops implementations they need to be exported. Export the following functions: dma_direct_alloc dma_direct_free dma_common_mmap dma_common_get_sgtable dma_direct_get_required_mask Bug: 151050914 Signed-off-by: Suren Baghdasaryan Change-Id: Ia77b797fcd909fce01da7431bfbde282dc70b3b3 (cherry picked from commit fd31496dae939c7bf2ef874e08d4bf8c6ab738b3) Signed-off-by: Qian-Hao Huang (cherry picked from commit cdc9f6ef946f3c17b048b3ee9e36aa0bbc12d712) --- kernel/dma/direct.c | 3 +++ kernel/dma/ops_helpers.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 71bb2e3440e2..09ca202ac480 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,6 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } +EXPORT_SYMBOL_GPL(dma_direct_get_required_mask); static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_limit) @@ -320,6 +321,7 @@ out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; } +EXPORT_SYMBOL_GPL(dma_direct_alloc); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) @@ -365,6 +367,7 @@ void dma_direct_free(struct device *dev, size_t size, __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } +EXPORT_SYMBOL_GPL(dma_direct_free); struct page *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index af4a6ef48ce0..e28e1e17eaf5 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -27,6 +27,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } +EXPORT_SYMBOL_GPL(dma_common_get_sgtable); /* * Create userspace mapping for the DMA-coherent memory. @@ -57,6 +58,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; #endif /* CONFIG_MMU */ } +EXPORT_SYMBOL_GPL(dma_common_mmap); struct page *dma_common_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) From 9c4bc457ab31d2a121da0b5884a6b7e927b1e1f9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 12 Aug 2023 16:56:25 +0100 Subject: [PATCH 085/139] UPSTREAM: mm/memory.c: fix mismerge Fix a build issue. Link: https://lkml.kernel.org/r/ZNerqcNS4EBJA/2v@casper.infradead.org Fixes: 4aaa60dad4d1 ("mm: allow per-VMA locks on file-backed VMAs") Signed-off-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308121909.XNYBtqNI-lkp@intel.com/ Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 08dff2810e8feb3096bf5c8242ab1649d1e8b1a4) Bug: 293665307 Change-Id: I07ce19f29c44831cdcf709fe1ce122d1963f0be2 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 56057f97afaf..24463fd7de64 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5514,7 +5514,7 @@ retry: * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (unlikely(!vma->anon_vma)) + if (vma_is_anonymous(vma) && !vma->anon_vma) goto inval_end_read; /* Check since vm_start/vm_end might change before we lock the VMA */ From b43b26b4cd1ff080ac1577a558de2962c8f03d07 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:13 +0100 Subject: [PATCH 086/139] UPSTREAM: mm: make lock_folio_maybe_drop_mmap() VMA lock aware Patch series "Handle more faults under the VMA lock", v2. At this point, we're handling the majority of file-backed page faults under the VMA lock, using the ->map_pages entry point. This patch set attempts to expand that for the following siutations: - We have to do a read. This could be because we've hit the point in the readahead window where we need to kick off the next readahead, or because the page is simply not present in cache. - We're handling a write fault. Most applications don't do I/O by writes to shared mmaps for very good reasons, but some do, and it'd be nice to not make that slow unnecessarily. - We're doing a COW of a private mapping (both PTE already present and PTE not-present). These are two different codepaths and I handle both of them in this patch set. There is no support in this patch set for drivers to mark themselves as being VMA lock friendly; they could implement the ->map_pages vm_operation, but if they do, they would be the first. This is probably something we want to change at some point in the future, and I've marked where to make that change in the code. There is very little performance change in the benchmarks we've run; mostly because the vast majority of page faults are handled through the other paths. I still think this patch series is useful for workloads that may take these paths more often, and just for cleaning up the fault path in general (it's now clearer why we have to retry in these cases). This patch (of 6): Drop the VMA lock instead of the mmap_lock if that's the one which is held. Link: https://lkml.kernel.org/r/20231006195318.4087158-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231006195318.4087158-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 5d74b2ab2c15d596c470bae6626f345d5575a9d0) Bug: 293665307 Change-Id: Ife2d11ab12fb428868cd44751784cf731fbffe62 Signed-off-by: Suren Baghdasaryan --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 63a846a1c1a3..c38dc43bfc8c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2967,7 +2967,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, /* * NOTE! This will make us return with VM_FAULT_RETRY, but with - * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT + * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) @@ -2977,13 +2977,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* - * We didn't have the right flags to drop the mmap_lock, - * but all fault_handlers only check for fatal signals - * if we return VM_FAULT_RETRY, so we need to drop the - * mmap_lock here and return 0 if we don't have a fpin. + * We didn't have the right flags to drop the + * fault lock, but all fault_handlers only check + * for fatal signals if we return VM_FAULT_RETRY, + * so we need to drop the fault lock here and + * return 0 if we don't have a fpin. */ if (*fpin == NULL) - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); return 0; } } else From 95af8a80bb7ba4ad46de6f743ec91a882f4cd624 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:14 +0100 Subject: [PATCH 087/139] BACKPORT: mm: call wp_page_copy() under the VMA lock It is usually safe to call wp_page_copy() under the VMA lock. The only unsafe situation is when no anon_vma has been allocated for this VMA, and we have to look at adjacent VMAs to determine if their anon_vma can be shared. Since this happens only for the first COW of a page in this VMA, the majority of calls to wp_page_copy() do not need to fall back to the mmap_sem. Add vmf_anon_prepare() as an alternative to anon_vma_prepare() which will return RETRY if we currently hold the VMA lock and need to allocate an anon_vma. This lets us drop the check in do_wp_page(). Link: https://lkml.kernel.org/r/20231006195318.4087158-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 164b06f238b986317131e6b61b2f22aabcbc2cc0) [surenb: resolved merge conflicts due to folio/page differences] Bug: 293665307 Change-Id: I39bdc247b375bd3dae8078b52c60fd4ce12e1850 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 24463fd7de64..3cccb73b0a13 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,6 +3099,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (likely(vma->anon_vma)) + return 0; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + return 0; +} + /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. @@ -3126,12 +3141,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; - int ret; + vm_fault_t ret; delayacct_wpcopy_start(); - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, @@ -3139,13 +3155,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; } else { + int err; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) goto oom; - ret = __wp_page_copy_user(new_page, old_page, vmf); - if (ret) { + err = __wp_page_copy_user(new_page, old_page, vmf); + if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3158,7 +3175,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) put_page(old_page); delayacct_wpcopy_end(); - return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } @@ -3271,11 +3288,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) oom_free_new: put_page(new_page); oom: + ret = VM_FAULT_OOM; +out: if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return VM_FAULT_OOM; + return ret; } /** @@ -3510,12 +3529,6 @@ reuse: return wp_page_shared(vmf); } copy: - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Ok, we need to copy. Oh, well.. */ From c7fa581a793af13559143914227a42dcc83fa3e7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:15 +0100 Subject: [PATCH 088/139] UPSTREAM: mm: handle shared faults under the VMA lock There are many implementations of ->fault and some of them depend on mmap_lock being held. All vm_ops that implement ->map_pages() end up calling filemap_fault(), which I have audited to be sure it does not rely on mmap_lock. So (for now) key off ->map_pages existing as a flag to indicate that it's safe to call ->fault while only holding the vma lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4ed4379881aa62588aba6442a9f362a8cf7624e6) Bug: 293665307 Change-Id: Ifb5ab3df5d05fb182d0cb52820fa24e28e2d6496 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 3cccb73b0a13..0db26276bed7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3099,6 +3099,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +/* + * We could add a bitflag somewhere, but for now, we know that all + * vm_ops that have a ->map_pages have been audited and don't need + * the mmap_lock to be held. + */ +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) + return 0; + vma_end_read(vma); + return VM_FAULT_RETRY; +} + static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -4701,10 +4716,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) From 6541fffd92e5584e6297b14b0ef9f5e09cf79e5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:16 +0100 Subject: [PATCH 089/139] UPSTREAM: mm: handle COW faults under the VMA lock If the page is not currently present in the page tables, we need to call the page fault handler to find out which page we're supposed to COW, so we need to both check that there is already an anon_vma and that the fault handler doesn't need the mmap_lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4de8c93a4751e10737b6af65db42c743228c67a6) Bug: 293665307 Change-Id: If749a6f8fcf69d83bbf872c1d45865d1b1b77ea0 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0db26276bed7..cc0a95fc14b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4672,13 +4672,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_can_call_fault(vmf); + if (!ret) + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) From c1da94fa44e60c93e3896b087680e905116d69b3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:17 +0100 Subject: [PATCH 090/139] UPSTREAM: mm: handle read faults under the VMA lock Most file-backed faults are already handled through ->map_pages(), but if we need to do I/O we'll come this way. Since filemap_fault() is now safe to be called under the VMA lock, we can handle these faults under the VMA lock now. Link: https://lkml.kernel.org/r/20231006195318.4087158-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 12214eba1992642eee5813a9cc9f626e5b2d1815) Bug: 293665307 Change-Id: Iee48af98b866d88d88ec01143eb26389ab373b6b Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index cc0a95fc14b2..ddebfa1457f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4651,10 +4651,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) From 4a518d86339bb74f1edb918b0808bc755273258a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:18 +0100 Subject: [PATCH 091/139] UPSTREAM: mm: handle write faults to RO pages under the VMA lock I think this is a pretty rare occurrence, but for consistency handle faults with the VMA lock held the same way that we handle other faults with the VMA lock held. Link: https://lkml.kernel.org/r/20231006195318.4087158-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4a68fef16df9d88d528094116f8bbd2dbfa62089) Bug: 293665307 Change-Id: I69cec218c8a1fe14df3268722e6b1be6dffe7978 Signed-off-by: Suren Baghdasaryan --- mm/memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ddebfa1457f4..7f29d9394bdf 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3358,10 +3358,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); @@ -3385,10 +3384,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + tmp = vmf_can_call_fault(vmf); + if (tmp) { put_page(vmf->page); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; + return tmp; } tmp = do_page_mkwrite(vmf); From 6c8f7108578a1f4ed013c3bec63784e7f25cd6bf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 26 Dec 2023 13:46:10 -0800 Subject: [PATCH 092/139] FROMGIT: arch/mm/fault: fix major fault accounting when retrying under per-VMA lock A test [1] in Android test suite started failing after [2] was merged. It turns out that after handling a major fault under per-VMA lock, the process major fault counter does not register that fault as major. Before [2] read faults would be done under mmap_lock, in which case FAULT_FLAG_TRIED flag is set before retrying. That in turn causes mm_account_fault() to account the fault as major once retry completes. With per-VMA locks we often retry because a fault can't be handled without locking the whole mm using mmap_lock. Therefore such retries do not set FAULT_FLAG_TRIED flag. This logic does not work after [2] because we can now handle read major faults under per-VMA lock and upon retry the fact there was a major fault gets lost. Fix this by setting FAULT_FLAG_TRIED after retrying under per-VMA lock if VM_FAULT_MAJOR was returned. Ideally we would use an additional VM_FAULT bit to indicate the reason for the retry (could not handle under per-VMA lock vs other reason) but this simpler solution seems to work, so keeping it simple. [1] https://cs.android.com/android/platform/superproject/+/master:test/vts-testcase/kernel/api/drop_caches_prop/drop_caches_test.cpp [2] https://lore.kernel.org/all/20231006195318.4087158-6-willy@infradead.org/ Link: https://lkml.kernel.org/r/20231226214610.109282-1-surenb@google.com Fixes: 12214eba1992 ("mm: handle read faults under the VMA lock") Signed-off-by: Suren Baghdasaryan Cc: Matthew Wilcox Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Gerald Schaefer Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton (cherry picked from commit 46e714c729c8d1d8110bc0545d7ffe8a759c9dc0 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-hotfixes-stable) Bug: 317385399 Change-Id: Ic7e97bf610dcabb7d3ac2306b2f1213be0ddd269 Signed-off-by: Suren Baghdasaryan --- arch/arm64/mm/fault.c | 2 ++ arch/powerpc/mm/fault.c | 2 ++ arch/riscv/mm/fault.c | 2 ++ arch/s390/mm/fault.c | 3 +++ arch/x86/mm/fault.c | 2 ++ 5 files changed, 11 insertions(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9970a4785819..7d522b037d9a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -619,6 +619,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + mm_flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b1723094d464..ec23164ad768 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -496,6 +496,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 34a44febae86..d710bb834a2a 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -310,6 +310,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0843adb266d1..60fed3c88332 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -420,6 +420,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 97599581ec6b..bcb5678b5b91 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1369,6 +1369,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { From 4d99e41ce174d65ef1d710922acb7a2c67bbaa0b Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Sat, 25 Nov 2023 02:41:58 +0530 Subject: [PATCH 093/139] FROMGIT: PM / devfreq: Synchronize devfreq_monitor_[start/stop] There is a chance if a frequent switch of the governor done in a loop result in timer list corruption where timer cancel being done from two place one from cancel_delayed_work_sync() and followed by expire_timers() can be seen from the traces[1]. while true do echo "simple_ondemand" > /sys/class/devfreq/1d84000.ufshc/governor echo "performance" > /sys/class/devfreq/1d84000.ufshc/governor done It looks to be issue with devfreq driver where device_monitor_[start/stop] need to synchronized so that delayed work should get corrupted while it is either being queued or running or being cancelled. Let's use polling flag and devfreq lock to synchronize the queueing the timer instance twice and work data being corrupted. [1] ... .. -0 [003] 9436.209662: timer_cancel timer=0xffffff80444f0428 -0 [003] 9436.209664: timer_expire_entry timer=0xffffff80444f0428 now=0x10022da1c function=__typeid__ZTSFvP10timer_listE_global_addr baseclk=0x10022da1c -0 [003] 9436.209718: timer_expire_exit timer=0xffffff80444f0428 kworker/u16:6-14217 [003] 9436.209863: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2b now=0x10022da1c flags=182452227 vendor.xxxyyy.ha-1593 [004] 9436.209888: timer_cancel timer=0xffffff80444f0428 vendor.xxxyyy.ha-1593 [004] 9436.216390: timer_init timer=0xffffff80444f0428 vendor.xxxyyy.ha-1593 [004] 9436.216392: timer_start timer=0xffffff80444f0428 function=__typeid__ZTSFvP10timer_listE_global_addr expires=0x10022da2c now=0x10022da1d flags=186646532 vendor.xxxyyy.ha-1593 [005] 9436.220992: timer_cancel timer=0xffffff80444f0428 xxxyyyTraceManag-7795 [004] 9436.261641: timer_cancel timer=0xffffff80444f0428 [2] 9436.261653][ C4] Unable to handle kernel paging request at virtual address dead00000000012a [ 9436.261664][ C4] Mem abort info: [ 9436.261666][ C4] ESR = 0x96000044 [ 9436.261669][ C4] EC = 0x25: DABT (current EL), IL = 32 bits [ 9436.261671][ C4] SET = 0, FnV = 0 [ 9436.261673][ C4] EA = 0, S1PTW = 0 [ 9436.261675][ C4] Data abort info: [ 9436.261677][ C4] ISV = 0, ISS = 0x00000044 [ 9436.261680][ C4] CM = 0, WnR = 1 [ 9436.261682][ C4] [dead00000000012a] address between user and kernel address ranges [ 9436.261685][ C4] Internal error: Oops: 96000044 [#1] PREEMPT SMP [ 9436.261701][ C4] Skip md ftrace buffer dump for: 0x3a982d0 ... [ 9436.262138][ C4] CPU: 4 PID: 7795 Comm: TraceManag Tainted: G S W O 5.10.149-android12-9-o-g17f915d29d0c #1 [ 9436.262141][ C4] Hardware name: Qualcomm Technologies, Inc. (DT) [ 9436.262144][ C4] pstate: 22400085 (nzCv daIf +PAN -UAO +TCO BTYPE=--) [ 9436.262161][ C4] pc : expire_timers+0x9c/0x438 [ 9436.262164][ C4] lr : expire_timers+0x2a4/0x438 [ 9436.262168][ C4] sp : ffffffc010023dd0 [ 9436.262171][ C4] x29: ffffffc010023df0 x28: ffffffd0636fdc18 [ 9436.262178][ C4] x27: ffffffd063569dd0 x26: ffffffd063536008 [ 9436.262182][ C4] x25: 0000000000000001 x24: ffffff88f7c69280 [ 9436.262185][ C4] x23: 00000000000000e0 x22: dead000000000122 [ 9436.262188][ C4] x21: 000000010022da29 x20: ffffff8af72b4e80 [ 9436.262191][ C4] x19: ffffffc010023e50 x18: ffffffc010025038 [ 9436.262195][ C4] x17: 0000000000000240 x16: 0000000000000201 [ 9436.262199][ C4] x15: ffffffffffffffff x14: ffffff889f3c3100 [ 9436.262203][ C4] x13: ffffff889f3c3100 x12: 00000000049f56b8 [ 9436.262207][ C4] x11: 00000000049f56b8 x10: 00000000ffffffff [ 9436.262212][ C4] x9 : ffffffc010023e50 x8 : dead000000000122 [ 9436.262216][ C4] x7 : ffffffffffffffff x6 : ffffffc0100239d8 [ 9436.262220][ C4] x5 : 0000000000000000 x4 : 0000000000000101 [ 9436.262223][ C4] x3 : 0000000000000080 x2 : ffffff889edc155c [ 9436.262227][ C4] x1 : ffffff8001005200 x0 : ffffff80444f0428 [ 9436.262232][ C4] Call trace: [ 9436.262236][ C4] expire_timers+0x9c/0x438 [ 9436.262240][ C4] __run_timers+0x1f0/0x330 [ 9436.262245][ C4] run_timer_softirq+0x28/0x58 [ 9436.262255][ C4] efi_header_end+0x168/0x5ec [ 9436.262265][ C4] __irq_exit_rcu+0x108/0x124 [ 9436.262274][ C4] __handle_domain_irq+0x118/0x1e4 [ 9436.262282][ C4] gic_handle_irq.30369+0x6c/0x2bc [ 9436.262286][ C4] el0_irq_naked+0x60/0x6c Bug: 317188938 Change-Id: I9a22325f6abbf28217c8f37b093cf77509b0139a Link: https://lore.kernel.org/all/1700860318-4025-1-git-send-email-quic_mojha@quicinc.com/ Reported-by: Joyyoung Huang Acked-by: MyungJoo Ham Signed-off-by: Mukesh Ojha Signed-off-by: Chanwoo Choi (cherry picked from commit aed5ed595960c6d301dcd4ed31aeaa7a8054c0c6 https://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git devfreq-next) Signed-off-by: Srinivasarao Pathipati --- drivers/devfreq/devfreq.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index fe6644f99887..8e9ba701a643 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -461,10 +461,14 @@ static void devfreq_monitor(struct work_struct *work) if (err) dev_err(&devfreq->dev, "dvfs failed with (%d) error\n", err); + if (devfreq->stop_polling) + goto out; + queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); - mutex_unlock(&devfreq->lock); +out: + mutex_unlock(&devfreq->lock); trace_devfreq_monitor(devfreq); } @@ -482,6 +486,10 @@ void devfreq_monitor_start(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (delayed_work_pending(&devfreq->work)) + goto out; + switch (devfreq->profile->timer) { case DEVFREQ_TIMER_DEFERRABLE: INIT_DEFERRABLE_WORK(&devfreq->work, devfreq_monitor); @@ -490,12 +498,16 @@ void devfreq_monitor_start(struct devfreq *devfreq) INIT_DELAYED_WORK(&devfreq->work, devfreq_monitor); break; default: - return; + goto out; } if (devfreq->profile->polling_ms) queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); + +out: + devfreq->stop_polling = false; + mutex_unlock(&devfreq->lock); } EXPORT_SYMBOL(devfreq_monitor_start); @@ -512,6 +524,14 @@ void devfreq_monitor_stop(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (devfreq->stop_polling) { + mutex_unlock(&devfreq->lock); + return; + } + + devfreq->stop_polling = true; + mutex_unlock(&devfreq->lock); cancel_delayed_work_sync(&devfreq->work); } EXPORT_SYMBOL(devfreq_monitor_stop); From 99288e911ad553ffee55b3da942f6e722fc0573b Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 20 Dec 2023 13:35:00 +0000 Subject: [PATCH 094/139] ANDROID: KVM: arm64: Document module_change_host_prot_range When this pKVM module ops has been introduced, the documentation has been omitted. Bug: 308373293 Change-Id: I9e471414e72a1ee04c132de4ed95d77e815ae8c9 Signed-off-by: Vincent Donnefort --- arch/arm64/include/asm/kvm_pkvm_module.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h index bf68d862b7d8..b8e5f8064358 100644 --- a/arch/arm64/include/asm/kvm_pkvm_module.h +++ b/arch/arm64/include/asm/kvm_pkvm_module.h @@ -72,6 +72,11 @@ enum pkvm_psci_notification { * @register_host_perm_fault_handler), otherwise * pKVM will be unable to handle this fault and the * CPU will be stuck in an infinite loop. + * @host_stage2_mod_prot_range: Similar to @host_stage2_mod_prot, but takes a + * range as an argument (@nr_pages). This + * considerably speeds up the process for a + * contiguous memory region, compared to the + * per-page @host_stage2_mod_prot. * @host_stage2_get_leaf: Query the host's stage2 page-table entry for * the page @phys. * @register_host_smc_handler: @cb is called whenever the host issues an SMC From 8fc25d7862c9c669025dd534d3eefd20d071ea0d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 19 Oct 2023 15:51:08 -0700 Subject: [PATCH 095/139] FROMGIT: f2fs: do not return EFSCORRUPTED, but try to run online repair If we return the error, there's no way to recover the status as of now, since fsck does not fix the xattr boundary issue. Bug: 305658663 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim (cherry picked from commit 50a472bbc79ff9d5a88be8019a60e936cadf9f13 https://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs.git dev) Change-Id: I55060a4eede3f5f85066aba22a6ab7155517e5c4 (cherry picked from commit 70113b9d489050d3e7a6f28e0cd6e43f104fc132) (cherry picked from commit 2c1f3789d609bd549f14c019b6c7b311bfd2fa64) --- fs/f2fs/node.c | 4 +++- fs/f2fs/xattr.c | 20 +++++++++++++------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5010a33acb8a..60708d47aaa8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2734,7 +2734,9 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + if (page) + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), + VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index db3b641f2158..adaad16468d8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -363,10 +363,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - err = -EFSCORRUPTED; + err = -ENODATA; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); goto out; @@ -583,13 +583,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - error = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); - goto cleanup; + break; } if (!handler || (handler->list && !handler->list(dentry))) @@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - +retry: error = read_all_xattrs(inode, ipage, &base_addr); if (error) return error; @@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index, /* find entry with wanted name. */ here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + if (!F2FS_I(inode)->i_xattr_nid) { + f2fs_notice(F2FS_I_SB(inode), + "recover xattr in inode (%lu)", inode->i_ino); + f2fs_recover_xattr_data(inode, NULL); + kfree(base_addr); + goto retry; + } + f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; From 717d1f8f91abc780615bd85ac778f702aff6fde4 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 19 Dec 2023 16:40:19 +0000 Subject: [PATCH 096/139] ANDROID: KVM: arm64: Fix host_smc print typo From pKVM point of view, unknown SMCs are simply forwarded, we can't consider them invalid or not. This was probably a typo following a copy of the host_hcall event. Bug: 299430621 Change-Id: Ieb53f985a5187a8b5a9feb4a95982b15cdc1b04a Signed-off-by: Vincent Donnefort --- arch/arm64/include/asm/kvm_hypevents.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h index 8a2dd41b8569..c507d8978444 100644 --- a/arch/arm64/include/asm/kvm_hypevents.h +++ b/arch/arm64/include/asm/kvm_hypevents.h @@ -53,7 +53,7 @@ HYP_EVENT(host_smc, __entry->id = id; __entry->forwarded = forwarded; ), - HE_PRINTK("id=%llu invalid=%u", + HE_PRINTK("id=%llu forwarded=%u", __entry->id, __entry->forwarded) ); From 15a93de4641d882e26cf1de50af69679c1a20aee Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 20 Dec 2023 16:18:05 +0000 Subject: [PATCH 097/139] ANDROID: KVM: arm64: Fix hyp event alignment The structures that define hyp events must be packed so they match their format definitions in the tracefs file hyp/events/hyp//format. Bug: 299430621 Change-Id: Ia7e1a686744d5c9c3f8a21881f03228c8acecade Signed-off-by: Vincent Donnefort --- arch/arm64/include/asm/kvm_hypevents_defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hypevents_defs.h b/arch/arm64/include/asm/kvm_hypevents_defs.h index e228d894a898..606f3477ecd3 100644 --- a/arch/arm64/include/asm/kvm_hypevents_defs.h +++ b/arch/arm64/include/asm/kvm_hypevents_defs.h @@ -15,10 +15,10 @@ struct hyp_entry_hdr { /* * Hyp events definitions common to the hyp and the host */ -#define HYP_EVENT_FORMAT(__name, __struct) \ - struct trace_hyp_format_##__name { \ - struct hyp_entry_hdr hdr; \ - __struct \ +#define HYP_EVENT_FORMAT(__name, __struct) \ + struct __packed trace_hyp_format_##__name { \ + struct hyp_entry_hdr hdr; \ + __struct \ } #define HE_PROTO(args...) args From e74417834ea0485b4a1e9ba9d8d7f2fed143eddd Mon Sep 17 00:00:00 2001 From: Ken Huang Date: Thu, 4 Jan 2024 23:18:14 +0800 Subject: [PATCH 098/139] ANDROID: Update the ABI symbol list Adding the following symbols: - __drmm_crtc_alloc_with_planes Bug: 275278929 Change-Id: I41b6069612d44214f474ed82ee2a4b07ca739302 Signed-off-by: Ken Huang --- android/abi_gki_aarch64_pixel | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index af22721e20b8..a01a49721a1a 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -739,6 +739,7 @@ drm_kms_helper_poll_fini drm_kms_helper_poll_init drm_match_cea_mode + __drmm_crtc_alloc_with_planes drmm_kmalloc drmm_mode_config_init drm_mode_config_reset From 066d57de875d52d8fd4fbb9248f42e3c8a1066cc Mon Sep 17 00:00:00 2001 From: Kever Yang Date: Wed, 3 Jan 2024 22:02:54 +0800 Subject: [PATCH 099/139] ANDROID: GKI: Enable symbols for v4l2 in async and fwnode INFO: 14 function symbol(s) added 'struct v4l2_async_subdev* __v4l2_async_nf_add_fwnode(struct v4l2_async_notifier*, struct fwnode_handle*, unsigned int)' 'struct v4l2_async_subdev* __v4l2_async_nf_add_fwnode_remote(struct v4l2_async_notifier*, struct fwnode_handle*, unsigned int)' 'void v4l2_async_nf_cleanup(struct v4l2_async_notifier*)' 'void v4l2_async_nf_init(struct v4l2_async_notifier*)' 'int v4l2_async_nf_parse_fwnode_endpoints(struct device*, struct v4l2_async_notifier*, size_t, parse_endpoint_func)' 'int v4l2_async_nf_register(struct v4l2_device*, struct v4l2_async_notifier*)' 'void v4l2_async_nf_unregister(struct v4l2_async_notifier*)' 'int v4l2_async_register_subdev(struct v4l2_subdev*)' 'int v4l2_async_register_subdev_sensor(struct v4l2_subdev*)' 'int v4l2_async_subdev_nf_register(struct v4l2_subdev*, struct v4l2_async_notifier*)' 'void v4l2_async_unregister_subdev(struct v4l2_subdev*)' 'int v4l2_fwnode_endpoint_alloc_parse(struct fwnode_handle*, struct v4l2_fwnode_endpoint*)' 'void v4l2_fwnode_endpoint_free(struct v4l2_fwnode_endpoint*)' 'int v4l2_fwnode_endpoint_parse(struct fwnode_handle*, struct v4l2_fwnode_endpoint*)' Bug: 300024866 Change-Id: I7e4c2faac5c8341a19ea3fed694190d38679dc5b Signed-off-by: Kever Yang --- android/abi_gki_aarch64.stg | 269 +++++++++++++++++++++++++++++++ android/abi_gki_aarch64_rockchip | 14 ++ 2 files changed, 283 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index f2b7c07a7716..4a190c90a757 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -9018,6 +9018,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x72d62916 } +pointer_reference { + id: 0x1625e208 + kind: POINTER + pointee_type_id: 0x72d76ebd +} pointer_reference { id: 0x162c7a70 kind: POINTER @@ -18183,6 +18188,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x9d41cc1a } +pointer_reference { + id: 0x2dc069c5 + kind: POINTER + pointee_type_id: 0x9d414188 +} pointer_reference { id: 0x2dc1540f kind: POINTER @@ -30713,6 +30723,11 @@ typedef { name: "p4d_t" referred_type_id: 0x148546d4 } +typedef { + id: 0xbad82a2c + name: "parse_endpoint_func" + referred_type_id: 0x2dc069c5 +} typedef { id: 0x8ef19fe7 name: "pci_bus_flags_t" @@ -52189,6 +52204,11 @@ member { name: "base" type_id: 0x180f82e8 } +member { + id: 0x85d2e2e4 + name: "base" + type_id: 0x080c6fc2 +} member { id: 0x85d6188a name: "base" @@ -56419,6 +56439,12 @@ member { name: "bus" type_id: 0x2309ad3e } +member { + id: 0xdaf846cc + name: "bus" + type_id: 0x286a95aa + offset: 160 +} member { id: 0x1639ef00 name: "bus_cleanup" @@ -56648,6 +56674,12 @@ member { type_id: 0x945e7ef6 offset: 448 } +member { + id: 0x2c928e64 + name: "bus_type" + type_id: 0x3c57148f + offset: 128 +} member { id: 0xb43c45b4 name: "bus_width" @@ -116596,6 +116628,12 @@ member { name: "link_fd" type_id: 0xe62ebf07 } +member { + id: 0x6075ccdc + name: "link_frequencies" + type_id: 0x2e18f543 + offset: 512 +} member { id: 0x178cf8a4 name: "link_gen" @@ -126808,6 +126846,18 @@ member { name: "mipi_csi1" type_id: 0xe49bfc8b } +member { + id: 0xa7e5d7c1 + name: "mipi_csi1" + type_id: 0xe49bfc8b + offset: 64 +} +member { + id: 0xeda56411 + name: "mipi_csi2" + type_id: 0xe72f0de6 + offset: 128 +} member { id: 0xeda56dd3 name: "mipi_csi2" @@ -136120,6 +136170,12 @@ member { type_id: 0xe62ebf07 offset: 672 } +member { + id: 0x4519d21b + name: "nr_of_link_frequencies" + type_id: 0x4585663f + offset: 576 +} member { id: 0x9c6b34f7 name: "nr_off" @@ -211032,6 +211088,16 @@ struct_union { member_id: 0x9683f73d } } +struct_union { + id: 0x286a95aa + kind: STRUCT + definition { + bytesize: 40 + member_id: 0xc0bc4db7 + member_id: 0xa7e5d7c1 + member_id: 0xeda56411 + } +} struct_union { id: 0x2880e524 kind: STRUCT @@ -265808,6 +265874,19 @@ struct_union { member_id: 0x465224ed } } +struct_union { + id: 0x72d76ebd + kind: STRUCT + name: "v4l2_fwnode_endpoint" + definition { + bytesize: 80 + member_id: 0x85d2e2e4 + member_id: 0x2c928e64 + member_id: 0xdaf846cc + member_id: 0x6075ccdc + member_id: 0x4519d21b + } +} struct_union { id: 0xccd4dc1a kind: STRUCT @@ -287051,6 +287130,13 @@ enumeration { } } } +function { + id: 0x003279c7 + return_type_id: 0x3c2dd1ca + parameter_id: 0x3cfe7778 + parameter_id: 0x0490bb4a + parameter_id: 0x4585663f +} function { id: 0x004cf563 return_type_id: 0x48b5725f @@ -291664,6 +291750,11 @@ function { parameter_id: 0x14528516 parameter_id: 0x2712b6f9 } +function { + id: 0x15112911 + return_type_id: 0x48b5725f + parameter_id: 0x1625e208 +} function { id: 0x151457b1 return_type_id: 0xd5cc9c9a @@ -298655,6 +298746,11 @@ function { parameter_id: 0x3c2755a3 parameter_id: 0x0cbf60eb } +function { + id: 0x1fa7cc4d + return_type_id: 0x48b5725f + parameter_id: 0x3cfe7778 +} function { id: 0x1fa8b2bc return_type_id: 0x48b5725f @@ -321574,6 +321670,12 @@ function { parameter_id: 0x04b193cc parameter_id: 0x0335a07f } +function { + id: 0x9ca0dc77 + return_type_id: 0x6720d32f + parameter_id: 0x074f1a14 + parameter_id: 0x3cfe7778 +} function { id: 0x9ca1921c return_type_id: 0x6720d32f @@ -322038,6 +322140,12 @@ function { parameter_id: 0x054f691a parameter_id: 0x0aa1f0ee } +function { + id: 0x9cfc5a75 + return_type_id: 0x6720d32f + parameter_id: 0x0490bb4a + parameter_id: 0x1625e208 +} function { id: 0x9cfd713b return_type_id: 0x6720d32f @@ -322060,6 +322168,12 @@ function { parameter_id: 0x02ed0755 parameter_id: 0x0e68dab6 } +function { + id: 0x9d027320 + return_type_id: 0x6720d32f + parameter_id: 0x01c5a749 + parameter_id: 0x3cfe7778 +} function { id: 0x9d038726 return_type_id: 0x6720d32f @@ -322608,6 +322722,13 @@ function { parameter_id: 0x0258f96e parameter_id: 0x15f20052 } +function { + id: 0x9d414188 + return_type_id: 0x6720d32f + parameter_id: 0x0258f96e + parameter_id: 0x1625e208 + parameter_id: 0x3c2dd1ca +} function { id: 0x9d419277 return_type_id: 0x6720d32f @@ -323728,6 +323849,14 @@ function { parameter_id: 0x33756485 parameter_id: 0x064d6086 } +function { + id: 0x9ddac293 + return_type_id: 0x6720d32f + parameter_id: 0x0258f96e + parameter_id: 0x3cfe7778 + parameter_id: 0xf435685e + parameter_id: 0xbad82a2c +} function { id: 0x9ddaf106 return_type_id: 0x6720d32f @@ -343026,6 +343155,24 @@ elf_symbol { type_id: 0x20cd94dc full_name: "__usecs_to_jiffies" } +elf_symbol { + id: 0xf51d746f + name: "__v4l2_async_nf_add_fwnode" + is_defined: true + symbol_type: FUNCTION + crc: 0x03599cac + type_id: 0x003279c7 + full_name: "__v4l2_async_nf_add_fwnode" +} +elf_symbol { + id: 0xe13e16ca + name: "__v4l2_async_nf_add_fwnode_remote" + is_defined: true + symbol_type: FUNCTION + crc: 0x82966749 + type_id: 0x003279c7 + full_name: "__v4l2_async_nf_add_fwnode_remote" +} elf_symbol { id: 0x4c0a941a name: "__v4l2_ctrl_handler_setup" @@ -394585,6 +394732,87 @@ elf_symbol { type_id: 0x927d452a full_name: "uuid_parse" } +elf_symbol { + id: 0x4e2f55da + name: "v4l2_async_nf_cleanup" + is_defined: true + symbol_type: FUNCTION + crc: 0xdad12cba + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_cleanup" +} +elf_symbol { + id: 0x04aadf7f + name: "v4l2_async_nf_init" + is_defined: true + symbol_type: FUNCTION + crc: 0xc88abf32 + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_init" +} +elf_symbol { + id: 0x7920fabe + name: "v4l2_async_nf_parse_fwnode_endpoints" + is_defined: true + symbol_type: FUNCTION + crc: 0xde590e4b + type_id: 0x9ddac293 + full_name: "v4l2_async_nf_parse_fwnode_endpoints" +} +elf_symbol { + id: 0x48e55006 + name: "v4l2_async_nf_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x8be566ca + type_id: 0x9ca0dc77 + full_name: "v4l2_async_nf_register" +} +elf_symbol { + id: 0x65ffd1d0 + name: "v4l2_async_nf_unregister" + is_defined: true + symbol_type: FUNCTION + crc: 0xc74894f9 + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_unregister" +} +elf_symbol { + id: 0x507a9ef5 + name: "v4l2_async_register_subdev" + is_defined: true + symbol_type: FUNCTION + crc: 0x64ab86bc + type_id: 0x9df18afd + full_name: "v4l2_async_register_subdev" +} +elf_symbol { + id: 0x050dd932 + name: "v4l2_async_register_subdev_sensor" + is_defined: true + symbol_type: FUNCTION + crc: 0x61c8f608 + type_id: 0x9df18afd + full_name: "v4l2_async_register_subdev_sensor" +} +elf_symbol { + id: 0x0664687c + name: "v4l2_async_subdev_nf_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x4d890f4b + type_id: 0x9d027320 + full_name: "v4l2_async_subdev_nf_register" +} +elf_symbol { + id: 0xf440f7f1 + name: "v4l2_async_unregister_subdev" + is_defined: true + symbol_type: FUNCTION + crc: 0x2592ea78 + type_id: 0x10e93841 + full_name: "v4l2_async_unregister_subdev" +} elf_symbol { id: 0xf39bae65 name: "v4l2_compat_ioctl32" @@ -394990,6 +395218,33 @@ elf_symbol { type_id: 0x209ae488 full_name: "v4l2_format_info" } +elf_symbol { + id: 0x7ba36329 + name: "v4l2_fwnode_endpoint_alloc_parse" + is_defined: true + symbol_type: FUNCTION + crc: 0x05930b06 + type_id: 0x9cfc5a75 + full_name: "v4l2_fwnode_endpoint_alloc_parse" +} +elf_symbol { + id: 0x2643c2c9 + name: "v4l2_fwnode_endpoint_free" + is_defined: true + symbol_type: FUNCTION + crc: 0xf01d6f06 + type_id: 0x15112911 + full_name: "v4l2_fwnode_endpoint_free" +} +elf_symbol { + id: 0xcb8b4f14 + name: "v4l2_fwnode_endpoint_parse" + is_defined: true + symbol_type: FUNCTION + crc: 0x9dcd6cfe + type_id: 0x9cfc5a75 + full_name: "v4l2_fwnode_endpoint_parse" +} elf_symbol { id: 0x58330374 name: "v4l2_g_parm_cap" @@ -399757,6 +400012,8 @@ interface { symbol_id: 0x7c261545 symbol_id: 0xf497de36 symbol_id: 0xf44f6a18 + symbol_id: 0xf51d746f + symbol_id: 0xe13e16ca symbol_id: 0x4c0a941a symbol_id: 0xfc85c168 symbol_id: 0xb6af2644 @@ -405485,6 +405742,15 @@ interface { symbol_id: 0xb0c1eaf9 symbol_id: 0xe7b3f166 symbol_id: 0xb21b47da + symbol_id: 0x4e2f55da + symbol_id: 0x04aadf7f + symbol_id: 0x7920fabe + symbol_id: 0x48e55006 + symbol_id: 0x65ffd1d0 + symbol_id: 0x507a9ef5 + symbol_id: 0x050dd932 + symbol_id: 0x0664687c + symbol_id: 0xf440f7f1 symbol_id: 0xf39bae65 symbol_id: 0xfd78bf45 symbol_id: 0x218d39b6 @@ -405530,6 +405796,9 @@ interface { symbol_id: 0xe66642fe symbol_id: 0x538ad5cc symbol_id: 0x2244c8f0 + symbol_id: 0x7ba36329 + symbol_id: 0x2643c2c9 + symbol_id: 0xcb8b4f14 symbol_id: 0x58330374 symbol_id: 0xdb18c924 symbol_id: 0x5e36dba6 diff --git a/android/abi_gki_aarch64_rockchip b/android/abi_gki_aarch64_rockchip index 0010cf2300b6..a051b3843047 100644 --- a/android/abi_gki_aarch64_rockchip +++ b/android/abi_gki_aarch64_rockchip @@ -1268,6 +1268,15 @@ usb_submit_urb __usecs_to_jiffies usleep_range_state + __v4l2_async_nf_add_fwnode_remote + v4l2_async_nf_cleanup + v4l2_async_nf_init + v4l2_async_nf_parse_fwnode_endpoints + v4l2_async_nf_register + v4l2_async_register_subdev + v4l2_async_register_subdev_sensor + v4l2_async_subdev_nf_register + v4l2_async_unregister_subdev v4l2_ctrl_find v4l2_ctrl_g_ctrl v4l2_ctrl_g_ctrl_int64 @@ -1295,6 +1304,9 @@ v4l2_event_subscribe v4l2_event_unsubscribe v4l2_fh_open + v4l2_fwnode_endpoint_alloc_parse + v4l2_fwnode_endpoint_free + v4l2_fwnode_endpoint_parse v4l2_i2c_subdev_init v4l2_match_dv_timings v4l2_pipeline_link_notify @@ -2871,9 +2883,11 @@ # required by video_rkcif.ko media_entity_setup_link + __v4l2_async_nf_add_fwnode # required by video_rkisp.ko param_ops_ullong + v4l2_async_nf_unregister v4l2_ctrl_poll # required by videobuf2-cma-sg.ko From c52d48818b621678f7ffbeddf29ee1a3437a1363 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:47 -0400 Subject: [PATCH 100/139] UPSTREAM: maple_tree: introduce __mas_set_range() mas_set_range() resets the node to MAS_START, which will cause a re-walk of the tree to the range. This is unnecessary when the maple state is already at the correct location of the write. Add a function that only sets the range to avoid unnecessary re-walking of the tree. Link: https://lkml.kernel.org/r/20230724183157.3939892-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit c1297987cc2ada57a7faea7985c2334548d110f9) Bug: 308042511 Change-Id: I9e026d0f103e3aa24b47998be6b83e28e7928540 Signed-off-by: Suren Baghdasaryan --- include/linux/maple_tree.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0ff8ce8cd06a..6053986cbdf6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -525,6 +525,22 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the + * current location. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * set the internal maple state values to a sub-range. + * Please use mas_set_range() if you do not know where you are in the tree. + */ +static inline void __mas_set_range(struct ma_state *mas, unsigned long start, + unsigned long last) +{ + mas->index = start; + mas->last = last; +} /** * mas_set_range() - Set up Maple Tree operation state for a different index. @@ -539,9 +555,8 @@ static inline void mas_reset(struct ma_state *mas) static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->index = start; - mas->last = last; - mas->node = MAS_START; + __mas_set_range(mas, start, last); + mas->node = MAS_START; } /** From 4ddcdc519b4ae94f56dfa81769d5ce59f1cc41a8 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:36 +0800 Subject: [PATCH 101/139] FROMGIT: maple_tree: add mt_free_one() and mt_attr() helpers Patch series "Introduce __mt_dup() to improve the performance of fork()", v7. This series introduces __mt_dup() to improve the performance of fork(). During the duplication process of mmap, all VMAs are traversed and inserted one by one into the new maple tree, causing the maple tree to be rebalanced multiple times. Balancing the maple tree is a costly operation. To duplicate VMAs more efficiently, mtree_dup() and __mt_dup() are introduced for the maple tree. They can efficiently duplicate a maple tree. Here are some algorithmic details about {mtree,__mt}_dup(). We perform a DFS pre-order traversal of all nodes in the source maple tree. During this process, we fully copy the nodes from the source tree to the new tree. This involves memory allocation, and when encountering a new node, if it is a non-leaf node, all its child nodes are allocated at once. This idea was originally from Liam R. Howlett's Maple Tree Work email, and I added some of my own ideas to implement it. Some previous discussions can be found in [1]. For a more detailed analysis of the algorithm, please refer to the logs for patch [3/10] and patch [10/10]. There is a "spawn" in byte-unixbench[2], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% Thanks to Liam and Matthew for the review. This patch (of 10): Add two helpers: 1. mt_free_one(), used to free a maple node. 2. mt_attr(), used to obtain the attributes of maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20231027033845.90608-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 4f2267b58a22d972be98edef8e6b3c7a67c9fb91 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: Ib9b13dee357ac4c85668901c20a3c370fbdd08da Signed-off-by: Suren Baghdasaryan --- lib/maple_tree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 826f7b8d5e05..d932bf27fa0b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -158,6 +158,11 @@ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } +static inline void mt_free_one(struct maple_node *node) +{ + kmem_cache_free(maple_node_cache, node); +} + static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); @@ -199,6 +204,11 @@ static unsigned int mas_mt_height(struct ma_state *mas) return mt_height(mas->tree); } +static inline unsigned int mt_attr(struct maple_tree *mt) +{ + return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; +} + static inline enum maple_type mte_node_type(const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & @@ -5702,7 +5712,7 @@ void mas_destroy(struct ma_state *mas) mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } - kmem_cache_free(maple_node_cache, node); + mt_free_one(ma_mnode_ptr(node)); total--; } From dc9323545b03f3d1ebeeac46c5ceb366eced9314 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:37 +0800 Subject: [PATCH 102/139] FROMGIT: maple_tree: introduce {mtree,mas}_lock_nested() In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is introduced. For example, when duplicating maple tree, we need to hold the locks of two trees, in which case nested locks are needed. At the same time, add the definition of spin_lock_nested() in tools for testing. Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit b2472efe4316b2687c153919c1513a098bd82c17 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I06f0eb0a32a2f39b7842de08a0e5ce59895345c5 Signed-off-by: Suren Baghdasaryan --- include/linux/maple_tree.h | 4 ++++ tools/include/linux/spinlock.h | 1 + 2 files changed, 5 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6053986cbdf6..73d9f342eac4 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -249,6 +249,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -399,6 +401,8 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..a6cdf25b6b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -11,6 +11,7 @@ #define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock(x) pthread_mutex_lock(x) +#define spin_lock_nested(x, subclass) pthread_mutex_lock(x) #define spin_unlock(x) pthread_mutex_unlock(x) #define spin_lock_bh(x) pthread_mutex_lock(x) #define spin_unlock_bh(x) pthread_mutex_unlock(x) From eb5048ea90b2f2935edfce9e3182a8aee3f2c8fd Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:38 +0800 Subject: [PATCH 103/139] FROMGIT: maple_tree: introduce interfaces __mt_dup() and mtree_dup() Introduce interfaces __mt_dup() and mtree_dup(), which are used to duplicate a maple tree. They duplicate a maple tree in Depth-First Search (DFS) pre-order traversal. It uses memcopy() to copy nodes in the source tree and allocate new child nodes in non-leaf nodes. The new node is exactly the same as the source node except for all the addresses stored in it. It will be faster than traversing all elements in the source tree and inserting them one by one into the new tree. The time complexity of these two functions is O(n). The difference between __mt_dup() and mtree_dup() is that mtree_dup() handles locks internally. Analysis of the average time complexity of this algorithm: For simplicity, let's assume that the maximum branching factor of all non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a full tree. Under the given conditions, if there is a maple tree with n elements, the number of its leaves is n/16. From bottom to top, the number of nodes in each level is 1/16 of the number of nodes in the level below. So the total number of nodes in the entire tree is given by the sum of n/16 + n/16^2 + n/16^3 + ... + 1. This is a geometric series, and it has log(n) terms with base 16. According to the formula for the sum of a geometric series, the sum of this series can be calculated as (n-1)/15. Each node has only one parent node pointer, which can be considered as an edge. In total, there are (n-1)/15-1 edges. This algorithm consists of two operations: 1. Traversing all nodes in DFS order. 2. For each node, making a copy and performing necessary modifications to create a new node. For the first part, DFS traversal will visit each edge twice. Let T(ascend) represent the cost of taking one step downwards, and T(descend) represent the cost of taking one step upwards. And both of them are constants (although mas_ascend() may not be, as it contains a loop, but here we ignore it and treat it as a constant). So the time spent on the first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)). For the second part, each node will be copied, and the cost of copying a node is denoted as T(copy_node). For each non-leaf node, it is necessary to reallocate all child nodes, and the cost of this operation is denoted as T(dup_alloc). The behavior behind memory allocation is complex and not specific to the maple tree operation. Here, we assume that the time required for a single allocation is constant. Since the size of a node is fixed, both of these symbols are also constants. We can calculate that the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15 - n/16) * T(dup_alloc). Adding both parts together, the total time spent by the algorithm can be represented as: ((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) - n/16 * T(dup_alloc) - (T(ascend) + T(descend)) Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc) Let C2 = T(dup_alloc) Let C3 = T(ascend) + T(descend) Finally, the expression can be simplified as: ((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3). This is a linear function, so the average time complexity is O(n). Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I385759a1184a202498e086458b572c203616b9b4 Signed-off-by: Suren Baghdasaryan --- include/linux/maple_tree.h | 3 + lib/maple_tree.c | 274 +++++++++++++++++++++++++++++++++++++ 2 files changed, 277 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 73d9f342eac4..7ccb05ba08ce 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -322,6 +322,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d932bf27fa0b..3bc5414f78ab 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4,6 +4,8 @@ * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett * Matthew Wilcox + * Copyright (c) 2023 ByteDance + * Author: Peng Zhang */ /* @@ -6537,6 +6539,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) } EXPORT_SYMBOL(mtree_erase); +/* + * mas_dup_free() - Free an incomplete duplication of a tree. + * @mas: The maple state of a incomplete tree. + * + * The parameter @mas->node passed in indicates that the allocation failed on + * this node. This function frees all nodes starting from @mas->node in the + * reverse order of mas_dup_build(). There is no need to hold the source tree + * lock at this time. + */ +static void mas_dup_free(struct ma_state *mas) +{ + struct maple_node *node; + enum maple_type type; + void __rcu **slots; + unsigned char count, i; + + /* Maybe the first node allocation failed. */ + if (mas_is_none(mas)) + return; + + while (!mte_is_root(mas->node)) { + mas_ascend(mas); + if (mas->offset) { + mas->offset--; + do { + mas_descend(mas); + mas->offset = mas_data_end(mas); + } while (!mte_is_leaf(mas->node)); + + mas_ascend(mas); + } + + node = mte_to_node(mas->node); + type = mte_node_type(mas->node); + slots = ma_slots(node, type); + count = mas_data_end(mas) + 1; + for (i = 0; i < count; i++) + ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; + mt_free_bulk(count, slots); + } + + node = mte_to_node(mas->node); + mt_free_one(node); +} + +/* + * mas_copy_node() - Copy a maple node and replace the parent. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @parent: The parent of the new node. + * + * Copy @mas->node to @new_mas->node, set @parent to be the parent of + * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, + struct maple_pnode *parent) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + unsigned long val; + + /* Copy the node completely. */ + memcpy(new_node, node, sizeof(struct maple_node)); + /* Update the parent node pointer. */ + val = (unsigned long)node->parent & MAPLE_NODE_MASK; + new_node->parent = ma_parent_ptr(val | (unsigned long)parent); +} + +/* + * mas_dup_alloc() - Allocate child nodes for a maple node. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function allocates child nodes for @new_mas->node during the duplication + * process. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + enum maple_type type; + unsigned char request, count, i; + void __rcu **slots; + void __rcu **new_slots; + unsigned long val; + + /* Allocate memory for child nodes. */ + type = mte_node_type(mas->node); + new_slots = ma_slots(new_node, type); + request = mas_data_end(mas) + 1; + count = mt_alloc_bulk(gfp, request, (void **)new_slots); + if (unlikely(count < request)) { + memset(new_slots, 0, request * sizeof(void *)); + mas_set_err(mas, -ENOMEM); + return; + } + + /* Restore node type information in slots. */ + slots = ma_slots(node, type); + for (i = 0; i < count; i++) { + val = (unsigned long)mt_slot_locked(mas->tree, slots, i); + val &= MAPLE_NODE_MASK; + ((unsigned long *)new_slots)[i] |= val; + } +} + +/* + * mas_dup_build() - Build a new maple tree from a source tree + * @mas: The maple state of source tree, need to be in MAS_START state. + * @new_mas: The maple state of new tree, need to be in MAS_START state. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function builds a new tree in DFS preorder. If the memory allocation + * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the + * last node. mas_dup_free() will free the incomplete duplication of a tree. + * + * Note that the attributes of the two trees need to be exactly the same, and the + * new tree needs to be empty, otherwise -EINVAL will be set in @mas. + */ +static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node; + struct maple_pnode *parent = NULL; + struct maple_enode *root; + enum maple_type type; + + if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || + unlikely(!mtree_empty(new_mas->tree))) { + mas_set_err(mas, -EINVAL); + return; + } + + root = mas_start(mas); + if (mas_is_ptr(mas) || mas_is_none(mas)) + goto set_new_tree; + + node = mt_alloc_one(gfp); + if (!node) { + new_mas->node = MAS_NONE; + mas_set_err(mas, -ENOMEM); + return; + } + + type = mte_node_type(mas->node); + root = mt_mk_node(node, type); + new_mas->node = root; + new_mas->min = 0; + new_mas->max = ULONG_MAX; + root = mte_mk_root(root); + while (1) { + mas_copy_node(mas, new_mas, parent); + if (!mte_is_leaf(mas->node)) { + /* Only allocate child nodes for non-leaf nodes. */ + mas_dup_alloc(mas, new_mas, gfp); + if (unlikely(mas_is_err(mas))) + return; + } else { + /* + * This is the last leaf node and duplication is + * completed. + */ + if (mas->max == ULONG_MAX) + goto done; + + /* This is not the last leaf node and needs to go up. */ + do { + mas_ascend(mas); + mas_ascend(new_mas); + } while (mas->offset == mas_data_end(mas)); + + /* Move to the next subtree. */ + mas->offset++; + new_mas->offset++; + } + + mas_descend(mas); + parent = ma_parent_ptr(mte_to_node(new_mas->node)); + mas_descend(new_mas); + mas->offset = 0; + new_mas->offset = 0; + } +done: + /* Specially handle the parent of the root node. */ + mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); +set_new_tree: + /* Make them the same height */ + new_mas->tree->ma_flags = mas->tree->ma_flags; + rcu_assign_pointer(new_mas->tree->ma_root, root); +} + +/** + * __mt_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * Note that the user needs to manually lock the source tree and the new tree. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_dup_build(&mas, &new_mas, gfp); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + return ret; +} +EXPORT_SYMBOL(__mt_dup); + +/** + * mtree_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_lock(&new_mas); + mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); + mas_dup_build(&mas, &new_mas, gfp); + mas_unlock(&mas); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + mas_unlock(&new_mas); + return ret; +} +EXPORT_SYMBOL(mtree_dup); + /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree From f73f881af49ecfcc16be7e460ed8046a91d259ad Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:39 +0800 Subject: [PATCH 104/139] FROMGIT: radix tree test suite: align kmem_cache_alloc_bulk() with kernel behavior. When kmem_cache_alloc_bulk() fails to allocate, leave the freed pointers in the array. This enables a more accurate simulation of the kernel's behavior and allows for testing potential double-free scenarios. Link: https://lkml.kernel.org/r/20231027033845.90608-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 46c99e26f2f86260fed226cab217d0b3ca8dca56 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: If822e9d219066e1573b7c044ef9a7344f652e365 Signed-off-by: Suren Baghdasaryan --- tools/testing/radix-tree/linux.c | 45 +++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index d587a558997f..64a1645ff94c 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -93,13 +93,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return p; } -void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) { assert(objp); - uatomic_dec(&nr_allocated); - uatomic_dec(&cachep->nr_allocated); - if (kmalloc_verbose) - printf("Freeing %p to slab\n", objp); if (cachep->nr_objs > 10 || cachep->align) { memset(objp, POISON_FREE, cachep->size); free(objp); @@ -111,6 +107,15 @@ void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) } } +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); + if (kmalloc_verbose) + printf("Freeing %p to slab\n", objp); + __kmem_cache_free_locked(cachep, objp); +} + void kmem_cache_free(struct kmem_cache *cachep, void *objp) { pthread_mutex_lock(&cachep->lock); @@ -141,18 +146,17 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, if (kmalloc_verbose) pr_debug("Bulk alloc %lu\n", size); - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (cachep->non_kernel < size) - return 0; - - cachep->non_kernel -= size; - } - pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs >= size) { struct radix_tree_node *node; for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + node = cachep->objs; cachep->nr_objs--; cachep->objs = node->parent; @@ -163,11 +167,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } else { pthread_mutex_unlock(&cachep->lock); for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + if (cachep->align) { posix_memalign(&p[i], cachep->align, cachep->size * size); } else { p[i] = malloc(cachep->size * size); + if (!p[i]) + break; } if (cachep->ctor) cachep->ctor(p[i]); @@ -176,6 +188,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } } + if (i < size) { + size = i; + pthread_mutex_lock(&cachep->lock); + for (i = 0; i < size; i++) + __kmem_cache_free_locked(cachep, p[i]); + pthread_mutex_unlock(&cachep->lock); + return 0; + } + for (i = 0; i < size; i++) { uatomic_inc(&nr_allocated); uatomic_inc(&cachep->nr_allocated); From 7befa7bbc90ae65849a08dce9bf5ce1845f19494 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:40 +0800 Subject: [PATCH 105/139] FROMGIT: maple_tree: add test for mtree_dup() Add test for mtree_dup(). Test by duplicating different maple trees and then comparing the two trees. Includes tests for duplicating full trees and memory allocation failures on different nodes. Link: https://lkml.kernel.org/r/20231027033845.90608-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit a2587a7e8d37885dc063255f5400a66299b42e48 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I7501db5735b1dfd15240ef2946b26d63ffe1d8e0 Signed-off-by: Suren Baghdasaryan --- tools/testing/radix-tree/maple.c | 361 +++++++++++++++++++++++++++++++ 1 file changed, 361 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index b598b7fe4419..916cf8b45ddf 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35753,6 +35753,363 @@ static noinline void __init check_locky(struct maple_tree *mt) mt_clear_in_rcu(mt); } +/* + * Compares two nodes except for the addresses stored in the nodes. + * Returns zero if they are the same, otherwise returns non-zero. + */ +static int __init compare_node(struct maple_enode *enode_a, + struct maple_enode *enode_b) +{ + struct maple_node *node_a, *node_b; + struct maple_node a, b; + void **slots_a, **slots_b; /* Do not use the rcu tag. */ + enum maple_type type; + int i; + + if (((unsigned long)enode_a & MAPLE_NODE_MASK) != + ((unsigned long)enode_b & MAPLE_NODE_MASK)) { + pr_err("The lower 8 bits of enode are different.\n"); + return -1; + } + + type = mte_node_type(enode_a); + node_a = mte_to_node(enode_a); + node_b = mte_to_node(enode_b); + a = *node_a; + b = *node_b; + + /* Do not compare addresses. */ + if (ma_is_root(node_a) || ma_is_root(node_b)) { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MA_ROOT_PARENT); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MA_ROOT_PARENT); + } else { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MAPLE_NODE_MASK); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MAPLE_NODE_MASK); + } + + if (a.parent != b.parent) { + pr_err("The lower 8 bits of parents are different. %p %p\n", + a.parent, b.parent); + return -1; + } + + /* + * If it is a leaf node, the slots do not contain the node address, and + * no special processing of slots is required. + */ + if (ma_is_leaf(type)) + goto cmp; + + slots_a = ma_slots(&a, type); + slots_b = ma_slots(&b, type); + + for (i = 0; i < mt_slots[type]; i++) { + if (!slots_a[i] && !slots_b[i]) + break; + + if (!slots_a[i] || !slots_b[i]) { + pr_err("The number of slots is different.\n"); + return -1; + } + + /* Do not compare addresses in slots. */ + ((unsigned long *)slots_a)[i] &= MAPLE_NODE_MASK; + ((unsigned long *)slots_b)[i] &= MAPLE_NODE_MASK; + } + +cmp: + /* + * Compare all contents of two nodes, including parent (except address), + * slots (except address), pivots, gaps and metadata. + */ + return memcmp(&a, &b, sizeof(struct maple_node)); +} + +/* + * Compare two trees and return 0 if they are the same, non-zero otherwise. + */ +static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) +{ + MA_STATE(mas_a, mt_a, 0, 0); + MA_STATE(mas_b, mt_b, 0, 0); + + if (mt_a->ma_flags != mt_b->ma_flags) { + pr_err("The flags of the two trees are different.\n"); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + + if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { + if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { + pr_err("One is MAS_ROOT and the other is not.\n"); + return -1; + } + return 0; + } + + while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { + + if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { + pr_err("One is MAS_NONE and the other is not.\n"); + return -1; + } + + if (mas_a.min != mas_b.min || + mas_a.max != mas_b.max) { + pr_err("mas->min, mas->max do not match.\n"); + return -1; + } + + if (compare_node(mas_a.node, mas_b.node)) { + pr_err("The contents of nodes %p and %p are different.\n", + mas_a.node, mas_b.node); + mt_dump(mt_a, mt_dump_dec); + mt_dump(mt_b, mt_dump_dec); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + } + + return 0; +} + +static __init void mas_subtree_max_range(struct ma_state *mas) +{ + unsigned long limit = mas->max; + MA_STATE(newmas, mas->tree, 0, 0); + void *entry; + + mas_for_each(mas, entry, limit) { + if (mas->last - mas->index >= + newmas.last - newmas.index) { + newmas = *mas; + } + } + + *mas = newmas; +} + +/* + * build_full_tree() - Build a full tree. + * @mt: The tree to build. + * @flags: Use @flags to build the tree. + * @height: The height of the tree to build. + * + * Build a tree with full leaf nodes and internal nodes. Note that the height + * should not exceed 3, otherwise it will take a long time to build. + * Return: zero if the build is successful, non-zero if it fails. + */ +static __init int build_full_tree(struct maple_tree *mt, unsigned int flags, + int height) +{ + MA_STATE(mas, mt, 0, 0); + unsigned long step; + int ret = 0, cnt = 1; + enum maple_type type; + + mt_init_flags(mt, flags); + mtree_insert_range(mt, 0, ULONG_MAX, xa_mk_value(5), GFP_KERNEL); + + mtree_lock(mt); + + while (1) { + mas_set(&mas, 0); + if (mt_height(mt) < height) { + mas.max = ULONG_MAX; + goto store; + } + + while (1) { + mas_dfs_preorder(&mas); + if (mas_is_none(&mas)) + goto unlock; + + type = mte_node_type(mas.node); + if (mas_data_end(&mas) + 1 < mt_slots[type]) { + mas_set(&mas, mas.min); + goto store; + } + } +store: + mas_subtree_max_range(&mas); + step = mas.last - mas.index; + if (step < 1) { + ret = -1; + goto unlock; + } + + step /= 2; + mas.last = mas.index + step; + mas_store_gfp(&mas, xa_mk_value(5), + GFP_KERNEL); + ++cnt; + } +unlock: + mtree_unlock(mt); + + MT_BUG_ON(mt, mt_height(mt) != height); + /* pr_info("height:%u number of elements:%d\n", mt_height(mt), cnt); */ + return ret; +} + +static noinline void __init check_mtree_dup(struct maple_tree *mt) +{ + DEFINE_MTREE(new); + int i, j, ret, count = 0; + unsigned int rand_seed = 17, rand; + + /* store a value at [0, 0] */ + mt_init_flags(mt, 0); + mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + + /* The two trees have different attributes. */ + mt_init_flags(mt, 0); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* The new tree is not empty */ + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + mtree_store(&new, 5, xa_mk_value(5), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Test for duplicating full trees. */ + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, 0, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, 0); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test for normal duplicating. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test memory allocation failed. */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i < 30; i += 3) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + /* Failed at the first node. */ + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(0); + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + MT_BUG_ON(&new, ret != -ENOMEM); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Random maple tree fails at a random node. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + /* + * The rand() library function is not used, so we can generate + * the same random numbers on any platform. + */ + rand_seed = rand_seed * 1103515245 + 12345; + rand = rand_seed / 65536 % 128; + mt_set_non_kernel(rand); + + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + if (ret != 0) { + MT_BUG_ON(&new, ret != -ENOMEM); + count++; + mtree_destroy(mt); + continue; + } + + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* pr_info("mtree_dup() fail %d times\n", count); */ + BUG_ON(!count); +} + extern void test_kmem_cache_bulk(void); void farmer_tests(void) @@ -35800,6 +36157,10 @@ void farmer_tests(void) check_null_expand(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, 0); + check_mtree_dup(&tree); + mtree_destroy(&tree); + /* RCU testing */ mt_init_flags(&tree, 0); check_erase_testset(&tree); From c79ca61edc7b3d27a6fd3acbeb81d4638e1acdec Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:41 +0800 Subject: [PATCH 106/139] FROMGIT: maple_tree: update the documentation of maple tree Introduce the new interface mtree_dup() in the documentation. Link: https://lkml.kernel.org/r/20231027033845.90608-7-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 9bc1d3cdb904170214456bca96c4924f28522ab8 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I3eb330f0be49f7e8d8b37ecb64de3b7ef349c05b Signed-off-by: Suren Baghdasaryan --- Documentation/core-api/maple_tree.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst index 45defcf15da7..285e2d2b21ae 100644 --- a/Documentation/core-api/maple_tree.rst +++ b/Documentation/core-api/maple_tree.rst @@ -81,6 +81,9 @@ section. Sometimes it is necessary to ensure the next call to store to a maple tree does not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case. +You can use mtree_dup() to duplicate an entire maple tree. It is a more +efficient way than inserting all elements one by one into a new tree. + Finally, you can remove all entries from a maple tree by calling mtree_destroy(). If the maple tree entries are pointers, you may wish to free the entries first. @@ -112,6 +115,7 @@ Takes ma_lock internally: * mtree_insert() * mtree_insert_range() * mtree_erase() + * mtree_dup() * mtree_destroy() * mt_set_in_rcu() * mt_clear_in_rcu() From e57d333531ad6cfac38e42826d0229c03d6b6b41 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:42 +0800 Subject: [PATCH 107/139] FROMGIT: maple_tree: skip other tests when BENCH is enabled Skip other tests when BENCH is enabled so that performance can be measured in user space. Link: https://lkml.kernel.org/r/20231027033845.90608-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit f670fa1caadb4ea532a89012c5451e4c6789bfcc https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I0a761a4b6211b19ec80c97d5aef80f3979523bcb Signed-off-by: Suren Baghdasaryan --- lib/test_maple_tree.c | 8 ++++---- tools/testing/radix-tree/maple.c | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ab9d4461abc9..b80e28cdaa96 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2741,10 +2741,6 @@ static int __init maple_tree_seed(void) pr_info("\nTEST STARTING\n\n"); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_root_expand(&tree); - mtree_destroy(&tree); - #if defined(BENCH_SLOT_STORE) #define BENCH mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); @@ -2788,6 +2784,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_root_expand(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); mtree_destroy(&tree); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 916cf8b45ddf..a2626f02f385 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36195,7 +36195,9 @@ void farmer_tests(void) void maple_tree_tests(void) { +#if !defined(BENCH) farmer_tests(); +#endif maple_tree_seed(); maple_tree_harvest(); } From 1bec2dd52eabfd5c2908353ab7f3c2f324131da9 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:43 +0800 Subject: [PATCH 108/139] FROMGIT: maple_tree: update check_forking() and bench_forking() Updated check_forking() and bench_forking() to use __mt_dup() to duplicate maple tree. Link: https://lkml.kernel.org/r/20231027033845.90608-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 446e1867e6df3cbdd19af6be8f8f4ed56176adb4 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: I3b64ad1cb5ae40b10dc86ee55501f12522c7e2f5 Signed-off-by: Suren Baghdasaryan --- lib/test_maple_tree.c | 117 ++++++++++++++++++------------------ tools/include/linux/rwsem.h | 4 ++ 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index b80e28cdaa96..68b2c387fddb 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1671,47 +1671,48 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt) #endif /* check_forking - simulate the kernel forking sequence with the tree. */ -static noinline void __init check_forking(struct maple_tree *mt) +static noinline void __init check_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134; + struct maple_tree mt, newmt; + int i, nr_entries = 134, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); - mt_set_non_kernel(99999); mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&newmt, &newmt_lock); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - down_write(&newmt_lock); - mas.index = 0; - mas.last = 0; - if (mas_expected_entries(&newmas, nr_entries)) { + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { pr_err("OOM!"); BUG_ON(1); } - rcu_read_lock(); - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } - rcu_read_unlock(); + mas_destroy(&newmas); + mas_destroy(&mas); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); + __mt_destroy(&mt); up_write(&newmt_lock); + up_write(&mt_lock); } static noinline void __init check_iteration(struct maple_tree *mt) @@ -1815,49 +1816,51 @@ static noinline void __init check_mas_store_gfp(struct maple_tree *mt) } #if defined(BENCH_FORK) -static noinline void __init bench_forking(struct maple_tree *mt) +static noinline void __init bench_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134, nr_fork = 80000; + struct maple_tree mt, newmt; + int i, nr_entries = 134, nr_fork = 80000, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - mt_set_external_lock(&newmt, &newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } for (i = 0; i < nr_fork; i++) { - mt_set_non_kernel(99999); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - mas.index = 0; - mas.last = 0; - rcu_read_lock(); - down_write(&newmt_lock); - if (mas_expected_entries(&newmas, nr_entries)) { - printk("OOM!"); + mt_init_flags(&newmt, + MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&newmt, &newmt_lock); + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { + pr_err("OOM!"); BUG_ON(1); } - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } + mas_destroy(&newmas); - rcu_read_unlock(); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); up_write(&newmt_lock); } + mas_destroy(&mas); + __mt_destroy(&mt); + up_write(&mt_lock); } #endif @@ -2771,9 +2774,7 @@ static int __init maple_tree_seed(void) #endif #if defined(BENCH_FORK) #define BENCH - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - bench_forking(&tree); - mtree_destroy(&tree); + bench_forking(); goto skip; #endif #if defined(BENCH_MT_FOR_EACH) @@ -2792,9 +2793,7 @@ static int __init maple_tree_seed(void) check_iteration(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_forking(&tree); - mtree_destroy(&tree); + check_forking(); mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_mas_store_gfp(&tree); diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h index 83971b3cbfce..f8bffd4a987c 100644 --- a/tools/include/linux/rwsem.h +++ b/tools/include/linux/rwsem.h @@ -37,4 +37,8 @@ static inline int up_write(struct rw_semaphore *sem) { return pthread_rwlock_unlock(&sem->lock); } + +#define down_read_nested(sem, subclass) down_read(sem) +#define down_write_nested(sem, subclass) down_write(sem) + #endif /* _TOOLS_RWSEM_H */ From 3743b40f655249489c00b52a1481f789eba2f4bb Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:44 +0800 Subject: [PATCH 109/139] FROMGIT: maple_tree: preserve the tree attributes when destroying maple tree When destroying maple tree, preserve its attributes and then turn it into an empty tree. This allows it to be reused without needing to be reinitialized. Link: https://lkml.kernel.org/r/20231027033845.90608-10-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit 8e50d32c7a89bde896945e4e572ef28ccd87bbf8 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) Bug: 308042511 Change-Id: If1725d5a37dcd26bec23e6bffe95d877903dfab1 Signed-off-by: Suren Baghdasaryan --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3bc5414f78ab..1200ff73c1b0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6825,7 +6825,7 @@ void __mt_destroy(struct maple_tree *mt) if (xa_is_node(root)) mte_destroy_walk(root, mt); - mt->ma_flags = 0; + mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); From ed9b660cd1ad6746e8357e9717981f0c7ceb73ce Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: [PATCH 110/139] BACKPORT: FROMGIT fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton (cherry picked from commit d2406291483775ecddaee929231a39c70c08fda2 https://git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm mm-unstable) [surenb: open-coded vma_iter_clear_gfp(), vma_iter_bulk_store(); replaced vma_next() with mas_find()] Bug: 308042511 Change-Id: I42d6620e8ce6a0b16211c231a9b72ba16ba9c0d2 Signed-off-by: Suren Baghdasaryan --- kernel/fork.c | 40 +++++++++++++++++++++++++++++----------- mm/memory.c | 7 ++++++- mm/mmap.c | 9 ++++++--- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 9ef103c05891..1109a10c5ccd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); @@ -687,16 +686,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(mas.tree); - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + mas_for_each(&mas, mpnt, ULONG_MAX) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + __mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + if (unlikely(mas_is_err(&mas))) { + retval = -ENOMEM; + goto loop_out; + } vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -758,12 +764,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -772,15 +779,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = mas_find(&mas, ULONG_MAX); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: mas_destroy(&mas); - if (!retval) + if (!retval) { mt_set_in_rcu(mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -790,8 +810,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: diff --git a/mm/memory.c b/mm/memory.c index 7f29d9394bdf..5e161551a5d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -411,6 +411,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, * be 0. This will underflow and is okay. */ next = mas_find(&mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -432,6 +434,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(&mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1736,7 +1740,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details, mm_wr_locked); - } while ((vma = mas_find(&mas, end_t - 1)) != NULL); + vma = mas_find(&mas, end_t - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index bd2140cfcf36..d5c48b243869 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3303,10 +3303,11 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); - if (!vma) { + if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); - return; + mmap_write_lock(mm); + goto destroy; } lru_add_drain(); @@ -3339,11 +3340,13 @@ void exit_mmap(struct mm_struct *mm) remove_vma(vma, true); count++; cond_resched(); - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + vma = mas_find(&mas, ULONG_MAX); + } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); +destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); From 8a597e7a2d06d699fb7b6c7787291e3916a502e2 Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Tue, 31 Oct 2023 12:15:46 +0000 Subject: [PATCH 111/139] ANDROID: KVM: arm64: Don't prepopulate MMIO regions for host stage-2 As we reserve only 1GB of memory for the MMIO region don't prepopulate the entire remaining address space with MMIO as this is prone to failure. Instead, let the MMIO regions to be created lazily on the fault path and keep only the RAM regions prepopulated. Bug: 307805059 Test: Boot pKVM with CONFIG_ARM64_16K_PAGES Change-Id: I6327f42eb17c6588335a1e04736393c9032114ab Signed-off-by: Sebastian Ene --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2c1032a59826..b3920a37f334 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -149,22 +149,16 @@ static void prepare_host_vtcr(void) static int prepopulate_host_stage2(void) { struct memblock_region *reg; - u64 addr = 0; - int i, ret; + int i, ret = 0; for (i = 0; i < hyp_memblock_nr; i++) { reg = &hyp_memory[i]; - ret = host_stage2_idmap_locked(addr, reg->base - addr, PKVM_HOST_MMIO_PROT, false); - if (ret) - return ret; ret = host_stage2_idmap_locked(reg->base, reg->size, PKVM_HOST_MEM_PROT, false); if (ret) return ret; - addr = reg->base + reg->size; } - return host_stage2_idmap_locked(addr, BIT(host_mmu.pgt.ia_bits) - addr, PKVM_HOST_MMIO_PROT, - false); + return ret; } int kvm_host_prepare_stage2(void *pgt_pool_base) From 934a40576ee2c3f07dc59e38a339767d51a84238 Mon Sep 17 00:00:00 2001 From: Stanley Chang Date: Wed, 19 Apr 2023 10:00:42 +0800 Subject: [PATCH 112/139] UPSTREAM: usb: dwc3: core: add support for disabling High-speed park mode Setting the PARKMODE_DISABLE_HS bit in the DWC3_USB3_GUCTL1. When this bit is set to '1' all HS bus instances in park mode are disabled For some USB wifi devices, if enable this feature it will reduce the performance. Therefore, add an option for disabling HS park mode by device-tree. In Synopsys's dwc3 data book: In a few high speed devices when an IN request is sent within 900ns of the ACK of the previous packet, these devices send a NAK. When connected to these devices, if required, the software can disable the park mode if you see performance drop in your system. When park mode is disabled, pipelining of multiple packet is disabled and instead one packet at a time is requested by the scheduler. This allows up to 12 NAKs in a micro-frame and improves performance of these slow devices. Bug: 300024866 Acked-by: Thinh Nguyen Signed-off-by: Stanley Chang Link: https://lore.kernel.org/r/20230419020044.15475-1-stanley_chang@realtek.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: William Wu (cherry picked from commit d21a797a3eeb2b001e07ff943e5611eab67a71a3) Change-Id: I43ee416e54779a073a0ba4057edf4be8bd7886de Signed-off-by: Kever Yang --- drivers/usb/dwc3/core.c | 5 +++++ drivers/usb/dwc3/core.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 3ee70ffaf003..2b01c3e05ebe 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1233,6 +1233,9 @@ static int dwc3_core_init(struct dwc3 *dwc) if (dwc->parkmode_disable_ss_quirk) reg |= DWC3_GUCTL1_PARKMODE_DISABLE_SS; + if (dwc->parkmode_disable_hs_quirk) + reg |= DWC3_GUCTL1_PARKMODE_DISABLE_HS; + if (DWC3_VER_IS_WITHIN(DWC3, 290A, ANY) && (dwc->maximum_speed == USB_SPEED_HIGH || dwc->maximum_speed == USB_SPEED_FULL)) @@ -1539,6 +1542,8 @@ static void dwc3_get_properties(struct dwc3 *dwc) "snps,resume-hs-terminations"); dwc->parkmode_disable_ss_quirk = device_property_read_bool(dev, "snps,parkmode-disable-ss-quirk"); + dwc->parkmode_disable_hs_quirk = device_property_read_bool(dev, + "snps,parkmode-disable-hs-quirk"); dwc->gfladj_refclk_lpm_sel = device_property_read_bool(dev, "snps,gfladj-refclk-lpm-sel-quirk"); diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 89219a14efb0..d21888658806 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -263,6 +263,7 @@ #define DWC3_GUCTL1_DEV_FORCE_20_CLK_FOR_30_CLK BIT(26) #define DWC3_GUCTL1_DEV_L1_EXIT_BY_HW BIT(24) #define DWC3_GUCTL1_PARKMODE_DISABLE_SS BIT(17) +#define DWC3_GUCTL1_PARKMODE_DISABLE_HS BIT(16) #define DWC3_GUCTL1_RESUME_OPMODE_HS_HOST BIT(10) /* Global Status Register */ @@ -1113,6 +1114,8 @@ struct dwc3_scratchpad_array { * generation after resume from suspend. * @parkmode_disable_ss_quirk: set if we need to disable all SuperSpeed * instances in park mode. + * @parkmode_disable_hs_quirk: set if we need to disable all HishSpeed + * instances in park mode. * @tx_de_emphasis_quirk: set if we enable Tx de-emphasis quirk * @tx_de_emphasis: Tx de-emphasis value * 0 - -6dB de-emphasis @@ -1330,6 +1333,7 @@ struct dwc3 { unsigned dis_tx_ipgap_linecheck_quirk:1; unsigned resume_hs_terminations:1; unsigned parkmode_disable_ss_quirk:1; + unsigned parkmode_disable_hs_quirk:1; unsigned gfladj_refclk_lpm_sel:1; unsigned tx_de_emphasis_quirk:1; From ef67750d99e2a666ee18a46aeb0615516845c089 Mon Sep 17 00:00:00 2001 From: Rick Yiu Date: Wed, 3 Jan 2024 06:56:23 +0000 Subject: [PATCH 113/139] ANDROID: sched: Export symbols for vendor modules Export sysctl_sched_min_granularity and sysctl_sched_idle_min_granularity. In the vendor module, it will use several static function in GKI, while we do not want to export these static functions, which will need to make them not static, we copied them to the vendor module, so we need the export the symbols used in those static functions. For example, sysctl_sched_min_granularity and sysctl_sched_idle_min_granularity are referred in sched_slice(), and they are only used as read-only. Bug: 316276520 Change-Id: I976d0a1f3a70e8e60099e55fdd3cc99a90053fbb Signed-off-by: Rick Yiu --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 507d1fc4e163..ac870e416e2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,6 +96,7 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* @@ -105,6 +106,7 @@ static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; * (default: 0.75 msec) */ unsigned int sysctl_sched_idle_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_idle_min_granularity); /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity From ac90f0829243d0c6d2f9b219b519c0d0be696ab6 Mon Sep 17 00:00:00 2001 From: Rick Yiu Date: Wed, 3 Jan 2024 07:22:49 +0000 Subject: [PATCH 114/139] ANDROID: Update the ABI symbol list Adding the following symbols: - sysctl_sched_idle_min_granularity - sysctl_sched_min_granularity Bug: 316276520 Change-Id: I8e33c3105a3ca62d168a6289ceafc31404757453 Signed-off-by: Rick Yiu --- android/abi_gki_aarch64.stg | 20 ++++++++++++++++++++ android/abi_gki_aarch64_pixel | 2 ++ 2 files changed, 22 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 4a190c90a757..113fdec98f04 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -388738,6 +388738,15 @@ elf_symbol { type_id: 0x4585663f full_name: "sysctl_sched_features" } +elf_symbol { + id: 0xe6ea21b1 + name: "sysctl_sched_idle_min_granularity" + is_defined: true + symbol_type: OBJECT + crc: 0x69545cfa + type_id: 0x4585663f + full_name: "sysctl_sched_idle_min_granularity" +} elf_symbol { id: 0x87812861 name: "sysctl_sched_latency" @@ -388747,6 +388756,15 @@ elf_symbol { type_id: 0x4585663f full_name: "sysctl_sched_latency" } +elf_symbol { + id: 0x34555a8a + name: "sysctl_sched_min_granularity" + is_defined: true + symbol_type: OBJECT + crc: 0x04390257 + type_id: 0x4585663f + full_name: "sysctl_sched_min_granularity" +} elf_symbol { id: 0x18d0dd21 name: "sysctl_vals" @@ -405076,7 +405094,9 @@ interface { symbol_id: 0x2f857527 symbol_id: 0x3e5f4f82 symbol_id: 0xbf1515af + symbol_id: 0xe6ea21b1 symbol_id: 0x87812861 + symbol_id: 0x34555a8a symbol_id: 0x18d0dd21 symbol_id: 0x92705587 symbol_id: 0xdbe66171 diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index a01a49721a1a..1acd4b663615 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -2091,7 +2091,9 @@ synchronize_rcu syscon_regmap_lookup_by_phandle sysctl_sched_features + sysctl_sched_idle_min_granularity sysctl_sched_latency + sysctl_sched_min_granularity sysfs_add_file_to_group sysfs_add_link_to_group sysfs_create_file_ns From 98b0e4cf0968083bfeaf1b63e5b5873acf534566 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 15 Sep 2023 17:31:06 +0300 Subject: [PATCH 115/139] BACKPORT: xhci: track port suspend state correctly in unsuccessful resume cases xhci-hub.c tracks suspended ports in a suspended_port bitfield. This is checked when responding to a Get_Status(PORT) request to see if a port in running U0 state was recently resumed, and adds the required USB_PORT_STAT_C_SUSPEND change bit in those cases. The suspended_port bit was left uncleared if a device is disconnected during suspend. The bit remained set even when a new device was connected and enumerated. The set bit resulted in a incorrect Get_Status(PORT) response with a bogus USB_PORT_STAT_C_SUSPEND change bit set once the new device reached U0 link state. USB_PORT_STAT_C_SUSPEND change bit is only used for USB2 ports, but xhci-hub keeps track of both USB2 and USB3 suspended ports. Cc: stable@vger.kernel.org Reported-by: Wesley Cheng Closes: https://lore.kernel.org/linux-usb/d68aa806-b26a-0e43-42fb-b8067325e967@quicinc.com/ Fixes: 1d5810b6923c ("xhci: Rework port suspend structures for limited ports.") Tested-by: Wesley Cheng Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20230915143108.1532163-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman Bug: 200589374 (cherry picked from commit d7cdfc319b2bcf6899ab0a05eec0958bc802a9a1 https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-next) [wcheng: modified change to remove dependency on updated resume timestamp tracking] Change-Id: Icccc1778a1f193b4b4c03532d291db88772bd454 Signed-off-by: Wesley Cheng --- drivers/usb/host/xhci-hub.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 544028862de0..10eb2175e749 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1053,19 +1053,19 @@ static void xhci_get_usb3_port_status(struct xhci_port *port, u32 *status, *status |= USB_PORT_STAT_C_CONFIG_ERROR << 16; /* USB3 specific wPortStatus bits */ - if (portsc & PORT_POWER) { + if (portsc & PORT_POWER) *status |= USB_SS_PORT_STAT_POWER; - /* link state handling */ - if (link_state == XDEV_U0) - bus_state->suspended_ports &= ~(1 << portnum); - } - /* remote wake resume signaling complete */ - if (bus_state->port_remote_wakeup & (1 << portnum) && + /* no longer suspended or resuming */ + if (link_state != XDEV_U3 && link_state != XDEV_RESUME && link_state != XDEV_RECOVERY) { - bus_state->port_remote_wakeup &= ~(1 << portnum); - usb_hcd_end_port_resume(&hcd->self, portnum); + /* remote wake resume signaling complete */ + if (bus_state->port_remote_wakeup & (1 << portnum)) { + bus_state->port_remote_wakeup &= ~(1 << portnum); + usb_hcd_end_port_resume(&hcd->self, portnum); + } + bus_state->suspended_ports &= ~(1 << portnum); } xhci_hub_report_usb3_link_state(xhci, status, portsc); @@ -1111,6 +1111,21 @@ static void xhci_get_usb2_port_status(struct xhci_port *port, u32 *status, return; } } + + /* + * Clear usb2 resume signalling variables if port is no longer suspended + * or resuming. Port either resumed to U0/U1/U2, disconnected, or in a + * error state. Resume related variables should be cleared in all those cases. + */ + if (link_state != XDEV_U3 && link_state != XDEV_RESUME) { + if (bus_state->resume_done[portnum] || + test_bit(portnum, &bus_state->resuming_ports)) { + bus_state->resume_done[portnum] = 0; + clear_bit(portnum, &bus_state->resuming_ports); + usb_hcd_end_port_resume(&port->rhub->hcd->self, portnum); + } + bus_state->suspended_ports &= ~(1 << portnum); + } } /* From ec46fe0ac7cb11d1f9b6b4709745af317a28c489 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Dec 2023 09:30:40 +0100 Subject: [PATCH 116/139] UPSTREAM: bpf: Fix prog_array_map_poke_run map poke update commit 4b7de801606e504e69689df71475d27e35336fb3 upstream. Lee pointed out issue found by syscaller [0] hitting BUG in prog array map poke update in prog_array_map_poke_run function due to error value returned from bpf_arch_text_poke function. There's race window where bpf_arch_text_poke can fail due to missing bpf program kallsym symbols, which is accounted for with check for -EINVAL in that BUG_ON call. The problem is that in such case we won't update the tail call jump and cause imbalance for the next tail call update check which will fail with -EBUSY in bpf_arch_text_poke. I'm hitting following race during the program load: CPU 0 CPU 1 bpf_prog_load bpf_check do_misc_fixups prog_array_map_poke_track map_update_elem bpf_fd_array_map_update_elem prog_array_map_poke_run bpf_arch_text_poke returns -EINVAL bpf_prog_kallsyms_add After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next poke update fails on expected jump instruction check in bpf_arch_text_poke with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run. Similar race exists on the program unload. Fixing this by moving the update to bpf_arch_poke_desc_update function which makes sure we call __bpf_arch_text_poke that skips the bpf address check. Each architecture has slightly different approach wrt looking up bpf address in bpf_arch_text_poke, so instead of splitting the function or adding new 'checkip' argument in previous version, it seems best to move the whole map_poke_run update as arch specific code. [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810 Bug: 309551558 Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Cc: Lee Jones Cc: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 57a6b0a464eb322bd62a78469d251f1d428c5ebb) Signed-off-by: Lee Jones Change-Id: I251c3da579e5d48cd7de4043913fd42d0671d6b5 --- arch/x86/net/bpf_jit_comp.c | 46 +++++++++++++++++++++++++++++ include/linux/bpf.h | 3 ++ kernel/bpf/arraymap.c | 58 +++++++------------------------------ 3 files changed, 59 insertions(+), 48 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5e680e039d0e..4686c1d9d0cf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2553,3 +2553,49 @@ void bpf_jit_free(struct bpf_prog *prog) bpf_prog_unlock_free(prog); } + +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + u8 *old_addr, *new_addr, *old_bypass_addr; + int ret; + + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + /* + * On program loading or teardown, the program's kallsym entry + * might not be in place, so we use __bpf_arch_text_poke to skip + * the kallsyms check. + */ + if (new) { + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0); + if (!old) { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0); + } + } else { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0); + } +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 320d3b287ed0..6a6ff277501d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2697,6 +2697,9 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old); + void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 832b2659e96e..00f23febb9a7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -997,11 +997,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1010,7 +1015,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1029,21 +1034,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1053,39 +1047,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } From 02f444ba07d22d35f4f2ddbf7d7da0643811af15 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 28 Oct 2023 07:30:27 -0600 Subject: [PATCH 117/139] UPSTREAM: io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid commit 7644b1a1c9a7ae8ab99175989bfc8676055edb46 upstream. We could race with SQ thread exit, and if we do, we'll hit a NULL pointer dereference when the thread is cleared. Grab the SQPOLL data lock before attempting to get the task cpu and pid for fdinfo, this ensures we have a stable view of it. Bug: 309790656 Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=218032 Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin (cherry picked from commit 9236d2ea6465b37c0a73d994c1ad31753d31e5f5) Signed-off-by: Lee Jones Change-Id: I044e0285d4535440606ff593230b873e3145db91 --- io_uring/fdinfo.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 882bd56b01ed..ea2c2ded4e41 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -51,7 +51,6 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_sq_data *sq = NULL; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; @@ -62,6 +61,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int cq_shift = 0; unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; + int sq_pid = -1, sq_cpu = -1; bool has_lock; unsigned int i; @@ -139,13 +139,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, has_lock = mutex_trylock(&ctx->uring_lock); if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; + struct io_sq_data *sq = ctx->sq_data; + + if (mutex_trylock(&sq->lock)) { + if (sq->thread) { + sq_pid = task_pid_nr(sq->thread); + sq_cpu = task_cpu(sq->thread); + } + mutex_unlock(&sq->lock); + } } - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); + seq_printf(m, "SqThread:\t%d\n", sq_pid); + seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct file *f = io_file_from_index(&ctx->file_table, i); From 29544d41573d0085debd0416803ec5287948fccd Mon Sep 17 00:00:00 2001 From: Zhipeng Wang Date: Fri, 5 Jan 2024 19:17:45 +0900 Subject: [PATCH 118/139] ANDROID: ABI: Update symbol list for imx 1 function symbol(s) added 'bool iio_trigger_using_own(struct iio_dev*)' Bug: 318788290 Change-Id: I5b17b2380f7087dabf51f4ed207e9ea4cab1ba38 Signed-off-by: Zhipeng Wang --- android/abi_gki_aarch64.stg | 10 ++++++++++ android/abi_gki_aarch64_imx | 1 + 2 files changed, 11 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 113fdec98f04..dc2644ea5502 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -365298,6 +365298,15 @@ elf_symbol { type_id: 0x16dc304e full_name: "iio_trigger_unregister" } +elf_symbol { + id: 0xfb09b362 + name: "iio_trigger_using_own" + is_defined: true + symbol_type: FUNCTION + crc: 0xe2c1359e + type_id: 0xf886bca4 + full_name: "iio_trigger_using_own" +} elf_symbol { id: 0xdf3e8655 name: "iio_update_buffers" @@ -402489,6 +402498,7 @@ interface { symbol_id: 0x7551a60b symbol_id: 0x08fd4b84 symbol_id: 0xc6d8f246 + symbol_id: 0xfb09b362 symbol_id: 0xdf3e8655 symbol_id: 0x6f2f4bd1 symbol_id: 0xf87ecda4 diff --git a/android/abi_gki_aarch64_imx b/android/abi_gki_aarch64_imx index c3797c477a4c..1b556c8705f6 100644 --- a/android/abi_gki_aarch64_imx +++ b/android/abi_gki_aarch64_imx @@ -1025,6 +1025,7 @@ iio_trigger_poll_chained iio_trigger_register iio_trigger_unregister + iio_trigger_using_own import_iovec in4_pton inet_csk_get_port From d6554d1262c4f39c7b652e02e54bb692e7f20747 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Tue, 21 Nov 2023 14:52:53 -0800 Subject: [PATCH 119/139] FROMGIT: usb: dwc3: gadget: Handle EP0 request dequeuing properly Current EP0 dequeue path will share the same as other EPs. However, there are some special considerations that need to be made for EP0 transfers: - EP0 transfers never transition into the started_list - EP0 only has one active request at a time In case there is a vendor specific control message for a function over USB FFS, then there is no guarantee on the timeline which the DATA/STATUS stage is responded to. While this occurs, any attempt to end transfers on non-control EPs will end up having the DWC3_EP_DELAY_STOP flag set, and defer issuing of the end transfer command. If the USB FFS application decides to timeout the control transfer, or if USB FFS AIO path exits, the USB FFS driver will issue a call to usb_ep_dequeue() for the ep0 request. In case of the AIO exit path, the AIO FS blocks until all pending USB requests utilizing the AIO path is completed. However, since the dequeue of ep0 req does not happen properly, all non-control EPs with the DWC3_EP_DELAY_STOP flag set will not be handled, and the AIO exit path will be stuck waiting for the USB FFS data endpoints to receive a completion callback. Fix is to utilize dwc3_ep0_reset_state() in the dequeue API to ensure EP0 is brought back to the SETUP state, and ensures that any deferred end transfer commands are handled. This also will end any active transfers on EP0, compared to the previous implementation which directly called giveback only. Fixes: fcd2def66392 ("usb: dwc3: gadget: Refactor dwc3_gadget_ep_dequeue") Acked-by: Thinh Nguyen Signed-off-by: Wesley Cheng Bug: 318577849 Change-Id: Ic00684db4b502f1aab128f7e49f22510dda24f60 (cherry picked from commit 730e12fbec53ab59dd807d981a204258a4cfb29a https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git usb-testing) Signed-off-by: Wesley Cheng --- drivers/usb/dwc3/gadget.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 6fdac0fae461..121092e35ec6 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2093,7 +2093,17 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, list_for_each_entry(r, &dep->pending_list, list) { if (r == req) { - dwc3_gadget_giveback(dep, req, -ECONNRESET); + /* + * Explicitly check for EP0/1 as dequeue for those + * EPs need to be handled differently. Control EP + * only deals with one USB req, and giveback will + * occur during dwc3_ep0_stall_and_restart(). EP0 + * requests are never added to started_list. + */ + if (dep->number > 1) + dwc3_gadget_giveback(dep, req, -ECONNRESET); + else + dwc3_ep0_reset_state(dwc); goto out; } } From 02aa72665c85f3398ada5ba37bacd56732799e11 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 2 Oct 2023 13:54:28 +0300 Subject: [PATCH 120/139] UPSTREAM: nvmet-tcp: Fix a possible UAF in queue intialization setup commit d920abd1e7c4884f9ecd0749d1921b7ab19ddfbd upstream. From Alon: "Due to a logical bug in the NVMe-oF/TCP subsystem in the Linux kernel, a malicious user can cause a UAF and a double free, which may lead to RCE (may also lead to an LPE in case the attacker already has local privileges)." Hence, when a queue initialization fails after the ahash requests are allocated, it is guaranteed that the queue removal async work will be called, hence leave the deallocation to the queue removal. Also, be extra careful not to continue processing the socket, so set queue rcv_state to NVMET_TCP_RECV_ERR upon a socket error. Bug: 310114968 Cc: stable@vger.kernel.org Reported-by: Alon Zahavi Tested-by: Alon Zahavi Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Greg Kroah-Hartman (cherry picked from commit e985d78bdcf37f7ef73666a43b0d2407715f00d3) Signed-off-by: Lee Jones Change-Id: Ifd7ec8294182a6bf6d8c261aeda5d989e909f7ff --- drivers/nvme/target/tcp.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 5e29da94f72d..355d80323b83 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -345,6 +345,7 @@ static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) { + queue->rcv_state = NVMET_TCP_RECV_ERR; if (status == -EPIPE || status == -ECONNRESET) kernel_sock_shutdown(queue->sock, SHUT_RDWR); else @@ -871,15 +872,11 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) iov.iov_len = sizeof(*icresp); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (ret < 0) - goto free_crypto; + return ret; /* queue removal will cleanup */ queue->state = NVMET_TCP_Q_LIVE; nvmet_prepare_receive_pdu(queue); return 0; -free_crypto: - if (queue->hdr_digest || queue->data_digest) - nvmet_tcp_free_crypto(queue); - return ret; } static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, From 5070b3b594e321d5abf9613b15f662c6638ea959 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 23 Nov 2023 15:13:14 +0800 Subject: [PATCH 121/139] UPSTREAM: ipv4: igmp: fix refcnt uaf issue when receiving igmp query packet [ Upstream commit e2b706c691905fe78468c361aaabc719d0a496f1 ] When I perform the following test operations: 1.ip link add br0 type bridge 2.brctl addif br0 eth0 3.ip addr add 239.0.0.1/32 dev eth0 4.ip addr add 239.0.0.1/32 dev br0 5.ip addr add 224.0.0.1/32 dev br0 6.while ((1)) do ifconfig br0 up ifconfig br0 down done 7.send IGMPv2 query packets to port eth0 continuously. For example, ./mausezahn ethX -c 0 "01 00 5e 00 00 01 00 72 19 88 aa 02 08 00 45 00 00 1c 00 01 00 00 01 02 0e 7f c0 a8 0a b7 e0 00 00 01 11 64 ee 9b 00 00 00 00" The preceding tests may trigger the refcnt uaf issue of the mc list. The stack is as follows: refcount_t: addition on 0; use-after-free. WARNING: CPU: 21 PID: 144 at lib/refcount.c:25 refcount_warn_saturate (lib/refcount.c:25) CPU: 21 PID: 144 Comm: ksoftirqd/21 Kdump: loaded Not tainted 6.7.0-rc1-next-20231117-dirty #80 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:refcount_warn_saturate (lib/refcount.c:25) RSP: 0018:ffffb68f00657910 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff8a00c3bf96c0 RCX: ffff8a07b6160908 RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff8a07b6160900 RBP: ffff8a00cba36862 R08: 0000000000000000 R09: 00000000ffff7fff R10: ffffb68f006577c0 R11: ffffffffb0fdcdc8 R12: ffff8a00c3bf9680 R13: ffff8a00c3bf96f0 R14: 0000000000000000 R15: ffff8a00d8766e00 FS: 0000000000000000(0000) GS:ffff8a07b6140000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f10b520b28 CR3: 000000039741a000 CR4: 00000000000006f0 Call Trace: igmp_heard_query (net/ipv4/igmp.c:1068) igmp_rcv (net/ipv4/igmp.c:1132) ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205) ip_local_deliver_finish (net/ipv4/ip_input.c:234) __netif_receive_skb_one_core (net/core/dev.c:5529) netif_receive_skb_internal (net/core/dev.c:5729) netif_receive_skb (net/core/dev.c:5788) br_handle_frame_finish (net/bridge/br_input.c:216) nf_hook_bridge_pre (net/bridge/br_input.c:294) __netif_receive_skb_core (net/core/dev.c:5423) __netif_receive_skb_list_core (net/core/dev.c:5606) __netif_receive_skb_list (net/core/dev.c:5674) netif_receive_skb_list_internal (net/core/dev.c:5764) napi_gro_receive (net/core/gro.c:609) e1000_clean_rx_irq (drivers/net/ethernet/intel/e1000/e1000_main.c:4467) e1000_clean (drivers/net/ethernet/intel/e1000/e1000_main.c:3805) __napi_poll (net/core/dev.c:6533) net_rx_action (net/core/dev.c:6735) __do_softirq (kernel/softirq.c:554) run_ksoftirqd (kernel/softirq.c:913) smpboot_thread_fn (kernel/smpboot.c:164) kthread (kernel/kthread.c:388) ret_from_fork (arch/x86/kernel/process.c:153) ret_from_fork_asm (arch/x86/entry/entry_64.S:250) The root causes are as follows: Thread A Thread B ... netif_receive_skb br_dev_stop ... br_multicast_leave_snoopers ... __ip_mc_dec_group ... __igmp_group_dropped igmp_rcv igmp_stop_timer igmp_heard_query //ref = 1 ip_ma_put igmp_mod_timer refcount_dec_and_test igmp_start_timer //ref = 0 ... refcount_inc //ref increases from 0 When the device receives an IGMPv2 Query message, it starts the timer immediately, regardless of whether the device is running. If the device is down and has left the multicast group, it will cause the mc list refcount uaf issue. Bug: 316932391 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Zhengchao Shao Reviewed-by: Eric Dumazet Reviewed-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Sasha Levin (cherry picked from commit 94445d9583079e0ccc5dde1370076ff24800d86e) Signed-off-by: Lee Jones Change-Id: I277be2304e564994e05b981ccd6cd8cbb9dc85be --- net/ipv4/igmp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index cbc4816ed7d8..ac53ef7eec91 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -216,8 +216,10 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) int tv = prandom_u32_max(max_delay); im->tm_running = 1; - if (!mod_timer(&im->timer, jiffies+tv+2)) - refcount_inc(&im->refcnt); + if (refcount_inc_not_zero(&im->refcnt)) { + if (mod_timer(&im->timer, jiffies + tv + 2)) + ip_ma_put(im); + } } static void igmp_gq_start_timer(struct in_device *in_dev) From c5dc4b4b3d11ade04153efa22dbaa77159d73be5 Mon Sep 17 00:00:00 2001 From: Jia-Shiuan Chen Date: Tue, 9 Jan 2024 07:12:12 +0000 Subject: [PATCH 122/139] ANDROID: Update the ABI symbol list Adding the following symbols: - dma_fence_array_ops Bug: 319196045 Change-Id: Id65c62e0aedd65c9c72d71c8e39f7fae1e1de740 Signed-off-by: Jia-Shiuan Chen --- android/abi_gki_aarch64_pixel | 1 + 1 file changed, 1 insertion(+) diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index 1acd4b663615..353946f63fd3 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -543,6 +543,7 @@ dmaengine_unmap_put dma_fence_add_callback dma_fence_array_create + dma_fence_array_ops dma_fence_context_alloc dma_fence_default_wait dma_fence_enable_sw_signaling From 031f804149b99bcec0f52fdfcf89b85a8141a5da Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 27 Nov 2023 09:31:46 +0000 Subject: [PATCH 123/139] ANDROID: KVM: arm64: Avoid BUG-ing from the host abort path Under certain circumstances __get_fault_info() may resolve the faulting address using the AT instruction. Given that this is being done outside of the host lock critical section, it is racy and the resolution via AT may fail. We currently BUG() in this situation, which is obviously less than ideal. Moving the address resolution to the critical section may have a performance impact, so let's keep it where it is, but bail out and return to the host to try a second time. Bug: 311830307 Change-Id: I26d61b04a4ccf040bd31802abb3c6b998ff4a48b Signed-off-by: Quentin Perret --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index b3920a37f334..3a5193ca0fb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -875,7 +875,14 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) int ret = -EPERM; esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + if (!__get_fault_info(esr, &fault)) { + addr = (u64)-1; + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + goto return_to_host; + } fault.esr_el2 = esr; addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; @@ -902,6 +909,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) else BUG_ON(ret && ret != -EAGAIN); +return_to_host: trace_host_mem_abort(esr, addr); } From 928b3b5dde28ee2c7e85888e13ee2593ed864532 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Dec 2023 19:44:49 +0100 Subject: [PATCH 124/139] UPSTREAM: netfilter: nf_tables: skip set commit for deleted/destroyed sets commit 7315dc1e122c85ffdfc8defffbb8f8b616c2eb1a upstream. NFT_MSG_DELSET deactivates all elements in the set, skip set->ops->commit() to avoid the unnecessary clone (for the pipapo case) as well as the sync GC cycle, which could deactivate again expired elements in such set. Bug: 318548348 Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 0105571f80edb96f81bb4bbdd5233a9130dc345b) Signed-off-by: Lee Jones Change-Id: Ie733688e27d9568d797fc1bc477261883b7dc8c1 --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6b5f22dc1d94..30802f7f2114 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9474,7 +9474,7 @@ static void nft_set_commit_update(struct list_head *set_update_list) list_for_each_entry_safe(set, next, set_update_list, pending_update) { list_del_init(&set->pending_update); - if (!set->ops->commit) + if (!set->ops->commit || set->dead) continue; set->ops->commit(set); From c1b1201d39dcb59bf20e45f40459ea25b37392b7 Mon Sep 17 00:00:00 2001 From: Dezhi Huang Date: Mon, 25 Dec 2023 15:06:46 +0800 Subject: [PATCH 125/139] BACKPORT: FROMLIST: dma-buf: Move sysfs work out of DMA-BUF export path We have identified an animation lag issue on our Android 14-6.1 product which seems to be caused by contention in the rwsem lock during the dmabuf request process. It appears that other processes are holding sysfs read locks, resulting in the blocking of dmabuf sysfs node creation. We encountered an issue in android14-6.1 that is similar to the problem described in [1]. So we cherry-pick this commit to android14-6.1. [1] https://android-review.googlesource.com/c/kernel/common/+/2111974 Bug: 311282169 Bug: 206979019 Link: https://lore.kernel.org/lkml/CABdmKX2dNYhgOYdrrJU6-jt6F=LjCidbKhR6t4F7yaa0SPr+-A@mail.gmail.com/T/ Signed-off-by: Dezhi Huang Conflicts: include/linux/dma-buf.h 1. The android14-6.1 KMI is frozen, and the modification to struct dma_buf_sysfs_entry in the original patch triggers ABI check failures. Instead of an anonymous union, use the existing struct kobject directly as a work_struct with type punning. Signed-off-by: T.J. Mercier Change-Id: Ic0386849b6b248b0a72215633fc1a50782455bac --- drivers/dma-buf/dma-buf-sysfs-stats.c | 66 +++++++++++++++++++++------ drivers/dma-buf/dma-buf.c | 16 +++++-- 2 files changed, 63 insertions(+), 19 deletions(-) diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 4b680e10c15a..46520c4d8ec9 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dma-buf-sysfs-stats.h" @@ -168,35 +169,72 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +static void sysfs_add_workfn(struct work_struct *work) +{ + /* The ABI would have to change for this to be false, but let's be paranoid. */ + _Static_assert(sizeof(struct kobject) >= sizeof(struct work_struct), + "kobject is smaller than work_struct"); + + struct dma_buf_sysfs_entry *sysfs_entry = + container_of((struct kobject *)work, struct dma_buf_sysfs_entry, kobj); + struct dma_buf *dmabuf = sysfs_entry->dmabuf; + + /* + * A dmabuf is ref-counted via its file member. If this handler holds the only + * reference to the dmabuf, there is no need for sysfs kobject creation. This is an + * optimization and a race; when the reference count drops to 1 immediately after + * this check it is not harmful as the sysfs entry will still get cleaned up in + * dma_buf_stats_teardown, which won't get called until the final dmabuf reference + * is released, and that can't happen until the end of this function. + */ + if (file_count(dmabuf->file) > 1) { + /* + * kobject_init_and_add expects kobject to be zero-filled, but we have populated it + * (the sysfs_add_work union member) to trigger this work function. + */ + memset(&dmabuf->sysfs_entry->kobj, 0, sizeof(dmabuf->sysfs_entry->kobj)); + dmabuf->sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; + if (kobject_init_and_add(&dmabuf->sysfs_entry->kobj, &dma_buf_ktype, NULL, + "%lu", file_inode(dmabuf->file)->i_ino)) { + kobject_put(&dmabuf->sysfs_entry->kobj); + dmabuf->sysfs_entry = NULL; + } + } else { + /* + * Free the sysfs_entry and reset the pointer so dma_buf_stats_teardown doesn't + * attempt to operate on it. + */ + kfree(dmabuf->sysfs_entry); + dmabuf->sysfs_entry = NULL; + } + dma_buf_put(dmabuf); +} + int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { struct dma_buf_sysfs_entry *sysfs_entry; - int ret; + struct work_struct *work; if (!dmabuf->exp_name) { pr_err("exporter name must not be empty if stats needed\n"); return -EINVAL; } - sysfs_entry = kzalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); + sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); if (!sysfs_entry) return -ENOMEM; - sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; sysfs_entry->dmabuf = dmabuf; - dmabuf->sysfs_entry = sysfs_entry; - /* create the directory for buffer stats */ - ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, - "%lu", file_inode(file)->i_ino); - if (ret) - goto err_sysfs_dmabuf; + /* + * The use of kobj as a work_struct is an ugly hack + * to avoid an ABI break in this frozen kernel. + */ + work = (struct work_struct *)&dmabuf->sysfs_entry->kobj; + INIT_WORK(work, sysfs_add_workfn); + get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */ + schedule_work(work); return 0; - -err_sysfs_dmabuf: - kobject_put(&sysfs_entry->kobj); - dmabuf->sysfs_entry = NULL; - return ret; } diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index e0d42ee76b43..fbe8a07552ef 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -727,10 +727,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) dmabuf->resv = resv; } - ret = dma_buf_stats_setup(dmabuf, file); - if (ret) - goto err_dmabuf; - file->private_data = dmabuf; file->f_path.dentry->d_fsdata = dmabuf; dmabuf->file = file; @@ -739,9 +735,19 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) list_add(&dmabuf->list_node, &db_list.head); mutex_unlock(&db_list.lock); + ret = dma_buf_stats_setup(dmabuf, file); + if (ret) + goto err_sysfs; + return dmabuf; -err_dmabuf: +err_sysfs: + mutex_lock(&db_list.lock); + list_del(&dmabuf->list_node); + mutex_unlock(&db_list.lock); + dmabuf->file = NULL; + file->f_path.dentry->d_fsdata = NULL; + file->private_data = NULL; if (!resv) dma_resv_fini(dmabuf->resv); kfree(dmabuf); From 227b55a7a3ccacd83510b176f4d879de1887b2fa Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Thu, 8 Dec 2022 16:16:37 +0530 Subject: [PATCH 126/139] ANDROID: dma-buf: don't re-purpose kobject as work_struct The commit 5aec776ef8c9 ("BACKPORT: ANDROID: dma-buf: Move sysfs work out of DMA-BUF export path) re-purposed kobject as work_struct temporarily to create the sysfs entries asynchronously. The author knows what he is doing and rightly added a build assert if kobject struct size is smaller than the work_struct size. We are hitting this build assert on a non-GKI platform where CONFIG_ANDROID_KABI_RESERVE is not set. Fix this problem by allocating a new union with dma_buf_sysfs_entry structure and temporary structure as members. We only end up allocating more memory (because of union) only when kobject size is smaller than work_struct which the original patch any way assumed would never be true. Bug: 261818147 Bug: 262666413 Change-Id: Ifb089bf80d8a3a44ece9f05fc0b99ee76cb11645 Signed-off-by: Pavankumar Kondeti (cherry picked from commit ce18af9b5d7d0baad2ac3eea4c732d2bf128d690) Signed-off-by: T.J. Mercier --- drivers/dma-buf/dma-buf-sysfs-stats.c | 44 +++++++++++++++------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 46520c4d8ec9..4f3ee92dbe1b 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -169,15 +169,21 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +struct dma_buf_create_sysfs_entry { + struct dma_buf *dmabuf; + struct work_struct work; +}; + +union dma_buf_create_sysfs_work_entry { + struct dma_buf_create_sysfs_entry create_entry; + struct dma_buf_sysfs_entry sysfs_entry; +}; + static void sysfs_add_workfn(struct work_struct *work) { - /* The ABI would have to change for this to be false, but let's be paranoid. */ - _Static_assert(sizeof(struct kobject) >= sizeof(struct work_struct), - "kobject is smaller than work_struct"); - - struct dma_buf_sysfs_entry *sysfs_entry = - container_of((struct kobject *)work, struct dma_buf_sysfs_entry, kobj); - struct dma_buf *dmabuf = sysfs_entry->dmabuf; + struct dma_buf_create_sysfs_entry *create_entry = + container_of(work, struct dma_buf_create_sysfs_entry, work); + struct dma_buf *dmabuf = create_entry->dmabuf; /* * A dmabuf is ref-counted via its file member. If this handler holds the only @@ -188,6 +194,7 @@ static void sysfs_add_workfn(struct work_struct *work) * is released, and that can't happen until the end of this function. */ if (file_count(dmabuf->file) > 1) { + dmabuf->sysfs_entry->dmabuf = dmabuf; /* * kobject_init_and_add expects kobject to be zero-filled, but we have populated it * (the sysfs_add_work union member) to trigger this work function. @@ -212,29 +219,26 @@ static void sysfs_add_workfn(struct work_struct *work) int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { - struct dma_buf_sysfs_entry *sysfs_entry; - struct work_struct *work; + struct dma_buf_create_sysfs_entry *create_entry; + union dma_buf_create_sysfs_work_entry *work_entry; if (!dmabuf->exp_name) { pr_err("exporter name must not be empty if stats needed\n"); return -EINVAL; } - sysfs_entry = kmalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); - if (!sysfs_entry) + work_entry = kmalloc(sizeof(union dma_buf_create_sysfs_work_entry), GFP_KERNEL); + if (!work_entry) return -ENOMEM; - sysfs_entry->dmabuf = dmabuf; - dmabuf->sysfs_entry = sysfs_entry; + dmabuf->sysfs_entry = &work_entry->sysfs_entry; - /* - * The use of kobj as a work_struct is an ugly hack - * to avoid an ABI break in this frozen kernel. - */ - work = (struct work_struct *)&dmabuf->sysfs_entry->kobj; - INIT_WORK(work, sysfs_add_workfn); + create_entry = &work_entry->create_entry; + create_entry->dmabuf = dmabuf; + + INIT_WORK(&create_entry->work, sysfs_add_workfn); get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */ - schedule_work(work); + schedule_work(&create_entry->work); return 0; } From bc4d82ee40515f0c770b28d0dc4fa532a2b1850e Mon Sep 17 00:00:00 2001 From: Norihiko Hama Date: Fri, 15 Dec 2023 12:04:47 +0900 Subject: [PATCH 127/139] ANDROID: KMI workaround for CONFIG_NETFILTER_FAMILY_BRIDGE Enabling CONFIG_NETFILTER_FAMILY_BRIDGE causes the new element, hooks_bridge[] to be added to netns_nf. Since the KMI is frozen this could not be added. The only instantiation of struct netns_nf is as an embedded field of struct net. So instead of adding the field to struct netns_nf, a new "struct ext_net" is added that contains struct net and the new hooks_bridge[] field. An accessor function, get_nf_hooks_bridge() is added to get a pointer to the new field. There is a global init_net of type struct net which must be special cased since it is not a member of a struct ext_net. All other instances of struct net are allocated via net_alloc() which now allocates a struct ext_net. Since CONFIG_NETFILTER_FAMILY_BRIDGE is a hidden config that is needed for vendor modules, it is enabled via init/Kconfig.gki. Bug: 316040984 Fixes: 0145780bfc78 ("fix KASAN-related kernel crash by KMI W/A for NETFILTER_FAMILY_BRIDGE") Change-Id: I2c7384e3df9b88f12464dc0138986fed12ca626a Signed-off-by: Norihiko Hama --- include/linux/netfilter.h | 2 +- include/net/net_namespace.h | 30 ++++++++++++++++++++++++++++++ include/net/netns/netfilter.h | 3 --- init/Kconfig.gki | 1 + net/bridge/br_input.c | 2 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/net_namespace.c | 10 +++++++--- net/netfilter/core.c | 12 +++++++++--- net/netfilter/nf_queue.c | 2 +- net/netfilter/nfnetlink_hook.c | 4 ++-- 10 files changed, 53 insertions(+), 15 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 445494c502ba..49dc95d21f01 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -243,7 +243,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); + hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]); #endif break; default: diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8c3587d5c308..6641c4543d18 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -188,6 +188,36 @@ struct net { #endif } __randomize_layout; +/* + * To work around a KMI issue, hooks_bridge[] could not be + * added to struct netns_nf. Since the only use of netns_nf + * is embedded in struct net, struct ext_net is added to + * contain struct net plus the new field. Users of the new + * field must use get_nf_hooks_bridge() to access the field. + */ +struct ext_net { + struct net net; +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; +#endif + ANDROID_VENDOR_DATA(1); +}; + +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +extern struct net init_net; +extern struct nf_hook_entries **init_nf_hooks_bridgep; + +static inline struct nf_hook_entries __rcu **get_nf_hooks_bridge(const struct net *net) +{ + struct ext_net *ext_net; + + if (net == &init_net) + return init_nf_hooks_bridgep; + ext_net = container_of(net, struct ext_net, net); + return ext_net->hooks_bridge; +} +#endif + #include /* Init's network namespace */ diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 3b7eb0cb1201..56c72117b5b3 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -22,9 +22,6 @@ struct netns_nf { #ifdef CONFIG_NETFILTER_FAMILY_ARP struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS]; #endif -#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; -#endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) unsigned int defrag_ipv4_users; #endif diff --git a/init/Kconfig.gki b/init/Kconfig.gki index 081b1cdc9c7e..1a17a3d6e27b 100644 --- a/init/Kconfig.gki +++ b/init/Kconfig.gki @@ -202,6 +202,7 @@ config GKI_HIDDEN_NET_CONFIGS select PAGE_POOL select NET_PTP_CLASSIFY select NET_DEVLINK + select NETFILTER_FAMILY_BRIDGE help Dummy config option used to enable the networking hidden config, required by various SoC platforms. diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6bb272894c96..0da15e1f3f72 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -243,7 +243,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) goto frame_finish; #endif - e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); + e = rcu_dereference(get_nf_hooks_bridge(net)[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 01d690d9fe5f..c7f0aedf2244 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1016,7 +1016,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, unsigned int i; int ret; - e = rcu_dereference(net->nf.hooks_bridge[hook]); + e = rcu_dereference(get_nf_hooks_bridge(net)[hook]); if (!e) return okfn(net, sk, skb); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4c1707d0eb9b..1d0110152862 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1093,9 +1093,13 @@ void __init net_ns_init(void) struct net_generic *ng; #ifdef CONFIG_NET_NS - net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), - SMP_CACHE_BYTES, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + /* Allocate size for struct ext_net instead of struct net + * to fix a KMI issue when CONFIG_NETFILTER_FAMILY_BRIDGE + * is enabled + */ + net_cachep = kmem_cache_create("net_namespace", sizeof(struct ext_net), + SMP_CACHE_BYTES, + SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 55a7f72d547c..6c7a44f84b93 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -39,6 +39,12 @@ struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +struct nf_hook_entries __rcu *init_nf_hooks_bridge[NF_INET_NUMHOOKS]; +struct nf_hook_entries __rcu **init_nf_hooks_bridgep = &init_nf_hooks_bridge[0]; +EXPORT_SYMBOL_GPL(init_nf_hooks_bridgep); +#endif + static DEFINE_MUTEX(nf_hook_mutex); /* max hooks per family/hooknum */ @@ -278,9 +284,9 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: - if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) + if (WARN_ON_ONCE(hooknum >= NF_INET_NUMHOOKS)) return NULL; - return net->nf.hooks_bridge + hooknum; + return get_nf_hooks_bridge(net) + hooknum; #endif #ifdef CONFIG_NETFILTER_INGRESS case NFPROTO_INET: @@ -747,7 +753,7 @@ static int __net_init netfilter_net_init(struct net *net) __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp)); #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); + __netfilter_net_init(get_nf_hooks_bridge(net), NF_INET_NUMHOOKS); #endif #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 63d1516816b1..566f7794bf58 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -281,7 +281,7 @@ static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: - return rcu_dereference(net->nf.hooks_bridge[hooknum]); + return rcu_dereference(get_nf_hooks_bridge(net)[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 8120aadf6a0f..3ca3c3a3ba01 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -210,9 +210,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + if (hook >= NF_INET_NUMHOOKS) return ERR_PTR(-EINVAL); - hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); + hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]); #endif break; #if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) From febcf1429fa1f4cc476fa9616fe81d3b29059818 Mon Sep 17 00:00:00 2001 From: Aran Dalton Date: Fri, 5 Jan 2024 19:07:02 +0800 Subject: [PATCH 128/139] ANDROID: gki_defconfig: Set CONFIG_IDLE_INJECT and CONFIG_CPU_IDLE_THERMAL into y Under certain circumstances a SoC can reach a critical temperaturelimit and is unable to stabilize the temperature around a temperaturecontrol. The system may ask for a specific power budget butbecause of the OPP density, we can only choose an OPP with a powerbudget lower than the requested one and under-utilize the CPU, thuslosing performance. In other words, one OPP under-utilizes the CPUwith a power less than the requested power budget and the next OPPexceeds the power budget. The cpu idle cooling can solve this problem. Bug: 299411923 Signed-off-by: Aran Dalton Change-Id: I1c17b340617e88be075097dc47f30ce94be2a4d7 --- arch/arm64/configs/gki_defconfig | 2 ++ arch/x86/configs/gki_defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig index 38c90d04264e..009675466150 100644 --- a/arch/arm64/configs/gki_defconfig +++ b/arch/arm64/configs/gki_defconfig @@ -431,6 +431,7 @@ CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_GOV_USER_SPACE=y CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y CONFIG_CPU_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y CONFIG_DEVFREQ_THERMAL=y CONFIG_THERMAL_EMULATION=y CONFIG_WATCHDOG=y @@ -580,6 +581,7 @@ CONFIG_IIO_TRIGGER=y CONFIG_PWM=y CONFIG_GENERIC_PHY=y CONFIG_POWERCAP=y +CONFIG_IDLE_INJECT=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_DEBUG_SYMBOLS=y diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig index 22797e912979..7e2df44033bc 100644 --- a/arch/x86/configs/gki_defconfig +++ b/arch/x86/configs/gki_defconfig @@ -396,6 +396,7 @@ CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_GOV_USER_SPACE=y CONFIG_CPU_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y CONFIG_DEVFREQ_THERMAL=y CONFIG_THERMAL_EMULATION=y # CONFIG_X86_PKG_TEMP_THERMAL is not set @@ -523,6 +524,7 @@ CONFIG_IIO=y CONFIG_IIO_BUFFER=y CONFIG_IIO_TRIGGER=y CONFIG_POWERCAP=y +CONFIG_IDLE_INJECT=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_DEBUG_SYMBOLS=y From 28154afe74965b64bfb305f609cbc4cda7cf7004 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Dec 2023 14:52:14 -0800 Subject: [PATCH 129/139] FROMLIST: scsi: ufs: Simplify power management during async scan ufshcd_init() calls pm_runtime_get_sync() before it calls async_schedule(). ufshcd_async_scan() calls pm_runtime_put_sync() directly or indirectly from ufshcd_add_lus(). Simplify ufshcd_async_scan() by always calling pm_runtime_put_sync() from ufshcd_async_scan(). Cc: stable@vger.kernel.org Change-Id: I4b6ede95360c665594963fff0962742728064fb0 Signed-off-by: Bart Van Assche Bug: 310401362 Link: https://lore.kernel.org/linux-scsi/20231218225229.2542156-2-bvanassche@acm.org/ Signed-off-by: Bart Van Assche --- drivers/ufs/core/ufshcd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 94db7033989a..0a86de0feb79 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8683,7 +8683,6 @@ static int ufshcd_add_lus(struct ufs_hba *hba) ufs_bsg_probe(hba); ufshpb_init(hba); scsi_scan_host(hba->host); - pm_runtime_put_sync(hba->dev); out: return ret; @@ -8916,15 +8915,15 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) /* Probe and add UFS logical units */ ret = ufshcd_add_lus(hba); + out: + pm_runtime_put_sync(hba->dev); /* * If we failed to initialize the device or the device is not * present, turn off the power/clocks etc. */ - if (ret) { - pm_runtime_put_sync(hba->dev); + if (ret) ufshcd_hba_exit(hba); - } } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) From 7c91752f5dd9d5c187e9c02282f726a0206e590b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 18 Dec 2023 14:52:15 -0800 Subject: [PATCH 130/139] FROMLIST: scsi: ufs: Remove the ufshcd_hba_exit() call from ufshcd_async_scan() Calling ufshcd_hba_exit() from a function that is called asynchronously from ufshcd_init() is wrong because this triggers multiple race conditions. Instead of calling ufshcd_hba_exit(), log an error message. Reported-by: Daniel Mentz Closes: https://b.corp.google.com/issues/310401362 Fixes: 1d337ec2f35e ("ufs: improve init sequence") Change-Id: I1c056c2e42889301f69107468f2b3eb38bf3d734 Signed-off-by: Bart Van Assche Bug: 310401362 Link: https://lore.kernel.org/linux-scsi/20231218225229.2542156-3-bvanassche@acm.org/ Signed-off-by: Bart Van Assche --- drivers/ufs/core/ufshcd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 0a86de0feb79..0f0cfea31cbf 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8918,12 +8918,9 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) out: pm_runtime_put_sync(hba->dev); - /* - * If we failed to initialize the device or the device is not - * present, turn off the power/clocks etc. - */ + if (ret) - ufshcd_hba_exit(hba); + dev_err(hba->dev, "%s failed: %d\n", __func__, ret); } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) From 0801d8a89de16b3371ac091b123081565d58f53a Mon Sep 17 00:00:00 2001 From: liangjlee Date: Tue, 9 Jan 2024 03:17:54 +0000 Subject: [PATCH 131/139] ANDROID: mm: export dump_tasks symbol. Export dump_tasks to dump per-task memory status when ramdump. Bug: 316372318 Change-Id: Ie0dd1a4c7ada280dc0c7696781b4b9a5e2a100ab Signed-off-by: liangjlee --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2c5b854f767b..76a1954071e1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -420,7 +420,7 @@ static int dump_task(struct task_struct *p, void *arg) * State information includes task's pid, uid, tgid, vm size, rss, * pgtables_bytes, swapents, oom_score_adj value, and name. */ -static void dump_tasks(struct oom_control *oc) +void dump_tasks(struct oom_control *oc) { pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); @@ -436,6 +436,7 @@ static void dump_tasks(struct oom_control *oc) rcu_read_unlock(); } } +EXPORT_SYMBOL_GPL(dump_tasks); static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) { From a41a4ee3704a02c39e67207df9ecc09a0e018e53 Mon Sep 17 00:00:00 2001 From: liangjlee Date: Wed, 10 Jan 2024 09:00:21 +0000 Subject: [PATCH 132/139] ANDROID: Update the ABI symbol list Adding the following symbols: - dump_tasks Bug: 316372318 Change-Id: Iddaed980a227d8beb966cf0fae24947f5bf8b473 Signed-off-by: liangjlee --- android/abi_gki_aarch64.stg | 15 +++++++++++++++ android/abi_gki_aarch64_pixel | 1 + 2 files changed, 16 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index dc2644ea5502..573e477d70d3 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -298635,6 +298635,11 @@ function { parameter_id: 0x3e10b518 parameter_id: 0x0bb0c019 } +function { + id: 0x1f821b4c + return_type_id: 0x48b5725f + parameter_id: 0x3c692b7e +} function { id: 0x1f835b6f return_type_id: 0x48b5725f @@ -359364,6 +359369,15 @@ elf_symbol { type_id: 0x10985193 full_name: "dump_stack" } +elf_symbol { + id: 0x652fbf96 + name: "dump_tasks" + is_defined: true + symbol_type: FUNCTION + crc: 0x6fe3e49b + type_id: 0x1f821b4c + full_name: "dump_tasks" +} elf_symbol { id: 0xda364c85 name: "dw_handle_msi_irq" @@ -401839,6 +401853,7 @@ interface { symbol_id: 0xe09fd784 symbol_id: 0xded28924 symbol_id: 0xe3421d56 + symbol_id: 0x652fbf96 symbol_id: 0xda364c85 symbol_id: 0x68e0756b symbol_id: 0x12cb063e diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index 353946f63fd3..d0f6d7be74ff 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -813,6 +813,7 @@ drm_writeback_signal_completion dump_backtrace dump_stack + dump_tasks dw_handle_msi_irq dw_pcie_find_capability dw_pcie_host_init From 800cac4b33b120783548aba4f04f6fce05453e27 Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Wed, 29 Nov 2023 18:20:43 +0530 Subject: [PATCH 133/139] FROMGIT: wifi: nl80211: Extend del pmksa support for SAE and OWE security Current handling of del pmksa with SSID is limited to FILS security. In the current change the del pmksa support is extended to SAE/OWE security offloads as well. For OWE/SAE offloads, the PMK is generated and cached at driver/FW, so user app needs the capability to request cache deletion based on SSID for drivers supporting SAE/OWE offload. Signed-off-by: Vinayak Yadawad Link: https://msgid.link/ecdae726459e0944c377a6a6f6cb2c34d2e057d0.1701262123.git.vinayak.yadawad@broadcom.com [drop whitespace-damaged rdev_ops pointer completely, enabling tracing] Signed-off-by: Johannes Berg Bug: 301410304 (cherry picked from commit aa0887c4f18e280f8c2aa6964af602bd16c37f54 https://git.kernel.org/pub/scm/linux/kernel/git/wireless/wireless-next.git main) Change-Id: Ia665b9760279eb77347e79c97d177cba3beaa107 Signed-off-by: Paul Chen --- include/uapi/linux/nl80211.h | 3 +- net/wireless/nl80211.c | 94 +++++++++++++++++++++++++----------- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f3af16ce1f64..50a59769828a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -567,7 +567,8 @@ * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC * (for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID, * %NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS - * authentication. + * authentication. Additionally in case of SAE offload and OWE offloads + * PMKSA entry can be deleted using %NL80211_ATTR_SSID. * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries. * * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d919eff62ebe..0fdf95420bec 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12017,16 +12017,18 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) return err; } -static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) +static int nl80211_set_pmksa(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_pmksa *pmksa) = NULL; struct net_device *dev = info->user_ptr[1]; struct cfg80211_pmksa pmksa; + bool ap_pmksa_caching_support = false; memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); + ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING); + if (!info->attrs[NL80211_ATTR_PMKID]) return -EINVAL; @@ -12035,16 +12037,15 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) { pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); } else if (info->attrs[NL80211_ATTR_SSID] && - info->attrs[NL80211_ATTR_FILS_CACHE_ID] && - (info->genlhdr->cmd == NL80211_CMD_DEL_PMKSA || - info->attrs[NL80211_ATTR_PMK])) { + info->attrs[NL80211_ATTR_FILS_CACHE_ID] && + info->attrs[NL80211_ATTR_PMK]) { pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - pmksa.cache_id = - nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); + pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); } else { return -EINVAL; } + if (info->attrs[NL80211_ATTR_PMK]) { pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]); pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]); @@ -12056,32 +12057,71 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]) pmksa.pmk_reauth_threshold = - nla_get_u8( - info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]); + nla_get_u8(info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && - !(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP && - wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_AP_PMKSA_CACHING))) + !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) && + ap_pmksa_caching_support)) return -EOPNOTSUPP; - switch (info->genlhdr->cmd) { - case NL80211_CMD_SET_PMKSA: - rdev_ops = rdev->ops->set_pmksa; - break; - case NL80211_CMD_DEL_PMKSA: - rdev_ops = rdev->ops->del_pmksa; - break; - default: - WARN_ON(1); - break; + if (!rdev->ops->set_pmksa) + return -EOPNOTSUPP; + + return rdev_set_pmksa(rdev, dev, &pmksa); +} + +static int nl80211_del_pmksa(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_pmksa pmksa; + bool sae_offload_support = false; + bool owe_offload_support = false; + bool ap_pmksa_caching_support = false; + + memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); + + sae_offload_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD); + owe_offload_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_OWE_OFFLOAD); + ap_pmksa_caching_support = wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING); + + if (info->attrs[NL80211_ATTR_PMKID]) + pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); + + if (info->attrs[NL80211_ATTR_MAC]) { + pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + } else if (info->attrs[NL80211_ATTR_SSID]) { + /* SSID based pmksa flush suppported only for FILS, + * OWE/SAE OFFLOAD cases + */ + if (info->attrs[NL80211_ATTR_FILS_CACHE_ID] && + info->attrs[NL80211_ATTR_PMK]) { + pmksa.cache_id = nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); + } else if (!sae_offload_support && !owe_offload_support) { + return -EINVAL; + } + pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + } else { + return -EINVAL; } - if (!rdev_ops) + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + !((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO) && + ap_pmksa_caching_support)) return -EOPNOTSUPP; - return rdev_ops(&rdev->wiphy, dev, &pmksa); + if (!rdev->ops->del_pmksa) + return -EOPNOTSUPP; + + return rdev_del_pmksa(rdev, dev, &pmksa); } static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) @@ -16817,7 +16857,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { { .cmd = NL80211_CMD_SET_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = nl80211_setdel_pmksa, + .doit = nl80211_set_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_CLEAR_SKB), @@ -16825,7 +16865,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { { .cmd = NL80211_CMD_DEL_PMKSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .doit = nl80211_setdel_pmksa, + .doit = nl80211_del_pmksa, .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, From 82bf9e7625bebc1aef7271ea8f0c9b256b2a89ce Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 9 Jan 2024 17:22:33 -0800 Subject: [PATCH 134/139] FROMGIT: BACKPORT: mm/cma: fix placement of trace_cma_alloc_start/finish The current placement of trace_cma_alloc_start/finish misses the fail cases: !cma || !cma->count || !cma->bitmap. trace_cma_alloc_finish is also not emitted for the failure case where bitmap_count > bitmap_maxno. Fix these missed cases by moving the start event before the failure checks and moving the finish event to the out label. Link: https://lkml.kernel.org/r/20240110012234.3793639-1-kaleshsingh@google.com Fixes: 7bc1aec5e287 ("mm: cma: add trace events for CMA alloc perf testing") Change-Id: I61153fe078da4f9f3338147f1fbb7697a5554078 Signed-off-by: Kalesh Singh Cc: Minchan Kim Cc: Liam Mark Signed-off-by: Andrew Morton (cherry picked from commit 3b08ab9a811caebe1327f25f51557f95200d94bf https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-unstable) Bug: 315897033 [ Remove ret arg from trace_cma_alloc_finish - Kalesh Singh ] Signed-off-by: Kalesh Singh --- mm/cma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index b64768625d82..6d466c77630f 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -438,6 +438,9 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count, int ret = -ENOMEM; int num_attempts = 0; int max_retries = 5; + const char *name = cma ? cma->name : NULL; + + trace_cma_alloc_start(name, count, align); if (WARN_ON_ONCE((gfp_mask & GFP_KERNEL) == 0 || (gfp_mask & ~(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY)) != 0)) @@ -452,8 +455,6 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) goto out; - trace_cma_alloc_start(cma->name, count, align); - mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); @@ -522,8 +523,6 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count, start = bitmap_no + mask + 1; } - trace_cma_alloc_finish(cma->name, pfn, page, count, align); - /* * CMA can allocate multiple page blocks, which results in different * blocks being marked with different tags. Reset the tags to ignore @@ -542,6 +541,7 @@ struct page *__cma_alloc(struct cma *cma, unsigned long count, pr_debug("%s(): returned %p\n", __func__, page); out: + trace_cma_alloc_finish(name, pfn, page, count, align); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); From fd40c1d9017aae068ebbd569f1319588e3f78a34 Mon Sep 17 00:00:00 2001 From: Nikita Ioffe Date: Thu, 11 Jan 2024 10:49:01 +0000 Subject: [PATCH 135/139] ANDROID: add 16k targets for Microdroid kernel Bug: 317201718 Test: tools/bazel run //common:kernel_aarch64_microdroid_16k_dist Change-Id: I542f07d1d0b4f2b6a3c4c58185eee16b2b7f1667 Signed-off-by: Nikita Ioffe --- BUILD.bazel | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/BUILD.bazel b/BUILD.bazel index 6f4d747cdc94..1c1dc8627744 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -198,6 +198,34 @@ copy_to_dist_dir( log = "info", ) +kernel_build( + name = "kernel_aarch64_microdroid_16k", + srcs = ["//common:kernel_aarch64_sources"], + outs = [ + "Image", + "System.map", + "modules.builtin", + "modules.builtin.modinfo", + "vmlinux", + "vmlinux.symvers", + ], + build_config = "build.config.microdroid.aarch64", + make_goals = [ + "Image", + ], + page_size = "16k", +) + +copy_to_dist_dir( + name = "kernel_aarch64_microdroid_16k_dist", + data = [ + ":kernel_aarch64_microdroid_16k", + ], + dist_dir = "out/kernel_aarch64_microdroid_16k/dist", + flat = True, + log = "info", +) + # Microdroid is not a real device. The kernel image is built with special # configs to reduce the size. Hence, not using mixed build. kernel_build( From d4db0d5d081dfa2e41a2895c076c33aeb463f21f Mon Sep 17 00:00:00 2001 From: Lianjun Huang Date: Tue, 12 Dec 2023 15:51:36 +0800 Subject: [PATCH 136/139] ANDROID: GKI: add vendor hooks for swapping in ahead Add vendor hooks to capture demand paging during APP launch, so we can do it in advance in next launch. Bug: 315913896 Signed-off-by: Lianjun Huang Signed-off-by: Lianjun Huang Change-Id: I2698fefd347745fb4ff84b111caedbb3bb365ce3 --- drivers/android/vendor_hooks.c | 1 + include/trace/hooks/mm.h | 3 +++ mm/readahead.c | 1 + 3 files changed, 5 insertions(+) diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index ee68032b1918..14bb47fcf8b0 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -312,6 +312,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_unregister); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_get_thermal_zone_device); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_thermal_power_cap); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_enable_thermal_power_throttle); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_read_pages); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_reclaim_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_alloc_pages_failure_bypass); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_madvise_pageout_swap_entry); diff --git a/include/trace/hooks/mm.h b/include/trace/hooks/mm.h index 0bd0c34e17b9..50addc57dc10 100644 --- a/include/trace/hooks/mm.h +++ b/include/trace/hooks/mm.h @@ -76,6 +76,9 @@ struct slabinfo; DECLARE_HOOK(android_vh_cache_show, TP_PROTO(struct seq_file *m, struct slabinfo *sinfo, struct kmem_cache *s), TP_ARGS(m, sinfo, s)); +DECLARE_HOOK(android_vh_read_pages, + TP_PROTO(struct readahead_control *ractl), + TP_ARGS(ractl)); DECLARE_HOOK(android_vh_alloc_pages_reclaim_bypass, TP_PROTO(gfp_t gfp_mask, int order, int alloc_flags, int migratetype, struct page **page), diff --git a/mm/readahead.c b/mm/readahead.c index a8620cac2d83..dc5cc73775e7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -167,6 +167,7 @@ static void read_pages(struct readahead_control *rac) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); + trace_android_vh_read_pages(rac); if (aops->readahead) { aops->readahead(rac); /* From a70d3b7bdd7822665db5248f9b556fae267b116c Mon Sep 17 00:00:00 2001 From: Lianjun Huang Date: Mon, 8 Jan 2024 10:59:49 +0800 Subject: [PATCH 137/139] ANDROID: GKI: add symbols of vendor hooks to ABI for swapping in ahead Add symbols of vendor hooks to capture demand paging during APP launch, so we can do it in advance in next launch. INFO: 1 function symbol(s) added 'int __traceiter_android_vh_read_pages(void*, struct readahead_control*)' 1 variable symbol(s) added 'struct tracepoint __tracepoint_android_vh_read_pages' Bug: 315913896 Signed-off-by: Lianjun Huang Signed-off-by: Lianjun Huang Change-Id: Ibb1e31b6912f7b6b92b76727f7e5043897434def --- android/abi_gki_aarch64.stg | 26 ++++++++++++++++++++++++++ android/abi_gki_aarch64_xiaomi | 4 ++++ 2 files changed, 30 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 573e477d70d3..8da5d4a06705 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -317298,6 +317298,12 @@ function { parameter_id: 0x064d6086 parameter_id: 0x064d6086 } +function { + id: 0x9b32d0a3 + return_type_id: 0x6720d32f + parameter_id: 0x18bd6530 + parameter_id: 0x275ab027 +} function { id: 0x9b32f2ad return_type_id: 0x6720d32f @@ -337670,6 +337676,15 @@ elf_symbol { type_id: 0x9b3343fb full_name: "__traceiter_android_vh_ra_tuning_max_page" } +elf_symbol { + id: 0xb35da0ec + name: "__traceiter_android_vh_read_pages" + is_defined: true + symbol_type: FUNCTION + crc: 0x4cb21384 + type_id: 0x9b32d0a3 + full_name: "__traceiter_android_vh_read_pages" +} elf_symbol { id: 0x7d069e91 name: "__traceiter_android_vh_record_mutex_lock_starttime" @@ -341675,6 +341690,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_ra_tuning_max_page" } +elf_symbol { + id: 0x9fc2933e + name: "__tracepoint_android_vh_read_pages" + is_defined: true + symbol_type: OBJECT + crc: 0xb3878023 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_read_pages" +} elf_symbol { id: 0x761f292f name: "__tracepoint_android_vh_record_mutex_lock_starttime" @@ -399443,6 +399467,7 @@ interface { symbol_id: 0xf2c39651 symbol_id: 0x93303c51 symbol_id: 0x3a545b61 + symbol_id: 0xb35da0ec symbol_id: 0x7d069e91 symbol_id: 0x0fa39b81 symbol_id: 0x1a91ec8c @@ -399888,6 +399913,7 @@ interface { symbol_id: 0x0e92ee53 symbol_id: 0xb0c197a3 symbol_id: 0x811d5fab + symbol_id: 0x9fc2933e symbol_id: 0x761f292f symbol_id: 0xef7ad117 symbol_id: 0x158c4cfa diff --git a/android/abi_gki_aarch64_xiaomi b/android/abi_gki_aarch64_xiaomi index 1ca73267f242..d502877c9b2c 100644 --- a/android/abi_gki_aarch64_xiaomi +++ b/android/abi_gki_aarch64_xiaomi @@ -341,3 +341,7 @@ #required by zram.ko bioset_init bioset_exit + +#required by mi_asap.ko + __traceiter_android_vh_read_pages + __tracepoint_android_vh_read_pages From 66cd99ccdbea66677398e65065a275d0bdc43482 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 2 Feb 2023 13:53:29 -0800 Subject: [PATCH 138/139] BACKPORT: UPSTREAM: phy: qcom-qmp: Introduce Kconfig symbols for discrete drivers Introduce a config option for each QMP PHY driver now that the QMP PHY mega-driver has been split up into different modules. This allows kernel configurators to limit the binary size of the kernel by only compiling in the QMP PHY driver that they need. Leave the old config QCOM_QMP in place and make it into a menuconfig so that 'make olddefconfig' continues to work. Furthermore, set the default of the new Kconfig symbols to be QCOM_QMP so that the transition is smooth. Reviewed-by: Dmitry Baryshkov Reviewed-by: Johan Hovold Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/all/20230202215330.2152726-1-swboyd@chromium.org/ Bug: 319064658 Change-Id: I633e6e1bbc3e79292bfde927e46f84219f0178ae (cherry picked from commit d1abd69534bec16c43c633313e8e937af1354a7a) [quic_kuruva: Resolved minor conflict in drivers/phy/qualcomm/Kconfig ] Signed-off-by: Rajashekar kuruva --- drivers/phy/qualcomm/Kconfig | 50 ++++++++++++++++++++++++++++++++--- drivers/phy/qualcomm/Makefile | 12 ++++----- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/drivers/phy/qualcomm/Kconfig b/drivers/phy/qualcomm/Kconfig index 5c98850f5a36..1d3a8062e651 100644 --- a/drivers/phy/qualcomm/Kconfig +++ b/drivers/phy/qualcomm/Kconfig @@ -50,13 +50,55 @@ config PHY_QCOM_PCIE2 Enable this to support the Qualcomm PCIe PHY, used with the Synopsys based PCIe controller. -config PHY_QCOM_QMP - tristate "Qualcomm QMP PHY Driver" +menuconfig PHY_QCOM_QMP + tristate "Qualcomm QMP PHY Drivers" depends on OF && COMMON_CLK && (ARCH_QCOM || COMPILE_TEST) + +if PHY_QCOM_QMP + +config PHY_QCOM_QMP_COMBO + tristate "Qualcomm QMP Combo PHY Driver" + default PHY_QCOM_QMP select GENERIC_PHY help - Enable this to support the QMP PHY transceiver that is used - with controllers such as PCIe, UFS, and USB on Qualcomm chips. + Enable this to support the QMP Combo PHY transceiver that is used + with USB3 and DisplayPort controllers on Qualcomm chips. + +config PHY_QCOM_QMP_PCIE + tristate "Qualcomm QMP PCIe PHY Driver" + depends on PCI || COMPILE_TEST + select GENERIC_PHY + default PHY_QCOM_QMP + help + Enable this to support the QMP PCIe PHY transceiver that is used + with PCIe controllers on Qualcomm chips. + +config PHY_QCOM_QMP_PCIE_8996 + tristate "Qualcomm QMP PCIe 8996 PHY Driver" + depends on PCI || COMPILE_TEST + select GENERIC_PHY + default PHY_QCOM_QMP + help + Enable this to support the QMP PCIe PHY transceiver that is used + with PCIe controllers on Qualcomm msm8996 chips. + +config PHY_QCOM_QMP_UFS + tristate "Qualcomm QMP UFS PHY Driver" + select GENERIC_PHY + default PHY_QCOM_QMP + help + Enable this to support the QMP UFS PHY transceiver that is used + with UFS controllers on Qualcomm chips. + +config PHY_QCOM_QMP_USB + tristate "Qualcomm QMP USB PHY Driver" + select GENERIC_PHY + default PHY_QCOM_QMP + help + Enable this to support the QMP USB PHY transceiver that is used + with USB3 controllers on Qualcomm chips. + +endif # PHY_QCOM_QMP config PHY_QCOM_QUSB2 tristate "Qualcomm QUSB2 PHY Driver" diff --git a/drivers/phy/qualcomm/Makefile b/drivers/phy/qualcomm/Makefile index 65f6c30a3e93..79dd4e507961 100644 --- a/drivers/phy/qualcomm/Makefile +++ b/drivers/phy/qualcomm/Makefile @@ -5,12 +5,12 @@ obj-$(CONFIG_PHY_QCOM_EDP) += phy-qcom-edp.o obj-$(CONFIG_PHY_QCOM_IPQ4019_USB) += phy-qcom-ipq4019-usb.o obj-$(CONFIG_PHY_QCOM_IPQ806X_SATA) += phy-qcom-ipq806x-sata.o obj-$(CONFIG_PHY_QCOM_PCIE2) += phy-qcom-pcie2.o -obj-$(CONFIG_PHY_QCOM_QMP) += \ - phy-qcom-qmp-combo.o \ - phy-qcom-qmp-pcie.o \ - phy-qcom-qmp-pcie-msm8996.o \ - phy-qcom-qmp-ufs.o \ - phy-qcom-qmp-usb.o + +obj-$(CONFIG_PHY_QCOM_QMP_COMBO) += phy-qcom-qmp-combo.o +obj-$(CONFIG_PHY_QCOM_QMP_PCIE) += phy-qcom-qmp-pcie.o +obj-$(CONFIG_PHY_QCOM_QMP_PCIE_8996) += phy-qcom-qmp-pcie-msm8996.o +obj-$(CONFIG_PHY_QCOM_QMP_UFS) += phy-qcom-qmp-ufs.o +obj-$(CONFIG_PHY_QCOM_QMP_USB) += phy-qcom-qmp-usb.o obj-$(CONFIG_PHY_QCOM_QUSB2) += phy-qcom-qusb2.o obj-$(CONFIG_PHY_QCOM_USB_HS) += phy-qcom-usb-hs.o From df1cdb0a703318d8804872fa93e30208ad14398c Mon Sep 17 00:00:00 2001 From: Qian-Hao Huang Date: Mon, 15 Jan 2024 19:58:07 +0800 Subject: [PATCH 139/139] ANDROID: Update the pixel symbol list These symbols are needed as part of an upgrade to v6.1: - add_uevent_var - aes_encrypt - aes_expandkey - alloc_skb_with_frags - cpufreq_quick_get_max - cpuidle_governor_latency_req - cpu_topology - crypto_shash_final - datagram_poll - debugfs_create_blob - dev_pm_qos_add_notifier - dev_pm_qos_add_request - dev_pm_qos_remove_notifier - dev_pm_qos_remove_request - dma_direct_alloc - dma_direct_free - dma_get_sgtable_attrs - firmware_request_nowarn - idr_alloc_cyclic - in_egroup_p - init_user_ns - iov_iter_revert - __ipv6_addr_type - kernel_bind - kernel_connect - kernel_getsockname - kernel_recvmsg - kernel_sendmsg - kmem_cache_create_usercopy - ksize - lock_sock_nested - mempool_alloc - mempool_alloc_slab - mempool_create - mempool_destroy - mempool_free - mempool_free_slab - napi_gro_flush - netif_tx_lock - netif_tx_unlock - ns_capable_noaudit - param_get_string - param_set_copystring - param_set_int - pci_disable_msi - pcie_capability_read_word - pci_iomap - pci_iounmap - pci_irq_vector - pci_release_region - pci_request_region - pm_system_wakeup - proto_register - proto_unregister - radix_tree_iter_delete - radix_tree_next_chunk - _raw_read_lock_irq - _raw_read_unlock_irq - _raw_write_lock_bh - _raw_write_unlock_bh - refcount_dec_not_one - register_netevent_notifier - regulator_set_load - release_sock - seq_vprintf - sk_alloc - skb_coalesce_rx_frag - skb_copy_datagram_iter - skb_free_datagram - __skb_pad - skb_recv_datagram - skb_set_owner_w - skb_store_bits - sk_free - sock_alloc_send_pskb - sock_create_kern - sock_gettstamp - sock_init_data - sock_no_accept - sock_no_listen - sock_no_mmap - sock_no_sendpage - sock_no_shutdown - sock_no_socketpair - sock_queue_rcv_skb_reason - sock_register - sock_setsockopt - sock_unregister - strchrnul - unregister_netevent_notifier - vscnprintf - wait_for_completion_killable - wireless_send_event - __xa_insert - xa_store Bug: 303533633 Bug: 308924989 Change-Id: Ifbc09d5025f1bd3416f136fabd344ef2452390a8 Signed-off-by: Qian-Hao Huang --- android/abi_gki_aarch64.stg | 30 +++++++++++ android/abi_gki_aarch64_pixel | 95 +++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index 8da5d4a06705..a7a7e77c8c92 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -354353,6 +354353,24 @@ elf_symbol { type_id: 0x1023f4f6 full_name: "dma_contiguous_default_area" } +elf_symbol { + id: 0x279bd3a7 + name: "dma_direct_alloc" + is_defined: true + symbol_type: FUNCTION + crc: 0xb02b3af0 + type_id: 0x55df36a2 + full_name: "dma_direct_alloc" +} +elf_symbol { + id: 0x0e847130 + name: "dma_direct_free" + is_defined: true + symbol_type: FUNCTION + crc: 0x051debf6 + type_id: 0x13db1955 + full_name: "dma_direct_free" +} elf_symbol { id: 0xd13969dd name: "dma_fence_add_callback" @@ -372911,6 +372929,15 @@ elf_symbol { type_id: 0xfcd23386 full_name: "ns_capable" } +elf_symbol { + id: 0x27a870d1 + name: "ns_capable_noaudit" + is_defined: true + symbol_type: FUNCTION + crc: 0x3c75b0e0 + type_id: 0xfcd23386 + full_name: "ns_capable_noaudit" +} elf_symbol { id: 0xf68f8b33 name: "ns_to_kernel_old_timeval" @@ -401319,6 +401346,8 @@ interface { symbol_id: 0x710f1fc2 symbol_id: 0xaa54a71f symbol_id: 0x5f554bc7 + symbol_id: 0x279bd3a7 + symbol_id: 0x0e847130 symbol_id: 0xd13969dd symbol_id: 0xe2ee283f symbol_id: 0xf5808a3e @@ -403381,6 +403410,7 @@ interface { symbol_id: 0xea37502b symbol_id: 0x0bb7f730 symbol_id: 0xb65e3baf + symbol_id: 0x27a870d1 symbol_id: 0xf68f8b33 symbol_id: 0xfab02ca8 symbol_id: 0xd7668767 diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index d0f6d7be74ff..fac865d11e03 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -3,8 +3,11 @@ add_cpu add_timer add_timer_on + add_uevent_var add_wait_queue adjust_managed_page_count + aes_encrypt + aes_expandkey alarm_cancel alarm_init alarm_start_relative @@ -19,6 +22,7 @@ __alloc_percpu __alloc_percpu_gfp __alloc_skb + alloc_skb_with_frags alloc_workqueue alt_cb_patch_nops amba_bustype @@ -243,6 +247,7 @@ cpufreq_get_policy cpufreq_policy_transition_delay_us cpufreq_quick_get + cpufreq_quick_get_max cpufreq_register_driver cpufreq_register_governor cpufreq_register_notifier @@ -261,6 +266,7 @@ cpu_hwcaps cpuidle_driver_state_disabled cpuidle_get_driver + cpuidle_governor_latency_req cpu_latency_qos_add_request cpu_latency_qos_remove_request cpu_latency_qos_update_request @@ -276,6 +282,7 @@ cpus_read_lock cpus_read_unlock cpu_subsys + cpu_topology crc32_be crc32_le crc8 @@ -298,6 +305,7 @@ crypto_register_shash crypto_req_done crypto_shash_digest + crypto_shash_final crypto_shash_finup crypto_shash_setkey crypto_shash_update @@ -311,10 +319,12 @@ csum_partial csum_tcpudp_nofold _ctype + datagram_poll deactivate_task debugfs_attr_read debugfs_attr_write debugfs_create_atomic_t + debugfs_create_blob debugfs_create_bool debugfs_create_devm_seqfile debugfs_create_dir @@ -499,7 +509,11 @@ dev_pm_opp_of_remove_table dev_pm_opp_put dev_pm_opp_set_config + dev_pm_qos_add_notifier + dev_pm_qos_add_request dev_pm_qos_read_value + dev_pm_qos_remove_notifier + dev_pm_qos_remove_request dev_pm_qos_update_request _dev_printk dev_printk_emit @@ -540,6 +554,8 @@ dma_buf_unmap_attachment dma_buf_vmap dma_buf_vunmap + dma_direct_alloc + dma_direct_free dmaengine_unmap_put dma_fence_add_callback dma_fence_array_create @@ -559,6 +575,7 @@ dma_fence_wait_timeout dma_free_attrs dma_free_pages + dma_get_sgtable_attrs dma_get_slave_caps dma_get_slave_channel dma_heap_add @@ -865,6 +882,7 @@ find_task_by_vpid find_vma_intersection finish_wait + firmware_request_nowarn flush_dcache_page flush_delayed_work flush_work @@ -1037,6 +1055,7 @@ ida_destroy ida_free idr_alloc + idr_alloc_cyclic idr_destroy idr_find idr_for_each @@ -1056,6 +1075,7 @@ in6_pton in_aton inc_zone_page_state + in_egroup_p inet_csk_get_port init_dummy_netdev init_iova_domain @@ -1065,6 +1085,7 @@ __init_swait_queue_head init_task init_timer_key + init_user_ns init_uts_ns init_wait_entry __init_waitqueue_head @@ -1132,8 +1153,10 @@ io_schedule_timeout iounmap iova_domain_init_rcaches + iov_iter_revert ip_compute_csum ip_send_check + __ipv6_addr_type __irq_alloc_descs __irq_apply_affinity_hint irq_create_mapping_affinity @@ -1165,11 +1188,16 @@ jiffies_to_usecs kasan_flag_enabled kasprintf + kernel_bind + kernel_connect kernel_cpustat + kernel_getsockname kernel_kobj kernel_param_lock kernel_param_unlock + kernel_recvmsg kernel_restart + kernel_sendmsg kernfs_find_and_get_ns kernfs_notify kernfs_path_from_node @@ -1196,6 +1224,7 @@ kmalloc_trace kmem_cache_alloc kmem_cache_create + kmem_cache_create_usercopy kmem_cache_destroy kmem_cache_free kmemdup @@ -1210,6 +1239,7 @@ kobject_uevent_env kobj_sysfs_ops krealloc + ksize kstat kstrdup kstrndup @@ -1274,6 +1304,7 @@ __list_del_entry_valid list_sort __local_bh_enable_ip + lock_sock_nested log_abnormal_wakeup_reason log_post_read_mmio log_post_write_mmio @@ -1299,6 +1330,12 @@ memdup_user_nul memmove memparse + mempool_alloc + mempool_alloc_slab + mempool_create + mempool_destroy + mempool_free + mempool_free_slab memremap mem_section memset @@ -1350,6 +1387,7 @@ napi_complete_done napi_disable napi_enable + napi_gro_flush napi_gro_receive __napi_schedule napi_schedule_prep @@ -1366,7 +1404,9 @@ netif_receive_skb netif_receive_skb_list netif_rx + netif_tx_lock netif_tx_stop_all_queues + netif_tx_unlock netif_tx_wake_queue netlink_broadcast __netlink_kernel_create @@ -1393,6 +1433,7 @@ nr_cpu_ids nr_irqs ns_capable + ns_capable_noaudit nsec_to_clock_t ns_to_timespec64 __num_online_cpus @@ -1494,6 +1535,7 @@ panic_notifier_list param_array_ops param_get_int + param_get_string param_ops_bool param_ops_byte param_ops_charp @@ -1502,10 +1544,14 @@ param_ops_string param_ops_uint param_ops_ulong + param_set_copystring + param_set_int pci_alloc_irq_vectors_affinity pci_assign_resource pci_clear_master pci_disable_device + pci_disable_msi + pcie_capability_read_word pci_enable_device pci_enable_wake pci_find_bus @@ -1513,6 +1559,9 @@ pci_find_ext_capability pci_free_irq_vectors pci_get_device + pci_iomap + pci_iounmap + pci_irq_vector pci_load_and_free_saved_state pci_load_saved_state pci_msi_mask_irq @@ -1520,7 +1569,9 @@ pci_read_config_dword pci_read_config_word __pci_register_driver + pci_release_region pci_release_regions + pci_request_region pci_rescan_bus pci_restore_msi_state pci_restore_state @@ -1618,6 +1669,7 @@ __pm_runtime_use_autosuspend __pm_stay_awake pm_stay_awake + pm_system_wakeup pm_wakeup_dev_event pm_wakeup_ws_event power_supply_changed @@ -1652,6 +1704,8 @@ proc_remove proc_set_size proc_symlink + proto_register + proto_unregister pskb_expand_head __pskb_pull_tail ___pskb_trim @@ -1672,7 +1726,9 @@ radix_tree_delete_item radix_tree_gang_lookup radix_tree_insert + radix_tree_iter_delete radix_tree_lookup + radix_tree_next_chunk radix_tree_preload ___ratelimit raw_notifier_call_chain @@ -1680,9 +1736,11 @@ raw_notifier_chain_unregister _raw_read_lock _raw_read_lock_bh + _raw_read_lock_irq _raw_read_lock_irqsave _raw_read_unlock _raw_read_unlock_bh + _raw_read_unlock_irq _raw_read_unlock_irqrestore _raw_spin_lock _raw_spin_lock_bh @@ -1696,9 +1754,11 @@ _raw_spin_unlock_irq _raw_spin_unlock_irqrestore _raw_write_lock + _raw_write_lock_bh _raw_write_lock_irq _raw_write_lock_irqsave _raw_write_unlock + _raw_write_unlock_bh _raw_write_unlock_irq _raw_write_unlock_irqrestore rb_erase @@ -1713,6 +1773,7 @@ rdev_get_drvdata rdev_get_id reboot_mode + refcount_dec_not_one refcount_warn_saturate __refrigerator regcache_cache_only @@ -1730,6 +1791,7 @@ register_netdev register_netdevice register_netdevice_notifier + register_netevent_notifier register_oom_notifier register_pernet_device register_pernet_subsys @@ -1772,11 +1834,13 @@ regulator_notifier_call_chain regulator_put regulator_set_active_discharge_regmap + regulator_set_load regulator_set_voltage regulator_set_voltage_sel_regmap regulator_unregister release_firmware __release_region + release_sock remap_pfn_range remap_vmalloc_range remove_cpu @@ -1877,6 +1941,7 @@ seq_read seq_release seq_release_private + seq_vprintf seq_write set_capacity set_capacity_and_notify @@ -1917,20 +1982,25 @@ single_open single_open_size single_release + sk_alloc skb_add_rx_frag skb_checksum skb_checksum_help skb_clone skb_clone_sk + skb_coalesce_rx_frag skb_complete_wifi_ack skb_copy skb_copy_bits + skb_copy_datagram_iter skb_copy_expand skb_dequeue skb_dequeue_tail skb_ensure_writable + skb_free_datagram __skb_get_hash __skb_gso_segment + __skb_pad skb_pull skb_push skb_put @@ -1938,7 +2008,11 @@ skb_queue_purge skb_queue_tail skb_realloc_headroom + skb_recv_datagram + skb_set_owner_w + skb_store_bits skb_trim + sk_free skip_spaces smp_call_function smp_call_function_single @@ -2015,8 +2089,22 @@ snd_soc_unregister_component snprintf soc_device_register + sock_alloc_send_pskb __sock_create + sock_create_kern + sock_gettstamp + sock_init_data + sock_no_accept + sock_no_listen + sock_no_mmap + sock_no_sendpage + sock_no_shutdown + sock_no_socketpair + sock_queue_rcv_skb_reason + sock_register sock_release + sock_setsockopt + sock_unregister sock_wfree softnet_data sort @@ -2054,6 +2142,7 @@ strcasecmp strcat strchr + strchrnul strcmp strcpy strcspn @@ -2456,6 +2545,7 @@ unregister_netdevice_many unregister_netdevice_notifier unregister_netdevice_queue + unregister_netevent_notifier unregister_oom_notifier unregister_pernet_device unregister_pernet_subsys @@ -2604,6 +2694,7 @@ vring_del_virtqueue vring_interrupt vring_new_virtqueue + vscnprintf vsnprintf vunmap vzalloc @@ -2611,6 +2702,7 @@ wait_for_completion wait_for_completion_interruptible wait_for_completion_interruptible_timeout + wait_for_completion_killable wait_for_completion_timeout wait_woken __wake_up @@ -2628,6 +2720,7 @@ watchdog_set_restart_priority watchdog_unregister_device wireless_nlevent_flush + wireless_send_event woken_wake_function work_busy __write_overflow_field @@ -2639,11 +2732,13 @@ xa_find xa_find_after xa_get_mark + __xa_insert xa_load xa_set_mark xas_find xas_pause __xa_store + xa_store __xfrm_state_destroy xfrm_state_lookup_byspi xfrm_stateonly_find