From 6c00ef0d3c680c1d7a852c7a6a656df1851d2951 Mon Sep 17 00:00:00 2001 From: tangmeng Date: Thu, 27 Jan 2022 15:15:09 +0800 Subject: [PATCH 001/334] drivers/pcmcia: Fix typo in comment Replace unavaibale with unavailable. Signed-off-by: tangmeng Signed-off-by: Dominik Brodowski --- drivers/pcmcia/rsrc_nonstatic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index 6b6c578b5f92..ad1141fddb4c 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -394,7 +394,7 @@ static int do_validate_mem(struct pcmcia_socket *s, * do_mem_probe() checks a memory region for use by the PCMCIA subsystem. * To do so, the area is split up into sensible parts, and then passed * into the @validate() function. Only if @validate() and @fallback() fail, - * the area is marked as unavaibale for use by the PCMCIA subsystem. The + * the area is marked as unavailable for use by the PCMCIA subsystem. The * function returns the size of the usable memory area. */ static int do_mem_probe(struct pcmcia_socket *s, u_long base, u_long num, From 3928cf08334ed895a31458cbebd8d4ec6d84c080 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 23 Jan 2022 09:40:31 -0800 Subject: [PATCH 002/334] pcmcia: db1xxx_ss: restrict to MIPS_DB1XXX boards When the MIPS_ALCHEMY board selection is MIPS_XXS1500 instead of MIPS_DB1XXX, the PCMCIA driver 'db1xxx_ss' has build errors due to missing DB1XXX symbols. The PCMCIA driver should be restricted to MIPS_DB1XXX instead of MIPS_ALCHEMY to fix this build error. ERROR: modpost: "bcsr_read" [drivers/pcmcia/db1xxx_ss.ko] undefined! ERROR: modpost: "bcsr_mod" [drivers/pcmcia/db1xxx_ss.ko] undefined! Fixes: 42a4f17dc356 ("MIPS: Alchemy: remove SOC_AU1X00 in favor of MIPS_ALCHEMY") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Arnd Bergmann Cc: Daniel Vetter Cc: Kees Cook Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Acked-by: Manuel Lauss Signed-off-by: Dominik Brodowski --- drivers/pcmcia/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig index ab53eab635f6..1740a63b814d 100644 --- a/drivers/pcmcia/Kconfig +++ b/drivers/pcmcia/Kconfig @@ -151,7 +151,7 @@ config TCIC config PCMCIA_ALCHEMY_DEVBOARD tristate "Alchemy Db/Pb1xxx PCMCIA socket services" - depends on MIPS_ALCHEMY && PCMCIA + depends on MIPS_DB1XXX && PCMCIA help Enable this driver of you want PCMCIA support on your Alchemy Db1000, Db/Pb1100, Db/Pb1500, Db/Pb1550, Db/Pb1200, DB1300 From f20e232d74ee0ace386be0b7db1ff993ea69b4c4 Mon Sep 17 00:00:00 2001 From: Tinghan Shen Date: Mon, 21 Mar 2022 14:03:40 +0800 Subject: [PATCH 003/334] remoteproc: mediatek: Fix side effect of mt8195 sram power on The definition of L1TCM_SRAM_PDN bits on mt8195 is different to mt8192. L1TCM_SRAM_PDN bits[3:0] control the power of mt8195 L1TCM SRAM. L1TCM_SRAM_PDN bits[7:4] control the access path to EMI for SCP. These bits have to be powered on to allow EMI access for SCP. Bits[7:4] also affect audio DSP because audio DSP and SCP are placed on the same hardware bus. If SCP cannot access EMI, audio DSP is blocked too. L1TCM_SRAM_PDN bits[31:8] are not used. This fix removes modification of bits[7:4] when power on/off mt8195 SCP L1TCM. It's because the modification introduces a short period of time blocking audio DSP to access EMI. This was not a problem until we have to load both SCP module and audio DSP module. audio DSP needs to access EMI because it has source/data on DRAM. Audio DSP will have unexpected behavior when it accesses EMI and the SCP driver blocks the EMI path at the same time. Fixes: 79111df414fc ("remoteproc: mediatek: Support mt8195 scp") Signed-off-by: Tinghan Shen Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Matthias Brugger Link: https://lore.kernel.org/r/20220321060340.10975-1-tinghan.shen@mediatek.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/mtk_common.h | 2 + drivers/remoteproc/mtk_scp.c | 69 +++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 17 deletions(-) diff --git a/drivers/remoteproc/mtk_common.h b/drivers/remoteproc/mtk_common.h index 71ce4977cb0b..ea6fa1100a00 100644 --- a/drivers/remoteproc/mtk_common.h +++ b/drivers/remoteproc/mtk_common.h @@ -54,6 +54,8 @@ #define MT8192_CORE0_WDT_IRQ 0x10030 #define MT8192_CORE0_WDT_CFG 0x10034 +#define MT8195_L1TCM_SRAM_PDN_RESERVED_RSI_BITS GENMASK(7, 4) + #define SCP_FW_VER_LEN 32 #define SCP_SHARE_BUFFER_SIZE 288 diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c index 38609153bf64..ee6c4009586e 100644 --- a/drivers/remoteproc/mtk_scp.c +++ b/drivers/remoteproc/mtk_scp.c @@ -365,22 +365,22 @@ static int mt8183_scp_before_load(struct mtk_scp *scp) return 0; } -static void mt8192_power_on_sram(void __iomem *addr) +static void scp_sram_power_on(void __iomem *addr, u32 reserved_mask) { int i; for (i = 31; i >= 0; i--) - writel(GENMASK(i, 0), addr); + writel(GENMASK(i, 0) & ~reserved_mask, addr); writel(0, addr); } -static void mt8192_power_off_sram(void __iomem *addr) +static void scp_sram_power_off(void __iomem *addr, u32 reserved_mask) { int i; writel(0, addr); for (i = 0; i < 32; i++) - writel(GENMASK(i, 0), addr); + writel(GENMASK(i, 0) & ~reserved_mask, addr); } static int mt8186_scp_before_load(struct mtk_scp *scp) @@ -393,7 +393,7 @@ static int mt8186_scp_before_load(struct mtk_scp *scp) writel(0x0, scp->reg_base + MT8183_SCP_CLK_DIV_SEL); /* Turn on the power of SCP's SRAM before using it. Enable 1 block per time*/ - mt8192_power_on_sram(scp->reg_base + MT8183_SCP_SRAM_PDN); + scp_sram_power_on(scp->reg_base + MT8183_SCP_SRAM_PDN, 0); /* Initialize TCM before loading FW. */ writel(0x0, scp->reg_base + MT8183_SCP_L1_SRAM_PD); @@ -412,11 +412,32 @@ static int mt8192_scp_before_load(struct mtk_scp *scp) writel(1, scp->reg_base + MT8192_CORE0_SW_RSTN_SET); /* enable SRAM clock */ - mt8192_power_on_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_0); - mt8192_power_on_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_1); - mt8192_power_on_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_2); - mt8192_power_on_sram(scp->reg_base + MT8192_L1TCM_SRAM_PDN); - mt8192_power_on_sram(scp->reg_base + MT8192_CPU0_SRAM_PD); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_on(scp->reg_base + MT8192_L1TCM_SRAM_PDN, 0); + scp_sram_power_on(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); + + /* enable MPU for all memory regions */ + writel(0xff, scp->reg_base + MT8192_CORE0_MEM_ATT_PREDEF); + + return 0; +} + +static int mt8195_scp_before_load(struct mtk_scp *scp) +{ + /* clear SPM interrupt, SCP2SPM_IPC_CLR */ + writel(0xff, scp->reg_base + MT8192_SCP2SPM_IPC_CLR); + + writel(1, scp->reg_base + MT8192_CORE0_SW_RSTN_SET); + + /* enable SRAM clock */ + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_on(scp->reg_base + MT8192_L1TCM_SRAM_PDN, + MT8195_L1TCM_SRAM_PDN_RESERVED_RSI_BITS); + scp_sram_power_on(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); /* enable MPU for all memory regions */ writel(0xff, scp->reg_base + MT8192_CORE0_MEM_ATT_PREDEF); @@ -572,11 +593,25 @@ static void mt8183_scp_stop(struct mtk_scp *scp) static void mt8192_scp_stop(struct mtk_scp *scp) { /* Disable SRAM clock */ - mt8192_power_off_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_0); - mt8192_power_off_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_1); - mt8192_power_off_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_2); - mt8192_power_off_sram(scp->reg_base + MT8192_L1TCM_SRAM_PDN); - mt8192_power_off_sram(scp->reg_base + MT8192_CPU0_SRAM_PD); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_off(scp->reg_base + MT8192_L1TCM_SRAM_PDN, 0); + scp_sram_power_off(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); + + /* Disable SCP watchdog */ + writel(0, scp->reg_base + MT8192_CORE0_WDT_CFG); +} + +static void mt8195_scp_stop(struct mtk_scp *scp) +{ + /* Disable SRAM clock */ + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_off(scp->reg_base + MT8192_L1TCM_SRAM_PDN, + MT8195_L1TCM_SRAM_PDN_RESERVED_RSI_BITS); + scp_sram_power_off(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); /* Disable SCP watchdog */ writel(0, scp->reg_base + MT8192_CORE0_WDT_CFG); @@ -922,11 +957,11 @@ static const struct mtk_scp_of_data mt8192_of_data = { static const struct mtk_scp_of_data mt8195_of_data = { .scp_clk_get = mt8195_scp_clk_get, - .scp_before_load = mt8192_scp_before_load, + .scp_before_load = mt8195_scp_before_load, .scp_irq_handler = mt8192_scp_irq_handler, .scp_reset_assert = mt8192_scp_reset_assert, .scp_reset_deassert = mt8192_scp_reset_deassert, - .scp_stop = mt8192_scp_stop, + .scp_stop = mt8195_scp_stop, .scp_da_to_va = mt8192_scp_da_to_va, .host_to_scp_reg = MT8192_GIPC_IN_SET, .host_to_scp_int_bit = MT8192_HOST_IPC_INT_BIT, From 68d9787bdd5cbfa76e795c4015f8d58b84955a01 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 29 Mar 2022 21:16:16 +0530 Subject: [PATCH 004/334] remoteproc: Don't bother checking the return value of debugfs_create* DebugFS APIs are designed to return only the error pointers and not NULL in the case of failure. So these return pointers are safe to be passed on to the successive debugfs_create* APIs. Therefore, let's just get rid of the checks. Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220329154616.58902-1-manivannan.sadhasivam@linaro.org Signed-off-by: Mathieu Poirier --- drivers/remoteproc/remoteproc_core.c | 4 ---- drivers/remoteproc/remoteproc_debugfs.c | 17 ++--------------- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index c510125769b9..3c8382f66ce2 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -684,10 +684,6 @@ static int rproc_handle_trace(struct rproc *rproc, void *ptr, /* create the debugfs entry */ trace->tfile = rproc_create_trace_file(name, rproc, trace); - if (!trace->tfile) { - kfree(trace); - return -EINVAL; - } list_add_tail(&trace->node, &rproc->traces); diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c index 581930483ef8..b86c1d09c70c 100644 --- a/drivers/remoteproc/remoteproc_debugfs.c +++ b/drivers/remoteproc/remoteproc_debugfs.c @@ -386,16 +386,8 @@ void rproc_remove_trace_file(struct dentry *tfile) struct dentry *rproc_create_trace_file(const char *name, struct rproc *rproc, struct rproc_debug_trace *trace) { - struct dentry *tfile; - - tfile = debugfs_create_file(name, 0400, rproc->dbg_dir, trace, + return debugfs_create_file(name, 0400, rproc->dbg_dir, trace, &trace_rproc_ops); - if (!tfile) { - dev_err(&rproc->dev, "failed to create debugfs trace entry\n"); - return NULL; - } - - return tfile; } void rproc_delete_debug_dir(struct rproc *rproc) @@ -411,8 +403,6 @@ void rproc_create_debug_dir(struct rproc *rproc) return; rproc->dbg_dir = debugfs_create_dir(dev_name(dev), rproc_dbg); - if (!rproc->dbg_dir) - return; debugfs_create_file("name", 0400, rproc->dbg_dir, rproc, &rproc_name_ops); @@ -430,11 +420,8 @@ void rproc_create_debug_dir(struct rproc *rproc) void __init rproc_init_debugfs(void) { - if (debugfs_initialized()) { + if (debugfs_initialized()) rproc_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); - if (!rproc_dbg) - pr_err("can't create debugfs dir\n"); - } } void __exit rproc_exit_debugfs(void) From eac3e5b1c12f85732e60f5f8b985444d273866bb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 9 Apr 2022 08:27:54 +0200 Subject: [PATCH 005/334] remoteproc: mtk_scp: Fix a potential double free 'scp->rproc' is allocated using devm_rproc_alloc(), so there is no need to free it explicitly in the remove function. Fixes: c1407ac1099a ("remoteproc: mtk_scp: Use devm variant of rproc_alloc()") Signed-off-by: Christophe JAILLET Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/1d15023b4afb94591435c48482fe1276411b9a07.1648981531.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mathieu Poirier --- drivers/remoteproc/mtk_scp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c index 38609153bf64..4a0e6c4bc6f4 100644 --- a/drivers/remoteproc/mtk_scp.c +++ b/drivers/remoteproc/mtk_scp.c @@ -877,7 +877,6 @@ static int scp_remove(struct platform_device *pdev) for (i = 0; i < SCP_IPI_MAX; i++) mutex_destroy(&scp->ipi_desc[i].lock); mutex_destroy(&scp->send_lock); - rproc_free(scp->rproc); return 0; } From f340d5a19dc76c6b29eeb23537fd7345611e9117 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 13 Apr 2022 11:30:37 +0800 Subject: [PATCH 006/334] remoteproc: elf_loader: skip segment with memsz as zero Per elf specification, p_filesz: This member gives the number of bytes in the file image of the segment; it may be zero. p_memsz: This member gives the number of bytes in the memory image of the segment; it may be zero. There is a case that i.MX DSP firmware has segment with PT_LOAD and p_memsz/p_filesz set to zero. Such segment needs to be ignored, otherwize rproc_da_to_va would report error. Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20220413033038.1715945-2-peng.fan@oss.nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/remoteproc_elf_loader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c index d635d19a5aa8..5a412d7b6e0b 100644 --- a/drivers/remoteproc/remoteproc_elf_loader.c +++ b/drivers/remoteproc/remoteproc_elf_loader.c @@ -181,7 +181,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) bool is_iomem = false; void *ptr; - if (type != PT_LOAD) + if (type != PT_LOAD || !memsz) continue; dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n", From c7457143668a98e5ddb0ab92d1c61d8899f7ac74 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 13 Apr 2022 11:30:38 +0800 Subject: [PATCH 007/334] remoteproc: imx_dsp_rproc: use common rproc_elf_load_segments remoteproc elf loader supports the specific case that segments have PT_LOAD and memsz/filesz set to zero, so no duplicate code. Acked-by: Daniel Baluta Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20220413033038.1715945-3-peng.fan@oss.nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/imx_dsp_rproc.c | 95 +----------------------------- 1 file changed, 1 insertion(+), 94 deletions(-) diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c index 2abee78df96e..eee3c44c2146 100644 --- a/drivers/remoteproc/imx_dsp_rproc.c +++ b/drivers/remoteproc/imx_dsp_rproc.c @@ -649,99 +649,6 @@ static int imx_dsp_rproc_add_carveout(struct imx_dsp_rproc *priv) return 0; } -/** - * imx_dsp_rproc_elf_load_segments() - load firmware segments to memory - * @rproc: remote processor which will be booted using these fw segments - * @fw: the ELF firmware image - * - * This function specially checks if memsz is zero or not, otherwise it - * is mostly same as rproc_elf_load_segments(). - */ -static int imx_dsp_rproc_elf_load_segments(struct rproc *rproc, - const struct firmware *fw) -{ - struct device *dev = &rproc->dev; - u8 class = fw_elf_get_class(fw); - u32 elf_phdr_get_size = elf_size_of_phdr(class); - const u8 *elf_data = fw->data; - const void *ehdr, *phdr; - int i, ret = 0; - u16 phnum; - - ehdr = elf_data; - phnum = elf_hdr_get_e_phnum(class, ehdr); - phdr = elf_data + elf_hdr_get_e_phoff(class, ehdr); - - /* go through the available ELF segments */ - for (i = 0; i < phnum; i++, phdr += elf_phdr_get_size) { - u64 da = elf_phdr_get_p_paddr(class, phdr); - u64 memsz = elf_phdr_get_p_memsz(class, phdr); - u64 filesz = elf_phdr_get_p_filesz(class, phdr); - u64 offset = elf_phdr_get_p_offset(class, phdr); - u32 type = elf_phdr_get_p_type(class, phdr); - void *ptr; - - /* - * There is a case that with PT_LOAD type, the - * filesz = memsz = 0. If memsz = 0, rproc_da_to_va - * should return NULL ptr, then error is returned. - * So this case should be skipped from the loop. - * Add !memsz checking here. - */ - if (type != PT_LOAD || !memsz) - continue; - - dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n", - type, da, memsz, filesz); - - if (filesz > memsz) { - dev_err(dev, "bad phdr filesz 0x%llx memsz 0x%llx\n", - filesz, memsz); - ret = -EINVAL; - break; - } - - if (offset + filesz > fw->size) { - dev_err(dev, "truncated fw: need 0x%llx avail 0x%zx\n", - offset + filesz, fw->size); - ret = -EINVAL; - break; - } - - if (!rproc_u64_fit_in_size_t(memsz)) { - dev_err(dev, "size (%llx) does not fit in size_t type\n", - memsz); - ret = -EOVERFLOW; - break; - } - - /* grab the kernel address for this device address */ - ptr = rproc_da_to_va(rproc, da, memsz, NULL); - if (!ptr) { - dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da, - memsz); - ret = -EINVAL; - break; - } - - /* put the segment where the remote processor expects it */ - if (filesz) - memcpy(ptr, elf_data + offset, filesz); - - /* - * Zero out remaining memory for this segment. - * - * This isn't strictly required since dma_alloc_coherent already - * did this for us. albeit harmless, we may consider removing - * this. - */ - if (memsz > filesz) - memset(ptr + filesz, 0, memsz - filesz); - } - - return ret; -} - /* Prepare function for rproc_ops */ static int imx_dsp_rproc_prepare(struct rproc *rproc) { @@ -808,7 +715,7 @@ static const struct rproc_ops imx_dsp_rproc_ops = { .start = imx_dsp_rproc_start, .stop = imx_dsp_rproc_stop, .kick = imx_dsp_rproc_kick, - .load = imx_dsp_rproc_elf_load_segments, + .load = rproc_elf_load_segments, .parse_fw = rproc_elf_load_rsc_table, .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, From 79a43db93399cfc7825a908a03320677c52ef919 Mon Sep 17 00:00:00 2001 From: Daniel Baluta Date: Thu, 31 Mar 2022 13:32:37 +0300 Subject: [PATCH 008/334] remoteproc: imx_dsp_rproc: Make rsc_table optional There are cases when we want to test a simple "hello world" app on the DSP and we do not need a resource table. remoteproc core allows us having an optional rsc_table. Signed-off-by: Daniel Baluta Acked-by: Shengjiu Wang Link: https://lore.kernel.org/r/20220331103237.340796-1-daniel.baluta@oss.nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/imx_dsp_rproc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c index eee3c44c2146..ca0817f8e41e 100644 --- a/drivers/remoteproc/imx_dsp_rproc.c +++ b/drivers/remoteproc/imx_dsp_rproc.c @@ -709,6 +709,14 @@ static void imx_dsp_rproc_kick(struct rproc *rproc, int vqid) dev_err(dev, "%s: failed (%d, err:%d)\n", __func__, vqid, err); } +static int imx_dsp_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw) +{ + if (rproc_elf_load_rsc_table(rproc, fw)) + dev_warn(&rproc->dev, "no resource table found for this firmware\n"); + + return 0; +} + static const struct rproc_ops imx_dsp_rproc_ops = { .prepare = imx_dsp_rproc_prepare, .unprepare = imx_dsp_rproc_unprepare, @@ -716,7 +724,7 @@ static const struct rproc_ops imx_dsp_rproc_ops = { .stop = imx_dsp_rproc_stop, .kick = imx_dsp_rproc_kick, .load = rproc_elf_load_segments, - .parse_fw = rproc_elf_load_rsc_table, + .parse_fw = imx_dsp_rproc_parse_fw, .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, }; From 8f454f950dbb663180f596db18c3dc7ec26497f0 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Mon, 28 Mar 2022 10:20:11 +0800 Subject: [PATCH 009/334] remoteproc: core: Remove state checking before calling rproc_boot() There is no mutex protection of the state checking before rproc_boot(), which can't guarantee there is no another instance is trying to do same operation. Consider two instances case: Instance1: echo start > /sys/class/remoteproc/remoteproc0/state Instance2: echo start > /sys/class/remoteproc/remoteproc0/state ... Instance2: echo stop > /sys/class/remoteproc/remoteproc0/state ... Instance1: echo stop > /sys/class/remoteproc/remoteproc0/state The one issue is that the instance2 case may success when 'start' happens at same time as instance1, then rproc->power = 2; Or it may fail with -BUSY, then rproc->power = 1; which is uncertain. The another issue is for 'stop' operation, if the rproc->power = 1, when instance2 'stop' the remoteproc the instance1 will be impacted for it still needs the service at that time. The reference counter rproc->power is used to manage state changing and there is mutex protection in each operation function for multi instance case. So remove this state checking in rproc_cdev_write() and state_store() for 'start' operation, just let reference counter rproc->power to manage the behaviors. Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1648434012-16655-2-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/remoteproc_cdev.c | 4 ---- drivers/remoteproc/remoteproc_sysfs.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index 906ff3c4dfdd..62001eda780c 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -32,10 +32,6 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_ return -EFAULT; if (!strncmp(cmd, "start", len)) { - if (rproc->state == RPROC_RUNNING || - rproc->state == RPROC_ATTACHED) - return -EBUSY; - ret = rproc_boot(rproc); } else if (!strncmp(cmd, "stop", len)) { if (rproc->state != RPROC_RUNNING && diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 51a04bc6ba7a..ac64d69085ab 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -194,10 +194,6 @@ static ssize_t state_store(struct device *dev, int ret = 0; if (sysfs_streq(buf, "start")) { - if (rproc->state == RPROC_RUNNING || - rproc->state == RPROC_ATTACHED) - return -EBUSY; - ret = rproc_boot(rproc); if (ret) dev_err(&rproc->dev, "Boot failed: %d\n", ret); From 5e6a0e05270e3a4bb9289a0415d062966c27d192 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Mon, 28 Mar 2022 10:20:12 +0800 Subject: [PATCH 010/334] remoteproc: core: Move state checking to remoteproc_core There is no mutex protection of these state checking for 'stop' and 'detach' which can't guarantee there is no another instance is trying to do same operation. Consider two instances case: Instance1: echo stop > /sys/class/remoteproc/remoteproc0/state Instance2: echo stop > /sys/class/remoteproc/remoteproc0/state The issue is that the instance2 case may success, Or it may fail with -EINVAL, which is uncertain. So move this state checking in rproc_cdev_write() and state_store() for 'stop', 'detach' operation to 'rproc_shutdown' , 'rproc_detach' function under the mutex protection. Signed-off-by: Shengjiu Wang Link: https://lore.kernel.org/r/1648434012-16655-3-git-send-email-shengjiu.wang@nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/remoteproc_cdev.c | 7 ------- drivers/remoteproc/remoteproc_core.c | 11 +++++++++++ drivers/remoteproc/remoteproc_sysfs.c | 7 ------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index 62001eda780c..687f205fd70a 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -34,15 +34,8 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_ if (!strncmp(cmd, "start", len)) { ret = rproc_boot(rproc); } else if (!strncmp(cmd, "stop", len)) { - if (rproc->state != RPROC_RUNNING && - rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_shutdown(rproc); } else if (!strncmp(cmd, "detach", len)) { - if (rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_detach(rproc); } else { dev_err(&rproc->dev, "Unrecognized option\n"); diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 3c8382f66ce2..02a04ab34a23 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2071,6 +2071,12 @@ int rproc_shutdown(struct rproc *rproc) return ret; } + if (rproc->state != RPROC_RUNNING && + rproc->state != RPROC_ATTACHED) { + ret = -EINVAL; + goto out; + } + /* if the remote proc is still needed, bail out */ if (!atomic_dec_and_test(&rproc->power)) goto out; @@ -2130,6 +2136,11 @@ int rproc_detach(struct rproc *rproc) return ret; } + if (rproc->state != RPROC_ATTACHED) { + ret = -EINVAL; + goto out; + } + /* if the remote proc is still needed, bail out */ if (!atomic_dec_and_test(&rproc->power)) { ret = 0; diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index ac64d69085ab..8c7ea8922638 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -198,15 +198,8 @@ static ssize_t state_store(struct device *dev, if (ret) dev_err(&rproc->dev, "Boot failed: %d\n", ret); } else if (sysfs_streq(buf, "stop")) { - if (rproc->state != RPROC_RUNNING && - rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_shutdown(rproc); } else if (sysfs_streq(buf, "detach")) { - if (rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_detach(rproc); } else { dev_err(&rproc->dev, "Unrecognised option: %s\n", buf); From 58b7c856519fe946620ee68dd0c37bd3c695484a Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 15 Apr 2022 10:57:37 +0800 Subject: [PATCH 011/334] remoteproc: imx_rproc: Ignore create mem entry for resource table Resource table is used by Linux to get information published by remote processor. It should be not be used for memory allocation, so not create rproc mem entry. Fixes: b29b4249f8f0 ("remoteproc: imx_rproc: add i.MX specific parse fw hook") Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20220415025737.1561976-1-peng.fan@oss.nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/imx_rproc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 7a096f1891e6..91eb037089ef 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -423,6 +423,9 @@ static int imx_rproc_prepare(struct rproc *rproc) if (!strcmp(it.node->name, "vdev0buffer")) continue; + if (!strcmp(it.node->name, "rsc-table")) + continue; + rmem = of_reserved_mem_lookup(it.node); if (!rmem) { dev_err(priv->dev, "unable to acquire memory-region\n"); From b7da6f517214c307efece604ac9dc58dc6123c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Fri, 25 Feb 2022 17:58:52 -0500 Subject: [PATCH 012/334] dt-bindings: remoteproc: mediatek: Add interrupts property to mtk,scp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SCP node can have an associated interrupt. Add a property for it. Signed-off-by: Nícolas F. R. A. Prado Acked-by: Rob Herring Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220225225854.81038-2-nfraprado@collabora.com --- Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index 5b693a2d049c..823a236242de 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -42,6 +42,9 @@ properties: clock-names: const: main + interrupts: + maxItems: 1 + required: - compatible - reg From c7078972038a04549cfb1ab1f3f6554db9c31446 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 25 Jan 2022 13:34:27 +0100 Subject: [PATCH 013/334] pwm-sun4i: Convert "next_period" to local variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its value is calculated in sun4i_pwm_apply() and is used only there. Signed-off-by: Max Kellermann Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sun4i.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index 16d75f9aa36a..24db8c9b4bc6 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -89,7 +89,6 @@ struct sun4i_pwm_chip { void __iomem *base; spinlock_t ctrl_lock; const struct sun4i_pwm_data *data; - unsigned long next_period[2]; }; static inline struct sun4i_pwm_chip *to_sun4i_pwm_chip(struct pwm_chip *chip) @@ -237,6 +236,7 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, int ret; unsigned int delay_us, prescaler = 0; unsigned long now; + unsigned long next_period; bool bypass; pwm_get_state(pwm, &cstate); @@ -284,7 +284,7 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, val = (duty & PWM_DTY_MASK) | PWM_PRD(period); sun4i_pwm_writel(sun4i_pwm, val, PWM_CH_PRD(pwm->hwpwm)); - sun4i_pwm->next_period[pwm->hwpwm] = jiffies + + next_period = jiffies + nsecs_to_jiffies(cstate.period + 1000); if (state->polarity != PWM_POLARITY_NORMAL) @@ -306,9 +306,8 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, /* We need a full period to elapse before disabling the channel. */ now = jiffies; - if (time_before(now, sun4i_pwm->next_period[pwm->hwpwm])) { - delay_us = jiffies_to_usecs(sun4i_pwm->next_period[pwm->hwpwm] - - now); + if (time_before(now, next_period)) { + delay_us = jiffies_to_usecs(next_period - now); if ((delay_us / 500) > MAX_UDELAY_MS) msleep(delay_us / 1000 + 1); else From ba3e5037ceeb920e98970e2342edd7ab390217c0 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 25 Jan 2022 13:34:28 +0100 Subject: [PATCH 014/334] pwm-sun4i: Calculate "delay_jiffies" directly, eliminate absolute time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Basically this code did "jiffies + period - jiffies", and we can simply eliminate the "jiffies" time stamp here. Signed-off-by: Max Kellermann Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sun4i.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index 24db8c9b4bc6..9134db4f52a9 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -234,9 +234,8 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_state cstate; u32 ctrl, duty = 0, period = 0, val; int ret; + unsigned int delay_jiffies; unsigned int delay_us, prescaler = 0; - unsigned long now; - unsigned long next_period; bool bypass; pwm_get_state(pwm, &cstate); @@ -284,8 +283,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, val = (duty & PWM_DTY_MASK) | PWM_PRD(period); sun4i_pwm_writel(sun4i_pwm, val, PWM_CH_PRD(pwm->hwpwm)); - next_period = jiffies + - nsecs_to_jiffies(cstate.period + 1000); if (state->polarity != PWM_POLARITY_NORMAL) ctrl &= ~BIT_CH(PWM_ACT_STATE, pwm->hwpwm); @@ -305,14 +302,12 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; /* We need a full period to elapse before disabling the channel. */ - now = jiffies; - if (time_before(now, next_period)) { - delay_us = jiffies_to_usecs(next_period - now); - if ((delay_us / 500) > MAX_UDELAY_MS) - msleep(delay_us / 1000 + 1); - else - usleep_range(delay_us, delay_us * 2); - } + delay_jiffies = nsecs_to_jiffies(cstate.period + 1000); + delay_us = jiffies_to_usecs(delay_jiffies); + if ((delay_us / 500) > MAX_UDELAY_MS) + msleep(delay_us / 1000 + 1); + else + usleep_range(delay_us, delay_us * 2); spin_lock(&sun4i_pwm->ctrl_lock); ctrl = sun4i_pwm_readl(sun4i_pwm, PWM_CTRL_REG); From 8246b478a23aafdef0e1deb2ac2f21eeee97e877 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 25 Jan 2022 13:34:29 +0100 Subject: [PATCH 015/334] pwm-sun4i: Calculate the delay without rounding down to jiffies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a problem that was supposed to be addressed by commit 6eefb79d6f5bc ("pwm: sun4i: Remove erroneous else branch") - backlight could not be switched off on some Allwinner A20. The commit was correct, but was not a reliable fix for the problem, which was timing related. The real problem for the backlight switching problem was that sleeping for a full period did not work, because delay_us is always zero. It is zero because the period (plus 1 microsecond) is rounded down to the next "jiffies", but the period is less than one jiffy. On my Cubieboard 2, the period is 5ms, and 1 jiffy (at the default HZ=100) is 10ms, so nsecs_to_jiffies(10ms+1us)=0. The roundtrip from nanoseconds to jiffies and back to microseconds is an unnecessary loss of precision; always rounding down (via nsecs_to_jiffies()) then causes the breakage. This patch eliminates this roundtrip, and directly converts from nanoseconds to microseconds (for usleep_range()), using DIV_ROUND_UP_ULL() to force rounding up. This way, the sleep time is never zero, and after the sleep, we are guaranteed to be in a different period, and the device is ready for another control command for sure. Signed-off-by: Max Kellermann Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sun4i.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index 9134db4f52a9..c8445b0a3339 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -234,7 +234,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_state cstate; u32 ctrl, duty = 0, period = 0, val; int ret; - unsigned int delay_jiffies; unsigned int delay_us, prescaler = 0; bool bypass; @@ -302,8 +301,7 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; /* We need a full period to elapse before disabling the channel. */ - delay_jiffies = nsecs_to_jiffies(cstate.period + 1000); - delay_us = jiffies_to_usecs(delay_jiffies); + delay_us = DIV_ROUND_UP_ULL(cstate.period, NSEC_PER_USEC); if ((delay_us / 500) > MAX_UDELAY_MS) msleep(delay_us / 1000 + 1); else From fdaa6efce9aa83fa3c6033cb890548c584e1fded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 2 Mar 2022 19:00:00 +0100 Subject: [PATCH 016/334] pwm: atmel-tcb: Drop duplicated tracking of per-channel data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-channel data is tracked using struct pwm_device::chip_data and struct atmel_tcb_pwm_chip::pwms[]. Simplify by using the latter consistently. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel-tcb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index 36f7ea381838..bea622f53ba8 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -72,7 +72,8 @@ static int atmel_tcb_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm, enum pwm_polarity polarity) { - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; tcbpwm->polarity = polarity; @@ -97,7 +98,6 @@ static int atmel_tcb_pwm_request(struct pwm_chip *chip, return ret; } - pwm_set_chip_data(pwm, tcbpwm); tcbpwm->polarity = PWM_POLARITY_NORMAL; tcbpwm->duty = 0; tcbpwm->period = 0; @@ -139,7 +139,7 @@ static int atmel_tcb_pwm_request(struct pwm_chip *chip, static void atmel_tcb_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; clk_disable_unprepare(tcbpwmc->clk); tcbpwmc->pwms[pwm->hwpwm] = NULL; @@ -149,7 +149,7 @@ static void atmel_tcb_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) static void atmel_tcb_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; unsigned cmr; enum pwm_polarity polarity = tcbpwm->polarity; @@ -206,7 +206,7 @@ static void atmel_tcb_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) static int atmel_tcb_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; u32 cmr; enum pwm_polarity polarity = tcbpwm->polarity; @@ -291,7 +291,7 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; struct atmel_tcb_pwm_device *atcbpwm = NULL; int i = 0; int slowclk = 0; From f643490e1bf941600f6105e4d27c49054fb6d562 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 3 Mar 2022 17:35:42 -0500 Subject: [PATCH 017/334] dt-bindings: pwm: Add Xilinx AXI Timer This adds a binding for the Xilinx LogiCORE IP AXI Timer. This device is a "soft" block, so it has some parameters which would not be configurable in most hardware. This binding is usually automatically generated by Xilinx's tools, so the names and values of some properties should be kept as they are, if possible. In addition, this binding is already in the kernel at arch/microblaze/boot/dts/system.dts, and in user software such as QEMU. The existing driver uses the clock-frequency property, or alternatively the /cpus/timebase-frequency property as its frequency input. Because these properties are deprecated, they have not been included with this schema. All new bindings should use the clocks/clock-names properties to specify the parent clock. Because we need to init timer devices so early in boot, we determine if we should use the PWM driver or the clocksource/clockevent driver by the presence/absence, respectively, of #pwm-cells. Because both counters are used by the PWM, there is no need for a separate property specifying which counters are to be used for the PWM. Signed-off-by: Sean Anderson Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- .../bindings/timer/xlnx,xps-timer.yaml | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml diff --git a/Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml b/Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml new file mode 100644 index 000000000000..dd168d41d2e0 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/xlnx,xps-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx LogiCORE IP AXI Timer Device Tree Binding + +maintainers: + - Sean Anderson + +properties: + compatible: + contains: + const: xlnx,xps-timer-1.00.a + + clocks: + maxItems: 1 + + clock-names: + const: s_axi_aclk + + interrupts: + maxItems: 1 + + reg: + maxItems: 1 + + '#pwm-cells': true + + xlnx,count-width: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [8, 16, 32] + default: 32 + description: + The width of the counter(s), in bits. + + xlnx,one-timer-only: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [ 0, 1 ] + description: + Whether only one timer is present in this block. + +required: + - compatible + - reg + - xlnx,one-timer-only + +allOf: + - if: + required: + - '#pwm-cells' + then: + allOf: + - required: + - clocks + - properties: + xlnx,one-timer-only: + const: 0 + else: + required: + - interrupts + - if: + required: + - clocks + then: + required: + - clock-names + +additionalProperties: false + +examples: + - | + timer@800e0000 { + clock-names = "s_axi_aclk"; + clocks = <&zynqmp_clk 71>; + compatible = "xlnx,xps-timer-1.00.a"; + reg = <0x800e0000 0x10000>; + interrupts = <0 39 2>; + xlnx,count-width = <16>; + xlnx,one-timer-only = <0x0>; + }; + + timer@800f0000 { + #pwm-cells = <0>; + clock-names = "s_axi_aclk"; + clocks = <&zynqmp_clk 71>; + compatible = "xlnx,xps-timer-1.00.a"; + reg = <0x800e0000 0x10000>; + xlnx,count-width = <32>; + xlnx,one-timer-only = <0x0>; + }; From bc1ce713a0843ba14a1e00d5275ad42a8873a5ce Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 3 Mar 2022 17:35:43 -0500 Subject: [PATCH 018/334] pwm: Add support for Xilinx AXI Timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds PWM support for Xilinx LogiCORE IP AXI soft timers commonly found on Xilinx FPGAs. At the moment clock control is very basic: we just enable the clock during probe and pin the frequency. In the future, someone could add support for disabling the clock when not in use. Some common code has been specially demarcated. While currently only used by the PWM driver, it is anticipated that it may be split off in the future to be used by the timer driver as well. This driver was written with reference to Xilinx DS764 for v1.03.a [1]. [1] https://www.xilinx.com/support/documentation/ip_documentation/axi_timer/v1_03_a/axi_timer_ds764.pdf Signed-off-by: Sean Anderson Acked-by: Michal Simek Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- MAINTAINERS | 6 + arch/microblaze/kernel/timer.c | 4 + drivers/pwm/Kconfig | 14 ++ drivers/pwm/Makefile | 1 + drivers/pwm/pwm-xilinx.c | 321 +++++++++++++++++++++++++++++ include/clocksource/timer-xilinx.h | 73 +++++++ 6 files changed, 419 insertions(+) create mode 100644 drivers/pwm/pwm-xilinx.c create mode 100644 include/clocksource/timer-xilinx.h diff --git a/MAINTAINERS b/MAINTAINERS index 40fa1955ca3f..009ecbc44a73 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21650,6 +21650,12 @@ F: drivers/misc/Makefile F: drivers/misc/xilinx_sdfec.c F: include/uapi/misc/xilinx_sdfec.h +XILINX PWM DRIVER +M: Sean Anderson +S: Maintained +F: drivers/pwm/pwm-xilinx.c +F: include/clocksource/timer-xilinx.h + XILINX UARTLITE SERIAL DRIVER M: Peter Korsgaard L: linux-serial@vger.kernel.org diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index f8832cf49384..26c385582c3b 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -251,6 +251,10 @@ static int __init xilinx_timer_init(struct device_node *timer) u32 timer_num = 1; int ret; + /* If this property is present, the device is a PWM and not a timer */ + if (of_property_read_bool(timer, "#pwm-cells")) + return 0; + if (initialized) return -EINVAL; diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 21e3b05a5153..cefbf00b4c7e 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -640,4 +640,18 @@ config PWM_VT8500 To compile this driver as a module, choose M here: the module will be called pwm-vt8500. +config PWM_XILINX + tristate "Xilinx AXI Timer PWM support" + depends on OF_ADDRESS + depends on COMMON_CLK + select REGMAP_MMIO + help + PWM driver for Xilinx LogiCORE IP AXI timers. This timer is + typically a soft core which may be present in Xilinx FPGAs. + This device may also be present in Microblaze soft processors. + If you don't have this IP in your design, choose N. + + To compile this driver as a module, choose M here: the module + will be called pwm-xilinx. + endif diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index 708840b7fba8..ea785480359b 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -60,3 +60,4 @@ obj-$(CONFIG_PWM_TWL) += pwm-twl.o obj-$(CONFIG_PWM_TWL_LED) += pwm-twl-led.o obj-$(CONFIG_PWM_VISCONTI) += pwm-visconti.o obj-$(CONFIG_PWM_VT8500) += pwm-vt8500.o +obj-$(CONFIG_PWM_XILINX) += pwm-xilinx.o diff --git a/drivers/pwm/pwm-xilinx.c b/drivers/pwm/pwm-xilinx.c new file mode 100644 index 000000000000..4dab2b86c427 --- /dev/null +++ b/drivers/pwm/pwm-xilinx.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2021 Sean Anderson + * + * Limitations: + * - When changing both duty cycle and period, we may end up with one cycle + * with the old duty cycle and the new period. This is because the counters + * may only be reloaded by first stopping them, or by letting them be + * automatically reloaded at the end of a cycle. If this automatic reload + * happens after we set TLR0 but before we set TLR1 then we will have a + * bad cycle. This could probably be fixed by reading TCR0 just before + * reprogramming, but I think it would add complexity for little gain. + * - Cannot produce 100% duty cycle by configuring the TLRs. This might be + * possible by stopping the counters at an appropriate point in the cycle, + * but this is not (yet) implemented. + * - Only produces "normal" output. + * - Always produces low output if disabled. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The following functions are "common" to drivers for this device, and may be + * exported at a future date. + */ +u32 xilinx_timer_tlr_cycles(struct xilinx_timer_priv *priv, u32 tcsr, + u64 cycles) +{ + WARN_ON(cycles < 2 || cycles - 2 > priv->max); + + if (tcsr & TCSR_UDT) + return cycles - 2; + return priv->max - cycles + 2; +} + +unsigned int xilinx_timer_get_period(struct xilinx_timer_priv *priv, + u32 tlr, u32 tcsr) +{ + u64 cycles; + + if (tcsr & TCSR_UDT) + cycles = tlr + 2; + else + cycles = (u64)priv->max - tlr + 2; + + /* cycles has a max of 2^32 + 2, so we can't overflow */ + return DIV64_U64_ROUND_UP(cycles * NSEC_PER_SEC, + clk_get_rate(priv->clk)); +} + +/* + * The idea here is to capture whether the PWM is actually running (e.g. + * because we or the bootloader set it up) and we need to be careful to ensure + * we don't cause a glitch. According to the data sheet, to enable the PWM we + * need to + * + * - Set both timers to generate mode (MDT=1) + * - Set both timers to PWM mode (PWMA=1) + * - Enable the generate out signals (GENT=1) + * + * In addition, + * + * - The timer must be running (ENT=1) + * - The timer must auto-reload TLR into TCR (ARHT=1) + * - We must not be in the process of loading TLR into TCR (LOAD=0) + * - Cascade mode must be disabled (CASC=0) + * + * If any of these differ from usual, then the PWM is either disabled, or is + * running in a mode that this driver does not support. + */ +#define TCSR_PWM_SET (TCSR_GENT | TCSR_ARHT | TCSR_ENT | TCSR_PWMA) +#define TCSR_PWM_CLEAR (TCSR_MDT | TCSR_LOAD) +#define TCSR_PWM_MASK (TCSR_PWM_SET | TCSR_PWM_CLEAR) + +struct xilinx_pwm_device { + struct pwm_chip chip; + struct xilinx_timer_priv priv; +}; + +static inline struct xilinx_timer_priv +*xilinx_pwm_chip_to_priv(struct pwm_chip *chip) +{ + return &container_of(chip, struct xilinx_pwm_device, chip)->priv; +} + +static bool xilinx_timer_pwm_enabled(u32 tcsr0, u32 tcsr1) +{ + return ((TCSR_PWM_MASK | TCSR_CASC) & tcsr0) == TCSR_PWM_SET && + (TCSR_PWM_MASK & tcsr1) == TCSR_PWM_SET; +} + +static int xilinx_pwm_apply(struct pwm_chip *chip, struct pwm_device *unused, + const struct pwm_state *state) +{ + struct xilinx_timer_priv *priv = xilinx_pwm_chip_to_priv(chip); + u32 tlr0, tlr1, tcsr0, tcsr1; + u64 period_cycles, duty_cycles; + unsigned long rate; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + /* + * To be representable by TLR, cycles must be between 2 and + * priv->max + 2. To enforce this we can reduce the cycles, but we may + * not increase them. Caveat emptor: while this does result in more + * predictable rounding, it may also result in a completely different + * duty cycle (% high time) than what was requested. + */ + rate = clk_get_rate(priv->clk); + /* Avoid overflow */ + period_cycles = min_t(u64, state->period, U32_MAX * NSEC_PER_SEC); + period_cycles = mul_u64_u32_div(period_cycles, rate, NSEC_PER_SEC); + period_cycles = min_t(u64, period_cycles, priv->max + 2); + if (period_cycles < 2) + return -ERANGE; + + /* Same thing for duty cycles */ + duty_cycles = min_t(u64, state->duty_cycle, U32_MAX * NSEC_PER_SEC); + duty_cycles = mul_u64_u32_div(duty_cycles, rate, NSEC_PER_SEC); + duty_cycles = min_t(u64, duty_cycles, priv->max + 2); + + /* + * If we specify 100% duty cycle, we will get 0% instead, so decrease + * the duty cycle count by one. + */ + if (duty_cycles >= period_cycles) + duty_cycles = period_cycles - 1; + + /* Round down to 0% duty cycle for unrepresentable duty cycles */ + if (duty_cycles < 2) + duty_cycles = period_cycles; + + regmap_read(priv->map, TCSR0, &tcsr0); + regmap_read(priv->map, TCSR1, &tcsr1); + tlr0 = xilinx_timer_tlr_cycles(priv, tcsr0, period_cycles); + tlr1 = xilinx_timer_tlr_cycles(priv, tcsr1, duty_cycles); + regmap_write(priv->map, TLR0, tlr0); + regmap_write(priv->map, TLR1, tlr1); + + if (state->enabled) { + /* + * If the PWM is already running, then the counters will be + * reloaded at the end of the current cycle. + */ + if (!xilinx_timer_pwm_enabled(tcsr0, tcsr1)) { + /* Load TLR into TCR */ + regmap_write(priv->map, TCSR0, tcsr0 | TCSR_LOAD); + regmap_write(priv->map, TCSR1, tcsr1 | TCSR_LOAD); + /* Enable timers all at once with ENALL */ + tcsr0 = (TCSR_PWM_SET & ~TCSR_ENT) | (tcsr0 & TCSR_UDT); + tcsr1 = TCSR_PWM_SET | TCSR_ENALL | (tcsr1 & TCSR_UDT); + regmap_write(priv->map, TCSR0, tcsr0); + regmap_write(priv->map, TCSR1, tcsr1); + } + } else { + regmap_write(priv->map, TCSR0, 0); + regmap_write(priv->map, TCSR1, 0); + } + + return 0; +} + +static void xilinx_pwm_get_state(struct pwm_chip *chip, + struct pwm_device *unused, + struct pwm_state *state) +{ + struct xilinx_timer_priv *priv = xilinx_pwm_chip_to_priv(chip); + u32 tlr0, tlr1, tcsr0, tcsr1; + + regmap_read(priv->map, TLR0, &tlr0); + regmap_read(priv->map, TLR1, &tlr1); + regmap_read(priv->map, TCSR0, &tcsr0); + regmap_read(priv->map, TCSR1, &tcsr1); + state->period = xilinx_timer_get_period(priv, tlr0, tcsr0); + state->duty_cycle = xilinx_timer_get_period(priv, tlr1, tcsr1); + state->enabled = xilinx_timer_pwm_enabled(tcsr0, tcsr1); + state->polarity = PWM_POLARITY_NORMAL; + + /* + * 100% duty cycle results in constant low output. This may be (very) + * wrong if rate > 1 GHz, so fix this if you have such hardware :) + */ + if (state->period == state->duty_cycle) + state->duty_cycle = 0; +} + +static const struct pwm_ops xilinx_pwm_ops = { + .apply = xilinx_pwm_apply, + .get_state = xilinx_pwm_get_state, + .owner = THIS_MODULE, +}; + +static const struct regmap_config xilinx_pwm_regmap_config = { + .reg_bits = 32, + .reg_stride = 4, + .val_bits = 32, + .val_format_endian = REGMAP_ENDIAN_LITTLE, + .max_register = TCR1, +}; + +static int xilinx_pwm_probe(struct platform_device *pdev) +{ + int ret; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct xilinx_timer_priv *priv; + struct xilinx_pwm_device *xilinx_pwm; + u32 pwm_cells, one_timer, width; + void __iomem *regs; + + /* If there are no PWM cells, this binding is for a timer */ + ret = of_property_read_u32(np, "#pwm-cells", &pwm_cells); + if (ret == -EINVAL) + return -ENODEV; + if (ret) + return dev_err_probe(dev, ret, "could not read #pwm-cells\n"); + + xilinx_pwm = devm_kzalloc(dev, sizeof(*xilinx_pwm), GFP_KERNEL); + if (!xilinx_pwm) + return -ENOMEM; + platform_set_drvdata(pdev, xilinx_pwm); + priv = &xilinx_pwm->priv; + + regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(regs)) + return PTR_ERR(regs); + + priv->map = devm_regmap_init_mmio(dev, regs, + &xilinx_pwm_regmap_config); + if (IS_ERR(priv->map)) + return dev_err_probe(dev, PTR_ERR(priv->map), + "Could not create regmap\n"); + + ret = of_property_read_u32(np, "xlnx,one-timer-only", &one_timer); + if (ret) + return dev_err_probe(dev, ret, + "Could not read xlnx,one-timer-only\n"); + + if (one_timer) + return dev_err_probe(dev, -EINVAL, + "Two timers required for PWM mode\n"); + + ret = of_property_read_u32(np, "xlnx,count-width", &width); + if (ret == -EINVAL) + width = 32; + else if (ret) + return dev_err_probe(dev, ret, + "Could not read xlnx,count-width\n"); + + if (width != 8 && width != 16 && width != 32) + return dev_err_probe(dev, -EINVAL, + "Invalid counter width %d\n", width); + priv->max = BIT_ULL(width) - 1; + + /* + * The polarity of the Generate Out signals must be active high for PWM + * mode to work. We could determine this from the device tree, but + * alas, such properties are not allowed to be used. + */ + + priv->clk = devm_clk_get(dev, "s_axi_aclk"); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), + "Could not get clock\n"); + + ret = clk_prepare_enable(priv->clk); + if (ret) + return dev_err_probe(dev, ret, "Clock enable failed\n"); + clk_rate_exclusive_get(priv->clk); + + xilinx_pwm->chip.dev = dev; + xilinx_pwm->chip.ops = &xilinx_pwm_ops; + xilinx_pwm->chip.npwm = 1; + ret = pwmchip_add(&xilinx_pwm->chip); + if (ret) { + clk_rate_exclusive_put(priv->clk); + clk_disable_unprepare(priv->clk); + return dev_err_probe(dev, ret, "Could not register PWM chip\n"); + } + + return 0; +} + +static int xilinx_pwm_remove(struct platform_device *pdev) +{ + struct xilinx_pwm_device *xilinx_pwm = platform_get_drvdata(pdev); + + pwmchip_remove(&xilinx_pwm->chip); + clk_rate_exclusive_put(xilinx_pwm->priv.clk); + clk_disable_unprepare(xilinx_pwm->priv.clk); + return 0; +} + +static const struct of_device_id xilinx_pwm_of_match[] = { + { .compatible = "xlnx,xps-timer-1.00.a", }, + {}, +}; +MODULE_DEVICE_TABLE(of, xilinx_pwm_of_match); + +static struct platform_driver xilinx_pwm_driver = { + .probe = xilinx_pwm_probe, + .remove = xilinx_pwm_remove, + .driver = { + .name = "xilinx-pwm", + .of_match_table = of_match_ptr(xilinx_pwm_of_match), + }, +}; +module_platform_driver(xilinx_pwm_driver); + +MODULE_ALIAS("platform:xilinx-pwm"); +MODULE_DESCRIPTION("PWM driver for Xilinx LogiCORE IP AXI Timer"); +MODULE_LICENSE("GPL"); diff --git a/include/clocksource/timer-xilinx.h b/include/clocksource/timer-xilinx.h new file mode 100644 index 000000000000..c0f56fe6d22a --- /dev/null +++ b/include/clocksource/timer-xilinx.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (C) 2021 Sean Anderson + */ + +#ifndef XILINX_TIMER_H +#define XILINX_TIMER_H + +#include + +#define TCSR0 0x00 +#define TLR0 0x04 +#define TCR0 0x08 +#define TCSR1 0x10 +#define TLR1 0x14 +#define TCR1 0x18 + +#define TCSR_MDT BIT(0) +#define TCSR_UDT BIT(1) +#define TCSR_GENT BIT(2) +#define TCSR_CAPT BIT(3) +#define TCSR_ARHT BIT(4) +#define TCSR_LOAD BIT(5) +#define TCSR_ENIT BIT(6) +#define TCSR_ENT BIT(7) +#define TCSR_TINT BIT(8) +#define TCSR_PWMA BIT(9) +#define TCSR_ENALL BIT(10) +#define TCSR_CASC BIT(11) + +struct clk; +struct device_node; +struct regmap; + +/** + * struct xilinx_timer_priv - Private data for Xilinx AXI timer drivers + * @map: Regmap of the device, possibly with an offset + * @clk: Parent clock + * @max: Maximum value of the counters + */ +struct xilinx_timer_priv { + struct regmap *map; + struct clk *clk; + u32 max; +}; + +/** + * xilinx_timer_tlr_cycles() - Calculate the TLR for a period specified + * in clock cycles + * @priv: The timer's private data + * @tcsr: The value of the TCSR register for this counter + * @cycles: The number of cycles in this period + * + * Callers of this function MUST ensure that @cycles is representable as + * a TLR. + * + * Return: The calculated value for TLR + */ +u32 xilinx_timer_tlr_cycles(struct xilinx_timer_priv *priv, u32 tcsr, + u64 cycles); + +/** + * xilinx_timer_get_period() - Get the current period of a counter + * @priv: The timer's private data + * @tlr: The value of TLR for this counter + * @tcsr: The value of TCSR for this counter + * + * Return: The period, in ns + */ +unsigned int xilinx_timer_get_period(struct xilinx_timer_priv *priv, + u32 tlr, u32 tcsr); + +#endif /* XILINX_TIMER_H */ From 1a406a38bc1d66f68bac30cf8d83fcff621ff79c Mon Sep 17 00:00:00 2001 From: Sergiu Moga Date: Mon, 7 Mar 2022 17:36:55 +0200 Subject: [PATCH 019/334] dt-bindings: pwm: Convert atmel pwm to json-schema Convert PWM binding for Atmel/Microchip SoCs to Device Tree Schema format. Signed-off-by: Sergiu Moga Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- .../bindings/pwm/atmel,at91sam-pwm.yaml | 42 +++++++++++++++++++ .../devicetree/bindings/pwm/atmel-pwm.txt | 35 ---------------- MAINTAINERS | 2 +- 3 files changed, 43 insertions(+), 36 deletions(-) create mode 100644 Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml delete mode 100644 Documentation/devicetree/bindings/pwm/atmel-pwm.txt diff --git a/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml b/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml new file mode 100644 index 000000000000..5e8bb5a8095d --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2022 Microchip Technology, Inc. and its subsidiaries +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/atmel,at91sam-pwm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Atmel/Microchip PWM controller + +maintainers: + - Claudiu Beznea + +allOf: + - $ref: "pwm.yaml#" + +properties: + compatible: + enum: + - atmel,at91sam9rl-pwm + - atmel,sama5d3-pwm + - atmel,sama5d2-pwm + - microchip,sam9x60-pwm + + reg: + maxItems: 1 + + "#pwm-cells": + const: 3 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + pwm0: pwm@f8034000 { + compatible = "atmel,at91sam9rl-pwm"; + reg = <0xf8034000 0x400>; + #pwm-cells = <3>; + }; diff --git a/Documentation/devicetree/bindings/pwm/atmel-pwm.txt b/Documentation/devicetree/bindings/pwm/atmel-pwm.txt deleted file mode 100644 index fbb5325be1f0..000000000000 --- a/Documentation/devicetree/bindings/pwm/atmel-pwm.txt +++ /dev/null @@ -1,35 +0,0 @@ -Atmel PWM controller - -Required properties: - - compatible: should be one of: - - "atmel,at91sam9rl-pwm" - - "atmel,sama5d3-pwm" - - "atmel,sama5d2-pwm" - - "microchip,sam9x60-pwm" - - reg: physical base address and length of the controller's registers - - #pwm-cells: Should be 3. See pwm.yaml in this directory for a - description of the cells format. - -Example: - - pwm0: pwm@f8034000 { - compatible = "atmel,at91sam9rl-pwm"; - reg = <0xf8034000 0x400>; - #pwm-cells = <3>; - }; - - pwmleds { - compatible = "pwm-leds"; - - d1 { - label = "d1"; - pwms = <&pwm0 3 5000 0> - max-brightness = <255>; - }; - - d2 { - label = "d2"; - pwms = <&pwm0 1 5000 1> - max-brightness = <255>; - }; - }; diff --git a/MAINTAINERS b/MAINTAINERS index 009ecbc44a73..6fa5010e8c2d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12959,7 +12959,7 @@ M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-pwm@vger.kernel.org S: Supported -F: Documentation/devicetree/bindings/pwm/atmel-pwm.txt +F: Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml F: drivers/pwm/pwm-atmel.c MICROCHIP SAMA5D2-COMPATIBLE ADC DRIVER From f30673a9df34931cbfb9b51da00dd185a3cd4566 Mon Sep 17 00:00:00 2001 From: Sergiu Moga Date: Mon, 7 Mar 2022 17:36:56 +0200 Subject: [PATCH 020/334] dt-bindings: pwm: at91: Add SAMA7G5 compatible strings list Add compatible strings list for SAMA7G5. Signed-off-by: Sergiu Moga Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- .../bindings/pwm/atmel,at91sam-pwm.yaml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml b/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml index 5e8bb5a8095d..ab45df80345d 100644 --- a/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml @@ -15,11 +15,16 @@ allOf: properties: compatible: - enum: - - atmel,at91sam9rl-pwm - - atmel,sama5d3-pwm - - atmel,sama5d2-pwm - - microchip,sam9x60-pwm + oneOf: + - items: + - enum: + - atmel,at91sam9rl-pwm + - atmel,sama5d3-pwm + - atmel,sama5d2-pwm + - microchip,sam9x60-pwm + - items: + - const: microchip,sama7g5-pwm + - const: atmel,sama5d2-pwm reg: maxItems: 1 From 15452ce3c9467bdd15002f5bc56cd2c13e7b1aa0 Mon Sep 17 00:00:00 2001 From: Hammer Hsieh Date: Tue, 22 Mar 2022 18:43:00 +0800 Subject: [PATCH 021/334] dt-bindings: pwm: Add bindings doc for Sunplus SoC SP7021 PWM Driver Add bindings doc for Sunplus SoC SP7021 PWM Driver Reviewed-by: Krzysztof Kozlowski Signed-off-by: Hammer Hsieh Acked-by: Rob Herring Signed-off-by: Thierry Reding --- .../bindings/pwm/sunplus,sp7021-pwm.yaml | 42 +++++++++++++++++++ MAINTAINERS | 5 +++ 2 files changed, 47 insertions(+) create mode 100644 Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml diff --git a/Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml b/Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml new file mode 100644 index 000000000000..d4fc9e8db1d1 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) Sunplus Co., Ltd. 2021 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/sunplus,sp7021-pwm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sunplus SoC SP7021 PWM Controller + +maintainers: + - Hammer Hsieh + +allOf: + - $ref: pwm.yaml# + +properties: + compatible: + const: sunplus,sp7021-pwm + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + '#pwm-cells': + const: 2 + +unevaluatedProperties: false + +required: + - reg + - clocks + +examples: + - | + pwm: pwm@9c007a00 { + compatible = "sunplus,sp7021-pwm"; + reg = <0x9c007a00 0x80>; + clocks = <&clkc 0xa2>; + #pwm-cells = <2>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 6fa5010e8c2d..3c994dbd643f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18873,6 +18873,11 @@ S: Maintained F: Documentation/devicetree/bindings/nvmem/sunplus,sp7021-ocotp.yaml F: drivers/nvmem/sunplus-ocotp.c +SUNPLUS PWM DRIVER +M: Hammer Hsieh +S: Maintained +F: Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml + SUNPLUS RTC DRIVER M: Vincent Shih L: linux-rtc@vger.kernel.org From b3c4af8507a052c231084d7e2c1cc5fa5a8c245a Mon Sep 17 00:00:00 2001 From: Hammer Hsieh Date: Tue, 22 Mar 2022 18:43:01 +0800 Subject: [PATCH 022/334] pwm: sunplus-pwm: Add Sunplus SoC SP7021 PWM Driver Add Sunplus SoC SP7021 PWM Driver Signed-off-by: Hammer Hsieh Signed-off-by: Thierry Reding --- MAINTAINERS | 1 + drivers/pwm/Kconfig | 11 ++ drivers/pwm/Makefile | 1 + drivers/pwm/pwm-sunplus.c | 232 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 245 insertions(+) create mode 100644 drivers/pwm/pwm-sunplus.c diff --git a/MAINTAINERS b/MAINTAINERS index 3c994dbd643f..0c8e160e7742 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18877,6 +18877,7 @@ SUNPLUS PWM DRIVER M: Hammer Hsieh S: Maintained F: Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml +F: drivers/pwm/pwm-sunplus.c SUNPLUS RTC DRIVER M: Vincent Shih diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index cefbf00b4c7e..904de8d61828 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -572,6 +572,17 @@ config PWM_SUN4I To compile this driver as a module, choose M here: the module will be called pwm-sun4i. +config PWM_SUNPLUS + tristate "Sunplus PWM support" + depends on ARCH_SUNPLUS || COMPILE_TEST + depends on HAS_IOMEM && OF + help + Generic PWM framework driver for the PWM controller on + Sunplus SoCs. + + To compile this driver as a module, choose M here: the module + will be called pwm-sunplus. + config PWM_TEGRA tristate "NVIDIA Tegra PWM support" depends on ARCH_TEGRA || COMPILE_TEST diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index ea785480359b..5c08bdb817b4 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_PWM_STM32) += pwm-stm32.o obj-$(CONFIG_PWM_STM32_LP) += pwm-stm32-lp.o obj-$(CONFIG_PWM_STMPE) += pwm-stmpe.o obj-$(CONFIG_PWM_SUN4I) += pwm-sun4i.o +obj-$(CONFIG_PWM_SUNPLUS) += pwm-sunplus.o obj-$(CONFIG_PWM_TEGRA) += pwm-tegra.o obj-$(CONFIG_PWM_TIECAP) += pwm-tiecap.o obj-$(CONFIG_PWM_TIEHRPWM) += pwm-tiehrpwm.o diff --git a/drivers/pwm/pwm-sunplus.c b/drivers/pwm/pwm-sunplus.c new file mode 100644 index 000000000000..e776fd16512d --- /dev/null +++ b/drivers/pwm/pwm-sunplus.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PWM device driver for SUNPLUS SP7021 SoC + * + * Links: + * Reference Manual: + * https://sunplus-tibbo.atlassian.net/wiki/spaces/doc/overview + * + * Reference Manual(PWM module): + * https://sunplus.atlassian.net/wiki/spaces/doc/pages/461144198/12.+Pulse+Width+Modulation+PWM + * + * Limitations: + * - Only supports normal polarity. + * - It output low when PWM channel disabled. + * - When the parameters change, current running period will not be completed + * and run new settings immediately. + * - In .apply() PWM output need to write register FREQ and DUTY. When first write FREQ + * done and not yet write DUTY, it has short timing gap use new FREQ and old DUTY. + * + * Author: Hammer Hsieh + */ +#include +#include +#include +#include +#include +#include +#include + +#define SP7021_PWM_MODE0 0x000 +#define SP7021_PWM_MODE0_PWMEN(ch) BIT(ch) +#define SP7021_PWM_MODE0_BYPASS(ch) BIT(8 + (ch)) +#define SP7021_PWM_MODE1 0x004 +#define SP7021_PWM_MODE1_CNT_EN(ch) BIT(ch) +#define SP7021_PWM_FREQ(ch) (0x008 + 4 * (ch)) +#define SP7021_PWM_FREQ_MAX GENMASK(15, 0) +#define SP7021_PWM_DUTY(ch) (0x018 + 4 * (ch)) +#define SP7021_PWM_DUTY_DD_SEL(ch) FIELD_PREP(GENMASK(9, 8), ch) +#define SP7021_PWM_DUTY_MAX GENMASK(7, 0) +#define SP7021_PWM_DUTY_MASK SP7021_PWM_DUTY_MAX +#define SP7021_PWM_FREQ_SCALER 256 +#define SP7021_PWM_NUM 4 + +struct sunplus_pwm { + struct pwm_chip chip; + void __iomem *base; + struct clk *clk; +}; + +static inline struct sunplus_pwm *to_sunplus_pwm(struct pwm_chip *chip) +{ + return container_of(chip, struct sunplus_pwm, chip); +} + +static int sunplus_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + struct sunplus_pwm *priv = to_sunplus_pwm(chip); + u32 dd_freq, duty, mode0, mode1; + u64 clk_rate; + + if (state->polarity != pwm->state.polarity) + return -EINVAL; + + if (!state->enabled) { + /* disable pwm channel output */ + mode0 = readl(priv->base + SP7021_PWM_MODE0); + mode0 &= ~SP7021_PWM_MODE0_PWMEN(pwm->hwpwm); + writel(mode0, priv->base + SP7021_PWM_MODE0); + /* disable pwm channel clk source */ + mode1 = readl(priv->base + SP7021_PWM_MODE1); + mode1 &= ~SP7021_PWM_MODE1_CNT_EN(pwm->hwpwm); + writel(mode1, priv->base + SP7021_PWM_MODE1); + return 0; + } + + clk_rate = clk_get_rate(priv->clk); + + /* + * The following calculations might overflow if clk is bigger + * than 256 GHz. In practise it's 202.5MHz, so this limitation + * is only theoretic. + */ + if (clk_rate > (u64)SP7021_PWM_FREQ_SCALER * NSEC_PER_SEC) + return -EINVAL; + + /* + * With clk_rate limited above we have dd_freq <= state->period, + * so this cannot overflow. + */ + dd_freq = mul_u64_u64_div_u64(clk_rate, state->period, (u64)SP7021_PWM_FREQ_SCALER + * NSEC_PER_SEC); + + if (dd_freq == 0) + return -EINVAL; + + if (dd_freq > SP7021_PWM_FREQ_MAX) + dd_freq = SP7021_PWM_FREQ_MAX; + + writel(dd_freq, priv->base + SP7021_PWM_FREQ(pwm->hwpwm)); + + /* cal and set pwm duty */ + mode0 = readl(priv->base + SP7021_PWM_MODE0); + mode0 |= SP7021_PWM_MODE0_PWMEN(pwm->hwpwm); + mode1 = readl(priv->base + SP7021_PWM_MODE1); + mode1 |= SP7021_PWM_MODE1_CNT_EN(pwm->hwpwm); + if (state->duty_cycle == state->period) { + /* PWM channel output = high */ + mode0 |= SP7021_PWM_MODE0_BYPASS(pwm->hwpwm); + duty = SP7021_PWM_DUTY_DD_SEL(pwm->hwpwm) | SP7021_PWM_DUTY_MAX; + } else { + mode0 &= ~SP7021_PWM_MODE0_BYPASS(pwm->hwpwm); + /* + * duty_ns <= period_ns 27 bits, clk_rate 28 bits, won't overflow. + */ + duty = mul_u64_u64_div_u64(state->duty_cycle, clk_rate, + (u64)dd_freq * NSEC_PER_SEC); + duty = SP7021_PWM_DUTY_DD_SEL(pwm->hwpwm) | duty; + } + writel(duty, priv->base + SP7021_PWM_DUTY(pwm->hwpwm)); + writel(mode1, priv->base + SP7021_PWM_MODE1); + writel(mode0, priv->base + SP7021_PWM_MODE0); + + return 0; +} + +static void sunplus_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct sunplus_pwm *priv = to_sunplus_pwm(chip); + u32 mode0, dd_freq, duty; + u64 clk_rate; + + mode0 = readl(priv->base + SP7021_PWM_MODE0); + + if (mode0 & BIT(pwm->hwpwm)) { + clk_rate = clk_get_rate(priv->clk); + dd_freq = readl(priv->base + SP7021_PWM_FREQ(pwm->hwpwm)); + duty = readl(priv->base + SP7021_PWM_DUTY(pwm->hwpwm)); + duty = FIELD_GET(SP7021_PWM_DUTY_MASK, duty); + /* + * dd_freq 16 bits, SP7021_PWM_FREQ_SCALER 8 bits + * NSEC_PER_SEC 30 bits, won't overflow. + */ + state->period = DIV64_U64_ROUND_UP((u64)dd_freq * (u64)SP7021_PWM_FREQ_SCALER + * NSEC_PER_SEC, clk_rate); + /* + * dd_freq 16 bits, duty 8 bits, NSEC_PER_SEC 30 bits, won't overflow. + */ + state->duty_cycle = DIV64_U64_ROUND_UP((u64)dd_freq * (u64)duty * NSEC_PER_SEC, + clk_rate); + state->enabled = true; + } else { + state->enabled = false; + } + + state->polarity = PWM_POLARITY_NORMAL; +} + +static const struct pwm_ops sunplus_pwm_ops = { + .apply = sunplus_pwm_apply, + .get_state = sunplus_pwm_get_state, + .owner = THIS_MODULE, +}; + +static void sunplus_pwm_clk_release(void *data) +{ + struct clk *clk = data; + + clk_disable_unprepare(clk); +} + +static int sunplus_pwm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sunplus_pwm *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->clk = devm_clk_get(dev, NULL); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), + "get pwm clock failed\n"); + + ret = clk_prepare_enable(priv->clk); + if (ret < 0) { + dev_err(dev, "failed to enable clock: %d\n", ret); + return ret; + } + + ret = devm_add_action_or_reset(dev, sunplus_pwm_clk_release, priv->clk); + if (ret < 0) { + dev_err(dev, "failed to release clock: %d\n", ret); + return ret; + } + + priv->chip.dev = dev; + priv->chip.ops = &sunplus_pwm_ops; + priv->chip.npwm = SP7021_PWM_NUM; + + ret = devm_pwmchip_add(dev, &priv->chip); + if (ret < 0) + return dev_err_probe(dev, ret, "Cannot register sunplus PWM\n"); + + return 0; +} + +static const struct of_device_id sunplus_pwm_of_match[] = { + { .compatible = "sunplus,sp7021-pwm", }, + {} +}; +MODULE_DEVICE_TABLE(of, sunplus_pwm_of_match); + +static struct platform_driver sunplus_pwm_driver = { + .probe = sunplus_pwm_probe, + .driver = { + .name = "sunplus-pwm", + .of_match_table = sunplus_pwm_of_match, + }, +}; +module_platform_driver(sunplus_pwm_driver); + +MODULE_DESCRIPTION("Sunplus SoC PWM Driver"); +MODULE_AUTHOR("Hammer Hsieh "); +MODULE_LICENSE("GPL"); From 5e3b07ca5cc78cd4a987e78446849e41288d87cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Apr 2022 17:22:38 +0200 Subject: [PATCH 023/334] pwm: lp3943: Fix duty calculation in case period was clamped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardware only supports periods <= 1.6 ms and if a bigger period is requested it is clamped to 1.6 ms. In this case duty_cycle might be bigger than 1.6 ms and then the duty cycle register is written with a value bigger than LP3943_MAX_DUTY. So clamp duty_cycle accordingly. Fixes: af66b3c0934e ("pwm: Add LP3943 PWM driver") Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lp3943.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pwm/pwm-lp3943.c b/drivers/pwm/pwm-lp3943.c index ea17d446a627..2bd04ecb508c 100644 --- a/drivers/pwm/pwm-lp3943.c +++ b/drivers/pwm/pwm-lp3943.c @@ -125,6 +125,7 @@ static int lp3943_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, if (err) return err; + duty_ns = min(duty_ns, period_ns); val = (u8)(duty_ns * LP3943_MAX_DUTY / period_ns); return lp3943_write_byte(lp3943, reg_duty, val); From 1d24cc89203081919e339b734728149642786f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Apr 2022 17:22:39 +0200 Subject: [PATCH 024/334] pwm: lp3943: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lp3943.c | 41 +++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-lp3943.c b/drivers/pwm/pwm-lp3943.c index 2bd04ecb508c..215ef9069114 100644 --- a/drivers/pwm/pwm-lp3943.c +++ b/drivers/pwm/pwm-lp3943.c @@ -93,7 +93,7 @@ static void lp3943_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static int lp3943_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { struct lp3943_pwm *lp3943_pwm = to_lp3943_pwm(chip); struct lp3943 *lp3943 = lp3943_pwm->lp3943; @@ -118,15 +118,20 @@ static int lp3943_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, reg_duty = LP3943_REG_PWM1; } - period_ns = clamp(period_ns, LP3943_MIN_PERIOD, LP3943_MAX_PERIOD); - val = (u8)(period_ns / LP3943_MIN_PERIOD - 1); + /* + * Note that after this clamping, period_ns fits into an int. This is + * helpful because we can resort to integer division below instead of + * the (more expensive) 64 bit division. + */ + period_ns = clamp(period_ns, (u64)LP3943_MIN_PERIOD, (u64)LP3943_MAX_PERIOD); + val = (u8)((int)period_ns / LP3943_MIN_PERIOD - 1); err = lp3943_write_byte(lp3943, reg_prescale, val); if (err) return err; duty_ns = min(duty_ns, period_ns); - val = (u8)(duty_ns * LP3943_MAX_DUTY / period_ns); + val = (u8)((int)duty_ns * LP3943_MAX_DUTY / (int)period_ns); return lp3943_write_byte(lp3943, reg_duty, val); } @@ -183,12 +188,34 @@ static void lp3943_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) lp3943_pwm_set_mode(lp3943_pwm, pwm_map, LP3943_GPIO_OUT_HIGH); } +static int lp3943_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + lp3943_pwm_disable(chip, pwm); + return 0; + } + + err = lp3943_pwm_config(chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = lp3943_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops lp3943_pwm_ops = { .request = lp3943_pwm_request, .free = lp3943_pwm_free, - .config = lp3943_pwm_config, - .enable = lp3943_pwm_enable, - .disable = lp3943_pwm_disable, + .apply = lp3943_pwm_apply, .owner = THIS_MODULE, }; From eaaad16a9b3f16b411c33404e17d4dc61cf69b16 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 11 Apr 2022 09:23:40 +0300 Subject: [PATCH 025/334] gpio: mvebu: Drop PWM base assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwmchip_add() unconditionally assigns the base ID dynamically. Commit f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically") dropped all base assignment from drivers under drivers/pwm/. It missed this driver. Fix that. Fixes: f9a8ee8c8bcd1 ("pwm: Always allocate PWM chip base ID dynamically") Signed-off-by: Baruch Siach Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/gpio/gpio-mvebu.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 4c1f9e1091b7..a2c8dd329b31 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -871,13 +871,6 @@ static int mvebu_pwm_probe(struct platform_device *pdev, mvpwm->chip.dev = dev; mvpwm->chip.ops = &mvebu_pwm_ops; mvpwm->chip.npwm = mvchip->chip.ngpio; - /* - * There may already be some PWM allocated, so we can't force - * mvpwm->chip.base to a fixed point like mvchip->chip.base. - * So, we let pwmchip_add() do the numbering and take the next free - * region. - */ - mvpwm->chip.base = -1; spin_lock_init(&mvpwm->lock); From d7b4408374b69dbc246c59c29e3077920b9f03eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Apr 2022 17:29:10 +0200 Subject: [PATCH 026/334] pwm: atmel-tcb: Make atmel_tcb_divisors static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The array atmel_tcb_divisors is not supposed to be used outside of the driver, so make it static. This fixes a sparse warning: drivers/pwm/pwm-atmel-tcb.c:64:10: warning: symbol 'atmel_tcb_divisors' was not declared. Should it be static? Signed-off-by: Uwe Kleine-König Reviewed-by: Claudiu Beznea Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel-tcb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index bea622f53ba8..3977a0f9d132 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -61,7 +61,7 @@ struct atmel_tcb_pwm_chip { struct atmel_tcb_channel bkup; }; -const u8 atmel_tcb_divisors[] = { 2, 8, 32, 128, 0, }; +static const u8 atmel_tcb_divisors[] = { 2, 8, 32, 128, 0, }; static inline struct atmel_tcb_pwm_chip *to_tcb_chip(struct pwm_chip *chip) { From 09f688f0718f57f9cf68ee1aa94490f641e759ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Apr 2022 17:38:46 +0200 Subject: [PATCH 027/334] pwm: raspberrypi-poe: Fix endianness in firmware struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reg member of struct raspberrypi_pwm_prop is a little endian 32 bit quantity. Explicitly convert the (native endian) value to little endian on assignment as is already done in raspberrypi_pwm_set_property(). This fixes the following sparse warning: drivers/pwm/pwm-raspberrypi-poe.c:69:24: warning: incorrect type in initializer (different base types) drivers/pwm/pwm-raspberrypi-poe.c:69:24: expected restricted __le32 [usertype] reg drivers/pwm/pwm-raspberrypi-poe.c:69:24: got unsigned int [usertype] reg Fixes: 79caa362eab6 ("pwm: Add Raspberry Pi Firmware based PWM bus") Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-raspberrypi-poe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-raspberrypi-poe.c b/drivers/pwm/pwm-raspberrypi-poe.c index e52e29fc8231..6ff73029f367 100644 --- a/drivers/pwm/pwm-raspberrypi-poe.c +++ b/drivers/pwm/pwm-raspberrypi-poe.c @@ -66,7 +66,7 @@ static int raspberrypi_pwm_get_property(struct rpi_firmware *firmware, u32 reg, u32 *val) { struct raspberrypi_pwm_prop msg = { - .reg = reg + .reg = cpu_to_le32(reg), }; int ret; From 4225cd01d30f66b6f35e8f54ba6929db11a9a7af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Apr 2022 17:45:20 +0200 Subject: [PATCH 028/334] pwm: clps71xx: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This fixes a small issue in clps711x_get_duty() en passant: the multiplication v * 0xf might have overflown. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-clps711x.c | 74 +++++++++++++------------------------- 1 file changed, 24 insertions(+), 50 deletions(-) diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c index d7ad88685830..b0d91142da8d 100644 --- a/drivers/pwm/pwm-clps711x.c +++ b/drivers/pwm/pwm-clps711x.c @@ -23,29 +23,6 @@ static inline struct clps711x_chip *to_clps711x_chip(struct pwm_chip *chip) return container_of(chip, struct clps711x_chip, chip); } -static void clps711x_pwm_update_val(struct clps711x_chip *priv, u32 n, u32 v) -{ - /* PWM0 - bits 4..7, PWM1 - bits 8..11 */ - u32 shift = (n + 1) * 4; - unsigned long flags; - u32 tmp; - - spin_lock_irqsave(&priv->lock, flags); - - tmp = readl(priv->pmpcon); - tmp &= ~(0xf << shift); - tmp |= v << shift; - writel(tmp, priv->pmpcon); - - spin_unlock_irqrestore(&priv->lock, flags); -} - -static unsigned int clps711x_get_duty(struct pwm_device *pwm, unsigned int v) -{ - /* Duty cycle 0..15 max */ - return DIV64_U64_ROUND_CLOSEST(v * 0xf, pwm->args.period); -} - static int clps711x_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct clps711x_chip *priv = to_clps711x_chip(chip); @@ -60,44 +37,41 @@ static int clps711x_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) return 0; } -static int clps711x_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static int clps711x_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) { struct clps711x_chip *priv = to_clps711x_chip(chip); - unsigned int duty; + /* PWM0 - bits 4..7, PWM1 - bits 8..11 */ + u32 shift = (pwm->hwpwm + 1) * 4; + unsigned long flags; + u32 pmpcon, val; - if (period_ns != pwm->args.period) + if (state->polarity != PWM_POLARITY_NORMAL) return -EINVAL; - duty = clps711x_get_duty(pwm, duty_ns); - clps711x_pwm_update_val(priv, pwm->hwpwm, duty); + if (state->period != pwm->args.period) + return -EINVAL; + + if (state->enabled) + val = mul_u64_u64_div_u64(state->duty_cycle, 0xf, state->period); + else + val = 0; + + spin_lock_irqsave(&priv->lock, flags); + + pmpcon = readl(priv->pmpcon); + pmpcon &= ~(0xf << shift); + pmpcon |= val << shift; + writel(pmpcon, priv->pmpcon); + + spin_unlock_irqrestore(&priv->lock, flags); return 0; } -static int clps711x_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct clps711x_chip *priv = to_clps711x_chip(chip); - unsigned int duty; - - duty = clps711x_get_duty(pwm, pwm_get_duty_cycle(pwm)); - clps711x_pwm_update_val(priv, pwm->hwpwm, duty); - - return 0; -} - -static void clps711x_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct clps711x_chip *priv = to_clps711x_chip(chip); - - clps711x_pwm_update_val(priv, pwm->hwpwm, 0); -} - static const struct pwm_ops clps711x_pwm_ops = { .request = clps711x_pwm_request, - .config = clps711x_pwm_config, - .enable = clps711x_pwm_enable, - .disable = clps711x_pwm_disable, + .apply = clps711x_pwm_apply, .owner = THIS_MODULE, }; From b09b179bac0aa3d58d79729260a7492df8a1e312 Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Mon, 18 Apr 2022 20:20:52 +0800 Subject: [PATCH 029/334] dt-bindings: pwm: Convert pwm-mtk-disp.txt to mediatek,pwm-disp.yaml format Convert pwm-mtk-disp.txt to mediatek,pwm-disp.yaml format as suggested by maintainer. Signed-off-by: Xinlei Lee Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Miles Chen Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- .../bindings/pwm/mediatek,pwm-disp.yaml | 66 +++++++++++++++++++ .../devicetree/bindings/pwm/pwm-mtk-disp.txt | 45 ------------- 2 files changed, 66 insertions(+), 45 deletions(-) create mode 100644 Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml delete mode 100644 Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml new file mode 100644 index 000000000000..36f877f819fa --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/mediatek,pwm-disp.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek DISP_PWM Controller Device Tree Bindings + +maintainers: + - Jitao Shi + - Xinlei Lee + +allOf: + - $ref: pwm.yaml# + +properties: + compatible: + oneOf: + - enum: + - mediatek,mt2701-disp-pwm + - mediatek,mt6595-disp-pwm + - mediatek,mt8173-disp-pwm + - mediatek,mt8183-disp-pwm + - items: + - const: mediatek,mt8167-disp-pwm + - const: mediatek,mt8173-disp-pwm + + reg: + maxItems: 1 + + "#pwm-cells": + const: 2 + + clocks: + items: + - description: Main Clock + - description: Mm Clock + + clock-names: + items: + - const: main + - const: mm + +required: + - compatible + - reg + - "#pwm-cells" + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + #include + + pwm0: pwm@1401e000 { + compatible = "mediatek,mt8173-disp-pwm"; + reg = <0x1401e000 0x1000>; + #pwm-cells = <2>; + clocks = <&mmsys CLK_MM_DISP_PWM026M>, + <&mmsys CLK_MM_DISP_PWM0MM>; + clock-names = "main", "mm"; + }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt b/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt deleted file mode 100644 index 691e58b6c223..000000000000 --- a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt +++ /dev/null @@ -1,45 +0,0 @@ -MediaTek display PWM controller - -Required properties: - - compatible: should be "mediatek,-disp-pwm": - - "mediatek,mt2701-disp-pwm": found on mt2701 SoC. - - "mediatek,mt6595-disp-pwm": found on mt6595 SoC. - - "mediatek,mt8167-disp-pwm", "mediatek,mt8173-disp-pwm": found on mt8167 SoC. - - "mediatek,mt8173-disp-pwm": found on mt8173 SoC. - - "mediatek,mt8183-disp-pwm": found on mt8183 SoC.$ - - reg: physical base address and length of the controller's registers. - - #pwm-cells: must be 2. See pwm.yaml in this directory for a description of - the cell format. - - clocks: phandle and clock specifier of the PWM reference clock. - - clock-names: must contain the following: - - "main": clock used to generate PWM signals. - - "mm": sync signals from the modules of mmsys. - - pinctrl-names: Must contain a "default" entry. - - pinctrl-0: One property must exist for each entry in pinctrl-names. - See pinctrl/pinctrl-bindings.txt for details of the property values. - -Example: - pwm0: pwm@1401e000 { - compatible = "mediatek,mt8173-disp-pwm", - "mediatek,mt6595-disp-pwm"; - reg = <0 0x1401e000 0 0x1000>; - #pwm-cells = <2>; - clocks = <&mmsys CLK_MM_DISP_PWM026M>, - <&mmsys CLK_MM_DISP_PWM0MM>; - clock-names = "main", "mm"; - pinctrl-names = "default"; - pinctrl-0 = <&disp_pwm0_pins>; - }; - - backlight_lcd: backlight_lcd { - compatible = "pwm-backlight"; - pwms = <&pwm0 0 1000000>; - brightness-levels = < - 0 16 32 48 64 80 96 112 - 128 144 160 176 192 208 224 240 - 255 - >; - default-brightness-level = <9>; - power-supply = <&mt6397_vio18_reg>; - enable-gpios = <&pio 95 GPIO_ACTIVE_HIGH>; - }; From b8ba2b42b0e9cf6d2788e423f8fe2ba4fcc2539d Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Mon, 18 Apr 2022 20:20:53 +0800 Subject: [PATCH 030/334] dt-bindings: pwm: Add compatible for MediaTek MT8192 Add dt-binding documentation of pwm for MediaTek MT8192 SoC. Signed-off-by: Xinlei Lee Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index 36f877f819fa..4bfa8eb42186 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -24,6 +24,10 @@ properties: - items: - const: mediatek,mt8167-disp-pwm - const: mediatek,mt8173-disp-pwm + - items: + - enum: + - mediatek,mt8192-disp-pwm + - const: mediatek,mt8183-disp-pwm reg: maxItems: 1 From 7eafddce08617b4fdbd9485fa88bb749c8c7125a Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Mon, 18 Apr 2022 20:20:54 +0800 Subject: [PATCH 031/334] dt-bindings: pwm: Add compatible for MediaTek MT8195 Add dt-binding documentation of pwm for MediaTek MT8195 SoC. Signed-off-by: Xinlei Lee Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index 4bfa8eb42186..e5da918eaa33 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -27,6 +27,7 @@ properties: - items: - enum: - mediatek,mt8192-disp-pwm + - mediatek,mt8195-disp-pwm - const: mediatek,mt8183-disp-pwm reg: From 6ddb156ba4749a64293b5c4079406de178e2e5cd Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Mon, 18 Apr 2022 20:20:55 +0800 Subject: [PATCH 032/334] dt-bindings: pwm: Add compatible for MediaTek MT8186 Add dt-binding documentation of pwm for MediaTek MT8186 SoC. Signed-off-by: Xinlei Lee Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index e5da918eaa33..4b71bd668d33 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -26,6 +26,7 @@ properties: - const: mediatek,mt8173-disp-pwm - items: - enum: + - mediatek,mt8186-disp-pwm - mediatek,mt8192-disp-pwm - mediatek,mt8195-disp-pwm - const: mediatek,mt8183-disp-pwm From 2bf8ee0faa988b5cec3503ebf2f970a0e84d24ee Mon Sep 17 00:00:00 2001 From: Xinlei Lee Date: Mon, 18 Apr 2022 20:20:56 +0800 Subject: [PATCH 033/334] dt-bindings: pwm: Add interrupts property for MediaTek MT8192 Add interrupts property of pwm for MediaTek MT8192 SoC. Signed-off-by: Xinlei Lee Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml index 4b71bd668d33..e4fe2d1bfef5 100644 --- a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -37,6 +37,9 @@ properties: "#pwm-cells": const: 2 + interrupts: + maxItems: 1 + clocks: items: - description: Main Clock From ee651cd1e944df7d1553bb2c5593e887f12d6cda Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 8 Apr 2022 15:05:38 -0700 Subject: [PATCH 034/334] dt-bindings: remoteproc: qcom: pas: Add sc8280xp adsp and nsp pair Add the Qualcomm sc8280xp ADSP and NSP pairs to the binding. Signed-off-by: Bjorn Andersson Reviewed-by: Krzysztof Kozlowski Reviewed-by: Vinod Koul Link: https://lore.kernel.org/r/20220408220539.625301-1-bjorn.andersson@linaro.org --- .../bindings/remoteproc/qcom,adsp.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml index a4409c398193..df8286296c37 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml @@ -29,6 +29,9 @@ properties: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas - qcom,sc8180x-mpss-pas + - qcom,sc8280xp-adsp-pas + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas - qcom,sdm660-adsp-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas @@ -169,6 +172,9 @@ allOf: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas - qcom,sc8180x-mpss-pas + - qcom,sc8280xp-adsp-pas + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas - qcom,sm6350-adsp-pas @@ -284,6 +290,9 @@ allOf: - qcom,qcs404-wcss-pas - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas + - qcom,sc8280xp-adsp-pas + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas - qcom,sm6350-adsp-pas @@ -471,6 +480,7 @@ allOf: enum: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas + - qcom,sc8280xp-adsp-pas - qcom,sm6350-adsp-pas - qcom,sm8150-slpi-pas - qcom,sm8250-adsp-pas @@ -508,6 +518,22 @@ allOf: - const: cx - const: mxc + - if: + properties: + compatible: + contains: + enum: + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas + then: + properties: + power-domains: + items: + - description: NSP power domain + power-domain-names: + items: + - const: nsp + - if: properties: compatible: From 4e55a6cf48119243ca05c16bcb3bd3887a3c68b5 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 8 Apr 2022 15:05:39 -0700 Subject: [PATCH 035/334] remoteproc: qcom: pas: Add sc8280xp remoteprocs Among the subsystems in the Qualcomm sc8280xp platform we find an audio and two compute DSPs. Add support for controlling these using the peripheral authentication service (PAS) remoteproc driver. Signed-off-by: Bjorn Andersson Reviewed-by: Vinod Koul Link: https://lore.kernel.org/r/20220408220539.625301-2-bjorn.andersson@linaro.org --- drivers/remoteproc/qcom_q6v5_pas.c | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 1ae47cc153e5..06c6dc34f2e0 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -704,6 +704,36 @@ static const struct adsp_data sm8250_cdsp_resource = { .ssctl_id = 0x17, }; +static const struct adsp_data sc8280xp_nsp0_resource = { + .crash_reason_smem = 601, + .firmware_name = "cdsp.mdt", + .pas_id = 18, + .has_aggre2_clk = false, + .auto_boot = true, + .proxy_pd_names = (char*[]){ + "nsp", + NULL + }, + .ssr_name = "cdsp0", + .sysmon_name = "cdsp", + .ssctl_id = 0x17, +}; + +static const struct adsp_data sc8280xp_nsp1_resource = { + .crash_reason_smem = 633, + .firmware_name = "cdsp.mdt", + .pas_id = 30, + .has_aggre2_clk = false, + .auto_boot = true, + .proxy_pd_names = (char*[]){ + "nsp", + NULL + }, + .ssr_name = "cdsp1", + .sysmon_name = "cdsp1", + .ssctl_id = 0x20, +}; + static const struct adsp_data sm8350_cdsp_resource = { .crash_reason_smem = 601, .firmware_name = "cdsp.mdt", @@ -861,6 +891,9 @@ static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,sc8180x-adsp-pas", .data = &sm8150_adsp_resource}, { .compatible = "qcom,sc8180x-cdsp-pas", .data = &sm8150_cdsp_resource}, { .compatible = "qcom,sc8180x-mpss-pas", .data = &sc8180x_mpss_resource}, + { .compatible = "qcom,sc8280xp-adsp-pas", .data = &sm8250_adsp_resource}, + { .compatible = "qcom,sc8280xp-nsp0-pas", .data = &sc8280xp_nsp0_resource}, + { .compatible = "qcom,sc8280xp-nsp1-pas", .data = &sc8280xp_nsp1_resource}, { .compatible = "qcom,sdm660-adsp-pas", .data = &adsp_resource_init}, { .compatible = "qcom,sdm845-adsp-pas", .data = &sdm845_adsp_resource_init}, { .compatible = "qcom,sdm845-cdsp-pas", .data = &sdm845_cdsp_resource_init}, From 1a358d35066487d228a68303d808bc4721c6b1b9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 22 Apr 2022 12:53:26 +0200 Subject: [PATCH 036/334] rpmsg: qcom_smd: Fix irq_of_parse_and_map() return value The irq_of_parse_and_map() returns 0 on failure, not a negative ERRNO. Fixes: 53e2822e56c7 ("rpmsg: Introduce Qualcomm SMD backend") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220422105326.78713-1-krzysztof.kozlowski@linaro.org --- drivers/rpmsg/qcom_smd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c index 764c980507be..6ccfa12abd10 100644 --- a/drivers/rpmsg/qcom_smd.c +++ b/drivers/rpmsg/qcom_smd.c @@ -1407,7 +1407,7 @@ static int qcom_smd_parse_edge(struct device *dev, edge->name = node->name; irq = irq_of_parse_and_map(node, 0); - if (irq < 0) { + if (!irq) { dev_err(dev, "required smd interrupt missing\n"); ret = irq; goto put_node; From a22bb5526d7dd627b94a7ee22e5a98c36e39fceb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 20 Mar 2022 23:11:17 +0800 Subject: [PATCH 037/334] f2fs: check pinfile in gc_data_segment() in advance In order to skip migrating section which contains data of pinned file in advance. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea5b93b689cd..e83c07144d8f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1480,6 +1480,13 @@ next_step: special_file(inode->i_mode)) continue; + if (is_inode_flag_set(inode, FI_PIN_FILE) && + gc_type == FG_GC) { + f2fs_pin_file_control(inode, true); + iput(inode); + return submitted; + } + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); From 642c0969916eaa4878cb74f36752108e590b0389 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 20 Mar 2022 23:11:18 +0800 Subject: [PATCH 038/334] f2fs: don't set GC_FAILURE_PIN for background GC So that it can reduce the possibility that file be unpinned forcely by foreground GC due to .i_gc_failures[GC_FAILURE_PIN] exceeds threshold. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e83c07144d8f..6a7e4148ff9d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1202,7 +1202,8 @@ static int move_data_block(struct inode *inode, block_t bidx, } if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); + if (gc_type == FG_GC) + f2fs_pin_file_control(inode, true); err = -EAGAIN; goto out; } From c2ca36e82f704cb449020ccd29f1802c7497cc49 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Mar 2022 13:30:04 -0700 Subject: [PATCH 039/334] f2fs: remove unnecessary f2fs_lock_op in f2fs_new_inode This can be removed, since f2fs_alloc_nid() actually doesn't require to block checkpoint and __f2fs_build_free_nids() is covered by nm_i->nat_tree_lock. Suggested-by: Linus Torvalds Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5ed79b29999f..816285414564 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; From c277f1411d7bc2831d48bde5c18d3b7f87c61646 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Mar 2022 15:13:06 -0700 Subject: [PATCH 040/334] f2fs: introduce data read/write showing path info This was used in Android for a long time. Let's upstream it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 58 ++++++++++++++++++++--- include/trace/events/f2fs.h | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5b89af0f27f0..f08e6208e183 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4329,17 +4329,39 @@ out: static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (f2fs_should_use_dio(inode, iocb, to)) - return f2fs_dio_read_iter(iocb, to); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; - ret = filemap_read(iocb, to, 0); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4631,14 +4653,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from, dio); - if (preallocated < 0) + if (preallocated < 0) { ret = preallocated; - else + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync): f2fs_buffered_write_iter(iocb, from); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 1779e133cea0..4d1ad64d4cab 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -2067,6 +2067,100 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret) ); +DECLARE_EVENT_CLASS(f2fs__rw_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command), + + TP_STRUCT__entry( + __string(pathbuf, pathname) + __field(loff_t, offset) + __field(int, bytes) + __field(loff_t, i_size) + __string(cmdline, command) + __field(pid_t, pid) + __field(ino_t, ino) + ), + + TP_fast_assign( + /* + * Replace the spaces in filenames and cmdlines + * because this screws up the tooling that parses + * the traces. + */ + __assign_str(pathbuf, pathname); + (void)strreplace(__get_str(pathbuf), ' ', '_'); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + (void)strreplace(__get_str(cmdline), ' ', '_'); + __entry->pid = pid; + __entry->ino = inode->i_ino; + ), + + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + __get_str(pathbuf), __entry->offset, __entry->bytes, + __get_str(cmdline), __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(f2fs__rw_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes), + + TP_STRUCT__entry( + __field(ino_t, ino) + __field(loff_t, offset) + __field(int, bytes) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + ), + + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ From 12662d19467b391b5b509ac5e9ab4f583c6dde16 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Mar 2022 00:02:53 +0800 Subject: [PATCH 041/334] f2fs: fix to do sanity check on inline_dots inode As Wenqing reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215765 It will cause a kernel panic with steps: - mkdir mnt - mount tmp40.img mnt - ls mnt folio_mark_dirty+0x33/0x50 f2fs_add_regular_entry+0x541/0xad0 [f2fs] f2fs_add_dentry+0x6c/0xb0 [f2fs] f2fs_do_add_link+0x182/0x230 [f2fs] __recover_dot_dentries+0x2d6/0x470 [f2fs] f2fs_lookup+0x5af/0x6a0 [f2fs] __lookup_slow+0xac/0x200 lookup_slow+0x45/0x70 walk_component+0x16c/0x250 path_lookupat+0x8b/0x1f0 filename_lookup+0xef/0x250 user_path_at_empty+0x46/0x70 vfs_statx+0x98/0x190 __do_sys_newlstat+0x41/0x90 __x64_sys_newlstat+0x1a/0x30 do_syscall_64+0x37/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is for special file: e.g. character, block, fifo or socket file, f2fs doesn't assign address space operations pointer array for mapping->a_ops field, so, in a fuzzed image, if inline_dots flag was tagged in special file, during lookup(), when f2fs runs into __recover_dot_dentries(), it will cause NULL pointer access once f2fs_add_regular_entry() calls a_ops->set_dirty_page(). Fixes: 510022a85839 ("f2fs: add F2FS_INLINE_DOTS to recover missing dot dentries") Reported-by: Wenqing Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 816285414564..37bdda931e0c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -458,6 +458,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + err = f2fs_dquot_initialize(dir); if (err) return err; From 2aaf51dd39afb6d01d13f1e6fe20b684733b37d5 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Fri, 1 Apr 2022 00:34:14 +0200 Subject: [PATCH 042/334] f2fs: fix dereference of stale list iterator after loop body The list iterator variable will be a bogus pointer if no break was hit. Dereferencing it (cur->page in this case) could load an out-of-bounds/undefined value making it unsafe to use that in the comparision to determine if the specific element was found. Since 'cur->page' *can* be out-ouf-bounds it cannot be guaranteed that by chance (or intention of an attacker) it matches the value of 'page' even though the correct element was not found. This is fixed by using a separate list iterator variable for the loop and only setting the original variable if a suitable element was found. Then determing if the element was found is simply checking if the variable is set. Fixes: 8c242db9b8c0 ("f2fs: fix stale ATOMIC_WRITTEN_PAGE private pointer") Signed-off-by: Jakob Koschel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bd9731cdec56..9dd9f88b75e9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -355,16 +355,19 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct list_head *head = &fi->inmem_pages; struct inmem_pages *cur = NULL; + struct inmem_pages *tmp; f2fs_bug_on(sbi, !page_private_atomic(page)); mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) + list_for_each_entry(tmp, head, list) { + if (tmp->page == page) { + cur = tmp; break; + } } - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); + f2fs_bug_on(sbi, !cur); list_del(&cur->list); mutex_unlock(&fi->inmem_lock); From df35435d4144ae3a8afeedf45ea43c5cc63a70eb Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Tue, 12 Apr 2022 14:20:39 +0200 Subject: [PATCH 043/334] f2fs: Remove usage of list iterator pas the loop for list_move_tail() In preparation to limit the scope of a list iterator to the list traversal loop, the usage of the list iterator variable 'next' should be avoided past the loop body [1]. Instead of calling list_move_tail() on 'next' after the loop, it is called within the loop if the correct location was found. After the loop it covers the case if no location was found and it should be inserted based on the 'head' of the list. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9dd9f88b75e9..f344c1d65d2c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4089,10 +4089,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) From 9e3a845df9ea56387d2a6011299d44ddf21b3322 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Tue, 12 Apr 2022 14:20:40 +0200 Subject: [PATCH 044/334] f2fs: replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean [1]. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f344c1d65d2c..c9b3224ef936 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1669,33 +1669,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } From c2eecefec5df1306eafce28ccdf1ca159a552ecc Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Tue, 26 Apr 2022 14:05:34 +0800 Subject: [PATCH 045/334] rpmsg: virtio: Fix possible double free in rpmsg_probe() vch will be free in virtio_rpmsg_release_device() when rpmsg_ns_register_device() fails. There is no need to call kfree() again. Fix this by changing error path from free_vch to free_ctrldev. Fixes: c486682ae1e2 ("rpmsg: virtio: Register the rpmsg_char device") Signed-off-by: Hangyu Hua Tested-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20220426060536.15594-2-hbh25y@gmail.com Signed-off-by: Mathieu Poirier --- drivers/rpmsg/virtio_rpmsg_bus.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 3ede25b1f2e4..d4e453062062 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -973,7 +973,8 @@ static int rpmsg_probe(struct virtio_device *vdev) err = rpmsg_ns_register_device(rpdev_ns); if (err) - goto free_vch; + /* vch will be free in virtio_rpmsg_release_device() */ + goto free_ctrldev; } /* @@ -997,8 +998,6 @@ static int rpmsg_probe(struct virtio_device *vdev) return 0; -free_vch: - kfree(vch); free_ctrldev: rpmsg_virtio_del_ctrl_dev(rpdev_ctrl); free_coherent: From 1680939e9ecf7764fba8689cfb3429c2fe2bb23c Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Tue, 26 Apr 2022 14:05:35 +0800 Subject: [PATCH 046/334] rpmsg: virtio: Fix possible double free in rpmsg_virtio_add_ctrl_dev() vch will be free in virtio_rpmsg_release_device() when rpmsg_ctrldev_register_device() fails. There is no need to call kfree() again. Fixes: c486682ae1e2 ("rpmsg: virtio: Register the rpmsg_char device") Signed-off-by: Hangyu Hua Tested-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20220426060536.15594-3-hbh25y@gmail.com Signed-off-by: Mathieu Poirier --- drivers/rpmsg/virtio_rpmsg_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index d4e453062062..9948a7335b83 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -851,7 +851,7 @@ static struct rpmsg_device *rpmsg_virtio_add_ctrl_dev(struct virtio_device *vdev err = rpmsg_ctrldev_register_device(rpdev_ctrl); if (err) { - kfree(vch); + /* vch will be free in virtio_rpmsg_release_device() */ return ERR_PTR(err); } From df191796985922488e4e6b64f7bd79c3934412f2 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Tue, 26 Apr 2022 14:05:36 +0800 Subject: [PATCH 047/334] rpmsg: virtio: Fix the unregistration of the device rpmsg_ctrl Unregister the rpmsg_ctrl device instead of just freeing the the virtio_rpmsg_channel structure. This will properly unregister the device and call virtio_rpmsg_release_device() that frees the structure. Fixes: c486682ae1e2 ("rpmsg: virtio: Register the rpmsg_char device") Signed-off-by: Arnaud Pouliquen Reviewed-by: Hangyu Hua Link: https://lore.kernel.org/r/20220426060536.15594-4-hbh25y@gmail.com Signed-off-by: Mathieu Poirier --- drivers/rpmsg/virtio_rpmsg_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 9948a7335b83..905ac7910c98 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -862,7 +862,7 @@ static void rpmsg_virtio_del_ctrl_dev(struct rpmsg_device *rpdev_ctrl) { if (!rpdev_ctrl) return; - kfree(to_virtio_rpmsg_channel(rpdev_ctrl)); + device_unregister(&rpdev_ctrl->dev); } static int rpmsg_probe(struct virtio_device *vdev) From 31976eb180a199bc8d3e19b3e493158b1337c579 Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Tue, 19 Apr 2022 20:33:30 +0800 Subject: [PATCH 048/334] dt-bindings: remoteproc: mediatek: Add firmware-name property The SCP needs firmware which differs between other platforms and SoCs. Add a new property "firmware-name" to allow the DT to specify the platform/board specific path to this firmware file. Signed-off-by: Allen-KH Cheng Reviewed-by: AngeloGioacchino Del Regno Acked-by: Rob Herring Link: https://lore.kernel.org/r/20220419123331.14377-2-allen-kh.cheng@mediatek.com Signed-off-by: Mathieu Poirier --- Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index 823a236242de..abc11ac92818 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -45,6 +45,13 @@ properties: interrupts: maxItems: 1 + firmware-name: + $ref: /schemas/types.yaml#/definitions/string + description: + If present, name (or relative path) of the file within the + firmware search path containing the firmware image used when + initializing SCP. + required: - compatible - reg From 1552de67fbf0a51eab127fc15d55e9e1befdb156 Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Tue, 19 Apr 2022 20:33:31 +0800 Subject: [PATCH 049/334] remoteproc: mediatek: Allow reading firmware-name from DT The SCP firmware blob differs between platforms and SoCs. We add support in the SCP driver for reading the path of firmware file from DT in order to allow these files to live in a generic file system (or linux-firmware). The firmware-name property is optional and the code falls back to the old filename if the property isn't present. Signed-off-by: Allen-KH Cheng Reviewed-by: Rex-BC Chen Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20220419123331.14377-3-allen-kh.cheng@mediatek.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/mtk_scp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c index 621174ea7fd6..47b2a40e1b4a 100644 --- a/drivers/remoteproc/mtk_scp.c +++ b/drivers/remoteproc/mtk_scp.c @@ -809,9 +809,13 @@ static int scp_probe(struct platform_device *pdev) struct mtk_scp *scp; struct rproc *rproc; struct resource *res; - char *fw_name = "scp.img"; + const char *fw_name = "scp.img"; int ret, i; + ret = rproc_of_parse_firmware(dev, 0, &fw_name); + if (ret < 0 && ret != -EINVAL) + return ret; + rproc = devm_rproc_alloc(dev, np->name, &scp_ops, fw_name, sizeof(*scp)); if (!rproc) return dev_err_probe(dev, -ENOMEM, "unable to allocate remoteproc\n"); From 59d6f72f6f9c92fec8757d9e29527da828e9281f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 23 Apr 2022 11:39:32 +0200 Subject: [PATCH 050/334] rpmsg: qcom_smd: Fix returning 0 if irq_of_parse_and_map() fails irq_of_parse_and_map() returns 0 on failure, so this should not be passed further as error return code. Fixes: 1a358d350664 ("rpmsg: qcom_smd: Fix irq_of_parse_and_map() return value") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220423093932.32136-1-krzysztof.kozlowski@linaro.org --- drivers/rpmsg/qcom_smd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c index 6ccfa12abd10..1957b27c4cf3 100644 --- a/drivers/rpmsg/qcom_smd.c +++ b/drivers/rpmsg/qcom_smd.c @@ -1409,7 +1409,7 @@ static int qcom_smd_parse_edge(struct device *dev, irq = irq_of_parse_and_map(node, 0); if (!irq) { dev_err(dev, "required smd interrupt missing\n"); - ret = irq; + ret = -EINVAL; goto put_node; } From e56a4be2843c95c08cf8421dc1f8e880cafbaf91 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 25 Apr 2022 00:13:01 +0200 Subject: [PATCH 051/334] power: supply: core: Initialize struct to zero As we rely on pointers in the battery info to be zero-initialized such as in the helper function power_supply_supports_vbat2ri() we certainly need to allocate the struct power_supply_battery_info with kzalloc() as well. Else this happens: Unable to handle kernel paging request at virtual address 00280000 (...) PC is at power_supply_vbat2ri+0x50/0x12c LR is at ab8500_fg_battery_resistance+0x34/0x108 Fixes: e9e7d165b4b0 ("power: supply: Support VBAT-to-Ri lookup tables") Signed-off-by: Linus Walleij Signed-off-by: Sebastian Reichel --- drivers/power/supply/power_supply_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index d925cb137e12..fad5890c899e 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -616,7 +616,7 @@ int power_supply_get_battery_info(struct power_supply *psy, goto out_put_node; } - info = devm_kmalloc(&psy->dev, sizeof(*info), GFP_KERNEL); + info = devm_kzalloc(&psy->dev, sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; goto out_put_node; From 34f243e9fb5ace1ca760c72e366247eaeff430c0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 2 May 2022 13:12:34 +0200 Subject: [PATCH 052/334] power: supply: axp288_fuel_gauge: Fix battery reporting on the One Mix 1 Commit 3a06b912a5ce ("power: supply: axp288_fuel_gauge: Make "T3 MRD" no_battery_list DMI entry more generic") added a generic no-battery DMI match for many mini-PCs / HDMI-sticks which use "T3 MRD" as their DMI board-name. It turns out that the One Mix 1 mini laptop also uses "T3 MRD" for its DMI boardname and it also has its chassis-type wrongly set to a value of "3" (desktop). This was causing the axp288_fuel_gauge driver to disable battery reporting because this matches the no-battery DMI list entry for generic "T3 MRD" mini-PCs. Change the no-battery DMI list into a quirks DMI list and add a specific match for the One Mix 1 mini laptop before the generic "T3 MRD" no-battery quirk entry to fix this. Fixes: 3a06b912a5ce ("power: supply: axp288_fuel_gauge: Make "T3 MRD" no_battery_list DMI entry more generic") Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- drivers/power/supply/axp288_fuel_gauge.c | 40 +++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/power/supply/axp288_fuel_gauge.c b/drivers/power/supply/axp288_fuel_gauge.c index e9f285dae489..5b8aa4a980cd 100644 --- a/drivers/power/supply/axp288_fuel_gauge.c +++ b/drivers/power/supply/axp288_fuel_gauge.c @@ -90,6 +90,8 @@ #define AXP288_REG_UPDATE_INTERVAL (60 * HZ) #define AXP288_FG_INTR_NUM 6 +#define AXP288_QUIRK_NO_BATTERY BIT(0) + static bool no_current_sense_res; module_param(no_current_sense_res, bool, 0444); MODULE_PARM_DESC(no_current_sense_res, "No (or broken) current sense resistor"); @@ -524,7 +526,7 @@ static struct power_supply_desc fuel_gauge_desc = { * detection reports one despite it not being there. * Please keep this listed sorted alphabetically. */ -static const struct dmi_system_id axp288_no_battery_list[] = { +static const struct dmi_system_id axp288_quirks[] = { { /* ACEPC T8 Cherry Trail Z8350 mini PC */ .matches = { @@ -534,6 +536,7 @@ static const struct dmi_system_id axp288_no_battery_list[] = { /* also match on somewhat unique bios-version */ DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1.000"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* ACEPC T11 Cherry Trail Z8350 mini PC */ @@ -544,6 +547,7 @@ static const struct dmi_system_id axp288_no_battery_list[] = { /* also match on somewhat unique bios-version */ DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1.000"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Intel Cherry Trail Compute Stick, Windows version */ @@ -551,6 +555,7 @@ static const struct dmi_system_id axp288_no_battery_list[] = { DMI_MATCH(DMI_SYS_VENDOR, "Intel"), DMI_MATCH(DMI_PRODUCT_NAME, "STK1AW32SC"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Intel Cherry Trail Compute Stick, version without an OS */ @@ -558,34 +563,55 @@ static const struct dmi_system_id axp288_no_battery_list[] = { DMI_MATCH(DMI_SYS_VENDOR, "Intel"), DMI_MATCH(DMI_PRODUCT_NAME, "STK1A32SC"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Meegopad T02 */ .matches = { DMI_MATCH(DMI_PRODUCT_NAME, "MEEGOPAD T02"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Mele PCG03 Mini PC */ .matches = { DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "Mini PC"), DMI_EXACT_MATCH(DMI_BOARD_NAME, "Mini PC"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Minix Neo Z83-4 mini PC */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "MINIX"), DMI_MATCH(DMI_PRODUCT_NAME, "Z83-4"), - } + }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { - /* Various Ace PC/Meegopad/MinisForum/Wintel Mini-PCs/HDMI-sticks */ + /* + * One Mix 1, this uses the "T3 MRD" boardname used by + * generic mini PCs, but it is a mini laptop so it does + * actually have a battery! + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "T3 MRD"), + DMI_MATCH(DMI_BIOS_DATE, "06/14/2018"), + }, + .driver_data = NULL, + }, + { + /* + * Various Ace PC/Meegopad/MinisForum/Wintel Mini-PCs/HDMI-sticks + * This entry must be last because it is generic, this allows + * adding more specifuc quirks overriding this generic entry. + */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "T3 MRD"), DMI_MATCH(DMI_CHASSIS_TYPE, "3"), DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), DMI_MATCH(DMI_BIOS_VERSION, "5.11"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, {} }; @@ -665,7 +691,9 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev) [BAT_D_CURR] = "axp288-chrg-d-curr", [BAT_VOLT] = "axp288-batt-volt", }; + const struct dmi_system_id *dmi_id; struct device *dev = &pdev->dev; + unsigned long quirks = 0; int i, pirq, ret; /* @@ -675,7 +703,11 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev) if (!acpi_quirk_skip_acpi_ac_and_battery()) return -ENODEV; - if (dmi_check_system(axp288_no_battery_list)) + dmi_id = dmi_first_match(axp288_quirks); + if (dmi_id) + quirks = (unsigned long)dmi_id->driver_data; + + if (quirks & AXP288_QUIRK_NO_BATTERY) return -ENODEV; info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); From f61509a6f0b70f5bedea34efaf8065621689bd7a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 2 May 2022 13:12:35 +0200 Subject: [PATCH 053/334] power: supply: axp288_fuel_gauge: Drop BIOS version check from "T3 MRD" DMI quirk Some "T3 MRD" mini-PCs / HDMI-sticks without a battery use a different value then "5.11" for their DMI BIOS version field. Drop the BIOS version check so that the no-battery "T3 MRD" DMI quirk applies to these too. Fixes: 3a06b912a5ce ("power: supply: axp288_fuel_gauge: Make "T3 MRD" no_battery_list DMI entry more generic") Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- drivers/power/supply/axp288_fuel_gauge.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/power/supply/axp288_fuel_gauge.c b/drivers/power/supply/axp288_fuel_gauge.c index 5b8aa4a980cd..8e6f8a655079 100644 --- a/drivers/power/supply/axp288_fuel_gauge.c +++ b/drivers/power/supply/axp288_fuel_gauge.c @@ -609,7 +609,6 @@ static const struct dmi_system_id axp288_quirks[] = { DMI_MATCH(DMI_BOARD_NAME, "T3 MRD"), DMI_MATCH(DMI_CHASSIS_TYPE, "3"), DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - DMI_MATCH(DMI_BIOS_VERSION, "5.11"), }, .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, From 010ddb813f3554cbbf8bd13b731452236a2c8017 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 23 Apr 2022 19:27:27 +0200 Subject: [PATCH 054/334] power: supply: ab8500_fg: Allocate wq in probe The workqueue is allocated in bind() but all interrupts are registered in probe(). Some interrupts put work on the workqueue, which can have bad side effects. Allocate the workqueue in probe() instead, destroy it in .remove() and make unbind() simply flush the workqueue. Fixes: 1c1f13a006ed ("power: supply: ab8500: Move to componentized binding") Signed-off-by: Linus Walleij Signed-off-by: Sebastian Reichel --- drivers/power/supply/ab8500_fg.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/power/supply/ab8500_fg.c b/drivers/power/supply/ab8500_fg.c index 97ac588a9e9c..ec8a404d71b4 100644 --- a/drivers/power/supply/ab8500_fg.c +++ b/drivers/power/supply/ab8500_fg.c @@ -3037,13 +3037,6 @@ static int ab8500_fg_bind(struct device *dev, struct device *master, { struct ab8500_fg *di = dev_get_drvdata(dev); - /* Create a work queue for running the FG algorithm */ - di->fg_wq = alloc_ordered_workqueue("ab8500_fg_wq", WQ_MEM_RECLAIM); - if (di->fg_wq == NULL) { - dev_err(dev, "failed to create work queue\n"); - return -ENOMEM; - } - di->bat_cap.max_mah_design = di->bm->bi->charge_full_design_uah; di->bat_cap.max_mah = di->bat_cap.max_mah_design; di->vbat_nom_uv = di->bm->bi->voltage_max_design_uv; @@ -3067,8 +3060,7 @@ static void ab8500_fg_unbind(struct device *dev, struct device *master, if (ret) dev_err(dev, "failed to disable coulomb counter\n"); - destroy_workqueue(di->fg_wq); - flush_scheduled_work(); + flush_workqueue(di->fg_wq); } static const struct component_ops ab8500_fg_component_ops = { @@ -3117,6 +3109,13 @@ static int ab8500_fg_probe(struct platform_device *pdev) ab8500_fg_charge_state_to(di, AB8500_FG_CHARGE_INIT); ab8500_fg_discharge_state_to(di, AB8500_FG_DISCHARGE_INIT); + /* Create a work queue for running the FG algorithm */ + di->fg_wq = alloc_ordered_workqueue("ab8500_fg_wq", WQ_MEM_RECLAIM); + if (di->fg_wq == NULL) { + dev_err(dev, "failed to create work queue\n"); + return -ENOMEM; + } + /* Init work for running the fg algorithm instantly */ INIT_WORK(&di->fg_work, ab8500_fg_instant_work); @@ -3227,6 +3226,8 @@ static int ab8500_fg_remove(struct platform_device *pdev) { struct ab8500_fg *di = platform_get_drvdata(pdev); + destroy_workqueue(di->fg_wq); + flush_scheduled_work(); component_del(&pdev->dev, &ab8500_fg_component_ops); list_del(&di->node); ab8500_fg_sysfs_exit(di); From 4bd69ecfa672e94334618ef9ec653b2a237cf2d9 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 22 Apr 2022 14:22:23 -0500 Subject: [PATCH 055/334] dt-bindings: i3c: Convert cdns,i3c-master to DT schema Convert the Cadence I3C master to DT schema format. This fixes a warning as it is used in the i3c.yaml example. The "nintendo,nunchuk" is not documented by a schema, so change the example child device to something which is documented. Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220422192224.2594098-1-robh@kernel.org --- .../bindings/i3c/cdns,i3c-master.txt | 43 ------------- .../bindings/i3c/cdns,i3c-master.yaml | 60 +++++++++++++++++++ 2 files changed, 60 insertions(+), 43 deletions(-) delete mode 100644 Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt create mode 100644 Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml diff --git a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt deleted file mode 100644 index 3716589d6999..000000000000 --- a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt +++ /dev/null @@ -1,43 +0,0 @@ -Bindings for cadence I3C master block -===================================== - -Required properties: --------------------- -- compatible: shall be "cdns,i3c-master" -- clocks: shall reference the pclk and sysclk -- clock-names: shall contain "pclk" and "sysclk" -- interrupts: the interrupt line connected to this I3C master -- reg: I3C master registers - -Mandatory properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- #address-cells: shall be set to 1 -- #size-cells: shall be set to 0 - -Optional properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- i2c-scl-hz -- i3c-scl-hz - -I3C device connected on the bus follow the generic description (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details). - -Example: - - i3c-master@0d040000 { - compatible = "cdns,i3c-master"; - clocks = <&coreclock>, <&i3csysclock>; - clock-names = "pclk", "sysclk"; - interrupts = <3 0>; - reg = <0x0d040000 0x1000>; - #address-cells = <1>; - #size-cells = <0>; - i2c-scl-hz = <100000>; - - nunchuk: nunchuk@52 { - compatible = "nintendo,nunchuk"; - reg = <0x52 0x0 0x10>; - }; - }; diff --git a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml new file mode 100644 index 000000000000..cc40d25358ec --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/cdns,i3c-master.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Cadence I3C master block + +maintainers: + - Boris Brezillon + +allOf: + - $ref: i3c.yaml# + +properties: + compatible: + const: cdns,i3c-master + + reg: + maxItems: 1 + + clocks: + maxItems: 2 + + clock-names: + items: + - const: pclk + - const: sysclk + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + +unevaluatedProperties: false + +examples: + - | + i3c-master@d040000 { + compatible = "cdns,i3c-master"; + clocks = <&coreclock>, <&i3csysclock>; + clock-names = "pclk", "sysclk"; + interrupts = <3 0>; + reg = <0x0d040000 0x1000>; + #address-cells = <3>; + #size-cells = <0>; + i2c-scl-hz = <100000>; + + eeprom@57{ + compatible = "atmel,24c01"; + reg = <0x57 0x0 0x10>; + pagesize = <0x8>; + }; + }; +... From 6742ca620bd929ee96bd6724692132056203fb95 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 22 Apr 2022 14:22:36 -0500 Subject: [PATCH 056/334] dt-bindings: i3c: Convert snps,dw-i3c-master to DT schema Convert the Synopsys Designware I3C master to DT schema format. Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220422192236.2594577-1-robh@kernel.org --- .../bindings/i3c/snps,dw-i3c-master.txt | 41 --------------- .../bindings/i3c/snps,dw-i3c-master.yaml | 52 +++++++++++++++++++ 2 files changed, 52 insertions(+), 41 deletions(-) delete mode 100644 Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt create mode 100644 Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml diff --git a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt deleted file mode 100644 index 07f35f36085d..000000000000 --- a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt +++ /dev/null @@ -1,41 +0,0 @@ -Bindings for Synopsys DesignWare I3C master block -================================================= - -Required properties: --------------------- -- compatible: shall be "snps,dw-i3c-master-1.00a" -- clocks: shall reference the core_clk -- interrupts: the interrupt line connected to this I3C master -- reg: Offset and length of I3C master registers - -Mandatory properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- #address-cells: shall be set to 3 -- #size-cells: shall be set to 0 - -Optional properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- i2c-scl-hz -- i3c-scl-hz - -I3C device connected on the bus follow the generic description (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details). - -Example: - - i3c-master@2000 { - compatible = "snps,dw-i3c-master-1.00a"; - #address-cells = <3>; - #size-cells = <0>; - reg = <0x02000 0x1000>; - interrupts = <0>; - clocks = <&i3cclk>; - - eeprom@57{ - compatible = "atmel,24c01"; - reg = <0x57 0x0 0x10>; - pagesize = <0x8>; - }; - }; diff --git a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml new file mode 100644 index 000000000000..7a76fd32962a --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/snps,dw-i3c-master.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Synopsys DesignWare I3C master block + +maintainers: + - Alexandre Belloni + +allOf: + - $ref: i3c.yaml# + +properties: + compatible: + const: snps,dw-i3c-master-1.00a + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - interrupts + +unevaluatedProperties: false + +examples: + - | + i3c-master@2000 { + compatible = "snps,dw-i3c-master-1.00a"; + #address-cells = <3>; + #size-cells = <0>; + reg = <0x02000 0x1000>; + interrupts = <0>; + clocks = <&i3cclk>; + + eeprom@57{ + compatible = "atmel,24c01"; + reg = <0x57 0x0 0x10>; + pagesize = <0x8>; + }; + }; +... From 68fdbe090c362e8be23890a7333d156e18c27781 Mon Sep 17 00:00:00 2001 From: "Sicelo A. Mhlongo" Date: Wed, 20 Apr 2022 14:30:59 +0200 Subject: [PATCH 057/334] power: supply: bq27xxx: expose battery data when CI=1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Capacity Inaccurate flag is set, the chip still provides data about the battery, albeit inaccurate. Instead of discarding capacity values for CI=1, expose the stale data and use the POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED property to indicate that the values should be used with care. Reviewed-by: Pali Rohár Signed-off-by: Sicelo A. Mhlongo Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 58 ++++++++++++-------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 72e727cd31e8..35e6a394c0df 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1572,14 +1572,6 @@ static int bq27xxx_battery_read_charge(struct bq27xxx_device_info *di, u8 reg) */ static inline int bq27xxx_battery_read_nac(struct bq27xxx_device_info *di) { - int flags; - - if (di->opts & BQ27XXX_O_ZERO) { - flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, true); - if (flags >= 0 && (flags & BQ27000_FLAG_CI)) - return -ENODATA; - } - return bq27xxx_battery_read_charge(di, BQ27XXX_REG_NAC); } @@ -1742,6 +1734,18 @@ static bool bq27xxx_battery_dead(struct bq27xxx_device_info *di, u16 flags) return flags & (BQ27XXX_FLAG_SOC1 | BQ27XXX_FLAG_SOCF); } +/* + * Returns true if reported battery capacity is inaccurate + */ +static bool bq27xxx_battery_capacity_inaccurate(struct bq27xxx_device_info *di, + u16 flags) +{ + if (di->opts & BQ27XXX_O_HAS_CI) + return (flags & BQ27000_FLAG_CI); + else + return false; +} + static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di) { /* Unlikely but important to return first */ @@ -1751,6 +1755,8 @@ static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di) return POWER_SUPPLY_HEALTH_COLD; if (unlikely(bq27xxx_battery_dead(di, di->cache.flags))) return POWER_SUPPLY_HEALTH_DEAD; + if (unlikely(bq27xxx_battery_capacity_inaccurate(di, di->cache.flags))) + return POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED; return POWER_SUPPLY_HEALTH_GOOD; } @@ -1758,7 +1764,6 @@ static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di) void bq27xxx_battery_update(struct bq27xxx_device_info *di) { struct bq27xxx_reg_cache cache = {0, }; - bool has_ci_flag = di->opts & BQ27XXX_O_HAS_CI; bool has_singe_flag = di->opts & BQ27XXX_O_ZERO; cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); @@ -1766,30 +1771,19 @@ void bq27xxx_battery_update(struct bq27xxx_device_info *di) cache.flags = -1; /* read error */ if (cache.flags >= 0) { cache.temperature = bq27xxx_battery_read_temperature(di); - if (has_ci_flag && (cache.flags & BQ27000_FLAG_CI)) { - dev_info_once(di->dev, "battery is not calibrated! ignoring capacity values\n"); - cache.capacity = -ENODATA; - cache.energy = -ENODATA; - cache.time_to_empty = -ENODATA; - cache.time_to_empty_avg = -ENODATA; - cache.time_to_full = -ENODATA; - cache.charge_full = -ENODATA; - cache.health = -ENODATA; - } else { - if (di->regs[BQ27XXX_REG_TTE] != INVALID_REG_ADDR) - cache.time_to_empty = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE); - if (di->regs[BQ27XXX_REG_TTECP] != INVALID_REG_ADDR) - cache.time_to_empty_avg = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTECP); - if (di->regs[BQ27XXX_REG_TTF] != INVALID_REG_ADDR) - cache.time_to_full = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTF); + if (di->regs[BQ27XXX_REG_TTE] != INVALID_REG_ADDR) + cache.time_to_empty = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE); + if (di->regs[BQ27XXX_REG_TTECP] != INVALID_REG_ADDR) + cache.time_to_empty_avg = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTECP); + if (di->regs[BQ27XXX_REG_TTF] != INVALID_REG_ADDR) + cache.time_to_full = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTF); - cache.charge_full = bq27xxx_battery_read_fcc(di); - cache.capacity = bq27xxx_battery_read_soc(di); - if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR) - cache.energy = bq27xxx_battery_read_energy(di); - di->cache.flags = cache.flags; - cache.health = bq27xxx_battery_read_health(di); - } + cache.charge_full = bq27xxx_battery_read_fcc(di); + cache.capacity = bq27xxx_battery_read_soc(di); + if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR) + cache.energy = bq27xxx_battery_read_energy(di); + di->cache.flags = cache.flags; + cache.health = bq27xxx_battery_read_health(di); if (di->regs[BQ27XXX_REG_CYCT] != INVALID_REG_ADDR) cache.cycle_count = bq27xxx_battery_read_cyct(di); From d96a89407e5f682d1cb22569d91784506c784863 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Tue, 12 Apr 2022 08:30:44 +0000 Subject: [PATCH 058/334] power: supply: bq24190_charger: using pm_runtime_resume_and_get instead of pm_runtime_get_sync Using pm_runtime_resume_and_get is more appropriate for simplifing code Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq24190_charger.c | 63 +++++++++----------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index aa1a589eb9f2..27f5c7648617 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -455,11 +455,9 @@ static ssize_t bq24190_sysfs_show(struct device *dev, if (!info) return -EINVAL; - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } ret = bq24190_read_mask(bdi, info->reg, info->mask, info->shift, &v); if (ret) @@ -490,11 +488,9 @@ static ssize_t bq24190_sysfs_store(struct device *dev, if (ret < 0) return ret; - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } ret = bq24190_write_mask(bdi, info->reg, info->mask, info->shift, v); if (ret) @@ -512,10 +508,9 @@ static int bq24190_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) union power_supply_propval val = { .intval = bdi->charge_type }; int ret; - ret = pm_runtime_get_sync(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); if (ret < 0) { dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret); - pm_runtime_put_noidle(bdi->dev); return ret; } @@ -551,10 +546,9 @@ static int bq24190_vbus_is_enabled(struct regulator_dev *dev) int ret; u8 val; - ret = pm_runtime_get_sync(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); if (ret < 0) { dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret); - pm_runtime_put_noidle(bdi->dev); return ret; } @@ -1128,11 +1122,9 @@ static int bq24190_charger_get_property(struct power_supply *psy, dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_CHARGE_TYPE: @@ -1204,11 +1196,9 @@ static int bq24190_charger_set_property(struct power_supply *psy, dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_ONLINE: @@ -1477,11 +1467,9 @@ static int bq24190_battery_get_property(struct power_supply *psy, dev_warn(bdi->dev, "warning: /sys/class/power_supply/bq24190-battery is deprecated\n"); dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_STATUS: @@ -1525,11 +1513,9 @@ static int bq24190_battery_set_property(struct power_supply *psy, dev_warn(bdi->dev, "warning: /sys/class/power_supply/bq24190-battery is deprecated\n"); dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_ONLINE: @@ -1683,10 +1669,9 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data) int error; bdi->irq_event = true; - error = pm_runtime_get_sync(bdi->dev); + error = pm_runtime_resume_and_get(bdi->dev); if (error < 0) { dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); return IRQ_NONE; } bq24190_check_status(bdi); @@ -1921,11 +1906,9 @@ static int bq24190_remove(struct i2c_client *client) struct bq24190_dev_info *bdi = i2c_get_clientdata(client); int error; - error = pm_runtime_get_sync(bdi->dev); - if (error < 0) { + error = pm_runtime_resume_and_get(bdi->dev); + if (error < 0) dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); - } bq24190_register_reset(bdi); if (bdi->battery) @@ -1982,11 +1965,9 @@ static __maybe_unused int bq24190_pm_suspend(struct device *dev) struct bq24190_dev_info *bdi = i2c_get_clientdata(client); int error; - error = pm_runtime_get_sync(bdi->dev); - if (error < 0) { + error = pm_runtime_resume_and_get(bdi->dev); + if (error < 0) dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); - } bq24190_register_reset(bdi); @@ -2007,11 +1988,9 @@ static __maybe_unused int bq24190_pm_resume(struct device *dev) bdi->f_reg = 0; bdi->ss_reg = BQ24190_REG_SS_VBUS_STAT_MASK; /* impossible state */ - error = pm_runtime_get_sync(bdi->dev); - if (error < 0) { + error = pm_runtime_resume_and_get(bdi->dev); + if (error < 0) dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); - } bq24190_register_reset(bdi); bq24190_set_config(bdi); From b51431850f5bf456a1887b4bdfd3813c75641c56 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sat, 23 Apr 2022 17:50:55 +0200 Subject: [PATCH 059/334] dt-bindings: remoteproc: qcom: pas: Add MSM8226 adsp Add the compatible for the adsp found in MSM8226. Signed-off-by: Luca Weiss Acked-by: Krzysztof Kozlowski Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220423155059.660387-1-luca@z3ntu.xyz --- Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml index df8286296c37..947f94548d0e 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml @@ -16,6 +16,7 @@ description: properties: compatible: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil @@ -162,6 +163,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil @@ -280,6 +282,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil @@ -373,6 +376,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8998-adsp-pas then: @@ -572,6 +576,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil From fb4f07cc93995ebad7725512c0edc903c693037f Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sat, 23 Apr 2022 17:50:56 +0200 Subject: [PATCH 060/334] remoteproc: qcom: pas: Add MSM8226 ADSP support Add a config for the ADSP present on MSM8226. Signed-off-by: Luca Weiss Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220423155059.660387-2-luca@z3ntu.xyz --- drivers/remoteproc/qcom_q6v5_pas.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 06c6dc34f2e0..6ae39c5653b1 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -878,6 +878,7 @@ static const struct adsp_data sdx55_mpss_resource = { }; static const struct of_device_id adsp_of_match[] = { + { .compatible = "qcom,msm8226-adsp-pil", .data = &adsp_resource_init}, { .compatible = "qcom,msm8974-adsp-pil", .data = &adsp_resource_init}, { .compatible = "qcom,msm8996-adsp-pil", .data = &msm8996_adsp_resource}, { .compatible = "qcom,msm8996-slpi-pil", .data = &slpi_resource_init}, From dc2f78e2d4cc844a1458653d57ce1b54d4a29f21 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 15 Apr 2022 21:19:02 +0800 Subject: [PATCH 061/334] f2fs: remove WARN_ON in f2fs_is_valid_blkaddr Syzbot triggers two WARNs in f2fs_is_valid_blkaddr and __is_bitmap_valid. For example, in f2fs_is_valid_blkaddr, if type is DATA_GENERIC_ENHANCE or DATA_GENERIC_ENHANCE_READ, it invokes WARN_ON if blkaddr is not in the right range. The call trace is as follows: f2fs_get_node_info+0x45f/0x1070 read_node_page+0x577/0x1190 __get_node_page.part.0+0x9e/0x10e0 __get_node_page f2fs_get_node_page+0x109/0x180 do_read_inode f2fs_iget+0x2a5/0x58b0 f2fs_fill_super+0x3b39/0x7ca0 Fix these two WARNs by replacing WARN_ON with dump_stack. Reported-by: syzbot+763ae12a2ede1d99d4dc@syzkaller.appspotmail.com Signed-off-by: Dongliang Mu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 909085a78f9c..71b1e93cbe0c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -158,7 +158,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -196,7 +196,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); From c550e25bca660ed2554cbb48d32b82d0bb98e4b1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 18 Apr 2022 16:57:44 -0700 Subject: [PATCH 062/334] f2fs: use flush command instead of FUA for zoned device The block layer for zoned disk can reorder the FUA'ed IOs. Let's use flush command to keep the write order. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- fs/f2fs/node.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f08e6208e183..eae2e7908072 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -372,7 +372,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c45d341dcf6e..144f9f966690 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1631,7 +1631,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ From a7b8618aa2f0f926ce85f2486ac835a85c753ca7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 29 Mar 2022 16:25:54 -0700 Subject: [PATCH 063/334] f2fs: avoid infinite loop to flush node pages xfstests/generic/475 can give EIO all the time which give an infinite loop to flush node page like below. Let's avoid it. [16418.518551] Call Trace: [16418.518553] ? dm_submit_bio+0x48/0x400 [16418.518574] ? submit_bio_checks+0x1ac/0x5a0 [16418.525207] __submit_bio+0x1a9/0x230 [16418.525210] ? kmem_cache_alloc+0x29e/0x3c0 [16418.525223] submit_bio_noacct+0xa8/0x2b0 [16418.525226] submit_bio+0x4d/0x130 [16418.525238] __submit_bio+0x49/0x310 [f2fs] [16418.525339] ? bio_add_page+0x6a/0x90 [16418.525344] f2fs_submit_page_bio+0x134/0x1f0 [f2fs] [16418.525365] read_node_page+0x125/0x1b0 [f2fs] [16418.525388] __get_node_page.part.0+0x58/0x3f0 [f2fs] [16418.525409] __get_node_page+0x2f/0x60 [f2fs] [16418.525431] f2fs_get_dnode_of_data+0x423/0x860 [f2fs] [16418.525452] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [16418.525458] ? __mod_memcg_state.part.0+0x2a/0x30 [16418.525465] ? __mod_memcg_lruvec_state+0x27/0x40 [16418.525467] ? __xa_set_mark+0x57/0x70 [16418.525472] f2fs_do_write_data_page+0x10e/0x7b0 [f2fs] [16418.525493] f2fs_write_single_data_page+0x555/0x830 [f2fs] [16418.525514] ? sysvec_apic_timer_interrupt+0x4e/0x90 [16418.525518] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 [16418.525523] f2fs_write_cache_pages+0x303/0x880 [f2fs] [16418.525545] ? blk_flush_plug_list+0x47/0x100 [16418.525548] f2fs_write_data_pages+0xfd/0x320 [f2fs] [16418.525569] do_writepages+0xd5/0x210 [16418.525648] filemap_fdatawrite_wbc+0x7d/0xc0 [16418.525655] filemap_fdatawrite+0x50/0x70 [16418.525658] f2fs_sync_dirty_inodes+0xa4/0x230 [f2fs] [16418.525679] f2fs_write_checkpoint+0x16d/0x1720 [f2fs] [16418.525699] ? ttwu_do_wakeup+0x1c/0x160 [16418.525709] ? ttwu_do_activate+0x6d/0xd0 [16418.525711] ? __wait_for_common+0x11d/0x150 [16418.525715] kill_f2fs_super+0xca/0x100 [f2fs] [16418.525733] deactivate_locked_super+0x3b/0xb0 [16418.525739] deactivate_super+0x40/0x50 [16418.525741] cleanup_mnt+0x139/0x190 [16418.525747] __cleanup_mnt+0x12/0x20 [16418.525749] task_work_run+0x6d/0xa0 [16418.525765] exit_to_user_mode_prepare+0x1ad/0x1b0 [16418.525771] syscall_exit_to_user_mode+0x27/0x50 [16418.525774] do_syscall_64+0x48/0xc0 [16418.525776] entry_SYSCALL_64_after_hwframe+0x44/0xae Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +------- fs/f2fs/f2fs.h | 23 +++++++++++++++++++---- fs/f2fs/node.c | 23 ++++++++++++----------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 71b1e93cbe0c..beceac9885c3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,13 +98,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs) { - if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) - set_ckpt_flags(sbi, CP_ERROR_FLAG); - } else { - sbi->metapage_eio_ofs = page->index; - sbi->metapage_eio_cnt = 0; - } + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8c570de21ed5..f3eda4f13646 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -578,8 +578,8 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 -/* maximum retry of EIO'ed meta page */ -#define MAX_RETRY_META_PAGE_EIO 100 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ @@ -1614,8 +1614,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ - pgoff_t metapage_eio_ofs; /* EIO page offset */ - int metapage_eio_cnt; /* EIO count */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -4534,6 +4534,21 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 144f9f966690..51230cba841b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1416,8 +1416,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1443,19 +1442,21 @@ repeat: goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) From 4de851459ea65ac8cf4313af648ec9cf16b72562 Mon Sep 17 00:00:00 2001 From: Niels Dossche Date: Fri, 22 Apr 2022 20:05:04 +0200 Subject: [PATCH 064/334] f2fs: extend stat_lock to avoid potential race in statfs There are multiple calculations and reads of fields of sbi that should be protected by stat_lock. As stat_lock is not used to read these values in statfs, this can lead to inconsistent results. Extend the locking to prevent this issue. Commit c9c8ed50d94c ("f2fs: fix to avoid potential race on sbi->unusable_block_count access/update") already added the use of sbi->stat_lock in statfs in order to make the calculation of multiple, different fields atomic so that results are consistent. This is similar to that patch regarding the change in statfs. Signed-off-by: Niels Dossche Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4368f90571bd..cb50f8b2e41a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1707,18 +1707,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1731,14 +1736,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } From d46db4595be680bf9facb783fb95cf03b85f1d05 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 27 Apr 2022 18:02:53 +0200 Subject: [PATCH 065/334] f2fs: call bdev_zone_sectors() only once on init_blkz_info() Instead of calling bdev_zone_sectors() multiple times, call it once and cache the value locally. This will make the subsequent change easier to read. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb50f8b2e41a..df232ec85871 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3651,22 +3651,25 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; + zone_sectors = bdev_zone_sectors(bdev); + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + SECTOR_TO_BLOCK(zone_sectors)) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, From 7f262f737502689ddc77117ffdefaea0a4a25b03 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 27 Apr 2022 18:02:54 +0200 Subject: [PATCH 066/334] f2fs: ensure only power of 2 zone sizes are allowed F2FS zoned support has power of 2 zone size assumption in many places such as in __f2fs_issue_discard_zone, init_blkz_info. As the power of 2 requirement has been removed from the block layer, explicitly add a condition in f2fs to allow only power of 2 zone size devices. This condition will be relaxed once those calculation based on power of 2 is made generic. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index df232ec85871..d06a577a1208 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3658,6 +3658,10 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); + return -EINVAL; + } if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != SECTOR_TO_BLOCK(zone_sectors)) From f2db71053dc0409fae785096ad19cce4c8a95af7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 30 Apr 2022 21:19:24 +0800 Subject: [PATCH 067/334] f2fs: fix to clear dirty inode in f2fs_evict_inode() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215904 The kernel message is shown below: kernel BUG at fs/f2fs/inode.c:825! Call Trace: evict+0x282/0x4e0 __dentry_kill+0x2b2/0x4d0 shrink_dentry_list+0x17c/0x4f0 shrink_dcache_parent+0x143/0x1e0 do_one_tree+0x9/0x30 shrink_dcache_for_umount+0x51/0x120 generic_shutdown_super+0x5c/0x3a0 kill_block_super+0x90/0xd0 kill_f2fs_super+0x225/0x310 deactivate_locked_super+0x78/0xc0 cleanup_mnt+0x2b7/0x480 task_work_run+0xc8/0x150 exit_to_user_mode_prepare+0x14a/0x150 syscall_exit_to_user_mode+0x1d/0x40 do_syscall_64+0x48/0x90 The root cause is: inode node and dnode node share the same nid, so during f2fs_evict_inode(), dnode node truncation will invalidate its NAT entry, so when truncating inode node, it fails due to invalid NAT entry, result in inode is still marked as dirty, fix this issue by clearing dirty for inode and setting SBI_NEED_FSCK flag in filesystem. output from dump.f2fs: [print_node_info: 354] Node ID [0xf:15] is inode i_nid[0] [0x f : 15] Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 83639238a1fe..02630c17da93 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -796,8 +796,22 @@ retry: f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ From a9163b947ae8f7af7cb8d63606cd87b9facbfe74 Mon Sep 17 00:00:00 2001 From: Byungki Lee Date: Fri, 29 Apr 2022 13:29:53 -0700 Subject: [PATCH 068/334] f2fs: write checkpoint during FG_GC If there's not enough free sections each of which consistis of large segments, we can hit no free section for upcoming section allocation. Let's reclaim some prefree segments by writing checkpoints. Signed-off-by: Byungki Lee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6a7e4148ff9d..a193862ad8a5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1790,23 +1790,31 @@ gc_more: if (sync) goto stop; - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; - } + if (!has_not_enough_free_secs(sbi, sec_freed, 0)) + goto stop; - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) { + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + segno = NULL_SEGNO; + goto gc_more; } + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } + if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + ret = f2fs_write_checkpoint(sbi, &cpc); stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; From 4d17e6fe9293d57081ffdc11e1cf313e25e8fd9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2022 01:06:02 +0800 Subject: [PATCH 069/334] f2fs: fix to avoid f2fs_bug_on() in dec_valid_node_count() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215897 I have encountered a bug in F2FS file system in kernel v5.17. The kernel should enable CONFIG_KASAN=y and CONFIG_KASAN_INLINE=y. You can reproduce the bug by running the following commands: The kernel message is shown below: kernel BUG at fs/f2fs/f2fs.h:2511! Call Trace: f2fs_remove_inode_page+0x2a2/0x830 f2fs_evict_inode+0x9b7/0x1510 evict+0x282/0x4e0 do_unlinkat+0x33a/0x540 __x64_sys_unlinkat+0x8e/0xd0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is: .total_valid_block_count or .total_valid_node_count could fuzzed to zero, then once dec_valid_node_count() was called, it will cause BUG_ON(), this patch fixes to print warning info and set SBI_NEED_FSCK into CP instead of panic. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f3eda4f13646..56adc3b68e14 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2605,11 +2605,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; From 25f8236213a91efdf708b9d77e9e51b6fc3e141c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 27 Apr 2022 17:51:40 +0800 Subject: [PATCH 070/334] f2fs: fix to do sanity check on block address in f2fs_do_zero_range() As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215894 I have encountered a bug in F2FS file system in kernel v5.17. I have uploaded the system call sequence as case.c, and a fuzzed image can be found in google net disk The kernel should enable CONFIG_KASAN=y and CONFIG_KASAN_INLINE=y. You can reproduce the bug by running the following commands: kernel BUG at fs/f2fs/segment.c:2291! Call Trace: f2fs_invalidate_blocks+0x193/0x2d0 f2fs_fallocate+0x2593/0x4a70 vfs_fallocate+0x2a5/0xac0 ksys_fallocate+0x35/0x70 __x64_sys_fallocate+0x8e/0xf0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The root cause is, after image was fuzzed, block mapping info in inode will be inconsistent with SIT table, so in f2fs_fallocate(), it will cause panic when updating SIT with invalid blkaddr. Let's fix the issue by adding sanity check on block address before updating SIT table with it. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eae2e7908072..e4cf8b7b23aa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1438,11 +1438,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); From cfd66bb715fd11fde3338d0660cffa1396adc27d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 4 May 2022 14:09:22 +0800 Subject: [PATCH 071/334] f2fs: fix deadloop in foreground GC As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215914 The root cause is: in a very small sized image, it's very easy to exceed threshold of foreground GC, if we calculate free space and dirty data based on section granularity, in corner case, has_not_enough_free_secs() will always return true, result in deadloop in f2fs_gc(). So this patch refactors has_not_enough_free_secs() as below to fix this issue: 1. calculate needed space based on block granularity, and separate all blocks to two parts, section part, and block part, comparing section part to free section, and comparing block part to free space in openned log. 2. account F2FS_DIRTY_NODES, F2FS_DIRTY_IMETA and F2FS_DIRTY_DENTS as node block consumer; 3. account F2FS_DIRTY_DENTS as data block consumer; Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c94caf0c0a1..66153c632a5d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -572,11 +572,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; @@ -602,19 +601,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) From 6b8beca0edd32075a769bfe4178ca00c0dcd22a9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 May 2022 09:33:06 +0800 Subject: [PATCH 072/334] f2fs: fix to do sanity check on total_data_blocks As Yanming reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215916 The kernel message is shown below: kernel BUG at fs/f2fs/segment.c:2560! Call Trace: allocate_segment_by_default+0x228/0x440 f2fs_allocate_data_block+0x13d1/0x31f0 do_write_page+0x18d/0x710 f2fs_outplace_write_data+0x151/0x250 f2fs_do_write_data_page+0xef9/0x1980 move_data_page+0x6af/0xbc0 do_garbage_collect+0x312f/0x46f0 f2fs_gc+0x6b0/0x3bc0 f2fs_balance_fs+0x921/0x2260 f2fs_write_single_data_page+0x16be/0x2370 f2fs_write_cache_pages+0x428/0xd00 f2fs_write_data_pages+0x96e/0xd50 do_writepages+0x168/0x550 __writeback_single_inode+0x9f/0x870 writeback_sb_inodes+0x47d/0xb20 __writeback_inodes_wb+0xb2/0x200 wb_writeback+0x4bd/0x660 wb_workfn+0x5f3/0xab0 process_one_work+0x79f/0x13e0 worker_thread+0x89/0xf60 kthread+0x26a/0x300 ret_from_fork+0x22/0x30 RIP: 0010:new_curseg+0xe8d/0x15f0 The root cause is: ckpt.valid_block_count is inconsistent with SIT table, stat info indicates filesystem has free blocks, but SIT table indicates filesystem has no free segment. So that during garbage colloection, it triggers panic when LFS allocator fails to find free segment. This patch tries to fix this issue by checking consistency in between ckpt.valid_block_count and block accounted from SIT. Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/segment.c | 33 ++++++++++++++++++++++----------- fs/f2fs/segment.h | 1 + 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 56adc3b68e14..efe5e80163a8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1117,8 +1117,8 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c9b3224ef936..388bedc9b5da 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4461,7 +4461,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, @@ -4486,8 +4486,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { /* build discard map only one time */ @@ -4527,15 +4527,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -4557,13 +4557,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 66153c632a5d..1fa26a9603cb 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,7 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) From 71419129625a50cfb5e3c5cc215948a3f98c806d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 May 2022 18:30:31 +0800 Subject: [PATCH 073/334] f2fs: give priority to select unpinned section for foreground GC Previously, during foreground GC, if victims contain data of pinned file, it will fail migration of the data, and meanwhile i_gc_failures of that pinned file may increase, and when it exceeds threshold, GC will unpin the file, result in breaking pinfile's semantics. In order to mitigate such condition, let's record and skip section which has pinned file's data and give priority to select unpinned one. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 85 +++++++++++++++++++++++++++++++++++++++-------- fs/f2fs/segment.c | 8 +++++ fs/f2fs/segment.h | 3 ++ 3 files changed, 82 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a193862ad8a5..3009c0a97ab4 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -646,6 +646,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -787,6 +835,9 @@ retry: if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -1201,12 +1252,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1351,12 +1399,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, err = -EAGAIN; goto out; } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1476,14 +1521,15 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; - if (is_inode_flag_set(inode, FI_PIN_FILE) && - gc_type == FG_GC) { - f2fs_pin_file_control(inode, true); + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { iput(inode); return submitted; } @@ -1766,9 +1812,17 @@ gc_more: ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && @@ -1819,6 +1873,9 @@ stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 388bedc9b5da..87ff2b3cdf94 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4654,6 +4654,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -5242,6 +5249,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 1fa26a9603cb..8fbc9f6afa55 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -295,6 +295,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ From a45b408a020ba53e0a2b2214e1371ef6d42ad5c4 Mon Sep 17 00:00:00 2001 From: Rex-BC Chen Date: Tue, 1 Mar 2022 13:37:31 +0800 Subject: [PATCH 074/334] dt-bindings: watchdog: Add compatible for MediaTek MT8186 Add dt-binding documentation of watchdog for MediaTek MT8186 SoC. Signed-off-by: Rex-BC Chen Acked-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220301054405.25021-2-rex-bc.chen@mediatek.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/mtk-wdt.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt index a97418c74f6b..762c62e428ef 100644 --- a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt @@ -16,6 +16,7 @@ Required properties: "mediatek,mt7629-wdt", "mediatek,mt6589-wdt": for MT7629 "mediatek,mt7986-wdt", "mediatek,mt6589-wdt": for MT7986 "mediatek,mt8183-wdt": for MT8183 + "mediatek,mt8186-wdt", "mediatek,mt6589-wdt": for MT8186 "mediatek,mt8516-wdt", "mediatek,mt6589-wdt": for MT8516 "mediatek,mt8192-wdt": for MT8192 "mediatek,mt8195-wdt", "mediatek,mt6589-wdt": for MT8195 From 1d6866e8f15c978e82ef16ce6d21ee96b2eca7ee Mon Sep 17 00:00:00 2001 From: Runyang Chen Date: Tue, 1 Mar 2022 13:37:32 +0800 Subject: [PATCH 075/334] dt-bindings: reset: mt8186: add reset-controller header file 1. Add toprgu reset-controller header file for MT8186. 2. Add DSI software reset bit which is controlled in MMSYS for MT8186. Signed-off-by: Runyang Chen Signed-off-by: Rex-BC Chen Acked-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220301054405.25021-3-rex-bc.chen@mediatek.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/dt-bindings/reset/mt8186-resets.h | 36 +++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/dt-bindings/reset/mt8186-resets.h diff --git a/include/dt-bindings/reset/mt8186-resets.h b/include/dt-bindings/reset/mt8186-resets.h new file mode 100644 index 000000000000..5f850370c42c --- /dev/null +++ b/include/dt-bindings/reset/mt8186-resets.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2022 MediaTek Inc. + * Author: Runyang Chen + */ + +#ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8186 +#define _DT_BINDINGS_RESET_CONTROLLER_MT8186 + +#define MT8186_TOPRGU_INFRA_SW_RST 0 +#define MT8186_TOPRGU_MM_SW_RST 1 +#define MT8186_TOPRGU_MFG_SW_RST 2 +#define MT8186_TOPRGU_VENC_SW_RST 3 +#define MT8186_TOPRGU_VDEC_SW_RST 4 +#define MT8186_TOPRGU_IMG_SW_RST 5 +#define MT8186_TOPRGU_DDR_SW_RST 6 +#define MT8186_TOPRGU_INFRA_AO_SW_RST 8 +#define MT8186_TOPRGU_CONNSYS_SW_RST 9 +#define MT8186_TOPRGU_APMIXED_SW_RST 10 +#define MT8186_TOPRGU_PWRAP_SW_RST 11 +#define MT8186_TOPRGU_CONN_MCU_SW_RST 12 +#define MT8186_TOPRGU_IPNNA_SW_RST 13 +#define MT8186_TOPRGU_WPE_SW_RST 14 +#define MT8186_TOPRGU_ADSP_SW_RST 15 +#define MT8186_TOPRGU_AUDIO_SW_RST 17 +#define MT8186_TOPRGU_CAM_MAIN_SW_RST 18 +#define MT8186_TOPRGU_CAM_RAWA_SW_RST 19 +#define MT8186_TOPRGU_CAM_RAWB_SW_RST 20 +#define MT8186_TOPRGU_IPE_SW_RST 21 +#define MT8186_TOPRGU_IMG2_SW_RST 22 +#define MT8186_TOPRGU_SW_RST_NUM 23 + +/* MMSYS resets */ +#define MT8186_MMSYS_SW0_RST_B_DISP_DSI0 19 + +#endif /* _DT_BINDINGS_RESET_CONTROLLER_MT8186 */ From 4dbabc4d9e8cc6d8267a9883d6694c8925235e1a Mon Sep 17 00:00:00 2001 From: Runyang Chen Date: Tue, 1 Mar 2022 13:37:33 +0800 Subject: [PATCH 076/334] watchdog: mediatek: mt8186: add wdt support Support MT8186 watchdog device. Signed-off-by: Runyang Chen Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220301054405.25021-4-rex-bc.chen@mediatek.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/mtk_wdt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/watchdog/mtk_wdt.c b/drivers/watchdog/mtk_wdt.c index 4577a76dd464..fe5a2ecba97a 100644 --- a/drivers/watchdog/mtk_wdt.c +++ b/drivers/watchdog/mtk_wdt.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -80,6 +81,10 @@ static const struct mtk_wdt_data mt8183_data = { .toprgu_sw_rst_num = MT8183_TOPRGU_SW_RST_NUM, }; +static const struct mtk_wdt_data mt8186_data = { + .toprgu_sw_rst_num = MT8186_TOPRGU_SW_RST_NUM, +}; + static const struct mtk_wdt_data mt8192_data = { .toprgu_sw_rst_num = MT8192_TOPRGU_SW_RST_NUM, }; @@ -419,6 +424,7 @@ static const struct of_device_id mtk_wdt_dt_ids[] = { { .compatible = "mediatek,mt2712-wdt", .data = &mt2712_data }, { .compatible = "mediatek,mt6589-wdt" }, { .compatible = "mediatek,mt8183-wdt", .data = &mt8183_data }, + { .compatible = "mediatek,mt8186-wdt", .data = &mt8186_data }, { .compatible = "mediatek,mt8192-wdt", .data = &mt8192_data }, { .compatible = "mediatek,mt8195-wdt", .data = &mt8195_data }, { /* sentinel */ } From 100ad27e9537aa1187b86c249b1b48e451ded4cb Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Tue, 15 Feb 2022 16:12:02 +0800 Subject: [PATCH 077/334] dt-bindings: watchdog: imx7ulp-wdt: Add imx93 compatible string The wdog on i.MX93 is modified from i.MX8ULP with some changes, it uses one compatible string, so add i.MX93 support. Signed-off-by: Peng Fan Acked-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220215081202.787853-1-peng.fan@oss.nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml index 4ca8a31359a5..8562978aa0c8 100644 --- a/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml @@ -19,6 +19,7 @@ properties: - items: - const: fsl,imx8ulp-wdt - const: fsl,imx7ulp-wdt + - const: fsl,imx93-wdt reg: maxItems: 1 From 2dd441f16d6ad6104d85c4e5dfeb6dde4df26869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 16 Feb 2022 07:34:08 +0100 Subject: [PATCH 078/334] watchdog: bcm7038_wdt: Support BCM6345 compatible string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new "compatible" value has been added in the commit 17fffe91ba36 ("dt-bindings: watchdog: Add BCM6345 compatible to BCM7038 binding"). It's meant to be used for BCM63xx SoCs family but hardware block can be programmed just like the 7038 one. Cc: Florian Fainelli Signed-off-by: Rafał Miłecki Acked-by: Florian Fainelli Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220216063408.23168-1-zajec5@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/bcm7038_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/bcm7038_wdt.c b/drivers/watchdog/bcm7038_wdt.c index 8656a137e9a4..1ffcf6aca6ae 100644 --- a/drivers/watchdog/bcm7038_wdt.c +++ b/drivers/watchdog/bcm7038_wdt.c @@ -218,6 +218,7 @@ static SIMPLE_DEV_PM_OPS(bcm7038_wdt_pm_ops, bcm7038_wdt_suspend, bcm7038_wdt_resume); static const struct of_device_id bcm7038_wdt_match[] = { + { .compatible = "brcm,bcm6345-wdt" }, { .compatible = "brcm,bcm7038-wdt" }, {}, }; From 711a5b25bac95dcd1111521ed71693330e74a926 Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Wed, 5 Jan 2022 18:04:56 +0800 Subject: [PATCH 079/334] watchdog: mtk_wdt: mt7986: Add toprgu reset controller support Besides watchdog, the mt7986 toprgu module also provides software reset functionality for various peripheral subsystems (eg, ethernet, pcie, and connectivity) Signed-off-by: Sam Shih Reviewed-by: Matthias Brugger Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220105100456.7126-3-sam.shih@mediatek.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/mtk_wdt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/watchdog/mtk_wdt.c b/drivers/watchdog/mtk_wdt.c index fe5a2ecba97a..f0d4e3cc7459 100644 --- a/drivers/watchdog/mtk_wdt.c +++ b/drivers/watchdog/mtk_wdt.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -77,6 +78,10 @@ static const struct mtk_wdt_data mt2712_data = { .toprgu_sw_rst_num = MT2712_TOPRGU_SW_RST_NUM, }; +static const struct mtk_wdt_data mt7986_data = { + .toprgu_sw_rst_num = MT7986_TOPRGU_SW_RST_NUM, +}; + static const struct mtk_wdt_data mt8183_data = { .toprgu_sw_rst_num = MT8183_TOPRGU_SW_RST_NUM, }; @@ -423,6 +428,7 @@ static int mtk_wdt_resume(struct device *dev) static const struct of_device_id mtk_wdt_dt_ids[] = { { .compatible = "mediatek,mt2712-wdt", .data = &mt2712_data }, { .compatible = "mediatek,mt6589-wdt" }, + { .compatible = "mediatek,mt7986-wdt", .data = &mt7986_data }, { .compatible = "mediatek,mt8183-wdt", .data = &mt8183_data }, { .compatible = "mediatek,mt8186-wdt", .data = &mt8186_data }, { .compatible = "mediatek,mt8192-wdt", .data = &mt8192_data }, From a03f70cfb283c48a5f0925fe868b479bc9b66fd4 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Tue, 11 Jan 2022 22:23:10 +0100 Subject: [PATCH 080/334] dt-bindings: watchdog: improve QCOM compatible parsing for modern chips Parse compatible as expected for modern QCOMs. Fixes warnings as: arch/arm64/boot/dts/qcom/sdm845-oneplus-fajita.dt.yaml: watchdog@17980000: compatible: ['qcom,apss-wdt-sdm845', 'qcom,kpss-wdt'] is too long From schema: Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml arch/arm64/boot/dts/qcom/sdm845-oneplus-fajita.dt.yaml: watchdog@17980000: compatible: Additional items are not allowed ('qcom,kpss-wdt' was unexpected) From schema: Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml Signed-off-by: David Heidelberg Reviewed-by: Bjorn Andersson Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220111212310.97566-1-david@ixit.cz Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/qcom-wdt.yaml | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml b/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml index 16c6f82a13ca..4ff8c59c59ab 100644 --- a/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml @@ -14,22 +14,27 @@ allOf: properties: compatible: - enum: - - qcom,apss-wdt-qcs404 - - qcom,apss-wdt-sc7180 - - qcom,apss-wdt-sc7280 - - qcom,apss-wdt-sdm845 - - qcom,apss-wdt-sdx55 - - qcom,apss-wdt-sm6350 - - qcom,apss-wdt-sm8150 - - qcom,apss-wdt-sm8250 - - qcom,kpss-timer - - qcom,kpss-wdt - - qcom,kpss-wdt-apq8064 - - qcom,kpss-wdt-ipq4019 - - qcom,kpss-wdt-ipq8064 - - qcom,kpss-wdt-msm8960 - - qcom,scss-timer + oneOf: + - items: + - enum: + - qcom,apss-wdt-qcs404 + - qcom,apss-wdt-sc7180 + - qcom,apss-wdt-sc7280 + - qcom,apss-wdt-sdm845 + - qcom,apss-wdt-sdx55 + - qcom,apss-wdt-sm6350 + - qcom,apss-wdt-sm8150 + - qcom,apss-wdt-sm8250 + - const: qcom,kpss-wdt + - items: + - enum: + - qcom,kpss-wdt + - qcom,kpss-timer + - qcom,kpss-wdt-apq8064 + - qcom,kpss-wdt-ipq4019 + - qcom,kpss-wdt-ipq8064 + - qcom,kpss-wdt-msm8960 + - qcom,scss-timer reg: maxItems: 1 From 289660a4af0e1e38873b7a5cc772114d3b911427 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Fri, 11 Feb 2022 11:55:28 +0000 Subject: [PATCH 081/334] dt-bindings: watchdog: convert faraday,ftwdt010 to yaml Converts watchdog/faraday,ftwdt010.txt to yaml. This permits to detect missing properties like clocks and resets or compatible like moxa,moxart-watchdog. Signed-off-by: Corentin Labbe Reviewed-by: Linus Walleij Reviewed-by: Krzysztof Kozlowski Acked-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220211115528.3382374-1-clabbe@baylibre.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/faraday,ftwdt010.txt | 22 ------ .../bindings/watchdog/faraday,ftwdt010.yaml | 67 +++++++++++++++++++ 2 files changed, 67 insertions(+), 22 deletions(-) delete mode 100644 Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt create mode 100644 Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml diff --git a/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt b/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt deleted file mode 100644 index 9ecdb502e605..000000000000 --- a/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt +++ /dev/null @@ -1,22 +0,0 @@ -Faraday Technology FTWDT010 watchdog - -This is an IP part from Faraday Technology found in the Gemini -SoCs and others. - -Required properties: -- compatible : must be one of - "faraday,ftwdt010" - "cortina,gemini-watchdog", "faraday,ftwdt010" -- reg : shall contain base register location and length -- interrupts : shall contain the interrupt for the watchdog - -Optional properties: -- timeout-sec : the default watchdog timeout in seconds. - -Example: - -watchdog@41000000 { - compatible = "faraday,ftwdt010"; - reg = <0x41000000 0x1000>; - interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; -}; diff --git a/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml b/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml new file mode 100644 index 000000000000..ca9e1beff76b --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/faraday,ftwdt010.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Faraday Technology FTWDT010 watchdog + +maintainers: + - Linus Walleij + - Corentin Labbe + +description: | + This is an IP part from Faraday Technology found in the Gemini + SoCs and others. + +allOf: + - $ref: "watchdog.yaml#" + +properties: + compatible: + oneOf: + - const: faraday,ftwdt010 + - items: + - enum: + - cortina,gemini-watchdog + - moxa,moxart-watchdog + - const: faraday,ftwdt010 + + reg: + maxItems: 1 + + resets: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + const: PCLK + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + watchdog@41000000 { + compatible = "faraday,ftwdt010"; + reg = <0x41000000 0x1000>; + interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; + timeout-secs = <5>; + }; + - | + watchdog: watchdog@98500000 { + compatible = "moxa,moxart-watchdog", "faraday,ftwdt010"; + reg = <0x98500000 0x10>; + clocks = <&clk_apb>; + clock-names = "PCLK"; + }; +... From ea2949df22a533cdf75e4583c00b1ce94cd5a83b Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:14 +0000 Subject: [PATCH 082/334] watchdog: rzg2l_wdt: Fix 32bit overflow issue The value of timer_cycle_us can be 0 due to 32bit overflow. For eg:- If we assign the counter value "0xfff" for computing maxval. This patch fixes this issue by appending ULL to 1024, so that it is promoted to 64bit. This patch also fixes the warning message, 'watchdog: Invalid min and max timeout values, resetting to 0!'. Fixes: 2cbc5cd0b55fa2 ("watchdog: Add Watchdog Timer driver for RZ/G2L") Signed-off-by: Biju Das Reviewed-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220225175320.11041-2-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 6b426df34fd6..96f2a018ab62 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -53,7 +53,7 @@ static void rzg2l_wdt_wait_delay(struct rzg2l_wdt_priv *priv) static u32 rzg2l_wdt_get_cycle_usec(unsigned long cycle, u32 wdttime) { - u64 timer_cycle_us = 1024 * 1024 * (wdttime + 1) * MICRO; + u64 timer_cycle_us = 1024 * 1024ULL * (wdttime + 1) * MICRO; return div64_ul(timer_cycle_us, cycle); } From 95abafe76297fa057de6c3486ef844bd446bdf18 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:15 +0000 Subject: [PATCH 083/334] watchdog: rzg2l_wdt: Fix Runtime PM usage Both rzg2l_wdt_probe() and rzg2l_wdt_start() calls pm_runtime_get() which results in a usage counter imbalance. This patch fixes this issue by removing pm_runtime_get() call from probe. Fixes: 2cbc5cd0b55fa2 ("watchdog: Add Watchdog Timer driver for RZ/G2L") Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220225175320.11041-3-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 96f2a018ab62..0fc73b8a9567 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -151,12 +151,11 @@ static const struct watchdog_ops rzg2l_wdt_ops = { .restart = rzg2l_wdt_restart, }; -static void rzg2l_wdt_reset_assert_pm_disable_put(void *data) +static void rzg2l_wdt_reset_assert_pm_disable(void *data) { struct watchdog_device *wdev = data; struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); - pm_runtime_put(wdev->parent); pm_runtime_disable(wdev->parent); reset_control_assert(priv->rstc); } @@ -206,11 +205,6 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) reset_control_deassert(priv->rstc); pm_runtime_enable(&pdev->dev); - ret = pm_runtime_resume_and_get(&pdev->dev); - if (ret < 0) { - dev_err(dev, "pm_runtime_resume_and_get failed ret=%pe", ERR_PTR(ret)); - goto out_pm_get; - } priv->wdev.info = &rzg2l_wdt_ident; priv->wdev.ops = &rzg2l_wdt_ops; @@ -222,7 +216,7 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(&priv->wdev, priv); ret = devm_add_action_or_reset(&pdev->dev, - rzg2l_wdt_reset_assert_pm_disable_put, + rzg2l_wdt_reset_assert_pm_disable, &priv->wdev); if (ret < 0) return ret; @@ -235,12 +229,6 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) dev_warn(dev, "Specified timeout invalid, using default"); return devm_watchdog_register_device(&pdev->dev, &priv->wdev); - -out_pm_get: - pm_runtime_disable(dev); - reset_control_assert(priv->rstc); - - return ret; } static const struct of_device_id rzg2l_wdt_ids[] = { From e4cf89596c1f1e33309556699f910ced4abbaf44 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:16 +0000 Subject: [PATCH 084/334] watchdog: rzg2l_wdt: Fix 'BUG: Invalid wait context' This patch fixes the issue 'BUG: Invalid wait context' during restart() callback by using clk_prepare_enable() instead of pm_runtime_get_sync() for turning on the clocks during restart. This issue is noticed when testing with renesas_defconfig. [ 42.213802] reboot: Restarting system [ 42.217860] [ 42.219364] ============================= [ 42.223368] [ BUG: Invalid wait context ] [ 42.227372] 5.17.0-rc5-arm64-renesas-00002-g10393723e35e #522 Not tainted [ 42.234153] ----------------------------- [ 42.238155] systemd-shutdow/1 is trying to lock: [ 42.242766] ffff00000a650828 (&genpd->mlock){+.+.}-{3:3}, at: genpd_lock_mtx+0x14/0x20 [ 42.250709] other info that might help us debug this: [ 42.255753] context-{4:4} [ 42.258368] 2 locks held by systemd-shutdow/1: [ 42.262806] #0: ffff80000944e1c8 (system_transition_mutex#2){+.+.}-{3:3}, at: __do_sys_reboot+0xd0/0x250 [ 42.272388] #1: ffff8000094c4e40 (rcu_read_lock){....}-{1:2}, at: atomic_notifier_call_chain+0x0/0x150 [ 42.281795] stack backtrace: [ 42.284672] CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.17.0-rc5-arm64-renesas-00002-g10393723e35e #522 [ 42.294577] Hardware name: Renesas SMARC EVK based on r9a07g044c2 (DT) [ 42.301096] Call trace: [ 42.303538] dump_backtrace+0xcc/0xd8 [ 42.307203] show_stack+0x14/0x30 [ 42.310517] dump_stack_lvl+0x88/0xb0 [ 42.314180] dump_stack+0x14/0x2c [ 42.317492] __lock_acquire+0x1b24/0x1b50 [ 42.321502] lock_acquire+0x120/0x3a8 [ 42.325162] __mutex_lock+0x84/0x8f8 [ 42.328737] mutex_lock_nested+0x30/0x58 [ 42.332658] genpd_lock_mtx+0x14/0x20 [ 42.336319] genpd_runtime_resume+0xc4/0x228 [ 42.340587] __rpm_callback+0x44/0x170 [ 42.344337] rpm_callback+0x64/0x70 [ 42.347824] rpm_resume+0x4e0/0x6b8 [ 42.351310] __pm_runtime_resume+0x50/0x78 [ 42.355404] rzg2l_wdt_restart+0x28/0x68 [ 42.359329] watchdog_restart_notifier+0x1c/0x30 [ 42.363943] atomic_notifier_call_chain+0x94/0x150 [ 42.368732] do_kernel_restart+0x24/0x30 [ 42.372652] machine_restart+0x44/0x70 [ 42.376399] kernel_restart+0x3c/0x60 [ 42.380058] __do_sys_reboot+0x228/0x250 [ 42.383977] __arm64_sys_reboot+0x20/0x28 [ 42.387983] invoke_syscall+0x40/0xf8 Fixes: 2cbc5cd0b55fa2 ("watchdog: Add Watchdog Timer driver for RZ/G2L") Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220225175320.11041-4-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 0fc73b8a9567..48dfe6e5e64f 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -43,6 +43,8 @@ struct rzg2l_wdt_priv { struct reset_control *rstc; unsigned long osc_clk_rate; unsigned long delay; + struct clk *pclk; + struct clk *osc_clk; }; static void rzg2l_wdt_wait_delay(struct rzg2l_wdt_priv *priv) @@ -118,7 +120,9 @@ static int rzg2l_wdt_restart(struct watchdog_device *wdev, /* Reset the module before we modify any register */ reset_control_reset(priv->rstc); - pm_runtime_get_sync(wdev->parent); + + clk_prepare_enable(priv->pclk); + clk_prepare_enable(priv->osc_clk); /* smallest counter value to reboot soon */ rzg2l_wdt_write(priv, WDTSET_COUNTER_VAL(1), WDTSET); @@ -165,7 +169,6 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct rzg2l_wdt_priv *priv; unsigned long pclk_rate; - struct clk *wdt_clk; int ret; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -177,22 +180,20 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) return PTR_ERR(priv->base); /* Get watchdog main clock */ - wdt_clk = clk_get(&pdev->dev, "oscclk"); - if (IS_ERR(wdt_clk)) - return dev_err_probe(&pdev->dev, PTR_ERR(wdt_clk), "no oscclk"); + priv->osc_clk = devm_clk_get(&pdev->dev, "oscclk"); + if (IS_ERR(priv->osc_clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(priv->osc_clk), "no oscclk"); - priv->osc_clk_rate = clk_get_rate(wdt_clk); - clk_put(wdt_clk); + priv->osc_clk_rate = clk_get_rate(priv->osc_clk); if (!priv->osc_clk_rate) return dev_err_probe(&pdev->dev, -EINVAL, "oscclk rate is 0"); /* Get Peripheral clock */ - wdt_clk = clk_get(&pdev->dev, "pclk"); - if (IS_ERR(wdt_clk)) - return dev_err_probe(&pdev->dev, PTR_ERR(wdt_clk), "no pclk"); + priv->pclk = devm_clk_get(&pdev->dev, "pclk"); + if (IS_ERR(priv->pclk)) + return dev_err_probe(&pdev->dev, PTR_ERR(priv->pclk), "no pclk"); - pclk_rate = clk_get_rate(wdt_clk); - clk_put(wdt_clk); + pclk_rate = clk_get_rate(priv->pclk); if (!pclk_rate) return dev_err_probe(&pdev->dev, -EINVAL, "pclk rate is 0"); From 33d04d0fdba9fae18c7d58364643d2c606a43dba Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:17 +0000 Subject: [PATCH 085/334] watchdog: rzg2l_wdt: Fix reset control imbalance Both rzg2l_wdt_probe() and rzg2l_wdt_start() calls reset_control_ deassert() which results in a reset control imbalance. This patch fixes reset control imbalance by removing reset_control_ deassert() from rzg2l_wdt_start() and replaces reset_control_assert with reset_control_reset in rzg2l_wdt_stop() as watchdog module can be stopped only by a module reset. This change will allow us to restart WDT after stop() by configuring WDT timeout and enable registers. Fixes: 2cbc5cd0b55fa2 ("watchdog: Add Watchdog Timer driver for RZ/G2L") Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220225175320.11041-5-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 48dfe6e5e64f..88274704b260 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -88,7 +88,6 @@ static int rzg2l_wdt_start(struct watchdog_device *wdev) { struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); - reset_control_deassert(priv->rstc); pm_runtime_get_sync(wdev->parent); /* Initialize time out */ @@ -108,7 +107,7 @@ static int rzg2l_wdt_stop(struct watchdog_device *wdev) struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); pm_runtime_put(wdev->parent); - reset_control_assert(priv->rstc); + reset_control_reset(priv->rstc); return 0; } From baf1aace9ad15401f08e048a7f1fdec79821bc61 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:18 +0000 Subject: [PATCH 086/334] watchdog: rzg2l_wdt: Add error check for reset_control_deassert If reset_control_deassert() fails, then we won't be able to access the device registers. Therefore check the return code of reset_control_deassert() and bailout in case of error. Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220225175320.11041-6-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 88274704b260..73b667ed3e99 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -203,7 +203,10 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) return dev_err_probe(&pdev->dev, PTR_ERR(priv->rstc), "failed to get cpg reset"); - reset_control_deassert(priv->rstc); + ret = reset_control_deassert(priv->rstc); + if (ret) + return dev_err_probe(dev, ret, "failed to deassert"); + pm_runtime_enable(&pdev->dev); priv->wdev.info = &rzg2l_wdt_ident; From f43e6ddbd7d7b63b9e71927a1f50860f8d55f9cc Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:19 +0000 Subject: [PATCH 087/334] watchdog: rzg2l_wdt: Use force reset for WDT reset This patch uses the force reset(WDTRSTB) for triggering WDT reset for restart callback. This method(ie, Generate Reset (WDTRSTB) Signal on parity error)is faster compared to the overflow method for triggering watchdog reset. Overflow method: reboot: Restarting system Reboot failed -- System halted NOTICE: BL2: v2.5(release):v2.5/rzg2l-1.00-27-gf48f1440c Parity error method: reboot: Restarting system NOTICE: BL2: v2.5(release):v2.5/rzg2l-1.00-27-gf48f1440c Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220225175320.11041-7-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 73b667ed3e99..4e7107655cc2 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -21,8 +21,11 @@ #define WDTSET 0x04 #define WDTTIM 0x08 #define WDTINT 0x0C +#define PECR 0x10 +#define PEEN 0x14 #define WDTCNT_WDTEN BIT(0) #define WDTINT_INTDISP BIT(0) +#define PEEN_FORCE BIT(0) #define WDT_DEFAULT_TIMEOUT 60U @@ -117,17 +120,14 @@ static int rzg2l_wdt_restart(struct watchdog_device *wdev, { struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); - /* Reset the module before we modify any register */ - reset_control_reset(priv->rstc); - clk_prepare_enable(priv->pclk); clk_prepare_enable(priv->osc_clk); - /* smallest counter value to reboot soon */ - rzg2l_wdt_write(priv, WDTSET_COUNTER_VAL(1), WDTSET); + /* Generate Reset (WDTRSTB) Signal on parity error */ + rzg2l_wdt_write(priv, 0, PECR); - /* Enable watchdog timer*/ - rzg2l_wdt_write(priv, WDTCNT_WDTEN, WDTCNT); + /* Force parity error */ + rzg2l_wdt_write(priv, PEEN_FORCE, PEEN); return 0; } From 4055ee81009e606e830af1acd9e2e35a36249713 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Fri, 25 Feb 2022 17:53:20 +0000 Subject: [PATCH 088/334] watchdog: rzg2l_wdt: Add set_timeout callback This patch adds support for set_timeout callback. Once WDT is started, the WDT cycle setting register(WDTSET) can be updated only after issuing a module reset. Otherwise, it will ignore the writes and will hold the previous value. This patch updates the WDTSET register if it is active. Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220225175320.11041-8-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rzg2l_wdt.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 4e7107655cc2..6eea0ee4af49 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -115,6 +115,25 @@ static int rzg2l_wdt_stop(struct watchdog_device *wdev) return 0; } +static int rzg2l_wdt_set_timeout(struct watchdog_device *wdev, unsigned int timeout) +{ + struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); + + wdev->timeout = timeout; + + /* + * If the watchdog is active, reset the module for updating the WDTSET + * register so that it is updated with new timeout values. + */ + if (watchdog_active(wdev)) { + pm_runtime_put(wdev->parent); + reset_control_reset(priv->rstc); + rzg2l_wdt_start(wdev); + } + + return 0; +} + static int rzg2l_wdt_restart(struct watchdog_device *wdev, unsigned long action, void *data) { @@ -151,6 +170,7 @@ static const struct watchdog_ops rzg2l_wdt_ops = { .start = rzg2l_wdt_start, .stop = rzg2l_wdt_stop, .ping = rzg2l_wdt_ping, + .set_timeout = rzg2l_wdt_set_timeout, .restart = rzg2l_wdt_restart, }; From 83999b61d583cd07492600ea7b5cde3fbfd863fb Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 1 Mar 2022 12:23:32 +0000 Subject: [PATCH 089/334] dt-bindings: watchdog: renesas,wdt: Document RZ/V2L SoC Document RZ/V2L WDT bindings. RZ/V2L WDT is identical to one found on the RZ/G2L SoC. No driver changes are required as generic compatible string "renesas,rzg2l-wdt" will be used as a fallback. While at it, drop the comment "# RZ/G2L" for "renesas,rzg2l-wdt" compatible string as this will avoid changing the line for every new SoC addition. Signed-off-by: Lad Prabhakar Reviewed-by: Biju Das Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Reviewed-by: Wolfram Sang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220301122332.14796-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index d060438e1402..59a48387e72d 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -22,7 +22,8 @@ properties: - items: - enum: - renesas,r9a07g044-wdt # RZ/G2{L,LC} - - const: renesas,rzg2l-wdt # RZ/G2L + - renesas,r9a07g054-wdt # RZ/V2L + - const: renesas,rzg2l-wdt - items: - enum: From 5794dda109fc870fce48a8cdf580750b40bf9d15 Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Wed, 5 Jan 2022 18:04:55 +0800 Subject: [PATCH 090/334] dt-bindings: reset: mt7986: Add reset-controller header file Add infracfg, toprgu, and ethsys reset-controller header file for MT7986 platform. Signed-off-by: Sam Shih Acked-by: Rob Herring Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20220105100456.7126-2-sam.shih@mediatek.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/dt-bindings/reset/mt7986-resets.h | 55 +++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 include/dt-bindings/reset/mt7986-resets.h diff --git a/include/dt-bindings/reset/mt7986-resets.h b/include/dt-bindings/reset/mt7986-resets.h new file mode 100644 index 000000000000..af3d16c81192 --- /dev/null +++ b/include/dt-bindings/reset/mt7986-resets.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2022 MediaTek Inc. + * Author: Sam Shih + */ + +#ifndef _DT_BINDINGS_RESET_CONTROLLER_MT7986 +#define _DT_BINDINGS_RESET_CONTROLLER_MT7986 + +/* INFRACFG resets */ +#define MT7986_INFRACFG_PEXTP_MAC_SW_RST 6 +#define MT7986_INFRACFG_SSUSB_SW_RST 7 +#define MT7986_INFRACFG_EIP97_SW_RST 8 +#define MT7986_INFRACFG_AUDIO_SW_RST 13 +#define MT7986_INFRACFG_CQ_DMA_SW_RST 14 + +#define MT7986_INFRACFG_TRNG_SW_RST 17 +#define MT7986_INFRACFG_AP_DMA_SW_RST 32 +#define MT7986_INFRACFG_I2C_SW_RST 33 +#define MT7986_INFRACFG_NFI_SW_RST 34 +#define MT7986_INFRACFG_SPI0_SW_RST 35 +#define MT7986_INFRACFG_SPI1_SW_RST 36 +#define MT7986_INFRACFG_UART0_SW_RST 37 +#define MT7986_INFRACFG_UART1_SW_RST 38 +#define MT7986_INFRACFG_UART2_SW_RST 39 +#define MT7986_INFRACFG_AUXADC_SW_RST 43 + +#define MT7986_INFRACFG_APXGPT_SW_RST 66 +#define MT7986_INFRACFG_PWM_SW_RST 68 + +#define MT7986_INFRACFG_SW_RST_NUM 69 + +/* TOPRGU resets */ +#define MT7986_TOPRGU_APMIXEDSYS_SW_RST 0 +#define MT7986_TOPRGU_SGMII0_SW_RST 1 +#define MT7986_TOPRGU_SGMII1_SW_RST 2 +#define MT7986_TOPRGU_INFRA_SW_RST 3 +#define MT7986_TOPRGU_U2PHY_SW_RST 5 +#define MT7986_TOPRGU_PCIE_SW_RST 6 +#define MT7986_TOPRGU_SSUSB_SW_RST 7 +#define MT7986_TOPRGU_ETHDMA_SW_RST 20 +#define MT7986_TOPRGU_CONSYS_SW_RST 23 + +#define MT7986_TOPRGU_SW_RST_NUM 24 + +/* ETHSYS Subsystem resets */ +#define MT7986_ETHSYS_FE_SW_RST 6 +#define MT7986_ETHSYS_PMTR_SW_RST 8 +#define MT7986_ETHSYS_GMAC_SW_RST 23 +#define MT7986_ETHSYS_PPE0_SW_RST 30 +#define MT7986_ETHSYS_PPE1_SW_RST 31 + +#define MT7986_ETHSYS_SW_RST_NUM 32 + +#endif /* _DT_BINDINGS_RESET_CONTROLLER_MT7986 */ From 94e4a7d5f87668e43f6939c6418d64f884e0bdab Mon Sep 17 00:00:00 2001 From: Xiantao Hu Date: Thu, 24 Mar 2022 11:18:04 +0800 Subject: [PATCH 091/334] dt-bindings: watchdog: Add watchdog yaml file for Sunplus SP7021 This adds the documentation for the devicetree bindings of the Sunplus SP7021 watchdog driver, found from SP7021 SoCs and newer. Signed-off-by: Xiantao Hu Reviewed-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220324031805.61316-2-xt.hu@cqplus1.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../bindings/watchdog/sunplus,sp7021-wdt.yaml | 47 +++++++++++++++++++ MAINTAINERS | 6 +++ 2 files changed, 53 insertions(+) create mode 100644 Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml diff --git a/Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml b/Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml new file mode 100644 index 000000000000..d90271013191 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) Sunplus Co., Ltd. 2021 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/sunplus,sp7021-wdt.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sunplus SoCs Watchdog + +maintainers: + - XianTao Hu + +allOf: + - $ref: watchdog.yaml# + +properties: + compatible: + const: sunplus,sp7021-wdt + + reg: + items: + - description: watchdog registers regions + - description: miscellaneous control registers regions + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - resets + +additionalProperties: false + +examples: + - | + watchdog: watchdog@9c000630 { + compatible = "sunplus,sp7021-wdt"; + reg = <0x9c000630 0x08>, <0x9c000274 0x04>; + clocks = <&clkc 0x24>; + resets = <&rstc 0x14>; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index edc96cdb85e8..fade1e499add 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18902,6 +18902,12 @@ S: Maintained F: Documentation/devicetree/bindings/serial/sunplus,sp7021-uart.yaml F: drivers/tty/serial/sunplus-uart.c +SUNPLUS WATCHDOG DRIVER +M: Xiantao Hu +L: linux-watchdog@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml + SUPERH M: Yoshinori Sato M: Rich Felker From daf42866960ce69d48c94c4fc1aa997638b4c2e9 Mon Sep 17 00:00:00 2001 From: Xiantao Hu Date: Thu, 24 Mar 2022 11:18:05 +0800 Subject: [PATCH 092/334] watchdog: Add watchdog driver for Sunplus SP7021 Sunplus SP7021 requires watchdog timer support. Add watchdog driver to enable this. Signed-off-by: Xiantao Hu Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220324031805.61316-3-xt.hu@cqplus1.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- MAINTAINERS | 1 + drivers/watchdog/Kconfig | 11 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/sunplus_wdt.c | 220 +++++++++++++++++++++++++++++++++ 4 files changed, 233 insertions(+) create mode 100644 drivers/watchdog/sunplus_wdt.c diff --git a/MAINTAINERS b/MAINTAINERS index fade1e499add..d02286f5084c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18907,6 +18907,7 @@ M: Xiantao Hu L: linux-watchdog@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml +F: drivers/watchdog/sunplus_wdt.c SUPERH M: Yoshinori Sato diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index c4e82a8d863f..a7cd3ef5b3d8 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1011,6 +1011,17 @@ config APPLE_WATCHDOG To compile this driver as a module, choose M here: the module will be called apple_wdt. +config SUNPLUS_WATCHDOG + tristate "Sunplus watchdog support" + depends on ARCH_SUNPLUS || COMPILE_TEST + select WATCHDOG_CORE + help + Say Y here to include support for the watchdog timer + in Sunplus SoCs. + + To compile this driver as a module, choose M here: the + module will be called sunplus_wdt. + # X86 (i386 + ia64 + x86_64) Architecture config ACQUIRE_WDT diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index f7da867e8782..71c933a5964a 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -95,6 +95,7 @@ obj-$(CONFIG_ARM_SMC_WATCHDOG) += arm_smc_wdt.o obj-$(CONFIG_VISCONTI_WATCHDOG) += visconti_wdt.o obj-$(CONFIG_MSC313E_WATCHDOG) += msc313e_wdt.o obj-$(CONFIG_APPLE_WATCHDOG) += apple_wdt.o +obj-$(CONFIG_SUNPLUS_WATCHDOG) += sunplus_wdt.o # X86 (i386 + ia64 + x86_64) Architecture obj-$(CONFIG_ACQUIRE_WDT) += acquirewdt.o diff --git a/drivers/watchdog/sunplus_wdt.c b/drivers/watchdog/sunplus_wdt.c new file mode 100644 index 000000000000..e2d8c532bcb1 --- /dev/null +++ b/drivers/watchdog/sunplus_wdt.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sunplus Watchdog Driver + * + * Copyright (C) 2021 Sunplus Technology Co., Ltd. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#define WDT_CTRL 0x00 +#define WDT_CNT 0x04 + +#define WDT_STOP 0x3877 +#define WDT_RESUME 0x4A4B +#define WDT_CLRIRQ 0x7482 +#define WDT_UNLOCK 0xAB00 +#define WDT_LOCK 0xAB01 +#define WDT_CONMAX 0xDEAF + +/* TIMEOUT_MAX = ffff0/90kHz =11.65, so longer than 11 seconds will time out. */ +#define SP_WDT_MAX_TIMEOUT 11U +#define SP_WDT_DEFAULT_TIMEOUT 10 + +#define STC_CLK 90000 + +#define DEVICE_NAME "sunplus-wdt" + +static unsigned int timeout; +module_param(timeout, int, 0); +MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds"); + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +struct sp_wdt_priv { + struct watchdog_device wdev; + void __iomem *base; + struct clk *clk; + struct reset_control *rstc; +}; + +static int sp_wdt_restart(struct watchdog_device *wdev, + unsigned long action, void *data) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + + writel(WDT_STOP, base + WDT_CTRL); + writel(WDT_UNLOCK, base + WDT_CTRL); + writel(0x0001, base + WDT_CNT); + writel(WDT_LOCK, base + WDT_CTRL); + writel(WDT_RESUME, base + WDT_CTRL); + + return 0; +} + +static int sp_wdt_ping(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + u32 count; + + if (wdev->timeout > SP_WDT_MAX_TIMEOUT) { + /* WDT_CONMAX sets the count to the maximum (down-counting). */ + writel(WDT_CONMAX, base + WDT_CTRL); + } else { + writel(WDT_UNLOCK, base + WDT_CTRL); + /* + * Watchdog timer is a 20-bit down-counting based on STC_CLK. + * This register bits[16:0] is from bit[19:4] of the watchdog + * timer counter. + */ + count = (wdev->timeout * STC_CLK) >> 4; + writel(count, base + WDT_CNT); + writel(WDT_LOCK, base + WDT_CTRL); + } + + return 0; +} + +static int sp_wdt_stop(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + + writel(WDT_STOP, base + WDT_CTRL); + + return 0; +} + +static int sp_wdt_start(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + + writel(WDT_RESUME, base + WDT_CTRL); + + return 0; +} + +static unsigned int sp_wdt_get_timeleft(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + u32 val; + + val = readl(base + WDT_CNT); + val &= 0xffff; + val = val << 4; + + return val; +} + +static const struct watchdog_info sp_wdt_info = { + .identity = DEVICE_NAME, + .options = WDIOF_SETTIMEOUT | + WDIOF_MAGICCLOSE | + WDIOF_KEEPALIVEPING, +}; + +static const struct watchdog_ops sp_wdt_ops = { + .owner = THIS_MODULE, + .start = sp_wdt_start, + .stop = sp_wdt_stop, + .ping = sp_wdt_ping, + .get_timeleft = sp_wdt_get_timeleft, + .restart = sp_wdt_restart, +}; + +static void sp_clk_disable_unprepare(void *data) +{ + clk_disable_unprepare(data); +} + +static void sp_reset_control_assert(void *data) +{ + reset_control_assert(data); +} + +static int sp_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sp_wdt_priv *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->clk = devm_clk_get(dev, NULL); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), "Failed to get clock\n"); + + ret = clk_prepare_enable(priv->clk); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable clock\n"); + + ret = devm_add_action_or_reset(dev, sp_clk_disable_unprepare, priv->clk); + if (ret) + return ret; + + /* The timer and watchdog shared the STC reset */ + priv->rstc = devm_reset_control_get_shared(dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), "Failed to get reset\n"); + + reset_control_deassert(priv->rstc); + + ret = devm_add_action_or_reset(dev, sp_reset_control_assert, priv->rstc); + if (ret) + return ret; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->wdev.info = &sp_wdt_info; + priv->wdev.ops = &sp_wdt_ops; + priv->wdev.timeout = SP_WDT_DEFAULT_TIMEOUT; + priv->wdev.max_hw_heartbeat_ms = SP_WDT_MAX_TIMEOUT * 1000; + priv->wdev.min_timeout = 1; + priv->wdev.parent = dev; + + watchdog_set_drvdata(&priv->wdev, priv); + watchdog_init_timeout(&priv->wdev, timeout, dev); + watchdog_set_nowayout(&priv->wdev, nowayout); + watchdog_stop_on_reboot(&priv->wdev); + watchdog_set_restart_priority(&priv->wdev, 128); + + return devm_watchdog_register_device(dev, &priv->wdev); +} + +static const struct of_device_id sp_wdt_of_match[] = { + {.compatible = "sunplus,sp7021-wdt", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, sp_wdt_of_match); + +static struct platform_driver sp_wdt_driver = { + .probe = sp_wdt_probe, + .driver = { + .name = DEVICE_NAME, + .of_match_table = sp_wdt_of_match, + }, +}; + +module_platform_driver(sp_wdt_driver); + +MODULE_AUTHOR("Xiantao Hu "); +MODULE_DESCRIPTION("Sunplus Watchdog Timer Driver"); +MODULE_LICENSE("GPL"); From 70fabe207135194a48313bf56305f1ead32cea1f Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 17 Mar 2022 16:23:38 +0000 Subject: [PATCH 093/334] dt-bindings: watchdog: sunxi: fix F1C100s compatible The F1C100 series actually features a newer generation watchdog IP, so the compatible string was wrong. Signed-off-by: Andre Przywara Acked-by: Rob Herring Reviewed-by: Samuel Holland Link: https://lore.kernel.org/r/20220317162349.739636-2-andre.przywara@arm.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml index 43afa24513b9..7a26cde0afdd 100644 --- a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml @@ -26,10 +26,8 @@ properties: - allwinner,sun50i-h616-wdt - allwinner,sun50i-r329-wdt - allwinner,sun50i-r329-wdt-reset + - allwinner,suniv-f1c100s-wdt - const: allwinner,sun6i-a31-wdt - - items: - - const: allwinner,suniv-f1c100s-wdt - - const: allwinner,sun4i-a10-wdt - const: allwinner,sun20i-d1-wdt - items: - const: allwinner,sun20i-d1-wdt-reset From 5b38db0ed51e17ab1135f8e354951aeb46aa57d3 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 17 Mar 2022 16:23:39 +0000 Subject: [PATCH 094/334] dt-bindings: watchdog: sunxi: clarify clock support Most Allwinner SoCs have just one input clock to drive the watchdog peripheral. So far this is the 24 MHz "HOSC" oscillator, divided down internally to 32 KHz. The F1C100 series watchdog however uses the unchanged 32 KHz "LOSC" as its only clock input, which has the same effect, but let's the binding description mismatch. Change the binding description to name the clocks more loosely, so both the LOSC and divided HOSC match the description. As the fixed clock names now make less sense, drop them from SoCs supporting just one clock input, they were not used by any DT anyway. For the newer SoCs, supporting a choice of two input clocks, we keep both the description and clock-names requirement. Signed-off-by: Andre Przywara Reviewed-by: Rob Herring Reviewed-by: Samuel Holland Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220317162349.739636-3-andre.przywara@arm.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../watchdog/allwinner,sun4i-a10-wdt.yaml | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml index 7a26cde0afdd..cbcf19f51411 100644 --- a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml @@ -39,14 +39,8 @@ properties: clocks: minItems: 1 items: - - description: High-frequency oscillator input, divided internally - - description: Low-frequency oscillator input, only found on some variants - - clock-names: - minItems: 1 - items: - - const: hosc - - const: losc + - description: 32 KHz input clock + - description: secondary clock source interrupts: maxItems: 1 @@ -71,9 +65,14 @@ then: properties: clocks: minItems: 2 + items: + - description: High-frequency oscillator input, divided internally + - description: Low-frequency oscillator input clock-names: - minItems: 2 + items: + - const: hosc + - const: losc required: - clock-names @@ -83,9 +82,6 @@ else: clocks: maxItems: 1 - clock-names: - maxItems: 1 - unevaluatedProperties: false examples: From 23b5c7961f7597ba063969dfd79da3f4e19c792f Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 30 Apr 2022 17:49:36 -0500 Subject: [PATCH 095/334] memblock tests: update style of comments for memblock_add_*() functions Update comments in memblock_add_*() functions to match the style used in tests/alloc_*.c by rewording to make the expected outcome more apparent and, if more than one memblock is involved, adding a visual of the memory blocks. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport --- tools/testing/memblock/tests/basic_api.c | 85 ++++++++++++++++-------- 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index fbc1ce160303..ab8d71b1e24b 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -26,8 +26,8 @@ static int memblock_initialization_check(void) /* * A simple test that adds a memory block of a specified base address * and size to the collection of available memory regions (memblock.memory). - * It checks if a new entry was created and if region counter and total memory - * were correctly updated. + * Expect to create a new entry. The region counter and total memory get + * updated. */ static int memblock_add_simple_check(void) { @@ -53,10 +53,10 @@ static int memblock_add_simple_check(void) } /* - * A simple test that adds a memory block of a specified base address, size + * A simple test that adds a memory block of a specified base address, size, * NUMA node and memory flags to the collection of available memory regions. - * It checks if the new entry, region counter and total memory size have - * expected values. + * Expect to create a new entry. The region counter and total memory get + * updated. */ static int memblock_add_node_simple_check(void) { @@ -87,9 +87,15 @@ static int memblock_add_node_simple_check(void) /* * A test that tries to add two memory blocks that don't overlap with one - * another. It checks if two correctly initialized entries were added to the - * collection of available memory regions (memblock.memory) and if this - * change was reflected in memblock.memory's total size and region counter. + * another: + * + * | +--------+ +--------+ | + * | | r1 | | r2 | | + * +--------+--------+--------+--------+--+ + * + * Expect to add two correctly initialized entries to the collection of + * available memory regions (memblock.memory). The total size and + * region counter fields get updated. */ static int memblock_add_disjoint_check(void) { @@ -124,11 +130,21 @@ static int memblock_add_disjoint_check(void) } /* - * A test that tries to add two memory blocks, where the second one overlaps - * with the beginning of the first entry (that is r1.base < r2.base + r2.size). - * After this, it checks if two entries are merged into one region that starts - * at r2.base and has size of two regions minus their intersection. It also - * verifies the reported total size of the available memory and region counter. + * A test that tries to add two memory blocks r1 and r2, where r2 overlaps + * with the beginning of r1 (that is r1.base < r2.base + r2.size): + * + * | +----+----+------------+ | + * | | |r2 | r1 | | + * +----+----+----+------------+----------+ + * ^ ^ + * | | + * | r1.base + * | + * r2.base + * + * Expect to merge the two entries into one region that starts at r2.base + * and has size of two regions minus their intersection. The total size of + * the available memory is updated, and the region counter stays the same. */ static int memblock_add_overlap_top_check(void) { @@ -162,12 +178,21 @@ static int memblock_add_overlap_top_check(void) } /* - * A test that tries to add two memory blocks, where the second one overlaps - * with the end of the first entry (that is r2.base < r1.base + r1.size). - * After this, it checks if two entries are merged into one region that starts - * at r1.base and has size of two regions minus their intersection. It verifies - * that memblock can still see only one entry and has a correct total size of - * the available memory. + * A test that tries to add two memory blocks r1 and r2, where r2 overlaps + * with the end of r1 (that is r2.base < r1.base + r1.size): + * + * | +--+------+----------+ | + * | | | r1 | r2 | | + * +--+--+------+----------+--------------+ + * ^ ^ + * | | + * | r2.base + * | + * r1.base + * + * Expect to merge the two entries into one region that starts at r1.base + * and has size of two regions minus their intersection. The total size of + * the available memory is updated, and the region counter stays the same. */ static int memblock_add_overlap_bottom_check(void) { @@ -201,11 +226,19 @@ static int memblock_add_overlap_bottom_check(void) } /* - * A test that tries to add two memory blocks, where the second one is - * within the range of the first entry (that is r1.base < r2.base && - * r2.base + r2.size < r1.base + r1.size). It checks if two entries are merged - * into one region that stays the same. The counter and total size of available - * memory are expected to not be updated. + * A test that tries to add two memory blocks r1 and r2, where r2 is + * within the range of r1 (that is r1.base < r2.base && + * r2.base + r2.size < r1.base + r1.size): + * + * | +-------+--+-----------------------+ + * | | |r2| r1 | + * +---+-------+--+-----------------------+ + * ^ + * | + * r1.base + * + * Expect to merge two entries into one region that stays the same. + * The counter and total size of available memory are not updated. */ static int memblock_add_within_check(void) { @@ -236,8 +269,8 @@ static int memblock_add_within_check(void) } /* - * A simple test that tries to add the same memory block twice. The counter - * and total size of available memory are expected to not be updated. + * A simple test that tries to add the same memory block twice. Expect + * the counter and total size of available memory to not be updated. */ static int memblock_add_twice_check(void) { From e4f76c8d217e6b8c689ab31e0546888c2df016e4 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 30 Apr 2022 17:49:37 -0500 Subject: [PATCH 096/334] memblock tests: update style of comments for memblock_reserve_*() functions Update comments in memblock_reserve_*() functions to match the style used in tests/alloc_*.c by rewording to make the expected outcome more apparent and, if more than one memblock is involved, adding a visual of the memory blocks. If the comment has an extra column of spaces, remove the extra space at the beginning of each line for consistency and to conform to Linux kernel coding style. Reviewed-by: David Hildenbrand Signed-off-by: Rebecca Mckeever Signed-off-by: Mike Rapoport --- tools/testing/memblock/tests/basic_api.c | 94 ++++++++++++++++-------- 1 file changed, 63 insertions(+), 31 deletions(-) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index ab8d71b1e24b..6054b83962ca 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -303,12 +303,12 @@ static int memblock_add_checks(void) return 0; } - /* - * A simple test that marks a memory block of a specified base address - * and size as reserved and to the collection of reserved memory regions - * (memblock.reserved). It checks if a new entry was created and if region - * counter and total memory size were correctly updated. - */ +/* + * A simple test that marks a memory block of a specified base address + * and size as reserved and to the collection of reserved memory regions + * (memblock.reserved). Expect to create a new entry. The region counter + * and total memory size are updated. + */ static int memblock_reserve_simple_check(void) { struct memblock_region *rgn; @@ -330,10 +330,15 @@ static int memblock_reserve_simple_check(void) } /* - * A test that tries to mark two memory blocks that don't overlap as reserved - * and checks if two entries were correctly added to the collection of reserved - * memory regions (memblock.reserved) and if this change was reflected in - * memblock.reserved's total size and region counter. + * A test that tries to mark two memory blocks that don't overlap as reserved: + * + * | +--+ +----------------+ | + * | |r1| | r2 | | + * +--------+--+------+----------------+--+ + * + * Expect to add two entries to the collection of reserved memory regions + * (memblock.reserved). The total size and region counter for + * memblock.reserved are updated. */ static int memblock_reserve_disjoint_check(void) { @@ -368,13 +373,22 @@ static int memblock_reserve_disjoint_check(void) } /* - * A test that tries to mark two memory blocks as reserved, where the - * second one overlaps with the beginning of the first (that is - * r1.base < r2.base + r2.size). - * It checks if two entries are merged into one region that starts at r2.base - * and has size of two regions minus their intersection. The test also verifies - * that memblock can still see only one entry and has a correct total size of - * the reserved memory. + * A test that tries to mark two memory blocks r1 and r2 as reserved, + * where r2 overlaps with the beginning of r1 (that is + * r1.base < r2.base + r2.size): + * + * | +--------------+--+--------------+ | + * | | r2 | | r1 | | + * +--+--------------+--+--------------+--+ + * ^ ^ + * | | + * | r1.base + * | + * r2.base + * + * Expect to merge two entries into one region that starts at r2.base and + * has size of two regions minus their intersection. The total size of the + * reserved memory is updated, and the region counter is not updated. */ static int memblock_reserve_overlap_top_check(void) { @@ -408,13 +422,22 @@ static int memblock_reserve_overlap_top_check(void) } /* - * A test that tries to mark two memory blocks as reserved, where the - * second one overlaps with the end of the first entry (that is - * r2.base < r1.base + r1.size). - * It checks if two entries are merged into one region that starts at r1.base - * and has size of two regions minus their intersection. It verifies that - * memblock can still see only one entry and has a correct total size of the - * reserved memory. + * A test that tries to mark two memory blocks r1 and r2 as reserved, + * where r2 overlaps with the end of r1 (that is + * r2.base < r1.base + r1.size): + * + * | +--------------+--+--------------+ | + * | | r1 | | r2 | | + * +--+--------------+--+--------------+--+ + * ^ ^ + * | | + * | r2.base + * | + * r1.base + * + * Expect to merge two entries into one region that starts at r1.base and + * has size of two regions minus their intersection. The total size of the + * reserved memory is updated, and the region counter is not updated. */ static int memblock_reserve_overlap_bottom_check(void) { @@ -448,12 +471,21 @@ static int memblock_reserve_overlap_bottom_check(void) } /* - * A test that tries to mark two memory blocks as reserved, where the second - * one is within the range of the first entry (that is - * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)). - * It checks if two entries are merged into one region that stays the - * same. The counter and total size of available memory are expected to not be - * updated. + * A test that tries to mark two memory blocks r1 and r2 as reserved, + * where r2 is within the range of r1 (that is + * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)): + * + * | +-----+--+---------------------------| + * | | |r2| r1 | + * +-+-----+--+---------------------------+ + * ^ ^ + * | | + * | r2.base + * | + * r1.base + * + * Expect to merge two entries into one region that stays the same. The + * counter and total size of available memory are not updated. */ static int memblock_reserve_within_check(void) { @@ -485,7 +517,7 @@ static int memblock_reserve_within_check(void) /* * A simple test that tries to reserve the same memory block twice. - * The region counter and total size of reserved memory are expected to not + * Expect the region counter and total size of reserved memory to not * be updated. */ static int memblock_reserve_twice_check(void) From 60bba7b193cc8241b464b4616cfece9353086eeb Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 30 Apr 2022 17:49:38 -0500 Subject: [PATCH 097/334] memblock tests: update style of comments for memblock_remove_*() functions Update comments in memblock_remove_*() functions to match the style used in tests/alloc_*.c by rewording to make the expected outcome more apparent and, if more than one memblock is involved, adding a visual of the memory blocks. If the comment has an extra column of spaces, remove the extra space at the beginning of each line for consistency and to conform to Linux kernel coding style. Signed-off-by: Rebecca Mckeever Reviewed-by: David Hildenbrand Signed-off-by: Mike Rapoport --- tools/testing/memblock/tests/basic_api.c | 111 +++++++++++++++++------ 1 file changed, 81 insertions(+), 30 deletions(-) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 6054b83962ca..37b7cc075b0f 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -550,14 +550,22 @@ static int memblock_reserve_checks(void) return 0; } - /* - * A simple test that tries to remove the first entry of the array of - * available memory regions. By "removing" a region we mean overwriting it - * with the next region in memblock.memory. To check this is the case, the - * test adds two memory blocks and verifies that the value of the latter - * was used to erase r1 region. It also checks if the region counter and - * total size were updated to expected values. - */ +/* + * A simple test that tries to remove a region r1 from the array of + * available memory regions. By "removing" a region we mean overwriting it + * with the next region r2 in memblock.memory: + * + * | ...... +----------------+ | + * | : r1 : | r2 | | + * +--+----+----------+----------------+--+ + * ^ + * | + * rgn.base + * + * Expect to add two memory blocks r1 and r2 and then remove r1 so that + * r2 is the first available region. The region counter and total size + * are updated. + */ static int memblock_remove_simple_check(void) { struct memblock_region *rgn; @@ -587,11 +595,22 @@ static int memblock_remove_simple_check(void) return 0; } - /* - * A test that tries to remove a region that was not registered as available - * memory (i.e. has no corresponding entry in memblock.memory). It verifies - * that array, regions counter and total size were not modified. - */ +/* + * A test that tries to remove a region r2 that was not registered as + * available memory (i.e. has no corresponding entry in memblock.memory): + * + * +----------------+ + * | r2 | + * +----------------+ + * | +----+ | + * | | r1 | | + * +--+----+------------------------------+ + * ^ + * | + * rgn.base + * + * Expect the array, regions counter and total size to not be modified. + */ static int memblock_remove_absent_check(void) { struct memblock_region *rgn; @@ -621,11 +640,23 @@ static int memblock_remove_absent_check(void) } /* - * A test that tries to remove a region which overlaps with the beginning of - * the already existing entry r1 (that is r1.base < r2.base + r2.size). It - * checks if only the intersection of both regions is removed from the available - * memory pool. The test also checks if the regions counter and total size are - * updated to expected values. + * A test that tries to remove a region r2 that overlaps with the + * beginning of the already existing entry r1 + * (that is r1.base < r2.base + r2.size): + * + * +-----------------+ + * | r2 | + * +-----------------+ + * | .........+--------+ | + * | : r1 | rgn | | + * +-----------------+--------+--------+--+ + * ^ ^ + * | | + * | rgn.base + * r1.base + * + * Expect that only the intersection of both regions is removed from the + * available memory pool. The regions counter and total size are updated. */ static int memblock_remove_overlap_top_check(void) { @@ -661,11 +692,21 @@ static int memblock_remove_overlap_top_check(void) } /* - * A test that tries to remove a region which overlaps with the end of the - * first entry (that is r2.base < r1.base + r1.size). It checks if only the - * intersection of both regions is removed from the available memory pool. - * The test also checks if the regions counter and total size are updated to - * expected values. + * A test that tries to remove a region r2 that overlaps with the end of + * the already existing region r1 (that is r2.base < r1.base + r1.size): + * + * +--------------------------------+ + * | r2 | + * +--------------------------------+ + * | +---+..... | + * | |rgn| r1 : | + * +-+---+----+---------------------------+ + * ^ + * | + * r1.base + * + * Expect that only the intersection of both regions is removed from the + * available memory pool. The regions counter and total size are updated. */ static int memblock_remove_overlap_bottom_check(void) { @@ -698,13 +739,23 @@ static int memblock_remove_overlap_bottom_check(void) } /* - * A test that tries to remove a region which is within the range of the - * already existing entry (that is - * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)). - * It checks if the region is split into two - one that ends at r2.base and - * second that starts at r2.base + size, with appropriate sizes. The test - * also checks if the region counter and total size were updated to - * expected values. + * A test that tries to remove a region r2 that is within the range of + * the already existing entry r1 (that is + * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)): + * + * +----+ + * | r2 | + * +----+ + * | +-------------+....+---------------+ | + * | | rgn1 | r1 | rgn2 | | + * +-+-------------+----+---------------+-+ + * ^ + * | + * r1.base + * + * Expect that the region is split into two - one that ends at r2.base and + * another that starts at r2.base + r2.size, with appropriate sizes. The + * region counter and total size are updated. */ static int memblock_remove_within_check(void) { From a5550c053f6cf9993119f5c82ffc25ea80364b64 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 30 Apr 2022 17:49:39 -0500 Subject: [PATCH 098/334] memblock tests: update style of comments for memblock_free_*() functions Update comments in memblock_free_*() functions to match the style used in tests/alloc_*.c by rewording to make the expected outcome more apparent and, if more than one memblock is involved, adding a visual of the memory blocks. If the comment has an extra column of spaces, remove the extra space at the beginning of each line for consistency and to conform to Linux kernel coding style. Signed-off-by: Rebecca Mckeever Reviewed-by: David Hildenbrand Signed-off-by: Mike Rapoport --- tools/testing/memblock/tests/basic_api.c | 102 +++++++++++++++++------ 1 file changed, 75 insertions(+), 27 deletions(-) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 37b7cc075b0f..a7bc180316d6 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -806,12 +806,19 @@ static int memblock_remove_checks(void) } /* - * A simple test that tries to free a memory block that was marked earlier - * as reserved. By "freeing" a region we mean overwriting it with the next - * entry in memblock.reserved. To check this is the case, the test reserves - * two memory regions and verifies that the value of the latter was used to - * erase r1 region. - * The test also checks if the region counter and total size were updated. + * A simple test that tries to free a memory block r1 that was marked + * earlier as reserved. By "freeing" a region we mean overwriting it with + * the next entry r2 in memblock.reserved: + * + * | ...... +----+ | + * | : r1 : | r2 | | + * +--------------+----+-----------+----+-+ + * ^ + * | + * rgn.base + * + * Expect to reserve two memory regions and then erase r1 region with the + * value of r2. The region counter and total size are updated. */ static int memblock_free_simple_check(void) { @@ -842,11 +849,22 @@ static int memblock_free_simple_check(void) return 0; } - /* - * A test that tries to free a region that was not marked as reserved - * (i.e. has no corresponding entry in memblock.reserved). It verifies - * that array, regions counter and total size were not modified. - */ +/* + * A test that tries to free a region r2 that was not marked as reserved + * (i.e. has no corresponding entry in memblock.reserved): + * + * +----------------+ + * | r2 | + * +----------------+ + * | +----+ | + * | | r1 | | + * +--+----+------------------------------+ + * ^ + * | + * rgn.base + * + * The array, regions counter and total size are not modified. + */ static int memblock_free_absent_check(void) { struct memblock_region *rgn; @@ -876,11 +894,23 @@ static int memblock_free_absent_check(void) } /* - * A test that tries to free a region which overlaps with the beginning of - * the already existing entry r1 (that is r1.base < r2.base + r2.size). It - * checks if only the intersection of both regions is freed. The test also - * checks if the regions counter and total size are updated to expected - * values. + * A test that tries to free a region r2 that overlaps with the beginning + * of the already existing entry r1 (that is r1.base < r2.base + r2.size): + * + * +----+ + * | r2 | + * +----+ + * | ...+--------------+ | + * | : | r1 | | + * +----+--+--------------+---------------+ + * ^ ^ + * | | + * | rgn.base + * | + * r1.base + * + * Expect that only the intersection of both regions is freed. The + * regions counter and total size are updated. */ static int memblock_free_overlap_top_check(void) { @@ -914,10 +944,18 @@ static int memblock_free_overlap_top_check(void) } /* - * A test that tries to free a region which overlaps with the end of the - * first entry (that is r2.base < r1.base + r1.size). It checks if only the - * intersection of both regions is freed. The test also checks if the - * regions counter and total size are updated to expected values. + * A test that tries to free a region r2 that overlaps with the end of + * the already existing entry r1 (that is r2.base < r1.base + r1.size): + * + * +----------------+ + * | r2 | + * +----------------+ + * | +-----------+..... | + * | | r1 | : | + * +----+-----------+----+----------------+ + * + * Expect that only the intersection of both regions is freed. The + * regions counter and total size are updated. */ static int memblock_free_overlap_bottom_check(void) { @@ -951,13 +989,23 @@ static int memblock_free_overlap_bottom_check(void) } /* - * A test that tries to free a region which is within the range of the - * already existing entry (that is - * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)). - * It checks if the region is split into two - one that ends at r2.base and - * second that starts at r2.base + size, with appropriate sizes. It is - * expected that the region counter and total size fields were updated t - * reflect that change. + * A test that tries to free a region r2 that is within the range of the + * already existing entry r1 (that is + * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)): + * + * +----+ + * | r2 | + * +----+ + * | +------------+....+---------------+ + * | | rgn1 | r1 | rgn2 | + * +----+------------+----+---------------+ + * ^ + * | + * r1.base + * + * Expect that the region is split into two - one that ends at r2.base and + * another that starts at r2.base + r2.size, with appropriate sizes. The + * region counter and total size fields are updated. */ static int memblock_free_within_check(void) { From 000605cd1b14f0970465a44bfe89da93cca66348 Mon Sep 17 00:00:00 2001 From: Rebecca Mckeever Date: Sat, 30 Apr 2022 17:49:40 -0500 Subject: [PATCH 099/334] memblock tests: remove completed TODO item Remove completed item from TODO list. Signed-off-by: Rebecca Mckeever Reviewed-by: David Hildenbrand Signed-off-by: Mike Rapoport --- tools/testing/memblock/TODO | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/memblock/TODO b/tools/testing/memblock/TODO index c25b2fdec45e..cd1a30d5acc9 100644 --- a/tools/testing/memblock/TODO +++ b/tools/testing/memblock/TODO @@ -23,6 +23,3 @@ TODO 5. Add tests for memblock_alloc_node() to check if the correct NUMA node is set for the new region - -6. Update comments in tests/basic_api.c to match the style used in - tests/alloc_*.c From fcb24583509f843b1b4025761552dd64aa9f8b96 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 29 Apr 2022 08:53:45 +0800 Subject: [PATCH 100/334] dt-bindings: remoteproc: imx_rproc: Support i.MX93 Add i.MX93 remote processor(Cortex-M33) compatible string, and reorder the strings in alphabetical order. Signed-off-by: Peng Fan Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220429005346.2108279-2-peng.fan@oss.nxp.com Signed-off-by: Mathieu Poirier --- .../devicetree/bindings/remoteproc/fsl,imx-rproc.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml index fc16d903353e..3a1f59ad79e2 100644 --- a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml @@ -15,14 +15,15 @@ maintainers: properties: compatible: enum: - - fsl,imx8mq-cm4 + - fsl,imx6sx-cm4 + - fsl,imx7d-cm4 + - fsl,imx7ulp-cm4 - fsl,imx8mm-cm4 - fsl,imx8mn-cm7 - fsl,imx8mp-cm7 + - fsl,imx8mq-cm4 - fsl,imx8ulp-cm33 - - fsl,imx7d-cm4 - - fsl,imx7ulp-cm4 - - fsl,imx6sx-cm4 + - fsl,imx93-cm33 clocks: maxItems: 1 From 9222fabf0e39d281de65e908b94f4331eab556a2 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 29 Apr 2022 08:53:46 +0800 Subject: [PATCH 101/334] remoteproc: imx_rproc: Support i.MX93 i.MX93 features a Cortex-M33 core which could be kicked by ROM/Bootloader /Linux. Similar with i.MX8MN/P, we use SMC to trap into Arm Trusted Firmware to start/stop the M33 core. Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20220429005346.2108279-3-peng.fan@oss.nxp.com Signed-off-by: Mathieu Poirier --- drivers/remoteproc/imx_rproc.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 91eb037089ef..4a3352821b1d 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -91,6 +91,32 @@ struct imx_rproc { void __iomem *rsc_table; }; +static const struct imx_rproc_att imx_rproc_att_imx93[] = { + /* dev addr , sys addr , size , flags */ + /* TCM CODE NON-SECURE */ + { 0x0FFC0000, 0x201C0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x0FFE0000, 0x201E0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* TCM CODE SECURE */ + { 0x1FFC0000, 0x201C0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x1FFE0000, 0x201E0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* TCM SYS NON-SECURE*/ + { 0x20000000, 0x20200000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x20020000, 0x20220000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* TCM SYS SECURE*/ + { 0x30000000, 0x20200000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x30020000, 0x20220000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* DDR */ + { 0x80000000, 0x80000000, 0x10000000, 0 }, + { 0x90000000, 0x80000000, 0x10000000, 0 }, + + { 0xC0000000, 0xa0000000, 0x10000000, 0 }, + { 0xD0000000, 0xa0000000, 0x10000000, 0 }, +}; + static const struct imx_rproc_att imx_rproc_att_imx8mn[] = { /* dev addr , sys addr , size , flags */ /* ITCM */ @@ -261,6 +287,12 @@ static const struct imx_rproc_dcfg imx_rproc_cfg_imx6sx = { .method = IMX_RPROC_MMIO, }; +static const struct imx_rproc_dcfg imx_rproc_cfg_imx93 = { + .att = imx_rproc_att_imx93, + .att_size = ARRAY_SIZE(imx_rproc_att_imx93), + .method = IMX_RPROC_SMC, +}; + static int imx_rproc_start(struct rproc *rproc) { struct imx_rproc *priv = rproc->priv; @@ -824,6 +856,7 @@ static const struct of_device_id imx_rproc_of_match[] = { { .compatible = "fsl,imx8mn-cm7", .data = &imx_rproc_cfg_imx8mn }, { .compatible = "fsl,imx8mp-cm7", .data = &imx_rproc_cfg_imx8mn }, { .compatible = "fsl,imx8ulp-cm33", .data = &imx_rproc_cfg_imx8ulp }, + { .compatible = "fsl,imx93-cm33", .data = &imx_rproc_cfg_imx93 }, {}, }; MODULE_DEVICE_TABLE(of, imx_rproc_of_match); From be1de12cb6738929b711094e228da439492a4b91 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 5 May 2022 13:36:39 +0200 Subject: [PATCH 102/334] dt-bindings: remoteproc: st,stm32-rproc: Fix phandle-array parameters description Replace the FIXME by appropriate description. Fixes: 39bd2b6a3783 ("dt-bindings: Improve phandle-array schemas") Signed-off-by: Arnaud Pouliquen Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220505113639.1344281-1-arnaud.pouliquen@foss.st.com Signed-off-by: Mathieu Poirier --- .../bindings/remoteproc/st,stm32-rproc.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml index be3d9b0e876b..da50f0e99fe2 100644 --- a/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml @@ -43,8 +43,8 @@ properties: items: - items: - description: Phandle of syscon block - - description: FIXME - - description: FIXME + - description: The offset of the trust zone setting register + - description: The field mask of the trust zone state interrupts: description: Should contain the WWDG1 watchdog reset interrupt @@ -101,8 +101,8 @@ properties: items: - items: - description: Phandle of syscon block - - description: FIXME - - description: FIXME + - description: The offset of the power setting register + - description: The field mask of the PDDS selection st,syscfg-m4-state: $ref: "/schemas/types.yaml#/definitions/phandle-array" @@ -111,8 +111,8 @@ properties: items: - items: - description: Phandle of syscon block with the tamp register - - description: FIXME - - description: FIXME + - description: The offset of the tamp register + - description: The field mask of the Cortex-M4 state st,syscfg-rsc-tbl: $ref: "/schemas/types.yaml#/definitions/phandle-array" @@ -122,8 +122,8 @@ properties: items: - items: - description: Phandle of syscon block with the tamp register - - description: FIXME - - description: FIXME + - description: The offset of the tamp register + - description: The field mask of the Cortex-M4 resource table address st,auto-boot: $ref: /schemas/types.yaml#/definitions/flag From 2880f47b949f1f49e2d861ffbba91d57416be7d9 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Sat, 7 May 2022 00:28:14 +0800 Subject: [PATCH 103/334] f2fs: skip GC if possible when checkpoint disabling If the number of unusable blocks is not larger than unusable capacity, we can skip GC when checkpoint disabling. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu [Jaegeuk Kim: Fix missing gc_mode assignment] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d06a577a1208..f0cd454d8f88 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2058,7 +2058,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - unsigned int gc_mode; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2069,9 +2069,13 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); - gc_mode = sbi->gc_mode; sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { @@ -2097,6 +2101,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } +skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); From 2e42b7f817acd6e8d78226445eb6fe44fe79c12a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 14:17:32 -0700 Subject: [PATCH 104/334] f2fs: stop allocating pinned sections if EAGAIN happens EAGAIN doesn't guarantee to have a free section. Let's report it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e4cf8b7b23aa..b307d96a0a7c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1685,7 +1685,7 @@ next_alloc: GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + if (err && err != -ENODATA) goto out_err; } From d21a580dafc69aa04f46e6099616146a536b0724 Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Fri, 29 Apr 2022 16:11:22 +0800 Subject: [PATCH 105/334] ksmbd: fix reference count leak in smb_check_perm_dacl() The issue happens in a specific path in smb_check_perm_dacl(). When "id" and "uid" have the same value, the function simply jumps out of the loop without decrementing the reference count of the object "posix_acls", which is increased by get_acl() earlier. This may result in memory leaks. Fix it by decreasing the reference count of "posix_acls" before jumping to label "check_access_bits". Fixes: 777cad1604d6 ("ksmbd: remove select FS_POSIX_ACL in Kconfig") Signed-off-by: Xin Xiong Signed-off-by: Xin Tan Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smbacl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 6ecf55ea1fed..38f23bf981ac 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1261,6 +1261,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, if (!access_bits) access_bits = SET_MINIMUM_RIGHTS; + posix_acl_release(posix_acls); goto check_access_bits; } } From 158a66b245739e15858de42c0ba60fcf3de9b8e6 Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Wed, 4 May 2022 15:40:10 +0200 Subject: [PATCH 106/334] ksmbd: validate length in smb2_write() The SMB2 Write packet contains data that is to be written to a file or to a pipe. Depending on the client, there may be padding between the header and the data field. Currently, the length is validated only in the case padding is present. Since the DataOffset field always points to the beginning of the data, there is no need to have a special case for padding. By removing this, the length is validated in both cases. Signed-off-by: Marios Makassikis Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 49 ++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 16c803a9d996..2bdfe449b2da 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6328,23 +6328,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) length = le32_to_cpu(req->Length); id = req->VolatileFileId; - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length); if (rpc_resp) { if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { @@ -6489,22 +6484,16 @@ int smb2_write(struct ksmbd_work *work) if (req->Channel != SMB2_CHANNEL_RDMA_V1 && req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) From 846e437387e74c44ddc9f3eeec472fd37ca3cdb9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 10 May 2022 12:02:03 +0300 Subject: [PATCH 107/334] net/mlx5: Expose mlx5_sriov_blocking_notifier_register / unregister APIs Expose mlx5_sriov_blocking_notifier_register / unregister APIs to let a VF register to be notified for its enablement / disablement by the PF. Upon VF probe it will call mlx5_sriov_blocking_notifier_register() with its notifier block and upon VF remove it will call mlx5_sriov_blocking_notifier_unregister() to drop its registration. This can give a VF the ability to clean some resources upon disable before that the command interface goes down and on the other hand sets some stuff before that it's enabled. This may be used by a VF which is migration capable in few cases.(e.g. PF load/unload upon an health recovery). Link: https://lore.kernel.org/r/20220510090206.90374-2-yishaih@nvidia.com Signed-off-by: Yishai Hadas Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/sriov.c | 65 ++++++++++++++++++- include/linux/mlx5/driver.h | 12 ++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 887ee0f729d1..2935614f6fa9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -87,6 +87,11 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) enable_vfs_hca: num_msix_count = mlx5_get_default_msix_vec_count(dev, num_vfs); for (vf = 0; vf < num_vfs; vf++) { + /* Notify the VF before its enablement to let it set + * some stuff. + */ + blocking_notifier_call_chain(&sriov->vfs_ctx[vf].notifier, + MLX5_PF_NOTIFY_ENABLE_VF, dev); err = mlx5_core_enable_hca(dev, vf + 1); if (err) { mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err); @@ -127,6 +132,11 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) for (vf = num_vfs - 1; vf >= 0; vf--) { if (!sriov->vfs_ctx[vf].enabled) continue; + /* Notify the VF before its disablement to let it clean + * some resources. + */ + blocking_notifier_call_chain(&sriov->vfs_ctx[vf].notifier, + MLX5_PF_NOTIFY_DISABLE_VF, dev); err = mlx5_core_disable_hca(dev, vf + 1); if (err) { mlx5_core_warn(dev, "failed to disable VF %d\n", vf); @@ -257,7 +267,7 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; struct pci_dev *pdev = dev->pdev; - int total_vfs; + int total_vfs, i; if (!mlx5_core_is_pf(dev)) return 0; @@ -269,6 +279,9 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev) if (!sriov->vfs_ctx) return -ENOMEM; + for (i = 0; i < total_vfs; i++) + BLOCKING_INIT_NOTIFIER_HEAD(&sriov->vfs_ctx[i].notifier); + return 0; } @@ -281,3 +294,53 @@ void mlx5_sriov_cleanup(struct mlx5_core_dev *dev) kfree(sriov->vfs_ctx); } + +/** + * mlx5_sriov_blocking_notifier_unregister - Unregister a VF from + * a notification block chain. + * + * @mdev: The mlx5 core device. + * @vf_id: The VF id. + * @nb: The notifier block to be unregistered. + */ +void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb) +{ + struct mlx5_vf_context *vfs_ctx; + struct mlx5_core_sriov *sriov; + + sriov = &mdev->priv.sriov; + if (WARN_ON(vf_id < 0 || vf_id >= sriov->num_vfs)) + return; + + vfs_ctx = &sriov->vfs_ctx[vf_id]; + blocking_notifier_chain_unregister(&vfs_ctx->notifier, nb); +} +EXPORT_SYMBOL(mlx5_sriov_blocking_notifier_unregister); + +/** + * mlx5_sriov_blocking_notifier_register - Register a VF notification + * block chain. + * + * @mdev: The mlx5 core device. + * @vf_id: The VF id. + * @nb: The notifier block to be called upon the VF events. + * + * Returns 0 on success or an error code. + */ +int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb) +{ + struct mlx5_vf_context *vfs_ctx; + struct mlx5_core_sriov *sriov; + + sriov = &mdev->priv.sriov; + if (vf_id < 0 || vf_id >= sriov->num_vfs) + return -EINVAL; + + vfs_ctx = &sriov->vfs_ctx[vf_id]; + return blocking_notifier_chain_register(&vfs_ctx->notifier, nb); +} +EXPORT_SYMBOL(mlx5_sriov_blocking_notifier_register); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ff47d49d8be4..6fac5427d899 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -445,6 +445,11 @@ struct mlx5_qp_table { struct radix_tree_root tree; }; +enum { + MLX5_PF_NOTIFY_DISABLE_VF, + MLX5_PF_NOTIFY_ENABLE_VF, +}; + struct mlx5_vf_context { int enabled; u64 port_guid; @@ -455,6 +460,7 @@ struct mlx5_vf_context { u8 port_guid_valid:1; u8 node_guid_valid:1; enum port_state_policy policy; + struct blocking_notifier_head notifier; }; struct mlx5_core_sriov { @@ -1152,6 +1158,12 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev); void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev); +int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb); +void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb); #ifdef CONFIG_MLX5_CORE_IPOIB struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, struct ib_device *ibdev, From 61a2f1460fd03285ea34c1a235f2f50f71e13a1f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 10 May 2022 12:02:04 +0300 Subject: [PATCH 108/334] vfio/mlx5: Manage the VF attach/detach callback from the PF Manage the VF attach/detach callback from the PF. This lets the driver to enable parallel VFs migration as will be introduced in the next patch. As part of this, reorganize the VF is migratable code to be in a separate function and rename it to be set_migratable() to match its functionality. Link: https://lore.kernel.org/r/20220510090206.90374-3-yishaih@nvidia.com Signed-off-by: Yishai Hadas Reviewed-by: Alex Williamson Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 66 ++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/mlx5/cmd.h | 22 ++++++++++++ drivers/vfio/pci/mlx5/main.c | 38 +++------------------ 3 files changed, 92 insertions(+), 34 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5c9f9218cc1d..eddb7dedc047 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -71,6 +71,72 @@ end: return ret; } +static int mlx5fv_vf_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mlx5vf_pci_core_device *mvdev = + container_of(nb, struct mlx5vf_pci_core_device, nb); + + mutex_lock(&mvdev->state_mutex); + switch (event) { + case MLX5_PF_NOTIFY_ENABLE_VF: + mvdev->mdev_detach = false; + break; + case MLX5_PF_NOTIFY_DISABLE_VF: + mvdev->mdev_detach = true; + break; + default: + break; + } + mlx5vf_state_mutex_unlock(mvdev); + return 0; +} + +void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) +{ + if (!mvdev->migrate_cap) + return; + + mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, + &mvdev->nb); +} + +void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) +{ + struct pci_dev *pdev = mvdev->core_device.pdev; + int ret; + + if (!pdev->is_virtfn) + return; + + mvdev->mdev = mlx5_vf_get_core_dev(pdev); + if (!mvdev->mdev) + return; + + if (!MLX5_CAP_GEN(mvdev->mdev, migration)) + goto end; + + mvdev->vf_id = pci_iov_vf_id(pdev); + if (mvdev->vf_id < 0) + goto end; + + mutex_init(&mvdev->state_mutex); + spin_lock_init(&mvdev->reset_lock); + mvdev->nb.notifier_call = mlx5fv_vf_event; + ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, + &mvdev->nb); + if (ret) + goto end; + + mvdev->migrate_cap = 1; + mvdev->core_device.vdev.migration_flags = + VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P; + +end: + mlx5_vf_put_core_dev(mvdev->mdev); +} + int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id) { struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 1392a11a9cc0..dd4257b27d22 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -7,6 +7,7 @@ #define MLX5_VFIO_CMD_H #include +#include #include struct mlx5_vf_migration_file { @@ -24,13 +25,34 @@ struct mlx5_vf_migration_file { unsigned long last_offset; }; +struct mlx5vf_pci_core_device { + struct vfio_pci_core_device core_device; + int vf_id; + u16 vhca_id; + u8 migrate_cap:1; + u8 deferred_reset:1; + u8 mdev_detach:1; + /* protect migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protect the reset_done flow */ + spinlock_t reset_lock; + struct mlx5_vf_migration_file *resuming_migf; + struct mlx5_vf_migration_file *saving_migf; + struct notifier_block nb; + struct mlx5_core_dev *mdev; +}; + int mlx5vf_cmd_suspend_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct pci_dev *pdev, u16 vhca_id, size_t *state_size); int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id); +void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, struct mlx5_vf_migration_file *migf); +void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); #endif /* MLX5_VFIO_CMD_H */ diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index bbec5d288fee..26065487881f 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "cmd.h" @@ -25,20 +24,6 @@ /* Arbitrary to prevent userspace from consuming endless memory */ #define MAX_MIGRATION_SIZE (512*1024*1024) -struct mlx5vf_pci_core_device { - struct vfio_pci_core_device core_device; - u16 vhca_id; - u8 migrate_cap:1; - u8 deferred_reset:1; - /* protect migration state */ - struct mutex state_mutex; - enum vfio_device_mig_state mig_state; - /* protect the reset_done flow */ - spinlock_t reset_lock; - struct mlx5_vf_migration_file *resuming_migf; - struct mlx5_vf_migration_file *saving_migf; -}; - static struct page * mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned long offset) @@ -444,7 +429,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, * This function is called in all state_mutex unlock cases to * handle a 'deferred_reset' if exists. */ -static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) +void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) { again: spin_lock(&mvdev->reset_lock); @@ -596,24 +581,7 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, if (!mvdev) return -ENOMEM; vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - - if (pdev->is_virtfn) { - struct mlx5_core_dev *mdev = - mlx5_vf_get_core_dev(pdev); - - if (mdev) { - if (MLX5_CAP_GEN(mdev, migration)) { - mvdev->migrate_cap = 1; - mvdev->core_device.vdev.migration_flags = - VFIO_MIGRATION_STOP_COPY | - VFIO_MIGRATION_P2P; - mutex_init(&mvdev->state_mutex); - spin_lock_init(&mvdev->reset_lock); - } - mlx5_vf_put_core_dev(mdev); - } - } - + mlx5vf_cmd_set_migratable(mvdev); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) goto out_free; @@ -622,6 +590,7 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, return 0; out_free: + mlx5vf_cmd_remove_migratable(mvdev); vfio_pci_core_uninit_device(&mvdev->core_device); kfree(mvdev); return ret; @@ -632,6 +601,7 @@ static void mlx5vf_pci_remove(struct pci_dev *pdev) struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); vfio_pci_core_unregister_device(&mvdev->core_device); + mlx5vf_cmd_remove_migratable(mvdev); vfio_pci_core_uninit_device(&mvdev->core_device); kfree(mvdev); } From 8580ad14f9396b03c4b9645b4cf0dd0085664562 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 10 May 2022 12:02:05 +0300 Subject: [PATCH 109/334] vfio/mlx5: Refactor to enable VFs migration in parallel Refactor to enable different VFs to run their commands over the PF command interface in parallel and to not block one each other. This is done by not using the global PF lock that was used before but relying on the VF attach/detach mechanism to sync. Link: https://lore.kernel.org/r/20220510090206.90374-4-yishaih@nvidia.com Signed-off-by: Yishai Hadas Reviewed-by: Alex Williamson Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 103 +++++++++++++++-------------------- drivers/vfio/pci/mlx5/cmd.h | 11 ++-- drivers/vfio/pci/mlx5/main.c | 44 ++++----------- 3 files changed, 59 insertions(+), 99 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index eddb7dedc047..2d6e323de268 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -5,70 +5,65 @@ #include "cmd.h" -int mlx5vf_cmd_suspend_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod) +static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, + u16 *vhca_id); + +int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; - int ret; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); - MLX5_SET(suspend_vhca_in, in, vhca_id, vhca_id); + MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - ret = mlx5_cmd_exec_inout(mdev, suspend_vhca, in, out); - mlx5_vf_put_core_dev(mdev); - return ret; + return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); } -int mlx5vf_cmd_resume_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod) +int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; - int ret; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); - MLX5_SET(resume_vhca_in, in, vhca_id, vhca_id); + MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(resume_vhca_in, in, op_mod, op_mod); - ret = mlx5_cmd_exec_inout(mdev, resume_vhca, in, out); - mlx5_vf_put_core_dev(mdev); - return ret; + return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); } -int mlx5vf_cmd_query_vhca_migration_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; int ret; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); - MLX5_SET(query_vhca_migration_state_in, in, vhca_id, vhca_id); + MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); - ret = mlx5_cmd_exec_inout(mdev, query_vhca_migration_state, in, out); + ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, + out); if (ret) - goto end; + return ret; *state_size = MLX5_GET(query_vhca_migration_state_out, out, required_umem_size); - -end: - mlx5_vf_put_core_dev(mdev); - return ret; + return 0; } static int mlx5fv_vf_event(struct notifier_block *nb, @@ -120,6 +115,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) if (mvdev->vf_id < 0) goto end; + if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, + &mvdev->vhca_id)) + goto end; + mutex_init(&mvdev->state_mutex); spin_lock_init(&mvdev->reset_lock); mvdev->nb.notifier_call = mlx5fv_vf_event; @@ -137,23 +136,18 @@ end: mlx5_vf_put_core_dev(mvdev->mdev); } -int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id) +static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, + u16 *vhca_id) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int ret; - if (!mdev) - return -ENOTCONN; - out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); - if (!out) { - ret = -ENOMEM; - goto end; - } + if (!out) + return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, other_function, 1); @@ -171,8 +165,6 @@ int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id) err_exec: kfree(out); -end: - mlx5_vf_put_core_dev(mdev); return ret; } @@ -217,21 +209,23 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, return err; } -int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5_core_dev *mdev; u32 pdn, mkey; int err; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; + mdev = mvdev->mdev; err = mlx5_core_alloc_pd(mdev, &pdn); if (err) - goto end; + return err; err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); @@ -245,7 +239,7 @@ int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); - MLX5_SET(save_vhca_state_in, in, vhca_id, vhca_id); + MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, mkey); MLX5_SET(save_vhca_state_in, in, size, migf->total_length); @@ -253,37 +247,28 @@ int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, if (err) goto err_exec; - migf->total_length = - MLX5_GET(save_vhca_state_out, out, actual_image_size); - - mlx5_core_destroy_mkey(mdev, mkey); - mlx5_core_dealloc_pd(mdev, pdn); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_vf_put_core_dev(mdev); - - return 0; - + migf->total_length = MLX5_GET(save_vhca_state_out, out, + actual_image_size); err_exec: mlx5_core_destroy_mkey(mdev, mkey); err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); -end: - mlx5_vf_put_core_dev(mdev); return err; } -int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); + struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; u32 pdn, mkey; int err; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; mutex_lock(&migf->lock); @@ -292,6 +277,7 @@ int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, goto end; } + mdev = mvdev->mdev; err = mlx5_core_alloc_pd(mdev, &pdn); if (err) goto end; @@ -307,7 +293,7 @@ int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); - MLX5_SET(load_vhca_state_in, in, vhca_id, vhca_id); + MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(load_vhca_state_in, in, mkey, mkey); MLX5_SET(load_vhca_state_in, in, size, migf->total_length); @@ -319,7 +305,6 @@ err_mkey: err_reg: mlx5_core_dealloc_pd(mdev, pdn); end: - mlx5_vf_put_core_dev(mdev); mutex_unlock(&migf->lock); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index dd4257b27d22..94928055c005 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -43,16 +43,15 @@ struct mlx5vf_pci_core_device { struct mlx5_core_dev *mdev; }; -int mlx5vf_cmd_suspend_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod); -int mlx5vf_cmd_resume_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod); -int mlx5vf_cmd_query_vhca_migration_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); +int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); +int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size); -int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); -int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); -int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); #endif /* MLX5_VFIO_CMD_H */ diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 26065487881f..2db6b816f003 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -208,8 +208,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); - ret = mlx5vf_cmd_query_vhca_migration_state( - mvdev->core_device.pdev, mvdev->vhca_id, &migf->total_length); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &migf->total_length); if (ret) goto out_free; @@ -218,8 +218,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; - ret = mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev, - mvdev->vhca_id, migf); + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) goto out_free; return migf; @@ -346,8 +345,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, int ret; if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_suspend_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); if (ret) return ERR_PTR(ret); @@ -355,8 +353,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { - ret = mlx5vf_cmd_resume_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); if (ret) return ERR_PTR(ret); @@ -364,8 +361,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { - ret = mlx5vf_cmd_suspend_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) return ERR_PTR(ret); @@ -373,8 +369,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { - ret = mlx5vf_cmd_resume_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) return ERR_PTR(ret); @@ -409,8 +404,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev, - mvdev->vhca_id, + ret = mlx5vf_cmd_load_vhca_state(mvdev, mvdev->resuming_migf); if (ret) return ERR_PTR(ret); @@ -517,34 +511,16 @@ static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) struct mlx5vf_pci_core_device *mvdev = container_of( core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); struct vfio_pci_core_device *vdev = &mvdev->core_device; - int vf_id; int ret; ret = vfio_pci_core_enable(vdev); if (ret) return ret; - if (!mvdev->migrate_cap) { - vfio_pci_core_finish_enable(vdev); - return 0; - } - - vf_id = pci_iov_vf_id(vdev->pdev); - if (vf_id < 0) { - ret = vf_id; - goto out_disable; - } - - ret = mlx5vf_cmd_get_vhca_id(vdev->pdev, vf_id + 1, &mvdev->vhca_id); - if (ret) - goto out_disable; - - mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + if (mvdev->migrate_cap) + mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; vfio_pci_core_finish_enable(vdev); return 0; -out_disable: - vfio_pci_core_disable(vdev); - return ret; } static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) From 85c205db605b2067dec5c72ff7efbbad899a608e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 10 May 2022 12:02:06 +0300 Subject: [PATCH 110/334] vfio/mlx5: Run the SAVE state command in an async mode Use the PF asynchronous command mode for the SAVE state command. This enables returning earlier to user space upon issuing successfully the command and improve latency by let things run in parallel. Link: https://lore.kernel.org/r/20220510090206.90374-5-yishaih@nvidia.com Signed-off-by: Yishai Hadas Reviewed-by: Alex Williamson Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 81 +++++++++++++++++++++++++++++++++--- drivers/vfio/pci/mlx5/cmd.h | 19 ++++++++- drivers/vfio/pci/mlx5/main.c | 40 ++++++++++++++++-- 3 files changed, 131 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 2d6e323de268..9b9f33ca270a 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -78,6 +78,7 @@ static int mlx5fv_vf_event(struct notifier_block *nb, mvdev->mdev_detach = false; break; case MLX5_PF_NOTIFY_DISABLE_VF: + mlx5vf_disable_fds(mvdev); mvdev->mdev_detach = true; break; default: @@ -94,6 +95,7 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, &mvdev->nb); + destroy_workqueue(mvdev->cb_wq); } void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) @@ -119,13 +121,19 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) &mvdev->vhca_id)) goto end; + mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); + if (!mvdev->cb_wq) + goto end; + mutex_init(&mvdev->state_mutex); spin_lock_init(&mvdev->reset_lock); mvdev->nb.notifier_call = mlx5fv_vf_event; ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, &mvdev->nb); - if (ret) + if (ret) { + destroy_workqueue(mvdev->cb_wq); goto end; + } mvdev->migrate_cap = 1; mvdev->core_device.vdev.migration_flags = @@ -209,11 +217,56 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, return err; } +void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) +{ + struct mlx5vf_async_data *async_data = container_of(_work, + struct mlx5vf_async_data, work); + struct mlx5_vf_migration_file *migf = container_of(async_data, + struct mlx5_vf_migration_file, async_data); + struct mlx5_core_dev *mdev = migf->mvdev->mdev; + + mutex_lock(&migf->lock); + if (async_data->status) { + migf->is_err = true; + wake_up_interruptible(&migf->poll_wait); + } + mutex_unlock(&migf->lock); + + mlx5_core_destroy_mkey(mdev, async_data->mkey); + dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); + mlx5_core_dealloc_pd(mdev, async_data->pdn); + kvfree(async_data->out); + fput(migf->filp); +} + +static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) +{ + struct mlx5vf_async_data *async_data = container_of(context, + struct mlx5vf_async_data, cb_work); + struct mlx5_vf_migration_file *migf = container_of(async_data, + struct mlx5_vf_migration_file, async_data); + + if (!status) { + WRITE_ONCE(migf->total_length, + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size)); + wake_up_interruptible(&migf->poll_wait); + } + + /* + * The error and the cleanup flows can't run from an + * interrupt context + */ + async_data->status = status; + queue_work(migf->mvdev->cb_wq, &async_data->work); +} + int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; + u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5vf_async_data *async_data; struct mlx5_core_dev *mdev; u32 pdn, mkey; int err; @@ -243,13 +296,31 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, mkey, mkey); MLX5_SET(save_vhca_state_in, in, size, migf->total_length); - err = mlx5_cmd_exec_inout(mdev, save_vhca_state, in, out); + async_data = &migf->async_data; + async_data->out = kvzalloc(out_size, GFP_KERNEL); + if (!async_data->out) { + err = -ENOMEM; + goto err_out; + } + + /* no data exists till the callback comes back */ + migf->total_length = 0; + get_file(migf->filp); + async_data->mkey = mkey; + async_data->pdn = pdn; + err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), + async_data->out, + out_size, mlx5vf_save_callback, + &async_data->cb_work); if (err) goto err_exec; - migf->total_length = MLX5_GET(save_vhca_state_out, out, - actual_image_size); + return 0; + err_exec: + fput(migf->filp); + kvfree(async_data->out); +err_out: mlx5_core_destroy_mkey(mdev, mkey); err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 94928055c005..6c3112fdd8b1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -10,10 +10,20 @@ #include #include +struct mlx5vf_async_data { + struct mlx5_async_work cb_work; + struct work_struct work; + int status; + u32 pdn; + u32 mkey; + void *out; +}; + struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - bool disabled; + u8 disabled:1; + u8 is_err:1; struct sg_append_table table; size_t total_length; @@ -23,6 +33,10 @@ struct mlx5_vf_migration_file { struct scatterlist *last_offset_sg; unsigned int sg_last_entry; unsigned long last_offset; + struct mlx5vf_pci_core_device *mvdev; + wait_queue_head_t poll_wait; + struct mlx5_async_ctx async_ctx; + struct mlx5vf_async_data async_data; }; struct mlx5vf_pci_core_device { @@ -39,6 +53,7 @@ struct mlx5vf_pci_core_device { spinlock_t reset_lock; struct mlx5_vf_migration_file *resuming_migf; struct mlx5_vf_migration_file *saving_migf; + struct workqueue_struct *cb_wq; struct notifier_block nb; struct mlx5_core_dev *mdev; }; @@ -54,4 +69,6 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); #endif /* MLX5_VFIO_CMD_H */ diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 2db6b816f003..df8b572977da 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -134,12 +134,22 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, return -ESPIPE; pos = &filp->f_pos; + if (!(filp->f_flags & O_NONBLOCK)) { + if (wait_event_interruptible(migf->poll_wait, + READ_ONCE(migf->total_length) || migf->is_err)) + return -ERESTARTSYS; + } + mutex_lock(&migf->lock); + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + done = -EAGAIN; + goto out_unlock; + } if (*pos > migf->total_length) { done = -EINVAL; goto out_unlock; } - if (migf->disabled) { + if (migf->disabled || migf->is_err) { done = -ENODEV; goto out_unlock; } @@ -179,9 +189,28 @@ out_unlock: return done; } +static __poll_t mlx5vf_save_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + __poll_t pollflags = 0; + + poll_wait(filp, &migf->poll_wait, wait); + + mutex_lock(&migf->lock); + if (migf->disabled || migf->is_err) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (READ_ONCE(migf->total_length)) + pollflags = EPOLLIN | EPOLLRDNORM; + mutex_unlock(&migf->lock); + + return pollflags; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, + .poll = mlx5vf_save_poll, .release = mlx5vf_release_file, .llseek = no_llseek, }; @@ -207,7 +236,9 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); - + init_waitqueue_head(&migf->poll_wait); + mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); + INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &migf->total_length); if (ret) @@ -218,6 +249,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; + migf->mvdev = mvdev; ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) goto out_free; @@ -323,7 +355,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) return migf; } -static void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) +void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); @@ -331,6 +363,8 @@ static void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mvdev->resuming_migf = NULL; } if (mvdev->saving_migf) { + mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); + cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; From a77109ffca339097833f26a1fea55ff71e2b608a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 May 2022 13:12:58 -0600 Subject: [PATCH 111/334] vfio: Stop using iommu_present() IOMMU groups have been mandatory for some time now, so a device without one is necessarily a device without any usable IOMMU, therefore the iommu_present() check is redundant (or at best unhelpful). Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/537103bbd7246574f37f2c88704d7824a3a889f2.1649160714.git.robin.murphy@arm.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index a4555014bd1e..7b0a7b85e77e 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -745,11 +745,11 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) iommu_group = iommu_group_get(dev); #ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_group && noiommu && !iommu_present(dev->bus)) { + if (!iommu_group && noiommu) { /* * With noiommu enabled, create an IOMMU group for devices that - * don't already have one and don't have an iommu_ops on their - * bus. Taint the kernel because we're about to give a DMA + * don't already have one, implying no IOMMU hardware/driver + * exists. Taint the kernel because we're about to give a DMA * capable device to a user without IOMMU protection. */ group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); From 09ea48efffa3156218980e20aaf23dcc7d6000fc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:58 -0600 Subject: [PATCH 112/334] vfio: Make vfio_(un)register_notifier accept a vfio_device All callers have a struct vfio_device trivially available, pass it in directly and avoid calling the expensive vfio_group_get_from_dev(). Acked-by: Eric Farman Reviewed-by: Jason J. Herne Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 24 ++++++++++++------------ drivers/s390/cio/vfio_ccw_ops.c | 7 +++---- drivers/s390/crypto/vfio_ap_ops.c | 14 +++++++------- drivers/vfio/vfio.c | 28 +++++++++------------------- include/linux/vfio.h | 4 ++-- 5 files changed, 33 insertions(+), 44 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 0787ba5c301f..1cec4f1fdfac 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -810,8 +810,8 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) vgpu->group_notifier.notifier_call = intel_vgpu_group_notifier; events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; - ret = vfio_register_notifier(vfio_dev->dev, VFIO_IOMMU_NOTIFY, &events, - &vgpu->iommu_notifier); + ret = vfio_register_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &events, + &vgpu->iommu_notifier); if (ret != 0) { gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n", ret); @@ -819,8 +819,8 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) } events = VFIO_GROUP_NOTIFY_SET_KVM; - ret = vfio_register_notifier(vfio_dev->dev, VFIO_GROUP_NOTIFY, &events, - &vgpu->group_notifier); + ret = vfio_register_notifier(vfio_dev, VFIO_GROUP_NOTIFY, &events, + &vgpu->group_notifier); if (ret != 0) { gvt_vgpu_err("vfio_register_notifier for group failed: %d\n", ret); @@ -873,12 +873,12 @@ undo_group: vgpu->vfio_group = NULL; undo_register: - vfio_unregister_notifier(vfio_dev->dev, VFIO_GROUP_NOTIFY, - &vgpu->group_notifier); + vfio_unregister_notifier(vfio_dev, VFIO_GROUP_NOTIFY, + &vgpu->group_notifier); undo_iommu: - vfio_unregister_notifier(vfio_dev->dev, VFIO_IOMMU_NOTIFY, - &vgpu->iommu_notifier); + vfio_unregister_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, + &vgpu->iommu_notifier); out: return ret; } @@ -907,13 +907,13 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu) intel_gvt_release_vgpu(vgpu); - ret = vfio_unregister_notifier(vgpu->vfio_device.dev, VFIO_IOMMU_NOTIFY, - &vgpu->iommu_notifier); + ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_IOMMU_NOTIFY, + &vgpu->iommu_notifier); drm_WARN(&i915->drm, ret, "vfio_unregister_notifier for iommu failed: %d\n", ret); - ret = vfio_unregister_notifier(vgpu->vfio_device.dev, VFIO_GROUP_NOTIFY, - &vgpu->group_notifier); + ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_GROUP_NOTIFY, + &vgpu->group_notifier); drm_WARN(&i915->drm, ret, "vfio_unregister_notifier for group failed: %d\n", ret); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index c4d60cdbf247..b49e2e9db2dc 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -183,7 +183,7 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) private->nb.notifier_call = vfio_ccw_mdev_notifier; - ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, + ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events, &private->nb); if (ret) return ret; @@ -204,8 +204,7 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) out_unregister: vfio_ccw_unregister_dev_regions(private); - vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, - &private->nb); + vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb); return ret; } @@ -223,7 +222,7 @@ static void vfio_ccw_mdev_close_device(struct vfio_device *vdev) cp_free(&private->cp); vfio_ccw_unregister_dev_regions(private); - vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, &private->nb); + vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb); } static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private, diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index ee0a3bf8f476..bfa7ee6ef532 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1406,21 +1406,21 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev) matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier; events = VFIO_GROUP_NOTIFY_SET_KVM; - ret = vfio_register_notifier(vdev->dev, VFIO_GROUP_NOTIFY, - &events, &matrix_mdev->group_notifier); + ret = vfio_register_notifier(vdev, VFIO_GROUP_NOTIFY, &events, + &matrix_mdev->group_notifier); if (ret) return ret; matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier; events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; - ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, - &events, &matrix_mdev->iommu_notifier); + ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events, + &matrix_mdev->iommu_notifier); if (ret) goto out_unregister_group; return 0; out_unregister_group: - vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, + vfio_unregister_notifier(vdev, VFIO_GROUP_NOTIFY, &matrix_mdev->group_notifier); return ret; } @@ -1430,9 +1430,9 @@ static void vfio_ap_mdev_close_device(struct vfio_device *vdev) struct ap_matrix_mdev *matrix_mdev = container_of(vdev, struct ap_matrix_mdev, vdev); - vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, + vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &matrix_mdev->iommu_notifier); - vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, + vfio_unregister_notifier(vdev, VFIO_GROUP_NOTIFY, &matrix_mdev->group_notifier); vfio_ap_mdev_unset_kvm(matrix_mdev); } diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 7b0a7b85e77e..cb4730756510 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -2484,19 +2484,16 @@ static int vfio_unregister_group_notifier(struct vfio_group *group, return ret; } -int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, - unsigned long *events, struct notifier_block *nb) +int vfio_register_notifier(struct vfio_device *device, + enum vfio_notify_type type, unsigned long *events, + struct notifier_block *nb) { - struct vfio_group *group; + struct vfio_group *group = device->group; int ret; - if (!dev || !nb || !events || (*events == 0)) + if (!nb || !events || (*events == 0)) return -EINVAL; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - switch (type) { case VFIO_IOMMU_NOTIFY: ret = vfio_register_iommu_notifier(group, events, nb); @@ -2507,25 +2504,20 @@ int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, default: ret = -EINVAL; } - - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_register_notifier); -int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type, +int vfio_unregister_notifier(struct vfio_device *device, + enum vfio_notify_type type, struct notifier_block *nb) { - struct vfio_group *group; + struct vfio_group *group = device->group; int ret; - if (!dev || !nb) + if (!nb) return -EINVAL; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - switch (type) { case VFIO_IOMMU_NOTIFY: ret = vfio_unregister_iommu_notifier(group, nb); @@ -2536,8 +2528,6 @@ int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type, default: ret = -EINVAL; } - - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_unregister_notifier); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 66dda06ec42d..a00fd722f044 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -178,11 +178,11 @@ enum vfio_notify_type { /* events for VFIO_GROUP_NOTIFY */ #define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) -extern int vfio_register_notifier(struct device *dev, +extern int vfio_register_notifier(struct vfio_device *device, enum vfio_notify_type type, unsigned long *required_events, struct notifier_block *nb); -extern int vfio_unregister_notifier(struct device *dev, +extern int vfio_unregister_notifier(struct vfio_device *device, enum vfio_notify_type type, struct notifier_block *nb); From 0a58795647cd4300470788ffdbff6b29b5f00632 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:59 -0600 Subject: [PATCH 113/334] vfio/ccw: Remove mdev from struct channel_program The next patch wants the vfio_device instead. There is no reason to store a pointer here since we can container_of back to the vfio_device. Reviewed-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_cp.c | 47 ++++++++++++++++++++------------- drivers/s390/cio/vfio_ccw_cp.h | 4 +-- drivers/s390/cio/vfio_ccw_fsm.c | 3 +-- 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 8d1b2771c1aa..7a1cf3091cd6 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -16,6 +16,7 @@ #include #include "vfio_ccw_cp.h" +#include "vfio_ccw_private.h" struct pfn_array { /* Starting guest physical I/O address. */ @@ -98,17 +99,17 @@ static int pfn_array_alloc(struct pfn_array *pa, u64 iova, unsigned int len) * If the pin request partially succeeds, or fails completely, * all pages are left unpinned and a negative error value is returned. */ -static int pfn_array_pin(struct pfn_array *pa, struct device *mdev) +static int pfn_array_pin(struct pfn_array *pa, struct vfio_device *vdev) { int ret = 0; - ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr, + ret = vfio_pin_pages(vdev->dev, pa->pa_iova_pfn, pa->pa_nr, IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); if (ret < 0) { goto err_out; } else if (ret > 0 && ret != pa->pa_nr) { - vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret); + vfio_unpin_pages(vdev->dev, pa->pa_iova_pfn, ret); ret = -EINVAL; goto err_out; } @@ -122,11 +123,11 @@ err_out: } /* Unpin the pages before releasing the memory. */ -static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) +static void pfn_array_unpin_free(struct pfn_array *pa, struct vfio_device *vdev) { /* Only unpin if any pages were pinned to begin with */ if (pa->pa_nr) - vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); + vfio_unpin_pages(vdev->dev, pa->pa_iova_pfn, pa->pa_nr); pa->pa_nr = 0; kfree(pa->pa_iova_pfn); } @@ -190,8 +191,7 @@ static void convert_ccw0_to_ccw1(struct ccw1 *source, unsigned long len) * Within the domain (@mdev), copy @n bytes from a guest physical * address (@iova) to a host physical address (@to). */ -static long copy_from_iova(struct device *mdev, - void *to, u64 iova, +static long copy_from_iova(struct vfio_device *vdev, void *to, u64 iova, unsigned long n) { struct pfn_array pa = {0}; @@ -203,9 +203,9 @@ static long copy_from_iova(struct device *mdev, if (ret < 0) return ret; - ret = pfn_array_pin(&pa, mdev); + ret = pfn_array_pin(&pa, vdev); if (ret < 0) { - pfn_array_unpin_free(&pa, mdev); + pfn_array_unpin_free(&pa, vdev); return ret; } @@ -226,7 +226,7 @@ static long copy_from_iova(struct device *mdev, break; } - pfn_array_unpin_free(&pa, mdev); + pfn_array_unpin_free(&pa, vdev); return l; } @@ -423,11 +423,13 @@ static int ccwchain_loop_tic(struct ccwchain *chain, static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; struct ccwchain *chain; int len, ret; /* Copy 2K (the most we support today) of possible CCWs */ - len = copy_from_iova(cp->mdev, cp->guest_cp, cda, + len = copy_from_iova(vdev, cp->guest_cp, cda, CCWCHAIN_LEN_MAX * sizeof(struct ccw1)); if (len) return len; @@ -508,6 +510,8 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, int idx, struct channel_program *cp) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; struct ccw1 *ccw; struct pfn_array *pa; u64 iova; @@ -526,7 +530,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ccw_is_idal(ccw)) { /* Read first IDAW to see if it's 4K-aligned or not. */ /* All subsequent IDAws will be 4K-aligned. */ - ret = copy_from_iova(cp->mdev, &iova, ccw->cda, sizeof(iova)); + ret = copy_from_iova(vdev, &iova, ccw->cda, sizeof(iova)); if (ret) return ret; } else { @@ -555,7 +559,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ccw_is_idal(ccw)) { /* Copy guest IDAL into host IDAL */ - ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idal_len); + ret = copy_from_iova(vdev, idaws, ccw->cda, idal_len); if (ret) goto out_unpin; @@ -574,7 +578,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, } if (ccw_does_data_transfer(ccw)) { - ret = pfn_array_pin(pa, cp->mdev); + ret = pfn_array_pin(pa, vdev); if (ret < 0) goto out_unpin; } else { @@ -590,7 +594,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, return 0; out_unpin: - pfn_array_unpin_free(pa, cp->mdev); + pfn_array_unpin_free(pa, vdev); out_free_idaws: kfree(idaws); out_init: @@ -632,8 +636,10 @@ static int ccwchain_fetch_one(struct ccwchain *chain, * Returns: * %0 on success and a negative error value on failure. */ -int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) +int cp_init(struct channel_program *cp, union orb *orb) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; /* custom ratelimit used to avoid flood during guest IPL */ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 1); int ret; @@ -650,11 +656,12 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) * the problem if something does break. */ if (!orb->cmd.pfch && __ratelimit(&ratelimit_state)) - dev_warn(mdev, "Prefetching channel program even though prefetch not specified in ORB"); + dev_warn( + vdev->dev, + "Prefetching channel program even though prefetch not specified in ORB"); INIT_LIST_HEAD(&cp->ccwchain_list); memcpy(&cp->orb, orb, sizeof(*orb)); - cp->mdev = mdev; /* Build a ccwchain for the first CCW segment */ ret = ccwchain_handle_ccw(orb->cmd.cpa, cp); @@ -682,6 +689,8 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) */ void cp_free(struct channel_program *cp) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; struct ccwchain *chain, *temp; int i; @@ -691,7 +700,7 @@ void cp_free(struct channel_program *cp) cp->initialized = false; list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { for (i = 0; i < chain->ch_len; i++) { - pfn_array_unpin_free(chain->ch_pa + i, cp->mdev); + pfn_array_unpin_free(chain->ch_pa + i, vdev); ccwchain_cda_free(chain, i); } ccwchain_free(chain); diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h index ba31240ce965..e4c436199b4c 100644 --- a/drivers/s390/cio/vfio_ccw_cp.h +++ b/drivers/s390/cio/vfio_ccw_cp.h @@ -37,13 +37,11 @@ struct channel_program { struct list_head ccwchain_list; union orb orb; - struct device *mdev; bool initialized; struct ccw1 *guest_cp; }; -extern int cp_init(struct channel_program *cp, struct device *mdev, - union orb *orb); +extern int cp_init(struct channel_program *cp, union orb *orb); extern void cp_free(struct channel_program *cp); extern int cp_prefetch(struct channel_program *cp); extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm); diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index e435a9cd92da..8483a266051c 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -262,8 +262,7 @@ static void fsm_io_request(struct vfio_ccw_private *private, errstr = "transport mode"; goto err_out; } - io_region->ret_code = cp_init(&private->cp, mdev_dev(mdev), - orb); + io_region->ret_code = cp_init(&private->cp, orb); if (io_region->ret_code) { VFIO_CCW_MSG_EVENT(2, "%pUl (%x.%x.%04x): cp_init=%d\n", From 8e432bb015b6c327d016b1dca509964e189c4770 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:59 -0600 Subject: [PATCH 114/334] vfio/mdev: Pass in a struct vfio_device * to vfio_pin/unpin_pages() Every caller has a readily available vfio_device pointer, use that instead of passing in a generic struct device. The struct vfio_device already contains the group we need so this avoids complexity, extra refcountings, and a confusing lifecycle model. Reviewed-by: Christoph Hellwig Acked-by: Eric Farman Reviewed-by: Jason J. Herne Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 4 +- drivers/s390/cio/vfio_ccw_cp.c | 6 +-- drivers/s390/crypto/vfio_ap_ops.c | 9 ++-- drivers/vfio/vfio.c | 46 +++++++------------ include/linux/vfio.h | 4 +- 5 files changed, 27 insertions(+), 42 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index 784bbeb22adc..eded8719180f 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -262,10 +262,10 @@ Translation APIs for Mediated Devices The following APIs are provided for translating user pfn to host pfn in a VFIO driver:: - extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, + int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); - extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, + int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); These functions call back into the back-end IOMMU module by using the pin_pages diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 7a1cf3091cd6..0c2be9421ab7 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -103,13 +103,13 @@ static int pfn_array_pin(struct pfn_array *pa, struct vfio_device *vdev) { int ret = 0; - ret = vfio_pin_pages(vdev->dev, pa->pa_iova_pfn, pa->pa_nr, + ret = vfio_pin_pages(vdev, pa->pa_iova_pfn, pa->pa_nr, IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); if (ret < 0) { goto err_out; } else if (ret > 0 && ret != pa->pa_nr) { - vfio_unpin_pages(vdev->dev, pa->pa_iova_pfn, ret); + vfio_unpin_pages(vdev, pa->pa_iova_pfn, ret); ret = -EINVAL; goto err_out; } @@ -127,7 +127,7 @@ static void pfn_array_unpin_free(struct pfn_array *pa, struct vfio_device *vdev) { /* Only unpin if any pages were pinned to begin with */ if (pa->pa_nr) - vfio_unpin_pages(vdev->dev, pa->pa_iova_pfn, pa->pa_nr); + vfio_unpin_pages(vdev, pa->pa_iova_pfn, pa->pa_nr); pa->pa_nr = 0; kfree(pa->pa_iova_pfn); } diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index bfa7ee6ef532..e8914024f5b1 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -124,8 +124,7 @@ static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q) q->saved_isc = VFIO_AP_ISC_INVALID; } if (q->saved_pfn && !WARN_ON(!q->matrix_mdev)) { - vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), - &q->saved_pfn, 1); + vfio_unpin_pages(&q->matrix_mdev->vdev, &q->saved_pfn, 1); q->saved_pfn = 0; } } @@ -258,7 +257,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, return status; } - ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1, + ret = vfio_pin_pages(&q->matrix_mdev->vdev, &g_pfn, 1, IOMMU_READ | IOMMU_WRITE, &h_pfn); switch (ret) { case 1: @@ -301,7 +300,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, break; case AP_RESPONSE_OTHERWISE_CHANGED: /* We could not modify IRQ setings: clear new configuration */ - vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1); + vfio_unpin_pages(&q->matrix_mdev->vdev, &g_pfn, 1); kvm_s390_gisc_unregister(kvm, isc); break; default: @@ -1250,7 +1249,7 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb, struct vfio_iommu_type1_dma_unmap *unmap = data; unsigned long g_pfn = unmap->iova >> PAGE_SHIFT; - vfio_unpin_pages(mdev_dev(matrix_mdev->mdev), &g_pfn, 1); + vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1); return NOTIFY_OK; } diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index cb4730756510..0f85e789fbd3 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -2134,7 +2134,7 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); /* * Pin a set of guest PFNs and return their associated host PFNs for local * domain only. - * @dev [in] : device + * @device [in] : device * @user_pfn [in]: array of user/guest PFNs to be pinned. * @npage [in] : count of elements in user_pfn array. This count should not * be greater VFIO_PIN_PAGES_MAX_ENTRIES. @@ -2142,32 +2142,26 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); * @phys_pfn[out]: array of host PFNs * Return error or number of pages pinned. */ -int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, - int prot, unsigned long *phys_pfn) +int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, + int npage, int prot, unsigned long *phys_pfn) { struct vfio_container *container; - struct vfio_group *group; + struct vfio_group *group = device->group; struct vfio_iommu_driver *driver; int ret; - if (!dev || !user_pfn || !phys_pfn || !npage) + if (!user_pfn || !phys_pfn || !npage) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - - if (group->dev_counter > 1) { - ret = -EINVAL; - goto err_pin_pages; - } + if (group->dev_counter > 1) + return -EINVAL; ret = vfio_group_add_container_user(group); if (ret) - goto err_pin_pages; + return ret; container = group->container; driver = container->iommu_driver; @@ -2180,43 +2174,37 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, vfio_group_try_dissolve_container(group); -err_pin_pages: - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_pin_pages); /* * Unpin set of host PFNs for local domain only. - * @dev [in] : device + * @device [in] : device * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES. * @npage [in] : count of elements in user_pfn array. This count should not * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. * Return error or number of pages unpinned. */ -int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) +int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, + int npage) { struct vfio_container *container; - struct vfio_group *group; struct vfio_iommu_driver *driver; int ret; - if (!dev || !user_pfn || !npage) + if (!user_pfn || !npage) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - - ret = vfio_group_add_container_user(group); + ret = vfio_group_add_container_user(device->group); if (ret) - goto err_unpin_pages; + return ret; - container = group->container; + container = device->group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unpin_pages)) ret = driver->ops->unpin_pages(container->iommu_data, user_pfn, @@ -2224,10 +2212,8 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) else ret = -ENOTTY; - vfio_group_try_dissolve_container(group); + vfio_group_try_dissolve_container(device->group); -err_unpin_pages: - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_unpin_pages); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a00fd722f044..bddc70f88899 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -150,9 +150,9 @@ extern long vfio_external_check_extension(struct vfio_group *group, #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) -extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, +extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); -extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, +extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); extern int vfio_group_pin_pages(struct vfio_group *group, From c6250ffbacc5989a5db3b9acce34b93570938f60 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:59 -0600 Subject: [PATCH 115/334] vfio/mdev: Pass in a struct vfio_device * to vfio_dma_rw() Every caller has a readily available vfio_device pointer, use that instead of passing in a generic struct device. Change vfio_dma_rw() to take in the struct vfio_device and move the container users that would have been held by vfio_group_get_external_user_from_dev() to vfio_dma_rw() directly, like vfio_pin/unpin_pages(). Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/gvt.h | 4 ++-- drivers/vfio/vfio.c | 24 +++++++++++------------- include/linux/vfio.h | 2 +- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 03ecffc2ba56..5a28ee965b7f 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -732,7 +732,7 @@ static inline int intel_gvt_read_gpa(struct intel_vgpu *vgpu, unsigned long gpa, { if (!vgpu->attached) return -ESRCH; - return vfio_dma_rw(vgpu->vfio_group, gpa, buf, len, false); + return vfio_dma_rw(&vgpu->vfio_device, gpa, buf, len, false); } /** @@ -750,7 +750,7 @@ static inline int intel_gvt_write_gpa(struct intel_vgpu *vgpu, { if (!vgpu->attached) return -ESRCH; - return vfio_dma_rw(vgpu->vfio_group, gpa, buf, len, true); + return vfio_dma_rw(&vgpu->vfio_device, gpa, buf, len, true); } void intel_gvt_debugfs_remove_vgpu(struct intel_vgpu *vgpu); diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 0f85e789fbd3..1dfd019aaaae 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -2323,32 +2323,28 @@ EXPORT_SYMBOL(vfio_group_unpin_pages); * As the read/write of user space memory is conducted via the CPUs and is * not a real device DMA, it is not necessary to pin the user space memory. * - * The caller needs to call vfio_group_get_external_user() or - * vfio_group_get_external_user_from_dev() prior to calling this interface, - * so as to prevent the VFIO group from disposal in the middle of the call. - * But it can keep the reference to the VFIO group for several calls into - * this interface. - * After finishing using of the VFIO group, the caller needs to release the - * VFIO group by calling vfio_group_put_external_user(). - * - * @group [in] : VFIO group + * @device [in] : VFIO device * @user_iova [in] : base IOVA of a user space buffer * @data [in] : pointer to kernel buffer * @len [in] : kernel buffer length * @write : indicate read or write * Return error code on failure or 0 on success. */ -int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, - void *data, size_t len, bool write) +int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, + size_t len, bool write) { struct vfio_container *container; struct vfio_iommu_driver *driver; int ret = 0; - if (!group || !data || len <= 0) + if (!data || len <= 0) return -EINVAL; - container = group->container; + ret = vfio_group_add_container_user(device->group); + if (ret) + return ret; + + container = device->group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->dma_rw)) @@ -2357,6 +2353,8 @@ int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, else ret = -ENOTTY; + vfio_group_try_dissolve_container(device->group); + return ret; } EXPORT_SYMBOL(vfio_dma_rw); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index bddc70f88899..8a1510258717 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -161,7 +161,7 @@ extern int vfio_group_pin_pages(struct vfio_group *group, extern int vfio_group_unpin_pages(struct vfio_group *group, unsigned long *user_iova_pfn, int npage); -extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, +extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, size_t len, bool write); extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group); From 5eb20a78c032da9c5d00090953c1bed6c4e3f143 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:12:59 -0600 Subject: [PATCH 116/334] drm/i915/gvt: Change from vfio_group_(un)pin_pages to vfio_(un)pin_pages Use the existing vfio_device versions of vfio_(un)pin_pages(). There is no reason to use a group interface here, kvmgt has easy access to a vfio_device. Delete kvmgt_vdev::vfio_group since these calls were the last users. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Acked-by: Zhi Wang Link: https://lore.kernel.org/r/5-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/gvt.h | 1 - drivers/gpu/drm/i915/gvt/kvmgt.c | 27 ++++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 5a28ee965b7f..2af4c83e733c 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -231,7 +231,6 @@ struct intel_vgpu { struct kvm *kvm; struct work_struct release_work; atomic_t released; - struct vfio_group *vfio_group; struct kvm_page_track_notifier_node track_node; #define NR_BKT (1 << 18) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 1cec4f1fdfac..7655ffa97d51 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -243,7 +243,7 @@ static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, for (npage = 0; npage < total_pages; npage++) { unsigned long cur_gfn = gfn + npage; - ret = vfio_group_unpin_pages(vgpu->vfio_group, &cur_gfn, 1); + ret = vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1); drm_WARN_ON(&i915->drm, ret != 1); } } @@ -266,8 +266,8 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long cur_gfn = gfn + npage; unsigned long pfn; - ret = vfio_group_pin_pages(vgpu->vfio_group, &cur_gfn, 1, - IOMMU_READ | IOMMU_WRITE, &pfn); + ret = vfio_pin_pages(&vgpu->vfio_device, &cur_gfn, 1, + IOMMU_READ | IOMMU_WRITE, &pfn); if (ret != 1) { gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n", cur_gfn, ret); @@ -804,7 +804,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); unsigned long events; int ret; - struct vfio_group *vfio_group; vgpu->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier; vgpu->group_notifier.notifier_call = intel_vgpu_group_notifier; @@ -827,28 +826,19 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) goto undo_iommu; } - vfio_group = - vfio_group_get_external_user_from_dev(vgpu->vfio_device.dev); - if (IS_ERR_OR_NULL(vfio_group)) { - ret = !vfio_group ? -EFAULT : PTR_ERR(vfio_group); - gvt_vgpu_err("vfio_group_get_external_user_from_dev failed\n"); - goto undo_register; - } - vgpu->vfio_group = vfio_group; - ret = -EEXIST; if (vgpu->attached) - goto undo_group; + goto undo_register; ret = -ESRCH; if (!vgpu->kvm || vgpu->kvm->mm != current->mm) { gvt_vgpu_err("KVM is required to use Intel vGPU\n"); - goto undo_group; + goto undo_register; } ret = -EEXIST; if (__kvmgt_vgpu_exist(vgpu)) - goto undo_group; + goto undo_register; vgpu->attached = true; kvm_get_kvm(vgpu->kvm); @@ -868,10 +858,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) atomic_set(&vgpu->released, 0); return 0; -undo_group: - vfio_group_put_external_user(vgpu->vfio_group); - vgpu->vfio_group = NULL; - undo_register: vfio_unregister_notifier(vfio_dev, VFIO_GROUP_NOTIFY, &vgpu->group_notifier); @@ -925,7 +911,6 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu) gvt_cache_destroy(vgpu); intel_vgpu_release_msi_eventfd_ctx(vgpu); - vfio_group_put_external_user(vgpu->vfio_group); vgpu->kvm = NULL; vgpu->attached = false; From 231657b345046552b35670b45b892cfce2610e12 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:13:00 -0600 Subject: [PATCH 117/334] vfio: Remove dead code Now that callers have been updated to use the vfio_device APIs the driver facing group interface is no longer used, delete it: - vfio_group_get_external_user_from_dev() - vfio_group_pin_pages() - vfio_group_unpin_pages() - vfio_group_iommu_domain() -- Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 151 ------------------------------------------- include/linux/vfio.h | 11 ---- 2 files changed, 162 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 1dfd019aaaae..b9396424a9f9 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1947,44 +1947,6 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep) } EXPORT_SYMBOL_GPL(vfio_group_get_external_user); -/* - * External user API, exported by symbols to be linked dynamically. - * The external user passes in a device pointer - * to verify that: - * - A VFIO group is assiciated with the device; - * - IOMMU is set for the group. - * If both checks passed, vfio_group_get_external_user_from_dev() - * increments the container user counter to prevent the VFIO group - * from disposal before external user exits and returns the pointer - * to the VFIO group. - * - * When the external user finishes using the VFIO group, it calls - * vfio_group_put_external_user() to release the VFIO group and - * decrement the container user counter. - * - * @dev [in] : device - * Return error PTR or pointer to VFIO group. - */ - -struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev) -{ - struct vfio_group *group; - int ret; - - group = vfio_group_get_from_dev(dev); - if (!group) - return ERR_PTR(-ENODEV); - - ret = vfio_group_add_container_user(group); - if (ret) { - vfio_group_put(group); - return ERR_PTR(ret); - } - - return group; -} -EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev); - void vfio_group_put_external_user(struct vfio_group *group) { vfio_group_try_dissolve_container(group); @@ -2218,101 +2180,6 @@ int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, } EXPORT_SYMBOL(vfio_unpin_pages); -/* - * Pin a set of guest IOVA PFNs and return their associated host PFNs for a - * VFIO group. - * - * The caller needs to call vfio_group_get_external_user() or - * vfio_group_get_external_user_from_dev() prior to calling this interface, - * so as to prevent the VFIO group from disposal in the middle of the call. - * But it can keep the reference to the VFIO group for several calls into - * this interface. - * After finishing using of the VFIO group, the caller needs to release the - * VFIO group by calling vfio_group_put_external_user(). - * - * @group [in] : VFIO group - * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned. - * @npage [in] : count of elements in user_iova_pfn array. - * This count should not be greater - * VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @phys_pfn [out] : array of host PFNs - * Return error or number of pages pinned. - */ -int vfio_group_pin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage, - int prot, unsigned long *phys_pfn) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret; - - if (!group || !user_iova_pfn || !phys_pfn || !npage) - return -EINVAL; - - if (group->dev_counter > 1) - return -EINVAL; - - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, user_iova_pfn, - npage, prot, phys_pfn); - else - ret = -ENOTTY; - - return ret; -} -EXPORT_SYMBOL(vfio_group_pin_pages); - -/* - * Unpin a set of guest IOVA PFNs for a VFIO group. - * - * The caller needs to call vfio_group_get_external_user() or - * vfio_group_get_external_user_from_dev() prior to calling this interface, - * so as to prevent the VFIO group from disposal in the middle of the call. - * But it can keep the reference to the VFIO group for several calls into - * this interface. - * After finishing using of the VFIO group, the caller needs to release the - * VFIO group by calling vfio_group_put_external_user(). - * - * @group [in] : vfio group - * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned. - * @npage [in] : count of elements in user_iova_pfn array. - * This count should not be greater than - * VFIO_PIN_PAGES_MAX_ENTRIES. - * Return error or number of pages unpinned. - */ -int vfio_group_unpin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret; - - if (!group || !user_iova_pfn || !npage) - return -EINVAL; - - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->unpin_pages)) - ret = driver->ops->unpin_pages(container->iommu_data, - user_iova_pfn, npage); - else - ret = -ENOTTY; - - return ret; -} -EXPORT_SYMBOL(vfio_group_unpin_pages); - - /* * This interface allows the CPUs to perform some sort of virtual DMA on * behalf of the device. @@ -2516,24 +2383,6 @@ int vfio_unregister_notifier(struct vfio_device *device, } EXPORT_SYMBOL(vfio_unregister_notifier); -struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - - if (!group) - return ERR_PTR(-EINVAL); - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->group_iommu_domain)) - return driver->ops->group_iommu_domain(container->iommu_data, - group->iommu_group); - - return ERR_PTR(-ENOTTY); -} -EXPORT_SYMBOL_GPL(vfio_group_iommu_domain); - /* * Module/class support */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 8a1510258717..6195edd2edcd 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -140,8 +140,6 @@ int vfio_mig_get_next_state(struct vfio_device *device, */ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); -extern struct vfio_group *vfio_group_get_external_user_from_dev(struct device - *dev); extern bool vfio_external_group_match_file(struct vfio_group *group, struct file *filep); extern int vfio_external_user_iommu_id(struct vfio_group *group); @@ -154,18 +152,9 @@ extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); - -extern int vfio_group_pin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage, - int prot, unsigned long *phys_pfn); -extern int vfio_group_unpin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage); - extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, size_t len, bool write); -extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group); - /* each type has independent events */ enum vfio_notify_type { VFIO_IOMMU_NOTIFY = 0, From eadd86f835c63769febbd056dfcb70dafef0d4b3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:13:00 -0600 Subject: [PATCH 118/334] vfio: Remove calls to vfio_group_add_container_user() When the open_device() op is called the container_users is incremented and held incremented until close_device(). Thus, so long as drivers call functions within their open_device()/close_device() region they do not need to worry about the container_users. These functions can all only be called between open_device() and close_device(): vfio_pin_pages() vfio_unpin_pages() vfio_dma_rw() vfio_register_notifier() vfio_unregister_notifier() Eliminate the calls to vfio_group_add_container_user() and add vfio_assert_device_open() to detect driver mis-use. This causes the close_device() op to check device->open_count so always leave it elevated while calling the op. Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v4-8045e76bf00b+13d-vfio_mdev_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 80 ++++++++++----------------------------------- 1 file changed, 17 insertions(+), 63 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index b9396424a9f9..0339adeee16b 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1330,6 +1330,12 @@ static int vfio_group_add_container_user(struct vfio_group *group) static const struct file_operations vfio_device_fops; +/* true if the vfio_device has open_device() called but not close_device() */ +static bool vfio_assert_device_open(struct vfio_device *device) +{ + return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); +} + static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) { struct vfio_device *device; @@ -1544,8 +1550,10 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) struct vfio_device *device = filep->private_data; mutex_lock(&device->dev_set->lock); - if (!--device->open_count && device->ops->close_device) + vfio_assert_device_open(device); + if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); + device->open_count--; mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); @@ -2112,7 +2120,8 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, struct vfio_iommu_driver *driver; int ret; - if (!user_pfn || !phys_pfn || !npage) + if (!user_pfn || !phys_pfn || !npage || + !vfio_assert_device_open(device)) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) @@ -2121,10 +2130,6 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, if (group->dev_counter > 1) return -EINVAL; - ret = vfio_group_add_container_user(group); - if (ret) - return ret; - container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) @@ -2134,8 +2139,6 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, else ret = -ENOTTY; - vfio_group_try_dissolve_container(group); - return ret; } EXPORT_SYMBOL(vfio_pin_pages); @@ -2156,16 +2159,12 @@ int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, struct vfio_iommu_driver *driver; int ret; - if (!user_pfn || !npage) + if (!user_pfn || !npage || !vfio_assert_device_open(device)) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - ret = vfio_group_add_container_user(device->group); - if (ret) - return ret; - container = device->group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unpin_pages)) @@ -2174,8 +2173,6 @@ int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, else ret = -ENOTTY; - vfio_group_try_dissolve_container(device->group); - return ret; } EXPORT_SYMBOL(vfio_unpin_pages); @@ -2204,13 +2201,9 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, struct vfio_iommu_driver *driver; int ret = 0; - if (!data || len <= 0) + if (!data || len <= 0 || !vfio_assert_device_open(device)) return -EINVAL; - ret = vfio_group_add_container_user(device->group); - if (ret) - return ret; - container = device->group->container; driver = container->iommu_driver; @@ -2219,9 +2212,6 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, user_iova, data, len, write); else ret = -ENOTTY; - - vfio_group_try_dissolve_container(device->group); - return ret; } EXPORT_SYMBOL(vfio_dma_rw); @@ -2234,10 +2224,6 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; - container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->register_notifier)) @@ -2245,9 +2231,6 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, events, nb); else ret = -ENOTTY; - - vfio_group_try_dissolve_container(group); - return ret; } @@ -2258,10 +2241,6 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; - container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unregister_notifier)) @@ -2269,9 +2248,6 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, nb); else ret = -ENOTTY; - - vfio_group_try_dissolve_container(group); - return ret; } @@ -2300,10 +2276,6 @@ static int vfio_register_group_notifier(struct vfio_group *group, if (*events) return -EINVAL; - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; - ret = blocking_notifier_chain_register(&group->notifier, nb); /* @@ -2313,25 +2285,6 @@ static int vfio_register_group_notifier(struct vfio_group *group, if (!ret && set_kvm && group->kvm) blocking_notifier_call_chain(&group->notifier, VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); - - vfio_group_try_dissolve_container(group); - - return ret; -} - -static int vfio_unregister_group_notifier(struct vfio_group *group, - struct notifier_block *nb) -{ - int ret; - - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; - - ret = blocking_notifier_chain_unregister(&group->notifier, nb); - - vfio_group_try_dissolve_container(group); - return ret; } @@ -2342,7 +2295,8 @@ int vfio_register_notifier(struct vfio_device *device, struct vfio_group *group = device->group; int ret; - if (!nb || !events || (*events == 0)) + if (!nb || !events || (*events == 0) || + !vfio_assert_device_open(device)) return -EINVAL; switch (type) { @@ -2366,7 +2320,7 @@ int vfio_unregister_notifier(struct vfio_device *device, struct vfio_group *group = device->group; int ret; - if (!nb) + if (!nb || !vfio_assert_device_open(device)) return -EINVAL; switch (type) { @@ -2374,7 +2328,7 @@ int vfio_unregister_notifier(struct vfio_device *device, ret = vfio_unregister_iommu_notifier(group, nb); break; case VFIO_GROUP_NOTIFY: - ret = vfio_unregister_group_notifier(group, nb); + ret = blocking_notifier_chain_unregister(&group->notifier, nb); break; default: ret = -EINVAL; From 91be0bd6c6cf21328017e990d3ceeb00f03821fd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:19:07 -0600 Subject: [PATCH 119/334] vfio/pci: Have all VFIO PCI drivers store the vfio_pci_core_device in drvdata Having a consistent pointer in the drvdata will allow the next patch to make use of the drvdata from some of the core code helpers. Use a WARN_ON inside vfio_pci_core_register_device() to detect drivers that miss this. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-c841817a0349+8f-vfio_get_from_dev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 15 +++++++++++---- drivers/vfio/pci/mlx5/main.c | 15 +++++++++++---- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 4 ++++ 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 767b5d47631a..e92376837b29 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -337,6 +337,14 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) return 0; } +static struct hisi_acc_vf_core_device *hssi_acc_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct hisi_acc_vf_core_device, + core_device); +} + static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_qm *qm) { @@ -962,7 +970,7 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = dev_get_drvdata(&pdev->dev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); if (hisi_acc_vdev->core_device.vdev.migration_flags != VFIO_MIGRATION_STOP_COPY) @@ -1274,11 +1282,10 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device &hisi_acc_vfio_pci_ops); } + dev_set_drvdata(&pdev->dev, &hisi_acc_vdev->core_device); ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device); if (ret) goto out_free; - - dev_set_drvdata(&pdev->dev, hisi_acc_vdev); return 0; out_free: @@ -1289,7 +1296,7 @@ out_free: static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = dev_get_drvdata(&pdev->dev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index df8b572977da..dd1009b5ff9c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -24,6 +24,14 @@ /* Arbitrary to prevent userspace from consuming endless memory */ #define MAX_MIGRATION_SIZE (512*1024*1024) +static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct mlx5vf_pci_core_device, + core_device); +} + static struct page * mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, unsigned long offset) @@ -518,7 +526,7 @@ static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) { - struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); if (!mvdev->migrate_cap) return; @@ -592,11 +600,10 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, return -ENOMEM; vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); mlx5vf_cmd_set_migratable(mvdev); + dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) goto out_free; - - dev_set_drvdata(&pdev->dev, mvdev); return 0; out_free: @@ -608,7 +615,7 @@ out_free: static void mlx5vf_pci_remove(struct pci_dev *pdev) { - struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); vfio_pci_core_unregister_device(&mvdev->core_device); mlx5vf_cmd_remove_migratable(mvdev); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2b047469e02f..8c990a1a7def 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -151,10 +151,10 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops); + dev_set_drvdata(&pdev->dev, vdev); ret = vfio_pci_core_register_device(vdev); if (ret) goto out_free; - dev_set_drvdata(&pdev->dev, vdev); return 0; out_free: diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 06b6f3594a13..65587fd5c021 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1821,6 +1821,10 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) struct pci_dev *pdev = vdev->pdev; int ret; + /* Drivers must set the vfio_pci_core_device to their drvdata */ + if (WARN_ON(vdev != dev_get_drvdata(&vdev->pdev->dev))) + return -EINVAL; + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; From ff806cbd90bd2cc3d08a026e26157e5b5a6b5e03 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 11 May 2022 13:19:07 -0600 Subject: [PATCH 120/334] vfio/pci: Remove vfio_device_get_from_dev() The last user of this function is in PCI callbacks that want to convert their struct pci_dev to a vfio_device. Instead of searching use the vfio_device available trivially through the drvdata. When a callback in the device_driver is called, the caller must hold the device_lock() on dev. The purpose of the device_lock is to prevent remove() from being called (see __device_release_driver), and allow the driver to safely interact with its drvdata without races. The PCI core correctly follows this and holds the device_lock() when calling error_detected (see report_error_detected) and sriov_configure (see sriov_numvfs_store). Further, since the drvdata holds a positive refcount on the vfio_device any access of the drvdata, under the device_lock(), from a driver callback needs no further protection or refcounting. Thus the remark in the vfio_device_get_from_dev() comment does not apply here, VFIO PCI drivers all call vfio_unregister_group_dev() from their remove callbacks under the device_lock() and cannot race with the remaining callers. Reviewed-by: Kevin Tian Reviewed-by: Shameer Kolothum Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v4-c841817a0349+8f-vfio_get_from_dev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 4 +++- drivers/vfio/pci/vfio_pci_core.c | 32 +++++-------------------- drivers/vfio/vfio.c | 41 +------------------------------- include/linux/vfio.h | 2 -- include/linux/vfio_pci_core.h | 3 ++- 5 files changed, 12 insertions(+), 70 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 8c990a1a7def..cdee5f81f49d 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -174,10 +174,12 @@ static void vfio_pci_remove(struct pci_dev *pdev) static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) { + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + if (!enable_sriov) return -ENOENT; - return vfio_pci_core_sriov_configure(pdev, nr_virtfn); + return vfio_pci_core_sriov_configure(vdev, nr_virtfn); } static const struct pci_device_id vfio_pci_table[] = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 65587fd5c021..100ab98c7ff0 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1894,9 +1894,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) { - struct pci_dev *pdev = vdev->pdev; - - vfio_pci_core_sriov_configure(pdev, 0); + vfio_pci_core_sriov_configure(vdev, 0); vfio_unregister_group_dev(&vdev->vdev); @@ -1911,14 +1909,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - struct vfio_pci_core_device *vdev; - struct vfio_device *device; - - device = vfio_device_get_from_dev(&pdev->dev); - if (device == NULL) - return PCI_ERS_RESULT_DISCONNECT; - - vdev = container_of(device, struct vfio_pci_core_device, vdev); + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); mutex_lock(&vdev->igate); @@ -1927,26 +1918,18 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, mutex_unlock(&vdev->igate); - vfio_device_put(device); - return PCI_ERS_RESULT_CAN_RECOVER; } EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected); -int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) +int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, + int nr_virtfn) { - struct vfio_pci_core_device *vdev; - struct vfio_device *device; + struct pci_dev *pdev = vdev->pdev; int ret = 0; device_lock_assert(&pdev->dev); - device = vfio_device_get_from_dev(&pdev->dev); - if (!device) - return -ENODEV; - - vdev = container_of(device, struct vfio_pci_core_device, vdev); - if (nr_virtfn) { mutex_lock(&vfio_pci_sriov_pfs_mutex); /* @@ -1964,8 +1947,7 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) ret = pci_enable_sriov(pdev, nr_virtfn); if (ret) goto out_del; - ret = nr_virtfn; - goto out_put; + return nr_virtfn; } pci_disable_sriov(pdev); @@ -1975,8 +1957,6 @@ out_del: list_del_init(&vdev->sriov_pfs_item); out_unlock: mutex_unlock(&vfio_pci_sriov_pfs_mutex); -out_put: - vfio_device_put(device); return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 0339adeee16b..56b80c39a3c0 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -473,31 +473,15 @@ static void vfio_group_get(struct vfio_group *group) refcount_inc(&group->users); } -static struct vfio_group *vfio_group_get_from_dev(struct device *dev) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - - iommu_group = iommu_group_get(dev); - if (!iommu_group) - return NULL; - - group = vfio_group_get_from_iommu(iommu_group); - iommu_group_put(iommu_group); - - return group; -} - /* * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -void vfio_device_put(struct vfio_device *device) +static void vfio_device_put(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -EXPORT_SYMBOL_GPL(vfio_device_put); static bool vfio_device_try_get(struct vfio_device *device) { @@ -831,29 +815,6 @@ int vfio_register_emulated_iommu_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); -/* - * Get a reference to the vfio_device for a device. Even if the - * caller thinks they own the device, they could be racing with a - * release call path, so we can't trust drvdata for the shortcut. - * Go the long way around, from the iommu_group to the vfio_group - * to the vfio_device. - */ -struct vfio_device *vfio_device_get_from_dev(struct device *dev) -{ - struct vfio_group *group; - struct vfio_device *device; - - group = vfio_group_get_from_dev(dev); - if (!group) - return NULL; - - device = vfio_group_get_device(group, dev); - vfio_group_put(group); - - return device; -} -EXPORT_SYMBOL_GPL(vfio_device_get_from_dev); - static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, char *buf) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6195edd2edcd..be1e7db4a314 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -125,8 +125,6 @@ void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); -extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); -extern void vfio_device_put(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 48f2dd3c568c..23c176d4b073 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -227,8 +227,9 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); -int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn); extern const struct pci_error_handlers vfio_pci_core_err_handlers; +int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, + int nr_virtfn); long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, From 6213f5d4d23c50d393a31dc8e351e63a1fd10dbe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 May 2022 17:40:25 -0700 Subject: [PATCH 121/334] f2fs: don't need inode lock for system hidden quota Let's avoid false-alarmed lockdep warning. [ 58.914674] [T1501146] -> #2 (&sb->s_type->i_mutex_key#20){+.+.}-{3:3}: [ 58.915975] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.916738] [T1501146] system_server: f2fs_quota_sync+0x60/0x1a8 [ 58.917563] [T1501146] system_server: block_operations+0x16c/0x43c [ 58.918410] [T1501146] system_server: f2fs_write_checkpoint+0x114/0x318 [ 58.919312] [T1501146] system_server: f2fs_issue_checkpoint+0x178/0x21c [ 58.920214] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.920999] [T1501146] system_server: f2fs_do_sync_file+0x334/0x738 [ 58.921862] [T1501146] system_server: f2fs_sync_file+0x30/0x48 [ 58.922667] [T1501146] system_server: __arm64_sys_fsync+0x84/0xf8 [ 58.923506] [T1501146] system_server: el0_svc_common.llvm.12821150825140585682+0xd8/0x20c [ 58.924604] [T1501146] system_server: do_el0_svc+0x28/0xa0 [ 58.925366] [T1501146] system_server: el0_svc+0x24/0x38 [ 58.926094] [T1501146] system_server: el0_sync_handler+0x88/0xec [ 58.926920] [T1501146] system_server: el0_sync+0x1b4/0x1c0 [ 58.927681] [T1501146] -> #1 (&sbi->cp_global_sem){+.+.}-{3:3}: [ 58.928889] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.929650] [T1501146] system_server: f2fs_write_checkpoint+0xbc/0x318 [ 58.930541] [T1501146] system_server: f2fs_issue_checkpoint+0x178/0x21c [ 58.931443] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.932226] [T1501146] system_server: sync_filesystem+0xac/0x130 [ 58.933053] [T1501146] system_server: generic_shutdown_super+0x38/0x150 [ 58.933958] [T1501146] system_server: kill_block_super+0x24/0x58 [ 58.934791] [T1501146] system_server: kill_f2fs_super+0xcc/0x124 [ 58.935618] [T1501146] system_server: deactivate_locked_super+0x90/0x120 [ 58.936529] [T1501146] system_server: deactivate_super+0x74/0xac [ 58.937356] [T1501146] system_server: cleanup_mnt+0x128/0x168 [ 58.938150] [T1501146] system_server: __cleanup_mnt+0x18/0x28 [ 58.938944] [T1501146] system_server: task_work_run+0xb8/0x14c [ 58.939749] [T1501146] system_server: do_notify_resume+0x114/0x1e8 [ 58.940595] [T1501146] system_server: work_pending+0xc/0x5f0 [ 58.941375] [T1501146] -> #0 (&sbi->gc_lock){+.+.}-{3:3}: [ 58.942519] [T1501146] system_server: __lock_acquire+0x1270/0x2868 [ 58.943366] [T1501146] system_server: lock_acquire+0x114/0x294 [ 58.944169] [T1501146] system_server: down_write+0x7c/0xe0 [ 58.944930] [T1501146] system_server: f2fs_issue_checkpoint+0x13c/0x21c [ 58.945831] [T1501146] system_server: f2fs_sync_fs+0x48/0x6c [ 58.946614] [T1501146] system_server: f2fs_do_sync_file+0x334/0x738 [ 58.947472] [T1501146] system_server: f2fs_ioc_commit_atomic_write+0xc8/0x14c [ 58.948439] [T1501146] system_server: __f2fs_ioctl+0x674/0x154c [ 58.949253] [T1501146] system_server: f2fs_ioctl+0x54/0x88 [ 58.950018] [T1501146] system_server: __arm64_sys_ioctl+0xa8/0x110 [ 58.950865] [T1501146] system_server: el0_svc_common.llvm.12821150825140585682+0xd8/0x20c [ 58.951965] [T1501146] system_server: do_el0_svc+0x28/0xa0 [ 58.952727] [T1501146] system_server: el0_svc+0x24/0x38 [ 58.953454] [T1501146] system_server: el0_sync_handler+0x88/0xec [ 58.954279] [T1501146] system_server: el0_sync+0x1b4/0x1c0 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f0cd454d8f88..aa51c30333d3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2692,7 +2692,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; - inode_lock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); /* * do_quotactl @@ -2711,7 +2712,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); - inode_unlock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); if (ret) break; From 3db1de0e582c358dd013f3703cd55b5fe4076436 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 28 Apr 2022 11:18:09 -0700 Subject: [PATCH 122/334] f2fs: change the current atomic write way Current atomic write has three major issues like below. - keeps the updates in non-reclaimable memory space and they are even hard to be migrated, which is not good for contiguous memory allocation. - disk spaces used for atomic files cannot be garbage collected, so this makes it difficult for the filesystem to be defragmented. - If atomic write operations hit the threshold of either memory usage or garbage collection failure count, All the atomic write operations will fail immediately. To resolve the issues, I will keep a COW inode internally for all the updates to be flushed from memory, when we need to flush them out in a situation like high memory pressure. These COW inodes will be tagged as orphan inodes to be reclaimed in case of sudden power-cut or system failure during atomic writes. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 180 ++++++++++------ fs/f2fs/debug.c | 12 +- fs/f2fs/f2fs.h | 33 +-- fs/f2fs/file.c | 49 +++-- fs/f2fs/gc.c | 27 +-- fs/f2fs/inode.c | 3 +- fs/f2fs/namei.c | 28 ++- fs/f2fs/node.c | 4 - fs/f2fs/node.h | 1 - fs/f2fs/segment.c | 420 +++++++++++++----------------------- fs/f2fs/segment.h | 4 +- fs/f2fs/super.c | 6 +- include/trace/events/f2fs.h | 22 -- 13 files changed, 323 insertions(+), 466 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9a1a526f2092..8763a4690aaf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -2563,7 +2562,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; @@ -2600,6 +2604,7 @@ got_it: err = -EFSCORRUPTED; goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -3313,6 +3318,100 @@ unlock_out: return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + return __reserve_data_block(cow_inode, index, blk_addr, + node_changed); + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -3321,7 +3420,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3332,14 +3431,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3387,7 +3478,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3443,8 +3538,6 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3488,8 +3581,12 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -3522,9 +3619,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) inode->i_ino == F2FS_COMPRESS_INO(sbi)) clear_page_private_data(&folio->page); - if (page_private_atomic(&folio->page)) - return f2fs_drop_inmem_page(inode, &folio->page); - folio_detach_private(folio); } @@ -3534,10 +3628,6 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (PageDirty(page)) return 0; - /* This is atomic written page, keep Private */ - if (page_private_atomic(page)) - return 0; - if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { struct inode *inode = page->mapping->host; @@ -3563,18 +3653,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(&folio->page)) { - f2fs_register_inmem_page(inode, &folio->page); - return true; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return false; - } - if (!folio_test_dirty(folio)) { filemap_dirty_folio(mapping, folio); f2fs_update_dirty_folio(inode, folio); @@ -3654,42 +3732,14 @@ out: int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fcdf253cd211..65f0bcf498bb 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,7 +91,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = sbi->atomic_files; si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); @@ -167,8 +166,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,7 +293,6 @@ get_cache: sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -491,10 +487,6 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -519,9 +511,9 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + seq_printf(s, " - atomic IO: %4d (Max. %4d), " "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->aw_cnt, si->max_aw_cnt, si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index efe5e80163a8..68d299b54820 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -716,7 +716,6 @@ enum { enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -738,7 +737,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -752,7 +750,6 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -794,11 +791,9 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; @@ -1092,7 +1087,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1122,11 +1116,7 @@ enum page_type { META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1718,7 +1708,6 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -3202,11 +3191,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); @@ -3444,6 +3428,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3579,11 +3565,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3815,7 +3798,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3845,7 +3827,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b307d96a0a7c..7e5ec0c48b2a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1813,9 +1813,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); if (f2fs_is_volatile_file(inode)) { set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); @@ -1837,8 +1836,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * before dropping file lock, it needs to do in ->flush. */ if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -2001,6 +2000,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2023,11 +2023,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) @@ -2048,19 +2045,33 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + goto out; + } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + F2FS_I(inode)->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2091,21 +2102,17 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) goto err_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -2193,15 +2200,13 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) inode_lock(inode); if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3009c0a97ab4..ba8e93e517be 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1245,13 +1245,6 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } - err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; @@ -1393,12 +1386,6 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; - goto out; - } err = f2fs_gc_pinned_control(inode, gc_type, segno); if (err) goto out; @@ -1765,8 +1752,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1780,7 +1765,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1831,10 +1815,8 @@ retry: total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) + if (sbi->skipped_gc_rwsem) skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; } @@ -1860,13 +1842,6 @@ retry: segno = NULL_SEGNO; goto gc_more; } - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) ret = f2fs_write_checkpoint(sbi, &cpc); stop: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 02630c17da93..2fce8fa0dac8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -745,9 +745,8 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 37bdda931e0c..c549acb52ac4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -840,8 +840,8 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, - struct inode **whiteout) + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -855,7 +855,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -880,21 +880,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -915,7 +919,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, @@ -925,7 +929,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns, return -EIO; return __f2fs_tmpfile(mnt_userns, dir, NULL, - S_IFCHR | WHITEOUT_MODE, whiteout); + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); } static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 51230cba841b..beda8cbb791d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4c1d34bfea78..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,7 +147,6 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 87ff2b3cdf94..c0d7118fe171 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -30,7 +30,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -185,304 +185,180 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) -{ - struct inmem_pages *new; - - set_page_private_atomic(page); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); - - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); - - trace_f2fs_register_inmem_page(page, INMEM); -} - -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (f2fs_is_atomic_file(inode)) { + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + clear_inode_flag(inode, FI_ATOMIC_FILE); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } +} + +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; - - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); -retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni, false); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); - } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); - + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + kmem_cache_free(revoke_entry_slab, cur); } - return err; } -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; - } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; - } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); -} - -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - struct inmem_pages *tmp; - - f2fs_bug_on(sbi, !page_private_atomic(page)); - - mutex_lock(&fi->inmem_lock); - list_for_each_entry(tmp, head, list) { - if (tmp->page == page) { - cur = tmp; - break; - } - } - - f2fs_bug_on(sbi, !cur); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); -} - -static int __f2fs_commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - unlock_page(page); - break; - } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + if (!new) { + f2fs_put_dnode(&dn); + ret = -ENOMEM; + goto out; + } + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); +out: + __complete_revoke_list(inode, &revoke_list, ret ? true : false); - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); - - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); - } - - return err; + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -5360,9 +5236,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5381,5 +5257,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8fbc9f6afa55..3f277dfcb131 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -225,10 +225,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aa51c30333d3..0900c552a16c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1339,9 +1339,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1382,9 +1379,8 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 4d1ad64d4cab..7e915dbf3674 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,10 +15,6 @@ TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); -TRACE_DEFINE_ENUM(INMEM); -TRACE_DEFINE_ENUM(INMEM_DROP); -TRACE_DEFINE_ENUM(INMEM_INVALIDATE); -TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); @@ -59,10 +55,6 @@ TRACE_DEFINE_ENUM(CP_RESIZE); { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ - { INMEM, "INMEM" }, \ - { INMEM_DROP, "INMEM_DROP" }, \ - { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ - { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -1289,20 +1281,6 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_ARGS(page, type) ); -DEFINE_EVENT(f2fs__page, f2fs_register_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - -DEFINE_EVENT(f2fs__page, f2fs_commit_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - TRACE_EVENT(f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, unsigned long ret), From 7bc155fec5b371dbb57256e84a49c78692a09060 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 5 May 2022 17:49:18 -0700 Subject: [PATCH 123/334] f2fs: kill volatile write support There's no user, since all can use atomic writes simply. Let's kill it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 +- fs/f2fs/data.c | 5 -- fs/f2fs/debug.c | 10 +--- fs/f2fs/f2fs.h | 27 +--------- fs/f2fs/file.c | 116 ++----------------------------------------- fs/f2fs/segment.c | 3 +- fs/f2fs/verity.c | 2 +- 7 files changed, 10 insertions(+), 157 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index beceac9885c3..63ab8c9d674e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1004,9 +1004,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8763a4690aaf..54a7a8ad994d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2741,11 +2741,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 65f0bcf498bb..c92625ef16d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -92,9 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -511,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -615,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 68d299b54820..0699d7460d5d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -737,7 +737,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ @@ -1738,9 +1737,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -3191,11 +3188,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); -} - static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); @@ -3815,7 +3807,7 @@ struct f2fs_stat_info { int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3926,17 +3918,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -3998,9 +3979,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -4403,8 +4381,7 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7e5ec0c48b2a..6da8a663de7b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1815,13 +1815,6 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) if (f2fs_is_atomic_file(inode)) f2fs_abort_atomic_write(inode, true); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } return 0; } @@ -2096,15 +2089,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) @@ -2112,108 +2100,12 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: +unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_start_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_abort_atomic_write(inode, true); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - inode_unlock(inode); - - mnt_drop_write_file(filp); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return ret; -} - static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4151,11 +4043,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c0d7118fe171..47934995e2ca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3166,8 +3166,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 3d793202cc9f..5ac7e756a1bb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -128,7 +128,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* From 64e3ed0b8ea019597d0b5448fdf05a29eb65ae95 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Apr 2022 22:08:52 -0700 Subject: [PATCH 124/334] f2fs: reject test_dummy_encryption when !CONFIG_FS_ENCRYPTION There is no good reason to allow this mount option when the kernel isn't configured with encryption support. Since this option is only for testing, we can just fix this; we don't really need to worry about breaking anyone who might be counting on this option being ignored. Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0900c552a16c..8c81dd324297 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -525,10 +525,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); -#endif return 0; +#else + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; +#endif } #ifdef CONFIG_F2FS_FS_COMPRESSION From d147ea4adb969b03f7f1c7613cc6f44b70760eb3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 11:40:33 -0700 Subject: [PATCH 125/334] f2fs: introduce f2fs_gc_control to consolidate f2fs_gc parameters No functional change. - remove checkpoint=disable check for f2fs_write_checkpoint - get sec_freed all the time Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 +++++- fs/f2fs/file.c | 30 ++++++++++++--- fs/f2fs/gc.c | 74 ++++++++++++++++++++----------------- fs/f2fs/segment.c | 8 +++- fs/f2fs/super.c | 8 +++- include/trace/events/f2fs.h | 18 ++++----- 6 files changed, 98 insertions(+), 51 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0699d7460d5d..9920b2d6af8f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1265,6 +1265,14 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -3761,8 +3769,7 @@ extern const struct iomap_ops f2fs_iomap_ops; int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6da8a663de7b..d0547bef0851 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1647,6 +1647,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1684,7 +1688,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) goto out_err; } @@ -2344,6 +2348,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false }; __u32 sync; int ret; @@ -2369,7 +2376,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2378,6 +2387,11 @@ out: static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync }; u64 end; int ret; @@ -2405,8 +2419,8 @@ do_more: f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2820,6 +2834,10 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2863,7 +2881,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ba8e93e517be..f3d58d154240 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,6 +35,9 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false }; wait_ms = gc_th->min_sleep_time; @@ -141,8 +144,12 @@ do_gc: if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.err_gc_skipped = sync_mode; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) + if (f2fs_gc(sbi, &gc_control)) wait_ms = gc_th->no_gc_sleep_time; if (foreground) @@ -1740,21 +1747,20 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1781,8 +1787,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1792,7 +1797,7 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } @@ -1808,45 +1813,48 @@ retry: goto stop; } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_gc_rwsem) - skipped_round++; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC) goto stop; - if (!has_not_enough_free_secs(sbi, sec_freed, 0)) + if (!has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) goto stop; - if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) { - - /* Write checkpoint to reclaim prefree segments */ - if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && - prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { ret = f2fs_write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + goto stop; } - segno = NULL_SEGNO; - goto gc_more; } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; if (gc_type == FG_GC) f2fs_unpin_all_sections(sbi, true); @@ -1864,7 +1872,7 @@ stop: put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 47934995e2ca..8b4f2b1d2cca 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -399,8 +399,14 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false }; f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + f2fs_gc(sbi, &gc_control); } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8c81dd324297..a28c27eed6d0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2076,8 +2076,14 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true }; + f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7e915dbf3674..54ec9e543f09 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -644,19 +644,19 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, - TP_PROTO(struct super_block *sb, bool sync, bool background, + TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) - __field(bool, sync) - __field(bool, background) + __field(int, gc_type) + __field(bool, no_bg_gc) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -668,8 +668,8 @@ TRACE_EVENT(f2fs_gc_begin, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->sync = sync; - __entry->background = background; + __entry->gc_type = gc_type; + __entry->no_bg_gc = no_bg_gc; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -679,12 +679,12 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nodes = %lld, " "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), - __entry->sync, - __entry->background, + show_gc_type(__entry->gc_type), + (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, From c58d7c55de8bf7afd25d13d6eb8ef68782a51be9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 14:23:27 -0700 Subject: [PATCH 126/334] f2fs: keep wait_ms if EAGAIN happens In f2fs_gc thread, let's keep wait_ms when sec_freed was zero. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f3d58d154240..e275b72bc65f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -37,7 +37,8 @@ static int gc_thread_func(void *data) unsigned int wait_ms; struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, - .should_migrate_blocks = false }; + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -146,7 +147,6 @@ do_gc: gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; - gc_control.err_gc_skipped = sync_mode; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) From dc15f82f5329ab5daefa692bb80fb085a09ebd86 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 29 Apr 2022 15:46:17 -0300 Subject: [PATCH 127/334] vfio: Delete container_q Now that the iommu core takes care of isolation there is no race between driver attach and container unset. Once iommu_group_release_dma_owner() returns the device can immediately be re-used. Remove this mechanism. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v1-a1e8791d795b+6b-vfio_container_q_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index cc0d337f7b13..8cb7ba03aa16 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -74,7 +74,6 @@ struct vfio_group { struct list_head vfio_next; struct list_head container_next; atomic_t opened; - wait_queue_head_t container_q; enum vfio_group_type type; unsigned int dev_counter; struct kvm *kvm; @@ -363,7 +362,6 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, refcount_set(&group->users, 1); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); - init_waitqueue_head(&group->container_q); group->iommu_group = iommu_group; /* put in vfio_group_release() */ iommu_group_ref_get(iommu_group); @@ -684,23 +682,6 @@ void vfio_unregister_group_dev(struct vfio_device *device) group->dev_counter--; mutex_unlock(&group->device_lock); - /* - * In order to support multiple devices per group, devices can be - * plucked from the group while other devices in the group are still - * in use. The container persists with this group and those remaining - * devices still attached. If the user creates an isolation violation - * by binding this device to another driver while the group is still in - * use, that's their fault. However, in the case of removing the last, - * or potentially the only, device in the group there can be no other - * in-use devices in the group. The user has done their due diligence - * and we should lay no claims to those devices. In order to do that, - * we need to make sure the group is detached from the container. - * Without this stall, we're potentially racing with a user process - * that may attempt to immediately bind this device to another driver. - */ - if (list_empty(&group->device_list)) - wait_event(group->container_q, !group->container); - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -945,7 +926,6 @@ static void __vfio_group_unset_container(struct vfio_group *group) iommu_group_release_dma_owner(group->iommu_group); group->container = NULL; - wake_up(&group->container_q); list_del(&group->container_next); /* Detaching the last group deprivileges a container, remove iommu */ From 73b0565f19a8fbc18dcf4b9b5c26d1a47a69ab24 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:39 -0300 Subject: [PATCH 128/334] kvm/vfio: Move KVM_DEV_VFIO_GROUP_* ioctls into functions To make it easier to read and change in following patches. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- virt/kvm/vfio.c | 275 ++++++++++++++++++++++++++---------------------- 1 file changed, 149 insertions(+), 126 deletions(-) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 8fcbc50221c2..512b3ca00f3f 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -181,149 +181,171 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) mutex_unlock(&kv->lock); } -static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct vfio_group *vfio_group; struct kvm_vfio_group *kvg; - int32_t __user *argp = (int32_t __user *)(unsigned long)arg; struct fd f; - int32_t fd; int ret; + f = fdget(fd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group == vfio_group) { + ret = -EEXIST; + goto err_unlock; + } + } + + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); + if (!kvg) { + ret = -ENOMEM; + goto err_unlock; + } + + list_add_tail(&kvg->node, &kv->group_list); + kvg->vfio_group = vfio_group; + + kvm_arch_start_assignment(dev->kvm); + + mutex_unlock(&kv->lock); + + kvm_vfio_group_set_kvm(vfio_group, dev->kvm); + kvm_vfio_update_coherency(dev); + + return 0; +err_unlock: + mutex_unlock(&kv->lock); + kvm_vfio_group_put_external_user(vfio_group); + return ret; +} + +static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) +{ + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_group *kvg; + struct fd f; + int ret; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (!kvm_vfio_external_group_match_file(kvg->vfio_group, + f.file)) + continue; + + list_del(&kvg->node); + kvm_arch_end_assignment(dev->kvm); +#ifdef CONFIG_SPAPR_TCE_IOMMU + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); +#endif + kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); + kvm_vfio_group_put_external_user(kvg->vfio_group); + kfree(kvg); + ret = 0; + break; + } + + mutex_unlock(&kv->lock); + + fdput(f); + + kvm_vfio_update_coherency(dev); + + return ret; +} + +#ifdef CONFIG_SPAPR_TCE_IOMMU +static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, + void __user *arg) +{ + struct kvm_vfio_spapr_tce param; + struct kvm_vfio *kv = dev->private; + struct vfio_group *vfio_group; + struct kvm_vfio_group *kvg; + struct fd f; + struct iommu_group *grp; + int ret; + + if (copy_from_user(¶m, arg, sizeof(struct kvm_vfio_spapr_tce))) + return -EFAULT; + + f = fdget(param.groupfd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + grp = kvm_vfio_group_get_iommu_group(vfio_group); + if (WARN_ON_ONCE(!grp)) { + ret = -EIO; + goto err_put_external; + } + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group != vfio_group) + continue; + + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, + grp); + break; + } + + mutex_unlock(&kv->lock); + + iommu_group_put(grp); +err_put_external: + kvm_vfio_group_put_external_user(vfio_group); + return ret; +} +#endif + +static int kvm_vfio_set_group(struct kvm_device *dev, long attr, + void __user *arg) +{ + int32_t __user *argp = arg; + int32_t fd; + switch (attr) { case KVM_DEV_VFIO_GROUP_ADD: if (get_user(fd, argp)) return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group == vfio_group) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -EEXIST; - } - } - - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); - if (!kvg) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -ENOMEM; - } - - list_add_tail(&kvg->node, &kv->group_list); - kvg->vfio_group = vfio_group; - - kvm_arch_start_assignment(dev->kvm); - - mutex_unlock(&kv->lock); - - kvm_vfio_group_set_kvm(vfio_group, dev->kvm); - - kvm_vfio_update_coherency(dev); - - return 0; + return kvm_vfio_group_add(dev, fd); case KVM_DEV_VFIO_GROUP_DEL: if (get_user(fd, argp)) return -EFAULT; + return kvm_vfio_group_del(dev, fd); - f = fdget(fd); - if (!f.file) - return -EBADF; - - ret = -ENOENT; - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_external_group_match_file(kvg->vfio_group, - f.file)) - continue; - - list_del(&kvg->node); - kvm_arch_end_assignment(dev->kvm); #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, - kvg->vfio_group); + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: + return kvm_vfio_group_set_spapr_tce(dev, arg); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); - kfree(kvg); - ret = 0; - break; - } - - mutex_unlock(&kv->lock); - - fdput(f); - - kvm_vfio_update_coherency(dev); - - return ret; - -#ifdef CONFIG_SPAPR_TCE_IOMMU - case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { - struct kvm_vfio_spapr_tce param; - struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; - struct kvm_vfio_group *kvg; - struct fd f; - struct iommu_group *grp; - - if (copy_from_user(¶m, (void __user *)arg, - sizeof(struct kvm_vfio_spapr_tce))) - return -EFAULT; - - f = fdget(param.groupfd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - grp = kvm_vfio_group_get_iommu_group(vfio_group); - if (WARN_ON_ONCE(!grp)) { - kvm_vfio_group_put_external_user(vfio_group); - return -EIO; - } - - ret = -ENOENT; - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group != vfio_group) - continue; - - ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, - param.tablefd, grp); - break; - } - - mutex_unlock(&kv->lock); - - iommu_group_put(grp); - kvm_vfio_group_put_external_user(vfio_group); - - return ret; - } -#endif /* CONFIG_SPAPR_TCE_IOMMU */ } return -ENXIO; @@ -334,7 +356,8 @@ static int kvm_vfio_set_attr(struct kvm_device *dev, { switch (attr->group) { case KVM_DEV_VFIO_GROUP: - return kvm_vfio_set_group(dev, attr->attr, attr->addr); + return kvm_vfio_set_group(dev, attr->attr, + u64_to_user_ptr(attr->addr)); } return -ENXIO; From d55d9e7a4572182701ed0b62313b4f22e544e226 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:40 -0300 Subject: [PATCH 129/334] kvm/vfio: Store the struct file in the kvm_vfio_group Following patches will change the APIs to use the struct file as the handle instead of the vfio_group, so hang on to a reference to it with the same duration of as the vfio_group. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- virt/kvm/vfio.c | 59 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 512b3ca00f3f..3bd2615154d0 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -23,6 +23,7 @@ struct kvm_vfio_group { struct list_head node; + struct file *file; struct vfio_group *vfio_group; }; @@ -186,23 +187,17 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) struct kvm_vfio *kv = dev->private; struct vfio_group *vfio_group; struct kvm_vfio_group *kvg; - struct fd f; + struct file *filp; int ret; - f = fdget(fd); - if (!f.file) + filp = fget(fd); + if (!filp) return -EBADF; - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group == vfio_group) { + if (kvg->file == filp) { ret = -EEXIST; goto err_unlock; } @@ -214,6 +209,13 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) goto err_unlock; } + vfio_group = kvm_vfio_group_get_external_user(filp); + if (IS_ERR(vfio_group)) { + ret = PTR_ERR(vfio_group); + goto err_free; + } + + kvg->file = filp; list_add_tail(&kvg->node, &kv->group_list); kvg->vfio_group = vfio_group; @@ -225,9 +227,11 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) kvm_vfio_update_coherency(dev); return 0; +err_free: + kfree(kvg); err_unlock: mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); + fput(filp); return ret; } @@ -258,6 +262,7 @@ static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) #endif kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); + fput(kvg->file); kfree(kvg); ret = 0; break; @@ -278,10 +283,8 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, { struct kvm_vfio_spapr_tce param; struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; struct kvm_vfio_group *kvg; struct fd f; - struct iommu_group *grp; int ret; if (copy_from_user(¶m, arg, sizeof(struct kvm_vfio_spapr_tce))) @@ -291,36 +294,31 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, if (!f.file) return -EBADF; - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - grp = kvm_vfio_group_get_iommu_group(vfio_group); - if (WARN_ON_ONCE(!grp)) { - ret = -EIO; - goto err_put_external; - } - ret = -ENOENT; mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group != vfio_group) + struct iommu_group *grp; + + if (kvg->file != f.file) continue; + grp = kvm_vfio_group_get_iommu_group(kvg->vfio_group); + if (WARN_ON_ONCE(!grp)) { + ret = -EIO; + goto err_fdput; + } + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, grp); + iommu_group_put(grp); break; } mutex_unlock(&kv->lock); - - iommu_group_put(grp); -err_put_external: - kvm_vfio_group_put_external_user(vfio_group); +err_fdput: + fdput(f); return ret; } #endif @@ -394,6 +392,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev) #endif kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); + fput(kvg->file); list_del(&kvg->node); kfree(kvg); kvm_arch_end_assignment(dev->kvm); From 50d63b5bbfd12262069ad062611cd5e69c5e9e05 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:41 -0300 Subject: [PATCH 130/334] vfio: Change vfio_external_user_iommu_id() to vfio_file_iommu_group() The only caller wants to get a pointer to the struct iommu_group associated with the VFIO group file. Instead of returning the group ID then searching sysfs for that string to get the struct iommu_group just directly return the iommu_group pointer already held by the vfio_group struct. It already has a safe lifetime due to the struct file kref, the vfio_group and thus the iommu_group cannot be destroyed while the group file is open. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 21 ++++++++++++++------- include/linux/vfio.h | 2 +- virt/kvm/vfio.c | 37 ++++++++++++------------------------- 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 8cb7ba03aa16..25f90f3773de 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1653,10 +1653,7 @@ static const struct file_operations vfio_device_fops = { * increments the container user counter to prevent * the VFIO group from disposal before KVM exits. * - * 3. The external user calls vfio_external_user_iommu_id() - * to know an IOMMU ID. - * - * 4. When the external KVM finishes, it calls + * 3. When the external KVM finishes, it calls * vfio_group_put_external_user() to release the VFIO group. * This call decrements the container user counter. */ @@ -1697,11 +1694,21 @@ bool vfio_external_group_match_file(struct vfio_group *test_group, } EXPORT_SYMBOL_GPL(vfio_external_group_match_file); -int vfio_external_user_iommu_id(struct vfio_group *group) +/** + * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file + * @file: VFIO group file + * + * The returned iommu_group is valid as long as a ref is held on the file. + */ +struct iommu_group *vfio_file_iommu_group(struct file *file) { - return iommu_group_id(group->iommu_group); + struct vfio_group *group = file->private_data; + + if (file->f_op != &vfio_group_fops) + return NULL; + return group->iommu_group; } -EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id); +EXPORT_SYMBOL_GPL(vfio_file_iommu_group); long vfio_external_check_extension(struct vfio_group *group, unsigned long arg) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be1e7db4a314..d8c34c8490cb 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -140,7 +140,7 @@ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); extern bool vfio_external_group_match_file(struct vfio_group *group, struct file *filep); -extern int vfio_external_user_iommu_id(struct vfio_group *group); +extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern long vfio_external_check_extension(struct vfio_group *group, unsigned long arg); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 3bd2615154d0..9b7384dde158 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -108,43 +108,31 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) } #ifdef CONFIG_SPAPR_TCE_IOMMU -static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group) +static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { - int (*fn)(struct vfio_group *); - int ret = -EINVAL; + struct iommu_group *(*fn)(struct file *file); + struct iommu_group *ret; - fn = symbol_get(vfio_external_user_iommu_id); + fn = symbol_get(vfio_file_iommu_group); if (!fn) - return ret; + return NULL; - ret = fn(vfio_group); + ret = fn(file); - symbol_put(vfio_external_user_iommu_id); + symbol_put(vfio_file_iommu_group); return ret; } -static struct iommu_group *kvm_vfio_group_get_iommu_group( - struct vfio_group *group) -{ - int group_id = kvm_vfio_external_user_iommu_id(group); - - if (group_id < 0) - return NULL; - - return iommu_group_get_by_id(group_id); -} - static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, - struct vfio_group *vfio_group) + struct kvm_vfio_group *kvg) { - struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group); + struct iommu_group *grp = kvm_vfio_file_iommu_group(kvg->file); if (WARN_ON_ONCE(!grp)) return; kvm_spapr_tce_release_iommu_group(kvm, grp); - iommu_group_put(grp); } #endif @@ -258,7 +246,7 @@ static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) list_del(&kvg->node); kvm_arch_end_assignment(dev->kvm); #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); @@ -304,7 +292,7 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, if (kvg->file != f.file) continue; - grp = kvm_vfio_group_get_iommu_group(kvg->vfio_group); + grp = kvm_vfio_file_iommu_group(kvg->file); if (WARN_ON_ONCE(!grp)) { ret = -EIO; goto err_fdput; @@ -312,7 +300,6 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, grp); - iommu_group_put(grp); break; } @@ -388,7 +375,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev) list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); From c38ff5b0c373fbbd6a249eb461ffd4ae0f9dbfa0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:42 -0300 Subject: [PATCH 131/334] vfio: Remove vfio_external_group_match_file() vfio_group_fops_open() ensures there is only ever one struct file open for any struct vfio_group at any time: /* Do we need multiple instances of the group open? Seems not. */ opened = atomic_cmpxchg(&group->opened, 0, 1); if (opened) { vfio_group_put(group); return -EBUSY; Therefor the struct file * can be used directly to search the list of VFIO groups that KVM keeps instead of using the vfio_external_group_match_file() callback to try to figure out if the passed in FD matches the list or not. Delete vfio_external_group_match_file(). Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 9 --------- include/linux/vfio.h | 2 -- virt/kvm/vfio.c | 19 +------------------ 3 files changed, 1 insertion(+), 29 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 25f90f3773de..4805e18f2272 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1685,15 +1685,6 @@ void vfio_group_put_external_user(struct vfio_group *group) } EXPORT_SYMBOL_GPL(vfio_group_put_external_user); -bool vfio_external_group_match_file(struct vfio_group *test_group, - struct file *filep) -{ - struct vfio_group *group = filep->private_data; - - return (filep->f_op == &vfio_group_fops) && (group == test_group); -} -EXPORT_SYMBOL_GPL(vfio_external_group_match_file); - /** * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file diff --git a/include/linux/vfio.h b/include/linux/vfio.h index d8c34c8490cb..df0adecdf1ea 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -138,8 +138,6 @@ int vfio_mig_get_next_state(struct vfio_device *device, */ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); -extern bool vfio_external_group_match_file(struct vfio_group *group, - struct file *filep); extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern long vfio_external_check_extension(struct vfio_group *group, unsigned long arg); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 9b7384dde158..0b84916c3f71 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -49,22 +49,6 @@ static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) return vfio_group; } -static bool kvm_vfio_external_group_match_file(struct vfio_group *group, - struct file *filep) -{ - bool ret, (*fn)(struct vfio_group *, struct file *); - - fn = symbol_get(vfio_external_group_match_file); - if (!fn) - return false; - - ret = fn(group, filep); - - symbol_put(vfio_external_group_match_file); - - return ret; -} - static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) { void (*fn)(struct vfio_group *); @@ -239,8 +223,7 @@ static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_external_group_match_file(kvg->vfio_group, - f.file)) + if (kvg->file != f.file) continue; list_del(&kvg->node); From a905ad043f32bbb0c35d4325036397f20f30c8a9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:43 -0300 Subject: [PATCH 132/334] vfio: Change vfio_external_check_extension() to vfio_file_enforced_coherent() Instead of a general extension check change the function into a limited test if the iommu_domain has enforced coherency, which is the only thing kvm needs to query. Make the new op self contained by properly refcounting the container before touching it. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 30 +++++++++++++++++++++++++++--- include/linux/vfio.h | 3 +-- virt/kvm/vfio.c | 16 ++++++++-------- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 4805e18f2272..c2a360437e6c 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1701,11 +1701,35 @@ struct iommu_group *vfio_file_iommu_group(struct file *file) } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); -long vfio_external_check_extension(struct vfio_group *group, unsigned long arg) +/** + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file + * is always CPU cache coherent + * @file: VFIO group file + * + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop + * bit in DMA transactions. A return of false indicates that the user has + * rights to access additional instructions such as wbinvd on x86. + */ +bool vfio_file_enforced_coherent(struct file *file) { - return vfio_ioctl_check_extension(group->container, arg); + struct vfio_group *group = file->private_data; + bool ret; + + if (file->f_op != &vfio_group_fops) + return true; + + /* + * Since the coherency state is determined only once a container is + * attached the user must do so before they can prove they have + * permission. + */ + if (vfio_group_add_container_user(group)) + return true; + ret = vfio_ioctl_check_extension(group->container, VFIO_DMA_CC_IOMMU); + vfio_group_try_dissolve_container(group); + return ret; } -EXPORT_SYMBOL_GPL(vfio_external_check_extension); +EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); /* * Sub-module support diff --git a/include/linux/vfio.h b/include/linux/vfio.h index df0adecdf1ea..7cc374da4839 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -139,8 +139,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); extern struct iommu_group *vfio_file_iommu_group(struct file *file); -extern long vfio_external_check_extension(struct vfio_group *group, - unsigned long arg); +extern bool vfio_file_enforced_coherent(struct file *file); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 0b84916c3f71..d44cb3efb0b9 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -75,20 +75,20 @@ static void kvm_vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) symbol_put(vfio_group_set_kvm); } -static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) +static bool kvm_vfio_file_enforced_coherent(struct file *file) { - long (*fn)(struct vfio_group *, unsigned long); - long ret; + bool (*fn)(struct file *file); + bool ret; - fn = symbol_get(vfio_external_check_extension); + fn = symbol_get(vfio_file_enforced_coherent); if (!fn) return false; - ret = fn(vfio_group, VFIO_DMA_CC_IOMMU); + ret = fn(file); - symbol_put(vfio_external_check_extension); + symbol_put(vfio_file_enforced_coherent); - return ret > 0; + return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU @@ -136,7 +136,7 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) { + if (!kvm_vfio_file_enforced_coherent(kvg->file)) { noncoherent = true; break; } From ba70a89f3c2a8279809ea0fc7684857c91938b8a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:44 -0300 Subject: [PATCH 133/334] vfio: Change vfio_group_set_kvm() to vfio_file_set_kvm() Just change the argument from struct vfio_group to struct file *. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 29 +++++++++++++++++++++-------- include/linux/vfio.h | 5 +++-- virt/kvm/vfio.c | 16 ++++++++-------- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index c2a360437e6c..a0f73bd8e53f 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1731,6 +1731,27 @@ bool vfio_file_enforced_coherent(struct file *file) } EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); +/** + * vfio_file_set_kvm - Link a kvm with VFIO drivers + * @file: VFIO group file + * @kvm: KVM to link + * + * The kvm pointer will be forwarded to all the vfio_device's attached to the + * VFIO file via the VFIO_GROUP_NOTIFY_SET_KVM notifier. + */ +void vfio_file_set_kvm(struct file *file, struct kvm *kvm) +{ + struct vfio_group *group = file->private_data; + + if (file->f_op != &vfio_group_fops) + return; + + group->kvm = kvm; + blocking_notifier_call_chain(&group->notifier, + VFIO_GROUP_NOTIFY_SET_KVM, kvm); +} +EXPORT_SYMBOL_GPL(vfio_file_set_kvm); + /* * Sub-module support */ @@ -1999,14 +2020,6 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, return ret; } -void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) -{ - group->kvm = kvm; - blocking_notifier_call_chain(&group->notifier, - VFIO_GROUP_NOTIFY_SET_KVM, kvm); -} -EXPORT_SYMBOL_GPL(vfio_group_set_kvm); - static int vfio_register_group_notifier(struct vfio_group *group, unsigned long *events, struct notifier_block *nb) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 7cc374da4839..b298a26c4f07 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -15,6 +15,8 @@ #include #include +struct kvm; + /* * VFIO devices can be placed in a set, this allows all devices to share this * structure and the VFIO core will provide a lock that is held around @@ -140,6 +142,7 @@ extern struct vfio_group *vfio_group_get_external_user(struct file *filep); extern void vfio_group_put_external_user(struct vfio_group *group); extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern bool vfio_file_enforced_coherent(struct file *file); +extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) @@ -170,8 +173,6 @@ extern int vfio_unregister_notifier(struct vfio_device *device, enum vfio_notify_type type, struct notifier_block *nb); -struct kvm; -extern void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); /* * Sub-module helpers diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d44cb3efb0b9..b4e1bc22b7c5 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -62,17 +62,17 @@ static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) symbol_put(vfio_group_put_external_user); } -static void kvm_vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) +static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm) { - void (*fn)(struct vfio_group *, struct kvm *); + void (*fn)(struct file *file, struct kvm *kvm); - fn = symbol_get(vfio_group_set_kvm); + fn = symbol_get(vfio_file_set_kvm); if (!fn) return; - fn(group, kvm); + fn(file, kvm); - symbol_put(vfio_group_set_kvm); + symbol_put(vfio_file_set_kvm); } static bool kvm_vfio_file_enforced_coherent(struct file *file) @@ -195,7 +195,7 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) mutex_unlock(&kv->lock); - kvm_vfio_group_set_kvm(vfio_group, dev->kvm); + kvm_vfio_file_set_kvm(kvg->file, dev->kvm); kvm_vfio_update_coherency(dev); return 0; @@ -231,7 +231,7 @@ static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); + kvm_vfio_file_set_kvm(kvg->file, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); fput(kvg->file); kfree(kvg); @@ -360,7 +360,7 @@ static void kvm_vfio_destroy(struct kvm_device *dev) #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); + kvm_vfio_file_set_kvm(kvg->file, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); fput(kvg->file); list_del(&kvg->node); From 3e5449d5f954f537522906dfcb6a76e2b035521f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:45 -0300 Subject: [PATCH 134/334] kvm/vfio: Remove vfio_group from kvm None of the VFIO APIs take in the vfio_group anymore, so we can remove it completely. This has a subtle side effect on the enforced coherency tracking. The vfio_group_get_external_user() was holding on to the container_users which would prevent the iommu_domain and thus the enforced coherency value from changing while the group is registered with kvm. It changes the security proof slightly into 'user must hold a group FD that has a device that cannot enforce DMA coherence'. As opening the group FD, not attaching the container, is the privileged operation this doesn't change the security properties much. On the flip side it paves the way to changing the iommu_domain/container attached to a group at runtime which is something that will be required to support nested translation. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig i Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- virt/kvm/vfio.c | 51 ++++++++----------------------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index b4e1bc22b7c5..8f9f7fffb96a 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -24,7 +24,6 @@ struct kvm_vfio_group { struct list_head node; struct file *file; - struct vfio_group *vfio_group; }; struct kvm_vfio { @@ -33,35 +32,6 @@ struct kvm_vfio { bool noncoherent; }; -static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) -{ - struct vfio_group *vfio_group; - struct vfio_group *(*fn)(struct file *); - - fn = symbol_get(vfio_group_get_external_user); - if (!fn) - return ERR_PTR(-EINVAL); - - vfio_group = fn(filep); - - symbol_put(vfio_group_get_external_user); - - return vfio_group; -} - -static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) -{ - void (*fn)(struct vfio_group *); - - fn = symbol_get(vfio_group_put_external_user); - if (!fn) - return; - - fn(vfio_group); - - symbol_put(vfio_group_put_external_user); -} - static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm) { void (*fn)(struct file *file, struct kvm *kvm); @@ -91,7 +61,6 @@ static bool kvm_vfio_file_enforced_coherent(struct file *file) return ret; } -#ifdef CONFIG_SPAPR_TCE_IOMMU static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { struct iommu_group *(*fn)(struct file *file); @@ -108,6 +77,7 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) return ret; } +#ifdef CONFIG_SPAPR_TCE_IOMMU static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_group *kvg) { @@ -157,7 +127,6 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; struct kvm_vfio_group *kvg; struct file *filp; int ret; @@ -166,6 +135,12 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) if (!filp) return -EBADF; + /* Ensure the FD is a vfio group FD.*/ + if (!kvm_vfio_file_iommu_group(filp)) { + ret = -EINVAL; + goto err_fput; + } + mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { @@ -181,15 +156,8 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) goto err_unlock; } - vfio_group = kvm_vfio_group_get_external_user(filp); - if (IS_ERR(vfio_group)) { - ret = PTR_ERR(vfio_group); - goto err_free; - } - kvg->file = filp; list_add_tail(&kvg->node, &kv->group_list); - kvg->vfio_group = vfio_group; kvm_arch_start_assignment(dev->kvm); @@ -199,10 +167,9 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) kvm_vfio_update_coherency(dev); return 0; -err_free: - kfree(kvg); err_unlock: mutex_unlock(&kv->lock); +err_fput: fput(filp); return ret; } @@ -232,7 +199,6 @@ static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif kvm_vfio_file_set_kvm(kvg->file, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); fput(kvg->file); kfree(kvg); ret = 0; @@ -361,7 +327,6 @@ static void kvm_vfio_destroy(struct kvm_device *dev) kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif kvm_vfio_file_set_kvm(kvg->file, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); fput(kvg->file); list_del(&kvg->node); kfree(kvg); From 6a985ae80befcf2c00e7c889336bfe9e9739e2ef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 4 May 2022 16:14:46 -0300 Subject: [PATCH 135/334] vfio/pci: Use the struct file as the handle not the vfio_group VFIO PCI does a security check as part of hot reset to prove that the user has permission to manipulate all the devices that will be impacted by the reset. Use a new API vfio_file_has_dev() to perform this security check against the struct file directly and remove the vfio_group from VFIO PCI. Since VFIO PCI was the last user of vfio_group_get_external_user() and vfio_group_put_external_user() remove it as well. Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v3-f7729924a7ea+25e33-vfio_kvm_no_group_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 42 +++++++++---------- drivers/vfio/vfio.c | 70 ++++++++------------------------ include/linux/vfio.h | 3 +- 3 files changed, 40 insertions(+), 75 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 100ab98c7ff0..05a3aa95ba52 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -556,7 +556,7 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) struct vfio_pci_group_info { int count; - struct vfio_group **groups; + struct file **files; }; static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) @@ -1018,10 +1018,10 @@ reset_info_exit: } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { struct vfio_pci_hot_reset hdr; int32_t *group_fds; - struct vfio_group **groups; + struct file **files; struct vfio_pci_group_info info; bool slot = false; - int group_idx, count = 0, ret = 0; + int file_idx, count = 0, ret = 0; minsz = offsetofend(struct vfio_pci_hot_reset, count); @@ -1054,17 +1054,17 @@ reset_info_exit: return -EINVAL; group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); - if (!group_fds || !groups) { + files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + if (!group_fds || !files) { kfree(group_fds); - kfree(groups); + kfree(files); return -ENOMEM; } if (copy_from_user(group_fds, (void __user *)(arg + minsz), hdr.count * sizeof(*group_fds))) { kfree(group_fds); - kfree(groups); + kfree(files); return -EFAULT; } @@ -1073,22 +1073,22 @@ reset_info_exit: * user interface and store the group and iommu ID. This * ensures the group is held across the reset. */ - for (group_idx = 0; group_idx < hdr.count; group_idx++) { - struct vfio_group *group; - struct fd f = fdget(group_fds[group_idx]); - if (!f.file) { + for (file_idx = 0; file_idx < hdr.count; file_idx++) { + struct file *file = fget(group_fds[file_idx]); + + if (!file) { ret = -EBADF; break; } - group = vfio_group_get_external_user(f.file); - fdput(f); - if (IS_ERR(group)) { - ret = PTR_ERR(group); + /* Ensure the FD is a vfio group FD.*/ + if (!vfio_file_iommu_group(file)) { + fput(file); + ret = -EINVAL; break; } - groups[group_idx] = group; + files[file_idx] = file; } kfree(group_fds); @@ -1098,15 +1098,15 @@ reset_info_exit: goto hot_reset_release; info.count = hdr.count; - info.groups = groups; + info.files = files; ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); hot_reset_release: - for (group_idx--; group_idx >= 0; group_idx--) - vfio_group_put_external_user(groups[group_idx]); + for (file_idx--; file_idx >= 0; file_idx--) + fput(files[file_idx]); - kfree(groups); + kfree(files); return ret; } else if (cmd == VFIO_DEVICE_IOEVENTFD) { struct vfio_device_ioeventfd ioeventfd; @@ -1972,7 +1972,7 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, unsigned int i; for (i = 0; i < groups->count; i++) - if (groups->groups[i] == vdev->vdev.group) + if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) return true; return false; } diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index a0f73bd8e53f..1758d96f43f4 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1633,58 +1633,6 @@ static const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -/* - * External user API, exported by symbols to be linked dynamically. - * - * The protocol includes: - * 1. do normal VFIO init operation: - * - opening a new container; - * - attaching group(s) to it; - * - setting an IOMMU driver for a container. - * When IOMMU is set for a container, all groups in it are - * considered ready to use by an external user. - * - * 2. User space passes a group fd to an external user. - * The external user calls vfio_group_get_external_user() - * to verify that: - * - the group is initialized; - * - IOMMU is set for it. - * If both checks passed, vfio_group_get_external_user() - * increments the container user counter to prevent - * the VFIO group from disposal before KVM exits. - * - * 3. When the external KVM finishes, it calls - * vfio_group_put_external_user() to release the VFIO group. - * This call decrements the container user counter. - */ -struct vfio_group *vfio_group_get_external_user(struct file *filep) -{ - struct vfio_group *group = filep->private_data; - int ret; - - if (filep->f_op != &vfio_group_fops) - return ERR_PTR(-EINVAL); - - ret = vfio_group_add_container_user(group); - if (ret) - return ERR_PTR(ret); - - /* - * Since the caller holds the fget on the file group->users must be >= 1 - */ - vfio_group_get(group); - - return group; -} -EXPORT_SYMBOL_GPL(vfio_group_get_external_user); - -void vfio_group_put_external_user(struct vfio_group *group) -{ - vfio_group_try_dissolve_container(group); - vfio_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_group_put_external_user); - /** * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file @@ -1752,6 +1700,24 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); +/** + * vfio_file_has_dev - True if the VFIO file is a handle for device + * @file: VFIO file to check + * @device: Device that must be part of the file + * + * Returns true if given file has permission to manipulate the given device. + */ +bool vfio_file_has_dev(struct file *file, struct vfio_device *device) +{ + struct vfio_group *group = file->private_data; + + if (file->f_op != &vfio_group_fops) + return false; + + return group == device->group; +} +EXPORT_SYMBOL_GPL(vfio_file_has_dev); + /* * Sub-module support */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b298a26c4f07..45b287826ce6 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -138,11 +138,10 @@ int vfio_mig_get_next_state(struct vfio_device *device, /* * External user API */ -extern struct vfio_group *vfio_group_get_external_user(struct file *filep); -extern void vfio_group_put_external_user(struct vfio_group *group); extern struct iommu_group *vfio_file_iommu_group(struct file *file); extern bool vfio_file_enforced_coherent(struct file *file); extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm); +extern bool vfio_file_has_dev(struct file *file, struct vfio_device *device); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) From 1c05bb947f6464756174830b778aabf8f9d6ed0e Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 16 May 2022 12:12:02 +0200 Subject: [PATCH 136/334] include/uapi/linux/vfio.h: Fix trivial typo - _IORW should be _IOWR instead There is no macro called _IORW, so use _IOWR in the comment instead. Signed-off-by: Thomas Huth Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/20220516101202.88373-1-thuth@redhat.com Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fea86061b44e..733a1cddde30 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -643,7 +643,7 @@ enum { }; /** - * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) * * Return: 0 on success, -errno on failure: @@ -770,7 +770,7 @@ struct vfio_device_ioeventfd { #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) /** - * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17, + * VFIO_DEVICE_FEATURE - _IOWR(VFIO_TYPE, VFIO_BASE + 17, * struct vfio_device_feature) * * Get, set, or probe feature data of the device. The feature is selected From 6bbe1065121b8cd3b3e734ef8cd99f142bdab241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Wed, 11 May 2022 15:54:51 -0400 Subject: [PATCH 137/334] dt-bindings: remoteproc: mediatek: Make l1tcm reg exclusive to mt819x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ca23ecfdbd44 ("remoteproc/mediatek: support L1TCM") added support for the l1tcm memory region on the MT8192 SCP, adding a new da_to_va callback that handles l1tcm while keeping the old one for back-compatibility with MT8183. However, since the mt8192 compatible was missing from the dt-binding, the accompanying dt-binding commit 503c64cc42f1 ("dt-bindings: remoteproc: mediatek: add L1TCM memory region") mistakenly added this reg as if it were for mt8183. And later it became common to all platforms as their compatibles were added. Fix the dt-binding so that the l1tcm reg can be present only on the supported platforms: mt8192 and mt8195. Fixes: 503c64cc42f1 ("dt-bindings: remoteproc: mediatek: add L1TCM memory region") Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220511195452.871897-2-nfraprado@collabora.com Signed-off-by: Mathieu Poirier --- .../bindings/remoteproc/mtk,scp.yaml | 44 +++++++++++++------ 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index abc11ac92818..832064d635b3 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -23,11 +23,13 @@ properties: reg: description: - Should contain the address ranges for memory regions SRAM, CFG, and - L1TCM. + Should contain the address ranges for memory regions SRAM, CFG, and, + on some platforms, L1TCM. + minItems: 2 maxItems: 3 reg-names: + minItems: 2 items: - const: sram - const: cfg @@ -57,16 +59,30 @@ required: - reg - reg-names -if: - properties: - compatible: - enum: - - mediatek,mt8183-scp - - mediatek,mt8192-scp -then: - required: - - clocks - - clock-names +allOf: + - if: + properties: + compatible: + enum: + - mediatek,mt8183-scp + - mediatek,mt8192-scp + then: + required: + - clocks + - clock-names + + - if: + properties: + compatible: + enum: + - mediatek,mt8183-scp + - mediatek,mt8186-scp + then: + properties: + reg: + maxItems: 2 + reg-names: + maxItems: 2 additionalProperties: type: object @@ -86,10 +102,10 @@ additionalProperties: examples: - | - #include + #include scp@10500000 { - compatible = "mediatek,mt8183-scp"; + compatible = "mediatek,mt8192-scp"; reg = <0x10500000 0x80000>, <0x10700000 0x8000>, <0x10720000 0xe0000>; From bb489b96406104070c1fbe364c441cffae8a2ae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Wed, 11 May 2022 15:54:52 -0400 Subject: [PATCH 138/334] dt-bindings: remoteproc: mediatek: Add optional memory-region to mtk,scp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SCP co-processor can optionally be passed a reserved memory region to use. Add this property in the dt-binding. Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: AngeloGioacchino Del Regno Acked-by: Rob Herring Link: https://lore.kernel.org/r/20220511195452.871897-3-nfraprado@collabora.com Signed-off-by: Mathieu Poirier --- Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index 832064d635b3..eec3b9c4c713 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -54,6 +54,9 @@ properties: firmware search path containing the firmware image used when initializing SCP. + memory-region: + maxItems: 1 + required: - compatible - reg From cea9ba7239dcc84175041174304c6cdeae3226e5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:27:00 -0400 Subject: [PATCH 139/334] NFS: Do not report EINTR/ERESTARTSYS as mapping errors If the attempt to flush data was interrupted due to a local signal, then just requeue the writes back for I/O. Fixes: 6fbda89b257f ("NFS: Replace custom error reporting mechanism with generic one") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f00d45cf80ef..e437db1791ba 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1444,7 +1444,7 @@ static void nfs_async_write_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - if (nfs_error_is_fatal(error)) + if (nfs_error_is_fatal_on_server(error)) nfs_write_error(req, error); else nfs_redirty_request(req); From 9641d9bc9b75f11f70646f5c6ee9f5f519a1012e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:27:01 -0400 Subject: [PATCH 140/334] NFS: fsync() should report filesystem errors over EINTR/ERESTARTSYS If the commit to disk is interrupted, we should still first check for filesystem errors so that we can report them in preference to the error due to the signal. Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 150b7fa8f0a7..7c380e555224 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -204,15 +204,16 @@ static int nfs_file_fsync_commit(struct file *file, int datasync) { struct inode *inode = file_inode(file); - int ret; + int ret, ret2; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret < 0) - return ret; - return file_check_and_advance_wb_err(file); + ret2 = file_check_and_advance_wb_err(file); + if (ret2 < 0) + return ret2; + return ret; } int From e6005436f6cc9ed13288f936903f0151e5543485 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:27:02 -0400 Subject: [PATCH 141/334] NFS: Don't report ENOSPC write errors twice Any errors reported by the write() system call need to be cleared from the file descriptor's error tracking. The current call to nfs_wb_all() causes the error to be reported, but since it doesn't call file_check_and_advance_wb_err(), we can end up reporting the same error a second time when the application calls fsync(). Note that since Linux 4.13, the rule is that EIO may be reported for write(), but it must be reported by a subsequent fsync(), so let's just drop reporting it in write. The check for nfs_ctx_key_to_expire() is just a duplicate to the one already in nfs_write_end(), so let's drop that too. Reported-by: ChenXiaoSong Fixes: ce368536dd61 ("nfs: nfs_file_write() should check for writeback errors") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7c380e555224..87e4cd5e8fe2 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -598,18 +598,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode, - int error) -{ - struct nfs_open_context *ctx; - - ctx = nfs_file_open_context(filp); - if (nfs_error_is_fatal_on_server(error) || - nfs_ctx_key_to_expire(ctx, inode)) - return 1; - return 0; -} - ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -637,7 +625,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) - goto out; + return result; } nfs_clear_invalid_mapping(file->f_mapping); @@ -656,6 +644,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { result = filemap_fdatawrite_range(file->f_mapping, @@ -673,17 +662,22 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) } result = generic_write_sync(iocb, written); if (result < 0) - goto out; + return result; +out: /* Return error values */ error = filemap_check_wb_err(file->f_mapping, since); - if (nfs_need_check_write(file, inode, error)) { - int err = nfs_wb_all(inode); - if (err < 0) - result = err; + switch (error) { + default: + break; + case -EDQUOT: + case -EFBIG: + case -ENOSPC: + nfs_wb_all(inode); + error = file_check_and_advance_wb_err(file); + if (error < 0) + result = error; } - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); -out: return result; out_swapfile: From d95b26650e86175e4a97698d89bc1626cd1df0c6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:27:03 -0400 Subject: [PATCH 142/334] NFS: Do not report flush errors in nfs_write_end() If we do flush cached writebacks in nfs_write_end() due to the imminent expiration of an RPCSEC_GSS session, then we should defer reporting any resulting errors until the calls to file_check_and_advance_wb_err() in nfs_file_write() and nfs_file_fsync(). Fixes: 6fbda89b257f ("NFS: Replace custom error reporting mechanism with generic one") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 87e4cd5e8fe2..3f17748eaf29 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -386,11 +386,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx, mapping->host)) { - status = nfs_wb_all(mapping->host); - if (status < 0) - return status; - } + if (nfs_ctx_key_to_expire(ctx, mapping->host)) + nfs_wb_all(mapping->host); return copied; } From c5e483b77cc2edb318da152abe07e33006b975fd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:27:04 -0400 Subject: [PATCH 143/334] NFS: Don't report errors from nfs_pageio_complete() more than once Since errors from nfs_pageio_complete() are already being reported through nfs_async_write_error(), we should not be returning them to the callers of do_writepages() as well. They will end up being reported through the generic mechanism instead. Fixes: 6fbda89b257f ("NFS: Replace custom error reporting mechanism with generic one") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e437db1791ba..4925d11849cd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -681,11 +681,7 @@ static int nfs_writepage_locked(struct page *page, err = nfs_do_writepage(page, wbc, &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); - if (err < 0) - return err; - if (nfs_error_is_fatal(pgio.pg_error)) - return pgio.pg_error; - return 0; + return err; } int nfs_writepage(struct page *page, struct writeback_control *wbc) @@ -747,9 +743,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) if (err < 0) goto out_err; - err = pgio.pg_error; - if (nfs_error_is_fatal(err)) - goto out_err; return 0; out_err: return err; From 452284407c18d8a522c3039339b1860afa0025a8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:08:10 -0400 Subject: [PATCH 144/334] NFS: Memory allocation failures are not server fatal errors We need to filter out ENOMEM in nfs_error_is_fatal_on_server(), because running out of memory on our client is not a server error. Reported-by: Olga Kornievskaia Fixes: 2dc23afffbca ("NFS: ENOMEM should also be a fatal error.") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/internal.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7eefa16ed381..8f8cd6e2d4db 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -841,6 +841,7 @@ static inline bool nfs_error_is_fatal_on_server(int err) case 0: case -ERESTARTSYS: case -EINTR: + case -ENOMEM: return false; } return nfs_error_is_fatal(err); From 3764a17e31d579cf9b4bd0a69894b577e8d75702 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:08:11 -0400 Subject: [PATCH 145/334] NFSv4/pNFS: Do not fail I/O when we fail to allocate the pNFS layout Commit 587f03deb69b caused pnfs_update_layout() to stop returning ENOMEM when the memory allocation fails, and hence causes it to fall back to trying to do I/O through the MDS. There is no guarantee that this will fare any better. If we're failing the pNFS layout allocation, then we should just redirty the page and retry later. Reported-by: Olga Kornievskaia Fixes: 587f03deb69b ("pnfs: refactor send_layoutget") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 856c962273c7..68a87be3e6f9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2000,6 +2000,7 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; @@ -2128,6 +2129,7 @@ lookup_again: lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); if (!lgp) { + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, PNFS_UPDATE_LAYOUT_NOMEM); nfs_layoutget_end(lo); From c6fd3511c3397dd9cbc6dc5d105bbedb69bf4061 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:08:12 -0400 Subject: [PATCH 146/334] NFS: Further fixes to the writeback error handling When we handle an error by redirtying the page, we're not corrupting the mapping, so we don't want the error to be recorded in the mapping. If the caller has specified a sync_mode of WB_SYNC_NONE, we can just return AOP_WRITEPAGE_ACTIVATE. However if we're dealing with WB_SYNC_ALL, we need to ensure that retries happen when the errors are non-fatal. Reported-by: Olga Kornievskaia Fixes: 8fc75bed96bb ("NFS: Fix up return value on fatal errors in nfs_page_async_flush()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4925d11849cd..2f41659e232e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -603,8 +603,9 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static int nfs_page_async_flush(struct page *page, + struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; int ret = 0; @@ -630,11 +631,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, /* * Remove the problematic req upon fatal errors on the server */ - if (nfs_error_is_fatal(ret)) { - if (nfs_error_is_fatal_on_server(ret)) - goto out_launder; - } else - ret = -EAGAIN; + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; + if (wbc->sync_mode == WB_SYNC_NONE) + ret = AOP_WRITEPAGE_ACTIVATE; + redirty_page_for_writepage(wbc, page); nfs_redirty_request(req); pgio->pg_error = 0; } else @@ -650,15 +651,8 @@ out_launder: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - int ret; - nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); - ret = AOP_WRITEPAGE_ACTIVATE; - } - return ret; + return nfs_page_async_flush(page, wbc, pgio); } /* @@ -733,12 +727,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) priority = wb_priority(wbc); } - nfs_pageio_init_write(&pgio, inode, priority, false, - &nfs_async_write_completion_ops); - pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); - pgio.pg_error = 0; - nfs_pageio_complete(&pgio); + do { + nfs_pageio_init_write(&pgio, inode, priority, false, + &nfs_async_write_completion_ops); + pgio.pg_io_completion = ioc; + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, + &pgio); + pgio.pg_error = 0; + nfs_pageio_complete(&pgio); + } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); if (err < 0) From 126966ddedb60bb7bf9f3b341e26ca8ef1019efc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:08:13 -0400 Subject: [PATCH 147/334] pNFS/files: Fall back to I/O through the MDS on non-fatal layout errors Only report the error when the server is returning a fatal error, such as ESTALE, EIO, etc... Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 76deddab0a8f..2b2661582bbe 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -839,7 +839,12 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (IS_ERR_OR_NULL(lseg)) + if (IS_ERR(lseg)) { + /* Fall back to MDS on recoverable errors */ + if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg))) + lseg = NULL; + goto out; + } else if (!lseg) goto out; lo = NFS_I(ino)->layout; From 6949493884fe88500de4af182588e071cf1544ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:08:14 -0400 Subject: [PATCH 148/334] NFSv4: Don't hold the layoutget locks across multiple RPC calls When doing layoutget as part of the open() compound, we have to be careful to release the layout locks before we can call any further RPC calls, such as setattr(). The reason is that those calls could trigger a recall, which could deadlock. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a79f66432bd3..bf3ba541b9fb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3098,6 +3098,10 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } if (!opendata->cancelled) nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; From 7b8b44eb7710e34a1ea5278392417993a549fabf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:36:58 -0400 Subject: [PATCH 149/334] NFSv4: Specify the type of ACL to cache When caching a NFSv4 ACL, we want to specify whether we are caching an NFSv4.0 type acl, the NFSv4.1 dacl or the NFSv4.1 sacl. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 59 ++++++++++++++++++++++++++++------------- include/linux/nfs4.h | 2 ++ include/linux/nfs_xdr.h | 7 +++++ 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bf3ba541b9fb..e6c830d4db0b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5772,9 +5772,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred) return 0; } -static inline int nfs4_server_supports_acls(struct nfs_server *server) +static bool nfs4_server_supports_acls(const struct nfs_server *server, + enum nfs4_acl_type type) { - return server->caps & NFS_CAP_ACLS; + switch (type) { + default: + return server->attr_bitmask[0] & FATTR4_WORD0_ACL; + case NFS4ACL_DACL: + return server->attr_bitmask[1] & FATTR4_WORD1_DACL; + case NFS4ACL_SACL: + return server->attr_bitmask[1] & FATTR4_WORD1_SACL; + } } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -5813,6 +5821,7 @@ unwind: } struct nfs4_cached_acl { + enum nfs4_acl_type type; int cached; size_t len; char data[]; @@ -5833,7 +5842,8 @@ static void nfs4_zap_acl_attr(struct inode *inode) nfs4_set_cached_acl(inode, NULL); } -static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_cached_acl *acl; @@ -5843,6 +5853,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_ acl = nfsi->nfs4_acl; if (acl == NULL) goto out; + if (acl->type != type) + goto out; if (buf == NULL) /* user is just asking for length */ goto out_len; if (acl->cached == 0) @@ -5858,7 +5870,9 @@ out: return ret; } -static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, + size_t pgbase, size_t acl_len, + enum nfs4_acl_type type) { struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; @@ -5875,6 +5889,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size goto out; acl->cached = 0; } + acl->type = type; acl->len = acl_len; out: nfs4_set_cached_acl(inode, acl); @@ -5890,7 +5905,8 @@ out: * length. The next getxattr call will then produce another round trip to * the server, this time with the input buf of the required size. */ -static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct page **pages; struct nfs_getaclargs args = { @@ -5947,7 +5963,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu ret = -ERANGE; goto out_free; } - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len, + type); if (buf) { if (res.acl_len > buflen) { ret = -ERANGE; @@ -5967,14 +5984,15 @@ out_free: return ret; } -static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { .interruptible = true, }; ssize_t ret; do { - ret = __nfs4_get_acl_uncached(inode, buf, buflen); + ret = __nfs4_get_acl_uncached(inode, buf, buflen, type); trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; @@ -5983,27 +6001,29 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl return ret; } -static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, + enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); int ret; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret < 0) return ret; if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - ret = nfs4_read_cached_acl(inode, buf, buflen); + ret = nfs4_read_cached_acl(inode, buf, buflen, type); if (ret != -ENOENT) /* -ENOENT is returned if there is no ACL or if there is an ACL * but no cached acl data, just the acl length */ return ret; - return nfs4_get_acl_uncached(inode, buf, buflen); + return nfs4_get_acl_uncached(inode, buf, buflen, type); } -static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; @@ -6024,7 +6044,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl /* You can't remove system.nfs4_acl: */ if (buflen == 0) return -EINVAL; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -6055,12 +6075,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return ret; } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { }; int err; do { - err = __nfs4_proc_set_acl(inode, buf, buflen); + err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { /* @@ -7659,19 +7680,19 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - return nfs4_proc_set_acl(inode, buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL); } #ifdef CONFIG_NFS_V4_SECURITY_LABEL diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5662d8be04eb..8d04b6a5964c 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -451,6 +451,8 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) +#define FATTR4_WORD1_DACL (1UL << 26) +#define FATTR4_WORD1_SACL (1UL << 27) #define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) #define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0) #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2863e5a69c6a..13d068c57d8d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -800,6 +800,13 @@ struct nfs_setattrargs { const struct nfs4_label *label; }; +enum nfs4_acl_type { + NFS4ACL_NONE = 0, + NFS4ACL_ACL, + NFS4ACL_DACL, + NFS4ACL_SACL, +}; + struct nfs_setaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; From db145db021abf75aa1a5810e631425055cfbed47 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:36:59 -0400 Subject: [PATCH 150/334] NFSv4: Add encoders/decoders for the NFSv4.1 dacl and sacl attributes Add the ability to set or retrieve the acl using the NFSv4.1 'dacl' and 'sacl' attributes to the NFSv4 xdr encoders/decoders. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++-- fs/nfs/nfs4xdr.c | 93 +++++++++++++++++++++++++++-------------- include/linux/nfs_xdr.h | 3 ++ 3 files changed, 70 insertions(+), 35 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e6c830d4db0b..b2ddbaf32a95 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5911,9 +5911,11 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, struct page **pages; struct nfs_getaclargs args = { .fh = NFS_FH(inode), + .acl_type = type, .acl_len = buflen, }; struct nfs_getaclres res = { + .acl_type = type, .acl_len = buflen, }; struct rpc_message msg = { @@ -6028,9 +6030,10 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_setaclargs arg = { - .fh = NFS_FH(inode), - .acl_pages = pages, - .acl_len = buflen, + .fh = NFS_FH(inode), + .acl_type = type, + .acl_len = buflen, + .acl_pages = pages, }; struct nfs_setaclres res; struct rpc_message msg = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 86a5f6516928..0b41e00e754f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1680,19 +1680,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); } -static void -encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg, - struct compound_hdr *hdr) +static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2]) { - __be32 *p; + switch (type) { + default: + bitmap[0] = FATTR4_WORD0_ACL; + bitmap[1] = 0; + break; + case NFS4ACL_DACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_DACL; + break; + case NFS4ACL_SACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_SACL; + } +} + +static void encode_setacl(struct xdr_stream *xdr, + const struct nfs_setaclargs *arg, + struct compound_hdr *hdr) +{ + __u32 bitmap[2]; + + nfs4_acltype_to_bitmap(arg->acl_type, bitmap); encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); encode_nfs4_stateid(xdr, &zero_stateid); - p = reserve_space(xdr, 2*4); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(FATTR4_WORD0_ACL); - p = reserve_space(xdr, 4); - *p = cpu_to_be32(arg->acl_len); + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + encode_uint32(xdr, arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len); } @@ -2587,11 +2603,11 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - const __u32 nfs4_acl_bitmap[1] = { - [0] = FATTR4_WORD0_ACL, - }; + __u32 nfs4_acl_bitmap[2]; uint32_t replen; + nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap); + encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); @@ -5386,7 +5402,7 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - struct nfs_getaclres *res) + struct nfs_getaclres *res, enum nfs4_acl_type type) { unsigned int savep; uint32_t attrlen, @@ -5404,26 +5420,39 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto out; - if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) - return -EIO; - if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { + switch (type) { + default: + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (!(bitmap[0] & FATTR4_WORD0_ACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_DACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_DACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_SACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_SACL)) + return -EOPNOTSUPP; + } - /* The bitmap (xdr len + bitmaps) and the attr xdr len words - * are stored with the acl data to handle the problem of - * variable length bitmaps.*/ - res->acl_data_offset = xdr_page_pos(xdr); - res->acl_len = attrlen; - - /* Check for receive buffer overflow */ - if (res->acl_len > xdr_stream_remaining(xdr) || - res->acl_len + res->acl_data_offset > xdr->buf->page_len) { - res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", - attrlen, xdr_stream_remaining(xdr)); - } - } else - status = -EOPNOTSUPP; + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_page_pos(xdr); + res->acl_len = attrlen; + /* Check for receive buffer overflow */ + if (res->acl_len > xdr_stream_remaining(xdr) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); + } out: return status; } @@ -6486,7 +6515,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, res); + status = decode_getacl(xdr, rqstp, res, res->acl_type); out: return status; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 13d068c57d8d..4a8ba84f848e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -810,6 +810,7 @@ enum nfs4_acl_type { struct nfs_setaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; + enum nfs4_acl_type acl_type; size_t acl_len; struct page ** acl_pages; }; @@ -821,6 +822,7 @@ struct nfs_setaclres { struct nfs_getaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; + enum nfs4_acl_type acl_type; size_t acl_len; struct page ** acl_pages; }; @@ -829,6 +831,7 @@ struct nfs_getaclargs { #define NFS4_ACL_TRUNC 0x0001 /* ACL was truncated */ struct nfs_getaclres { struct nfs4_sequence_res seq_res; + enum nfs4_acl_type acl_type; size_t acl_len; size_t acl_data_offset; int acl_flags; From 71342db05722907f30505afa4d845cae0f276d23 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 May 2022 10:37:00 -0400 Subject: [PATCH 151/334] NFSv4.1: Enable access to the NFSv4.1 'dacl' and 'sacl' attributes Enable access to the NFSv4 acl via the NFSv4.1 'dacl' and 'sacl' attributes. This allows the server to authenticate the DACL and the SACL operations separately, since reading and/or editing the SACL is usually considered to be a privileged operation. It also allows the propagation of automatic inheritance information that was not supported by the NFSv4.0 'acl' attribute. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2ddbaf32a95..0dfdbb406f96 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7698,6 +7698,55 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL); } +#if defined(CONFIG_NFS_V4_1) +#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" + +static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL); +} + +#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" + +static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL); +} + +#endif + #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, @@ -10615,6 +10664,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { .set = nfs4_xattr_set_nfs4_acl, }; +#if defined(CONFIG_NFS_V4_1) +static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = { + .name = XATTR_NAME_NFSV4_DACL, + .list = nfs4_xattr_list_nfs4_dacl, + .get = nfs4_xattr_get_nfs4_dacl, + .set = nfs4_xattr_set_nfs4_dacl, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = { + .name = XATTR_NAME_NFSV4_SACL, + .list = nfs4_xattr_list_nfs4_sacl, + .get = nfs4_xattr_get_nfs4_sacl, + .set = nfs4_xattr_set_nfs4_sacl, +}; +#endif + #ifdef CONFIG_NFS_V4_2 static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { .prefix = XATTR_USER_PREFIX, @@ -10625,6 +10690,10 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#if defined(CONFIG_NFS_V4_1) + &nfs4_xattr_nfs4_dacl_handler, + &nfs4_xattr_nfs4_sacl_handler, +#endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL &nfs4_xattr_nfs4_label_handler, #endif From c81d5bae404abc6b257667e84d39b9b50c7063d4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 6 May 2022 13:34:41 -0700 Subject: [PATCH 152/334] f2fs: do not stop GC when requiring a free section The f2fs_gc uses a bitmap to indicate pinned sections, but when disabling chckpoint, we call f2fs_gc() with NULL_SEGNO which selects the same dirty segment as a victim all the time, resulting in checkpoint=disable failure, for example. Let's pick another one, if we fail to collect it. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 12 ++++++++---- fs/f2fs/gc.c | 14 +++++++++----- fs/f2fs/segment.c | 3 ++- fs/f2fs/super.c | 3 ++- include/trace/events/f2fs.h | 11 ++++++++--- 6 files changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9920b2d6af8f..492af5b96de1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1271,6 +1271,7 @@ struct f2fs_gc_control { bool no_bg_gc; /* check the space and stop bg_gc */ bool should_migrate_blocks; /* should migrate blocks */ bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ }; /* For s_flag in struct f2fs_sb_info */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d0547bef0851..216081ea8c81 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1650,7 +1650,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -2350,7 +2351,8 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .no_bg_gc = false, - .should_migrate_blocks = false }; + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2391,7 +2393,8 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) .init_gc_type = range->sync ? FG_GC : BG_GC, .no_bg_gc = false, .should_migrate_blocks = false, - .err_gc_skipped = range->sync }; + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2837,7 +2840,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) struct f2fs_gc_control gc_control = { .init_gc_type = FG_GC, .should_migrate_blocks = true, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e275b72bc65f..d5fb426e0747 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -147,6 +147,7 @@ do_gc: gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, &gc_control)) @@ -1761,6 +1762,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1823,12 +1825,13 @@ retry: if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (gc_control->init_gc_type == FG_GC) - goto stop; - - if (!has_not_enough_free_secs(sbi, - (gc_type == FG_GC) ? sec_freed : 0, 0)) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; + } /* FG_GC stops GC by skip_count */ if (gc_type == FG_GC) { @@ -1849,6 +1852,7 @@ retry: if (ret) goto stop; } +go_gc_more: segno = NULL_SEGNO; goto gc_more; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8b4f2b1d2cca..0a4180f64291 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -404,7 +404,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .init_gc_type = BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, - .err_gc_skipped = false }; + .err_gc_skipped = false, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); f2fs_gc(sbi, &gc_control); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a28c27eed6d0..63daae67a9d9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2080,7 +2080,8 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .victim_segno = NULL_SEGNO, .init_gc_type = FG_GC, .should_migrate_blocks = false, - .err_gc_skipped = true }; + .err_gc_skipped = true, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); err = f2fs_gc(sbi, &gc_control); diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 54ec9e543f09..16c67ede85b6 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -645,18 +645,21 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, + unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, gc_type, no_bg_gc, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, + dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, gc_type) __field(bool, no_bg_gc) + __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -670,6 +673,7 @@ TRACE_EVENT(f2fs_gc_begin, __entry->dev = sb->s_dev; __entry->gc_type = gc_type; __entry->no_bg_gc = no_bg_gc; + __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -679,12 +683,13 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nodes = %lld, " - "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), show_gc_type(__entry->gc_type), (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, + __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, From b5639bb4313b9d455fc9fc4768d23a5e4ca8cb9d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 14 May 2022 10:59:29 -0700 Subject: [PATCH 153/334] f2fs: don't use casefolded comparison for "." and ".." Tryng to rename a directory that has all following properties fails with EINVAL and triggers the 'WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))' in f2fs_match_ci_name(): - The directory is casefolded - The directory is encrypted - The directory's encryption key is not yet set up - The parent directory is *not* encrypted The problem is incorrect handling of the lookup of ".." to get the parent reference to update. fscrypt_setup_filename() treats ".." (and ".") specially, as it's never encrypted. It's passed through as-is, and setting up the directory's key is not attempted. As the name isn't a no-key name, f2fs treats it as a "normal" name and attempts a casefolded comparison. That breaks the assumption of the WARN_ON_ONCE() in f2fs_match_ci_name() which assumes that for encrypted directories, casefolded comparisons only happen when the directory's key is set up. We could just remove this WARN_ON_ONCE(). However, since casefolding is always a no-op on "." and ".." anyway, let's instead just not casefold these names. This results in the standard bytewise comparison. Fixes: 7ad08a58bf67 ("f2fs: Handle casefolding with Encryption") Cc: # v5.11+ Signed-off-by: Eric Biggers Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 3 ++- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/hash.c | 11 ++++++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a0e51937d92e..d5bd7932fb64 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, #if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; - if (IS_CASEFOLDED(dir)) { + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 492af5b96de1..e9e32bc814df 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -508,11 +508,11 @@ struct f2fs_filename { #if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode, if the directory is both - * casefolded and encrypted and its encryption key is unavailable, or if - * the filesystem is doing an internal operation where usr_fname is also - * NULL. In all these cases we fall back to treating the name as an - * opaque byte sequence. + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. */ struct fscrypt_str cf_name; #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 3cb1e7a24740..049ce50cec9b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len) /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also - * @fname->cf_name if the filename is valid Unicode. + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { @@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that - * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. Note - * that to handle encrypted directories, the fallback must use - * usr_fname (plaintext) rather than disk_name (ciphertext). + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { From 6b17ca8e5e7a7b10689867dff5e22d7da368ba76 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 17 May 2022 10:34:41 +0800 Subject: [PATCH 154/334] kvm/vfio: Fix potential deadlock problem in vfio Fix following coccicheck warning: ./virt/kvm/vfio.c:258:1-7: preceding lock on line 236 If kvm_vfio_file_iommu_group() failed, code would goto err_fdput with mutex_lock acquired and then return ret. It might cause potential deadlock. Move mutex_unlock bellow err_fdput tag to fix it. Fixes: d55d9e7a45721 ("kvm/vfio: Store the struct file in the kvm_vfio_group") Signed-off-by: Wan Jiabing Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220517023441.4258-1-wanjiabing@vivo.com Signed-off-by: Alex Williamson --- virt/kvm/vfio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 8f9f7fffb96a..ce1b01d02c51 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -252,8 +252,8 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, break; } - mutex_unlock(&kv->lock); err_fdput: + mutex_unlock(&kv->lock); fdput(f); return ret; } From be8d3adae65cd44b6c299b796a5e1a0c24c54454 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 May 2022 20:41:17 -0300 Subject: [PATCH 155/334] vfio: Add missing locking for struct vfio_group::kvm Without locking userspace can trigger a UAF by racing KVM_DEV_VFIO_GROUP_DEL with VFIO_GROUP_GET_DEVICE_FD: CPU1 CPU2 ioctl(KVM_DEV_VFIO_GROUP_DEL) ioctl(VFIO_GROUP_GET_DEVICE_FD) vfio_group_get_device_fd open_device() intel_vgpu_open_device() vfio_register_notifier() vfio_register_group_notifier() blocking_notifier_call_chain(&group->notifier, VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); set_kvm() group->kvm = NULL close() kfree(kvm) intel_vgpu_group_notifier() vdev->kvm = data [..] kvm_get_kvm(vgpu->kvm); // UAF! Add a simple rwsem in the group to protect the kvm while the notifier is using it. Note this doesn't fix the race internal to i915 where userspace can trigger two VFIO_GROUP_NOTIFY_SET_KVM's before we reach a consumer of vgpu->kvm and trigger this same UAF, it just makes the notifier self-consistent. Fixes: ccd46dbae77d ("vfio: support notifier chain in vfio_group") Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/1-v2-d035a1842d81+1bf-vfio_group_locking_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 1758d96f43f4..4261eeec9e73 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -76,6 +76,7 @@ struct vfio_group { atomic_t opened; enum vfio_group_type type; unsigned int dev_counter; + struct rw_semaphore group_rwsem; struct kvm *kvm; struct blocking_notifier_head notifier; }; @@ -360,6 +361,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->users, 1); + init_rwsem(&group->group_rwsem); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -1694,9 +1696,11 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) if (file->f_op != &vfio_group_fops) return; + down_write(&group->group_rwsem); group->kvm = kvm; blocking_notifier_call_chain(&group->notifier, VFIO_GROUP_NOTIFY_SET_KVM, kvm); + up_write(&group->group_rwsem); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); @@ -2004,15 +2008,22 @@ static int vfio_register_group_notifier(struct vfio_group *group, return -EINVAL; ret = blocking_notifier_chain_register(&group->notifier, nb); + if (ret) + return ret; /* * The attaching of kvm and vfio_group might already happen, so * here we replay once upon registration. */ - if (!ret && set_kvm && group->kvm) - blocking_notifier_call_chain(&group->notifier, - VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); - return ret; + if (set_kvm) { + down_read(&group->group_rwsem); + if (group->kvm) + blocking_notifier_call_chain(&group->notifier, + VFIO_GROUP_NOTIFY_SET_KVM, + group->kvm); + up_read(&group->group_rwsem); + } + return 0; } int vfio_register_notifier(struct vfio_device *device, From c6f4860ef938606117961fac11d8d67497ab299b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 May 2022 20:41:18 -0300 Subject: [PATCH 156/334] vfio: Change struct vfio_group::opened from an atomic to bool This is not a performance path, just use the group_rwsem to protect the value. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/2-v2-d035a1842d81+1bf-vfio_group_locking_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 46 ++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 4261eeec9e73..12d4b3efd463 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -73,7 +73,7 @@ struct vfio_group { struct mutex device_lock; struct list_head vfio_next; struct list_head container_next; - atomic_t opened; + bool opened; enum vfio_group_type type; unsigned int dev_counter; struct rw_semaphore group_rwsem; @@ -1213,30 +1213,30 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group = container_of(inode->i_cdev, struct vfio_group, cdev); - int opened; + int ret; + + down_write(&group->group_rwsem); /* users can be zero if this races with vfio_group_put() */ - if (!refcount_inc_not_zero(&group->users)) - return -ENODEV; + if (!refcount_inc_not_zero(&group->users)) { + ret = -ENODEV; + goto err_unlock; + } if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { - vfio_group_put(group); - return -EPERM; + ret = -EPERM; + goto err_put; } - /* Do we need multiple instances of the group open? Seems not. */ - opened = atomic_cmpxchg(&group->opened, 0, 1); - if (opened) { - vfio_group_put(group); - return -EBUSY; - } - - /* Is something still in use from a previous open? */ - if (group->container) { - atomic_dec(&group->opened); - vfio_group_put(group); - return -EBUSY; + /* + * Do we need multiple instances of the group open? Seems not. + * Is something still in use from a previous open? + */ + if (group->opened || group->container) { + ret = -EBUSY; + goto err_put; } + group->opened = true; /* Warn if previous user didn't cleanup and re-init to drop them */ if (WARN_ON(group->notifier.head)) @@ -1244,7 +1244,13 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) filep->private_data = group; + up_write(&group->group_rwsem); return 0; +err_put: + vfio_group_put(group); +err_unlock: + up_write(&group->group_rwsem); + return ret; } static int vfio_group_fops_release(struct inode *inode, struct file *filep) @@ -1255,7 +1261,9 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_try_dissolve_container(group); - atomic_dec(&group->opened); + down_write(&group->group_rwsem); + group->opened = false; + up_write(&group->group_rwsem); vfio_group_put(group); From 805bb6c1bd9009e389f884fa30ec5f5e5079376d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 May 2022 20:41:19 -0300 Subject: [PATCH 157/334] vfio: Split up vfio_group_get_device_fd() The split follows the pairing with the destroy functions: - vfio_group_get_device_fd() destroyed by close() - vfio_device_open() destroyed by vfio_device_fops_release() - vfio_device_assign_container() destroyed by vfio_group_try_dissolve_container() The next patch will put a lock around vfio_device_assign_container(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/3-v2-d035a1842d81+1bf-vfio_group_locking_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 79 ++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 12d4b3efd463..21db0e8d0d40 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1064,12 +1064,9 @@ static bool vfio_assert_device_open(struct vfio_device *device) return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +static int vfio_device_assign_container(struct vfio_device *device) { - struct vfio_device *device; - struct file *filep; - int fdno; - int ret = 0; + struct vfio_group *group = device->group; if (0 == atomic_read(&group->container_users) || !group->container->iommu_driver) @@ -1078,13 +1075,22 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; - device = vfio_device_get_from_name(group, buf); - if (IS_ERR(device)) - return PTR_ERR(device); + atomic_inc(&group->container_users); + return 0; +} + +static struct file *vfio_device_open(struct vfio_device *device) +{ + struct file *filep; + int ret; + + ret = vfio_device_assign_container(device); + if (ret) + return ERR_PTR(ret); if (!try_module_get(device->dev->driver->owner)) { ret = -ENODEV; - goto err_device_put; + goto err_unassign_container; } mutex_lock(&device->dev_set->lock); @@ -1100,15 +1106,11 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) * We can't use anon_inode_getfd() because we need to modify * the f_mode flags directly to allow more than just ioctls */ - fdno = ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - goto err_close_device; - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, device, O_RDWR); if (IS_ERR(filep)) { ret = PTR_ERR(filep); - goto err_fd; + goto err_close_device; } /* @@ -1118,17 +1120,15 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) */ filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - atomic_inc(&group->container_users); - - fd_install(fdno, filep); - - if (group->type == VFIO_NO_IOMMU) + if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); - return fdno; + /* + * On success the ref of device is moved to the file and + * put in vfio_device_fops_release() + */ + return filep; -err_fd: - put_unused_fd(fdno); err_close_device: mutex_lock(&device->dev_set->lock); if (device->open_count == 1 && device->ops->close_device) @@ -1137,7 +1137,40 @@ err_undo_count: device->open_count--; mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); -err_device_put: +err_unassign_container: + vfio_group_try_dissolve_container(device->group); + return ERR_PTR(ret); +} + +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +{ + struct vfio_device *device; + struct file *filep; + int fdno; + int ret; + + device = vfio_device_get_from_name(group, buf); + if (IS_ERR(device)) + return PTR_ERR(device); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + ret = fdno; + goto err_put_device; + } + + filep = vfio_device_open(device); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_put_fdno; + } + + fd_install(fdno, filep); + return fdno; + +err_put_fdno: + put_unused_fd(fdno); +err_put_device: vfio_device_put(device); return ret; } From e0e29bdb594adf472eeff475539ee39708b2b07b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 May 2022 20:41:20 -0300 Subject: [PATCH 158/334] vfio: Fully lock struct vfio_group::container This is necessary to avoid various user triggerable races, for instance racing SET_CONTAINER/UNSET_CONTAINER: ioctl(VFIO_GROUP_SET_CONTAINER) ioctl(VFIO_GROUP_UNSET_CONTAINER) vfio_group_unset_container int users = atomic_cmpxchg(&group->container_users, 1, 0); // users == 1 container_users == 0 __vfio_group_unset_container(group); container = group->container; vfio_group_set_container() if (!atomic_read(&group->container_users)) down_write(&container->group_lock); group->container = container; up_write(&container->group_lock); down_write(&container->group_lock); group->container = NULL; up_write(&container->group_lock); vfio_container_put(container); /* woops we lost/leaked the new container */ This can then go on to NULL pointer deref since container == 0 and container_users == 1. Wrap all touches of container, except those on a performance path with a known open device, with the group_rwsem. The only user of vfio_group_add_container_user() holds the user count for a simple operation, change it to just hold the group_lock over the operation and delete vfio_group_add_container_user(). Containers now only gain a user when a device FD is opened. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/4-v2-d035a1842d81+1bf-vfio_group_locking_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 66 +++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 21db0e8d0d40..81330c8ca7fe 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -918,6 +918,8 @@ static void __vfio_group_unset_container(struct vfio_group *group) struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; + lockdep_assert_held_write(&group->group_rwsem); + down_write(&container->group_lock); driver = container->iommu_driver; @@ -953,6 +955,8 @@ static int vfio_group_unset_container(struct vfio_group *group) { int users = atomic_cmpxchg(&group->container_users, 1, 0); + lockdep_assert_held_write(&group->group_rwsem); + if (!users) return -EINVAL; if (users != 1) @@ -971,8 +975,10 @@ static int vfio_group_unset_container(struct vfio_group *group) */ static void vfio_group_try_dissolve_container(struct vfio_group *group) { + down_write(&group->group_rwsem); if (0 == atomic_dec_if_positive(&group->container_users)) __vfio_group_unset_container(group); + up_write(&group->group_rwsem); } static int vfio_group_set_container(struct vfio_group *group, int container_fd) @@ -982,6 +988,8 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) struct vfio_iommu_driver *driver; int ret = 0; + lockdep_assert_held_write(&group->group_rwsem); + if (atomic_read(&group->container_users)) return -EINVAL; @@ -1039,23 +1047,6 @@ unlock_out: return ret; } -static int vfio_group_add_container_user(struct vfio_group *group) -{ - if (!atomic_inc_not_zero(&group->container_users)) - return -EINVAL; - - if (group->type == VFIO_NO_IOMMU) { - atomic_dec(&group->container_users); - return -EPERM; - } - if (!group->container->iommu_driver) { - atomic_dec(&group->container_users); - return -EINVAL; - } - - return 0; -} - static const struct file_operations vfio_device_fops; /* true if the vfio_device has open_device() called but not close_device() */ @@ -1068,6 +1059,8 @@ static int vfio_device_assign_container(struct vfio_device *device) { struct vfio_group *group = device->group; + lockdep_assert_held_write(&group->group_rwsem); + if (0 == atomic_read(&group->container_users) || !group->container->iommu_driver) return -EINVAL; @@ -1084,7 +1077,9 @@ static struct file *vfio_device_open(struct vfio_device *device) struct file *filep; int ret; + down_write(&device->group->group_rwsem); ret = vfio_device_assign_container(device); + up_write(&device->group->group_rwsem); if (ret) return ERR_PTR(ret); @@ -1197,11 +1192,13 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, status.flags = 0; + down_read(&group->group_rwsem); if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) status.flags |= VFIO_GROUP_FLAGS_VIABLE; + up_read(&group->group_rwsem); if (copy_to_user((void __user *)arg, &status, minsz)) return -EFAULT; @@ -1219,11 +1216,15 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, if (fd < 0) return -EINVAL; + down_write(&group->group_rwsem); ret = vfio_group_set_container(group, fd); + up_write(&group->group_rwsem); break; } case VFIO_GROUP_UNSET_CONTAINER: + down_write(&group->group_rwsem); ret = vfio_group_unset_container(group); + up_write(&group->group_rwsem); break; case VFIO_GROUP_GET_DEVICE_FD: { @@ -1709,15 +1710,19 @@ bool vfio_file_enforced_coherent(struct file *file) if (file->f_op != &vfio_group_fops) return true; - /* - * Since the coherency state is determined only once a container is - * attached the user must do so before they can prove they have - * permission. - */ - if (vfio_group_add_container_user(group)) - return true; - ret = vfio_ioctl_check_extension(group->container, VFIO_DMA_CC_IOMMU); - vfio_group_try_dissolve_container(group); + down_read(&group->group_rwsem); + if (group->container) { + ret = vfio_ioctl_check_extension(group->container, + VFIO_DMA_CC_IOMMU); + } else { + /* + * Since the coherency state is determined only once a container + * is attached the user must do so before they can prove they + * have permission. + */ + ret = true; + } + up_read(&group->group_rwsem); return ret; } EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); @@ -1910,6 +1915,7 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, if (group->dev_counter > 1) return -EINVAL; + /* group->container cannot change while a vfio device is open */ container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) @@ -1945,6 +1951,7 @@ int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; + /* group->container cannot change while a vfio device is open */ container = device->group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unpin_pages)) @@ -1984,6 +1991,7 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, if (!data || len <= 0 || !vfio_assert_device_open(device)) return -EINVAL; + /* group->container cannot change while a vfio device is open */ container = device->group->container; driver = container->iommu_driver; @@ -2004,6 +2012,7 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; + down_read(&group->group_rwsem); container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->register_notifier)) @@ -2011,6 +2020,8 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, events, nb); else ret = -ENOTTY; + up_read(&group->group_rwsem); + return ret; } @@ -2021,6 +2032,7 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; + down_read(&group->group_rwsem); container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unregister_notifier)) @@ -2028,6 +2040,8 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, nb); else ret = -ENOTTY; + up_read(&group->group_rwsem); + return ret; } From b76c0eed748605afe8cddcc45c574f7f8536551a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 May 2022 20:41:21 -0300 Subject: [PATCH 159/334] vfio: Simplify the life cycle of the group FD Once userspace opens a group FD it is prevented from opening another instance of that same group FD until all the prior group FDs and users of the container are done. The first is done trivially by checking the group->opened during group FD open. However, things get a little weird if userspace creates a device FD and then closes the group FD. The group FD still cannot be re-opened, but this time it is because the group->container is still set and container_users is elevated by the device FD. Due to this mismatched lifecycle we have the vfio_group_try_dissolve_container() which tries to auto-free a container after the group FD is closed but the device FD remains open. Instead have the device FD hold onto a reference to the single group FD. This directly prevents vfio_group_fops_release() from being called when any device FD exists and makes the lifecycle model more understandable. vfio_group_try_dissolve_container() is removed as the only place a container is auto-deleted is during vfio_group_fops_release(). At this point the container_users is either 1 or 0 since all device FDs must be closed. Change group->opened to group->opened_file which points to the single struct file * that is open for the group. If the group->open_file is NULL then group->container == NULL. If all device FDs have closed then the group's notifier list must be empty. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/5-v2-d035a1842d81+1bf-vfio_group_locking_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 52 +++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 81330c8ca7fe..149c25840130 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -73,11 +73,11 @@ struct vfio_group { struct mutex device_lock; struct list_head vfio_next; struct list_head container_next; - bool opened; enum vfio_group_type type; unsigned int dev_counter; struct rw_semaphore group_rwsem; struct kvm *kvm; + struct file *opened_file; struct blocking_notifier_head notifier; }; @@ -967,20 +967,6 @@ static int vfio_group_unset_container(struct vfio_group *group) return 0; } -/* - * When removing container users, anything that removes the last user - * implicitly removes the group from the container. That is, if the - * group file descriptor is closed, as well as any device file descriptors, - * the group is free. - */ -static void vfio_group_try_dissolve_container(struct vfio_group *group) -{ - down_write(&group->group_rwsem); - if (0 == atomic_dec_if_positive(&group->container_users)) - __vfio_group_unset_container(group); - up_write(&group->group_rwsem); -} - static int vfio_group_set_container(struct vfio_group *group, int container_fd) { struct fd f; @@ -1068,10 +1054,19 @@ static int vfio_device_assign_container(struct vfio_device *device) if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; + get_file(group->opened_file); atomic_inc(&group->container_users); return 0; } +static void vfio_device_unassign_container(struct vfio_device *device) +{ + down_write(&device->group->group_rwsem); + atomic_dec(&device->group->container_users); + fput(device->group->opened_file); + up_write(&device->group->group_rwsem); +} + static struct file *vfio_device_open(struct vfio_device *device) { struct file *filep; @@ -1133,7 +1128,7 @@ err_undo_count: mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); err_unassign_container: - vfio_group_try_dissolve_container(device->group); + vfio_device_unassign_container(device); return ERR_PTR(ret); } @@ -1264,18 +1259,12 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) /* * Do we need multiple instances of the group open? Seems not. - * Is something still in use from a previous open? */ - if (group->opened || group->container) { + if (group->opened_file) { ret = -EBUSY; goto err_put; } - group->opened = true; - - /* Warn if previous user didn't cleanup and re-init to drop them */ - if (WARN_ON(group->notifier.head)) - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); - + group->opened_file = filep; filep->private_data = group; up_write(&group->group_rwsem); @@ -1293,10 +1282,17 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; - vfio_group_try_dissolve_container(group); - down_write(&group->group_rwsem); - group->opened = false; + /* + * Device FDs hold a group file reference, therefore the group release + * is only called when there are no open devices. + */ + WARN_ON(group->notifier.head); + if (group->container) { + WARN_ON(atomic_read(&group->container_users) != 1); + __vfio_group_unset_container(group); + } + group->opened_file = NULL; up_write(&group->group_rwsem); vfio_group_put(group); @@ -1328,7 +1324,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) module_put(device->dev->driver->owner); - vfio_group_try_dissolve_container(device->group); + vfio_device_unassign_container(device); vfio_device_put(device); From 3ca5470878ebe9e31d3292391ae5fd63ab625a0b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 16 May 2022 20:41:22 -0300 Subject: [PATCH 160/334] vfio: Change struct vfio_group::container_users to a non-atomic int Now that everything is fully locked there is no need for container_users to remain as an atomic, change it to an unsigned int. Use 'if (group->container)' as the test to determine if the container is present or not instead of using container_users. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/6-v2-d035a1842d81+1bf-vfio_group_locking_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 149c25840130..cfcff7764403 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -66,7 +66,7 @@ struct vfio_group { struct device dev; struct cdev cdev; refcount_t users; - atomic_t container_users; + unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; struct list_head device_list; @@ -429,7 +429,7 @@ static void vfio_group_put(struct vfio_group *group) * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); - WARN_ON(atomic_read(&group->container_users)); + WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); list_del(&group->vfio_next); @@ -930,6 +930,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) iommu_group_release_dma_owner(group->iommu_group); group->container = NULL; + group->container_users = 0; list_del(&group->container_next); /* Detaching the last group deprivileges a container, remove iommu */ @@ -953,17 +954,13 @@ static void __vfio_group_unset_container(struct vfio_group *group) */ static int vfio_group_unset_container(struct vfio_group *group) { - int users = atomic_cmpxchg(&group->container_users, 1, 0); - lockdep_assert_held_write(&group->group_rwsem); - if (!users) + if (!group->container) return -EINVAL; - if (users != 1) + if (group->container_users != 1) return -EBUSY; - __vfio_group_unset_container(group); - return 0; } @@ -976,7 +973,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) lockdep_assert_held_write(&group->group_rwsem); - if (atomic_read(&group->container_users)) + if (group->container || WARN_ON(group->container_users)) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) @@ -1020,12 +1017,12 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) } group->container = container; + group->container_users = 1; container->noiommu = (group->type == VFIO_NO_IOMMU); list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ vfio_container_get(container); - atomic_inc(&group->container_users); unlock_out: up_write(&container->group_lock); @@ -1047,22 +1044,23 @@ static int vfio_device_assign_container(struct vfio_device *device) lockdep_assert_held_write(&group->group_rwsem); - if (0 == atomic_read(&group->container_users) || - !group->container->iommu_driver) + if (!group->container || !group->container->iommu_driver || + WARN_ON(!group->container_users)) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; get_file(group->opened_file); - atomic_inc(&group->container_users); + group->container_users++; return 0; } static void vfio_device_unassign_container(struct vfio_device *device) { down_write(&device->group->group_rwsem); - atomic_dec(&device->group->container_users); + WARN_ON(device->group->container_users <= 1); + device->group->container_users--; fput(device->group->opened_file); up_write(&device->group->group_rwsem); } @@ -1289,7 +1287,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) */ WARN_ON(group->notifier.head); if (group->container) { - WARN_ON(atomic_read(&group->container_users) != 1); + WARN_ON(group->container_users != 1); __vfio_group_unset_container(group); } group->opened_file = NULL; From 3e2910c7e23b55f6452b7130f97217ee0551a71a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 21 Mar 2022 10:59:51 +1100 Subject: [PATCH 161/334] NFS: Improve warning message when locks are lost. NFSv4 can lose locks if, for example there is a network partition for longer than the lease period. When this happens a warning message NFS: __nfs4_reclaim_open_state: Lock reclaim failed! is generated, possibly once for each lock (though rate limited). This is potentially misleading as is can be read as suggesting that lock reclaim was attempted. However the default behaviour is to not attempt to recover locks (except due to server report). This patch changes the reporting to produce at most one message for each attempt to recover all state from a given server. The message reports the server name and the number of locks lost if that number is non-zero. It reports that locks were lost and give no suggestion as to whether there was an attempt or not. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9e1c987c81e7..775c58def32c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1602,7 +1602,8 @@ static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, #endif /* CONFIG_NFS_V4_2 */ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, - const struct nfs4_state_recovery_ops *ops) + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_lock_state *lock; int status; @@ -1620,7 +1621,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st list_for_each_entry(lock, &state->lock_states, ls_locks) { trace_nfs4_state_lock_reclaim(state, lock); if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + *lost_locks += 1; } spin_unlock(&state->state_lock); } @@ -1630,7 +1631,9 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_state *state; unsigned int loop = 0; @@ -1666,7 +1669,7 @@ restart: #endif /* CONFIG_NFS_V4_2 */ refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = __nfs4_reclaim_open_state(sp, state, ops); + status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks); switch (status) { default: @@ -1909,6 +1912,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct rb_node *pos; LIST_HEAD(freeme); int status = 0; + int lost_locks = 0; restart: rcu_read_lock(); @@ -1928,8 +1932,11 @@ restart: spin_unlock(&clp->cl_lock); rcu_read_unlock(); - status = nfs4_reclaim_open_state(sp, ops); + status = nfs4_reclaim_open_state(sp, ops, &lost_locks); if (status < 0) { + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); status = nfs4_recovery_handle_error(clp, status); @@ -1943,6 +1950,9 @@ restart: } rcu_read_unlock(); nfs4_free_state_owners(&freeme); + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); return 0; } From 5e12f172db9536df4720a50e8aa70c5e212c2037 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 21 Mar 2022 16:20:43 +1100 Subject: [PATCH 162/334] NFS: update documentation for the nfs4_unique_id parameter The documentation for nfs4_unique_id is out-of-date. In particular it claim that when nfs4_unique_id is set, the host name is not used. since Commit 55b592933b7d ("NFSv4: Fix nfs4_init_uniform_client_string for net namespaces") both the unique_id AND the host name are used. Update the documentation to match the code. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- Documentation/admin-guide/nfs/nfs-client.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/nfs/nfs-client.rst b/Documentation/admin-guide/nfs/nfs-client.rst index 6adb6457bc69..36760685dd34 100644 --- a/Documentation/admin-guide/nfs/nfs-client.rst +++ b/Documentation/admin-guide/nfs/nfs-client.rst @@ -36,10 +36,9 @@ administrative requirements that require particular behavior that does not work well as part of an nfs_client_id4 string. The nfs.nfs4_unique_id boot parameter specifies a unique string that can be -used instead of a system's node name when an NFS client identifies itself to -a server. Thus, if the system's node name is not unique, or it changes, its -nfs.nfs4_unique_id stays the same, preventing collision with other clients -or loss of state during NFS reboot recovery or transparent state migration. +used together with a system's node name when an NFS client identifies itself to +a server. Thus, if the system's node name is not unique, its +nfs.nfs4_unique_id can help prevent collisions with other clients. The nfs.nfs4_unique_id string is typically a UUID, though it can contain anything that is believed to be unique across all NFS clients. An @@ -53,8 +52,12 @@ outstanding NFSv4 state has expired, to prevent loss of NFSv4 state. This string can be stored in an NFS client's grub.conf, or it can be provided via a net boot facility such as PXE. It may also be specified as an nfs.ko -module parameter. Specifying a uniquifier string is not support for NFS -clients running in containers. +module parameter. + +This uniquifier string will be the same for all NFS clients running in +containers unless it is overridden by a value written to +/sys/fs/nfs/net/nfs_client/identifier which will be local to the network +namespace of the process which writes. The DNS resolver From a28faaddb2be19659701602fc7c9934ff5fc3fa5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Apr 2022 11:58:50 -0400 Subject: [PATCH 163/334] Documentation: Add an explanation of NFSv4 client identifiers To enable NFSv4 to work correctly, NFSv4 client identifiers have to be globally unique and persistent over client reboots. We believe that in many cases, a good default identifier can be chosen and set when a client system is imaged. Because there are many different ways a system can be imaged, provide an explanation of how NFSv4 client identifiers and principals can be set by install scripts and imaging tools. Additional cases, such as NFSv4 clients running in containers, also need unique and persistent identifiers. The Linux NFS community sets forth this explanation to aid those who create and manage container environments. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- .../filesystems/nfs/client-identifier.rst | 216 ++++++++++++++++++ Documentation/filesystems/nfs/index.rst | 2 + 2 files changed, 218 insertions(+) create mode 100644 Documentation/filesystems/nfs/client-identifier.rst diff --git a/Documentation/filesystems/nfs/client-identifier.rst b/Documentation/filesystems/nfs/client-identifier.rst new file mode 100644 index 000000000000..5147e15815a1 --- /dev/null +++ b/Documentation/filesystems/nfs/client-identifier.rst @@ -0,0 +1,216 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +NFSv4 client identifier +======================= + +This document explains how the NFSv4 protocol identifies client +instances in order to maintain file open and lock state during +system restarts. A special identifier and principal are maintained +on each client. These can be set by administrators, scripts +provided by site administrators, or tools provided by Linux +distributors. + +There are risks if a client's NFSv4 identifier and its principal +are not chosen carefully. + + +Introduction +------------ + +The NFSv4 protocol uses "lease-based file locking". Leases help +NFSv4 servers provide file lock guarantees and manage their +resources. + +Simply put, an NFSv4 server creates a lease for each NFSv4 client. +The server collects each client's file open and lock state under +the lease for that client. + +The client is responsible for periodically renewing its leases. +While a lease remains valid, the server holding that lease +guarantees the file locks the client has created remain in place. + +If a client stops renewing its lease (for example, if it crashes), +the NFSv4 protocol allows the server to remove the client's open +and lock state after a certain period of time. When a client +restarts, it indicates to servers that open and lock state +associated with its previous leases is no longer valid and can be +destroyed immediately. + +In addition, each NFSv4 server manages a persistent list of client +leases. When the server restarts and clients attempt to recover +their state, the server uses this list to distinguish amongst +clients that held state before the server restarted and clients +sending fresh OPEN and LOCK requests. This enables file locks to +persist safely across server restarts. + +NFSv4 client identifiers +------------------------ + +Each NFSv4 client presents an identifier to NFSv4 servers so that +they can associate the client with its lease. Each client's +identifier consists of two elements: + + - co_ownerid: An arbitrary but fixed string. + + - boot verifier: A 64-bit incarnation verifier that enables a + server to distinguish successive boot epochs of the same client. + +The NFSv4.0 specification refers to these two items as an +"nfs_client_id4". The NFSv4.1 specification refers to these two +items as a "client_owner4". + +NFSv4 servers tie this identifier to the principal and security +flavor that the client used when presenting it. Servers use this +principal to authorize subsequent lease modification operations +sent by the client. Effectively this principal is a third element of +the identifier. + +As part of the identity presented to servers, a good +"co_ownerid" string has several important properties: + + - The "co_ownerid" string identifies the client during reboot + recovery, therefore the string is persistent across client + reboots. + - The "co_ownerid" string helps servers distinguish the client + from others, therefore the string is globally unique. Note + that there is no central authority that assigns "co_ownerid" + strings. + - Because it often appears on the network in the clear, the + "co_ownerid" string does not reveal private information about + the client itself. + - The content of the "co_ownerid" string is set and unchanging + before the client attempts NFSv4 mounts after a restart. + - The NFSv4 protocol places a 1024-byte limit on the size of the + "co_ownerid" string. + +Protecting NFSv4 lease state +---------------------------- + +NFSv4 servers utilize the "client_owner4" as described above to +assign a unique lease to each client. Under this scheme, there are +circumstances where clients can interfere with each other. This is +referred to as "lease stealing". + +If distinct clients present the same "co_ownerid" string and use +the same principal (for example, AUTH_SYS and UID 0), a server is +unable to tell that the clients are not the same. Each distinct +client presents a different boot verifier, so it appears to the +server as if there is one client that is rebooting frequently. +Neither client can maintain open or lock state in this scenario. + +If distinct clients present the same "co_ownerid" string and use +distinct principals, the server is likely to allow the first client +to operate normally but reject subsequent clients with the same +"co_ownerid" string. + +If a client's "co_ownerid" string or principal are not stable, +state recovery after a server or client reboot is not guaranteed. +If a client unexpectedly restarts but presents a different +"co_ownerid" string or principal to the server, the server orphans +the client's previous open and lock state. This blocks access to +locked files until the server removes the orphaned state. + +If the server restarts and a client presents a changed "co_ownerid" +string or principal to the server, the server will not allow the +client to reclaim its open and lock state, and may give those locks +to other clients in the meantime. This is referred to as "lock +stealing". + +Lease stealing and lock stealing increase the potential for denial +of service and in rare cases even data corruption. + +Selecting an appropriate client identifier +------------------------------------------ + +By default, the Linux NFSv4 client implementation constructs its +"co_ownerid" string starting with the words "Linux NFS" followed by +the client's UTS node name (the same node name, incidentally, that +is used as the "machine name" in an AUTH_SYS credential). In small +deployments, this construction is usually adequate. Often, however, +the node name by itself is not adequately unique, and can change +unexpectedly. Problematic situations include: + + - NFS-root (diskless) clients, where the local DCHP server (or + equivalent) does not provide a unique host name. + + - "Containers" within a single Linux host. If each container has + a separate network namespace, but does not use the UTS namespace + to provide a unique host name, then there can be multiple NFS + client instances with the same host name. + + - Clients across multiple administrative domains that access a + common NFS server. If hostnames are not assigned centrally + then uniqueness cannot be guaranteed unless a domain name is + included in the hostname. + +Linux provides two mechanisms to add uniqueness to its "co_ownerid" +string: + + nfs.nfs4_unique_id + This module parameter can set an arbitrary uniquifier string + via the kernel command line, or when the "nfs" module is + loaded. + + /sys/fs/nfs/client/net/identifier + This virtual file, available since Linux 5.3, is local to the + network namespace in which it is accessed and so can provide + distinction between network namespaces (containers) when the + hostname remains uniform. + +Note that this file is empty on name-space creation. If the +container system has access to some sort of per-container identity +then that uniquifier can be used. For example, a uniquifier might +be formed at boot using the container's internal identifier: + + sha256sum /etc/machine-id | awk '{print $1}' \\ + > /sys/fs/nfs/client/net/identifier + +Security considerations +----------------------- + +The use of cryptographic security for lease management operations +is strongly encouraged. + +If NFS with Kerberos is not configured, a Linux NFSv4 client uses +AUTH_SYS and UID 0 as the principal part of its client identity. +This configuration is not only insecure, it increases the risk of +lease and lock stealing. However, it might be the only choice for +client configurations that have no local persistent storage. +"co_ownerid" string uniqueness and persistence is critical in this +case. + +When a Kerberos keytab is present on a Linux NFS client, the client +attempts to use one of the principals in that keytab when +identifying itself to servers. The "sec=" mount option does not +control this behavior. Alternately, a single-user client with a +Kerberos principal can use that principal in place of the client's +host principal. + +Using Kerberos for this purpose enables the client and server to +use the same lease for operations covered by all "sec=" settings. +Additionally, the Linux NFS client uses the RPCSEC_GSS security +flavor with Kerberos and the integrity QOS to prevent in-transit +modification of lease modification requests. + +Additional notes +---------------- +The Linux NFSv4 client establishes a single lease on each NFSv4 +server it accesses. NFSv4 mounts from a Linux NFSv4 client of a +particular server then share that lease. + +Once a client establishes open and lock state, the NFSv4 protocol +enables lease state to transition to other servers, following data +that has been migrated. This hides data migration completely from +running applications. The Linux NFSv4 client facilitates state +migration by presenting the same "client_owner4" to all servers it +encounters. + +======== +See Also +======== + + - nfs(5) + - kerberos(7) + - RFC 7530 for the NFSv4.0 specification + - RFC 8881 for the NFSv4.1 specification. diff --git a/Documentation/filesystems/nfs/index.rst b/Documentation/filesystems/nfs/index.rst index 288d8ddb2bc6..8536134f31fd 100644 --- a/Documentation/filesystems/nfs/index.rst +++ b/Documentation/filesystems/nfs/index.rst @@ -6,6 +6,8 @@ NFS .. toctree:: :maxdepth: 1 + client-identifier + exporting pnfs rpc-cache rpc-server-gss From 9c4a5c75a62e83963083efd4eea5d5bd1583193c Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Wed, 4 May 2022 09:21:06 -0400 Subject: [PATCH 164/334] NFS: Pass i_size to fscache_unuse_cookie() when a file is released Pass updated i_size in fscache_unuse_cookie() when called from nfs_fscache_release_file(), which ensures the size of an fscache object gets written to the cache storage. Failing to do so results in unnessary reads from the NFS server, even when the data is cached, due to a cachefiles object coherency check failing with a trace similar to the following: cachefiles_coherency: o=0000000e BAD osiz B=afbb3 c=0 This problem can be reproduced as follows: #!/bin/bash v=4.2; NFS_SERVER=127.0.0.1 set -e; trap cleanup EXIT; rc=1 function cleanup { umount /mnt/nfs > /dev/null 2>&1 RC_STR="TEST PASS" [ $rc -eq 1 ] && RC_STR="TEST FAIL" echo "$RC_STR on $(uname -r) with NFSv$v and server $NFS_SERVER" } mount -o vers=$v,fsc $NFS_SERVER:/export /mnt/nfs rm -f /mnt/nfs/file1.bin > /dev/null 2>&1 dd if=/dev/zero of=/mnt/nfs/file1.bin bs=4096 count=1 > /dev/null 2>&1 echo 3 > /proc/sys/vm/drop_caches echo Read file 1st time from NFS server into fscache dd if=/mnt/nfs/file1.bin of=/dev/null > /dev/null 2>&1 umount /mnt/nfs && mount -o vers=$v,fsc $NFS_SERVER:/export /mnt/nfs echo 3 > /proc/sys/vm/drop_caches echo Read file 2nd time from fscache dd if=/mnt/nfs/file1.bin of=/dev/null > /dev/null 2>&1 echo Check mountstats for NFS read grep -q "READ: 0" /proc/self/mountstats # (1st number) == 0 [ $? -eq 0 ] && rc=0 Fixes: a6b5a28eb56c "nfs: Convert to new fscache volume/cookie API" Signed-off-by: Dave Wysochanski Tested-by: Daire Byrne Signed-off-by: Anna Schumaker --- fs/nfs/fscache.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index f73c09a9cf0a..e861d7bae305 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -231,11 +231,10 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; struct fscache_cookie *cookie = nfs_i_fscache(inode); + loff_t i_size = i_size_read(inode); - if (fscache_cookie_valid(cookie)) { - nfs_fscache_update_auxdata(&auxdata, inode); - fscache_unuse_cookie(cookie, &auxdata, NULL); - } + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_unuse_cookie(cookie, &auxdata, &i_size); } /* From c157a606e7525409208a51bd6663d6da3757d69e Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 5 May 2022 02:19:54 +0000 Subject: [PATCH 165/334] i3c/master: simplify the return expression of i3c_hci_remove() Simplify the return expression. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220505021954.54524-1-chi.minghao@zte.com.cn --- drivers/i3c/master/mipi-i3c-hci/core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 8c01123dc4ed..6aef5ce43cc1 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -768,13 +768,8 @@ static int i3c_hci_probe(struct platform_device *pdev) static int i3c_hci_remove(struct platform_device *pdev) { struct i3c_hci *hci = platform_get_drvdata(pdev); - int ret; - ret = i3c_master_unregister(&hci->master); - if (ret) - return ret; - - return 0; + return i3c_master_unregister(&hci->master); } static const __maybe_unused struct of_device_id i3c_hci_of_match[] = { From 227fab1ee7ca70c9b4b0915898e81327aeb70414 Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Thu, 5 May 2022 00:49:01 +0800 Subject: [PATCH 166/334] i3c: master: svc: fix returnvar.cocci warning Fix the following coccicheck warning: drivers/i3c/master/svc-i3c-master.c:1600:5-8: Unneeded variable: "ret". Return "0" on line 1605. Signed-off-by: Guo Zhengkui Reviewed-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220504164901.9622-1-guozhengkui@vivo.com --- drivers/i3c/master/svc-i3c-master.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 7550dad64ecf..d6e9ed74cdcf 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -1597,12 +1597,11 @@ static int __maybe_unused svc_i3c_runtime_suspend(struct device *dev) static int __maybe_unused svc_i3c_runtime_resume(struct device *dev) { struct svc_i3c_master *master = dev_get_drvdata(dev); - int ret = 0; pinctrl_pm_select_default_state(dev); svc_i3c_master_prepare_clks(master); - return ret; + return 0; } static const struct dev_pm_ops svc_i3c_pm_ops = { From 581d6d8f483696ff164f52a71beb43a87b718592 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 13 Apr 2022 18:17:30 -0500 Subject: [PATCH 167/334] rtc: sun6i: Add NVMEM provider The sun6i RTC provides 32 bytes of general-purpose data registers. They can be used to save data in the always-on RTC power domain. The registers are writable via 32-bit MMIO accesses only. Expose them with a NVMEM provider so they can be used by other drivers. Signed-off-by: Samuel Holland Acked-by: Jernej Skrabec Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220413231731.56709-1-samuel@sholland.org --- drivers/rtc/rtc-sun6i.c | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c index 5b3e4da63406..755aeb82a285 100644 --- a/drivers/rtc/rtc-sun6i.c +++ b/drivers/rtc/rtc-sun6i.c @@ -71,6 +71,10 @@ #define SUN6I_LOSC_OUT_GATING 0x0060 #define SUN6I_LOSC_OUT_GATING_EN_OFFSET 0 +/* General-purpose data */ +#define SUN6I_GP_DATA 0x0100 +#define SUN6I_GP_DATA_SIZE 0x20 + /* * Get date values */ @@ -662,6 +666,39 @@ static const struct rtc_class_ops sun6i_rtc_ops = { .alarm_irq_enable = sun6i_rtc_alarm_irq_enable }; +static int sun6i_rtc_nvmem_read(void *priv, unsigned int offset, void *_val, size_t bytes) +{ + struct sun6i_rtc_dev *chip = priv; + u32 *val = _val; + int i; + + for (i = 0; i < bytes / 4; ++i) + val[i] = readl(chip->base + SUN6I_GP_DATA + offset + 4 * i); + + return 0; +} + +static int sun6i_rtc_nvmem_write(void *priv, unsigned int offset, void *_val, size_t bytes) +{ + struct sun6i_rtc_dev *chip = priv; + u32 *val = _val; + int i; + + for (i = 0; i < bytes / 4; ++i) + writel(val[i], chip->base + SUN6I_GP_DATA + offset + 4 * i); + + return 0; +} + +static struct nvmem_config sun6i_rtc_nvmem_cfg = { + .type = NVMEM_TYPE_BATTERY_BACKED, + .reg_read = sun6i_rtc_nvmem_read, + .reg_write = sun6i_rtc_nvmem_write, + .size = SUN6I_GP_DATA_SIZE, + .word_size = 4, + .stride = 4, +}; + #ifdef CONFIG_PM_SLEEP /* Enable IRQ wake on suspend, to wake up from RTC. */ static int sun6i_rtc_suspend(struct device *dev) @@ -795,6 +832,11 @@ static int sun6i_rtc_probe(struct platform_device *pdev) if (ret) return ret; + sun6i_rtc_nvmem_cfg.priv = chip; + ret = devm_rtc_nvmem_register(chip->rtc, &sun6i_rtc_nvmem_cfg); + if (ret) + return ret; + dev_info(&pdev->dev, "RTC enabled\n"); return 0; From 4c4d145a65e5a7faac440081bc1eac860930cd24 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 16 May 2022 10:25:00 +0200 Subject: [PATCH 168/334] dt-bindings: rtc: rzn1: Describe the RZN1 RTC Add new binding file for this RTC. Signed-off-by: Miquel Raynal Reviewed-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220516082504.33913-2-miquel.raynal@bootlin.com --- .../bindings/rtc/renesas,rzn1-rtc.yaml | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml diff --git a/Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml b/Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml new file mode 100644 index 000000000000..2d4741f51663 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/renesas,rzn1-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/N1 SoCs Real-Time Clock DT bindings + +maintainers: + - Miquel Raynal + +allOf: + - $ref: rtc.yaml# + +properties: + compatible: + items: + - enum: + - renesas,r9a06g032-rtc + - const: renesas,rzn1-rtc + + reg: + maxItems: 1 + + interrupts: + minItems: 3 + maxItems: 3 + + interrupt-names: + items: + - const: alarm + - const: timer + - const: pps + + clocks: + maxItems: 1 + + clock-names: + const: hclk + + power-domains: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - interrupt-names + - clocks + - clock-names + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + rtc@40006000 { + compatible = "renesas,r9a06g032-rtc", "renesas,rzn1-rtc"; + reg = <0x40006000 0x1000>; + interrupts = , + , + ; + interrupt-names = "alarm", "timer", "pps"; + clocks = <&sysctrl R9A06G032_HCLK_RTC>; + clock-names = "hclk"; + power-domains = <&sysctrl>; + start-year = <2000>; + }; From deeb4b5393e106b990607df06261fba0ebb7ebde Mon Sep 17 00:00:00 2001 From: Michel Pollet Date: Mon, 16 May 2022 10:25:01 +0200 Subject: [PATCH 169/334] rtc: rzn1: Add new RTC driver Add a basic RTC driver for the RZ/N1. Signed-off-by: Michel Pollet Co-developed-by: Miquel Raynal Signed-off-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220516082504.33913-3-miquel.raynal@bootlin.com --- drivers/rtc/Kconfig | 7 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-rzn1.c | 243 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 drivers/rtc/rtc-rzn1.c diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 41c65b4d2baf..a00f901b5c1d 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1548,6 +1548,13 @@ config RTC_DRV_RS5C313 help If you say yes here you get support for the Ricoh RS5C313 RTC chips. +config RTC_DRV_RZN1 + tristate "Renesas RZ/N1 RTC" + depends on ARCH_RZN1 || COMPILE_TEST + depends on OF && HAS_IOMEM + help + If you say yes here you get support for the Renesas RZ/N1 RTC. + config RTC_DRV_GENERIC tristate "Generic RTC support" # Please consider writing a new RTC driver instead of using the generic diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 2d827d8261d5..fb04467b652d 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -151,6 +151,7 @@ obj-$(CONFIG_RTC_DRV_RX6110) += rtc-rx6110.o obj-$(CONFIG_RTC_DRV_RX8010) += rtc-rx8010.o obj-$(CONFIG_RTC_DRV_RX8025) += rtc-rx8025.o obj-$(CONFIG_RTC_DRV_RX8581) += rtc-rx8581.o +obj-$(CONFIG_RTC_DRV_RZN1) += rtc-rzn1.o obj-$(CONFIG_RTC_DRV_S35390A) += rtc-s35390a.o obj-$(CONFIG_RTC_DRV_S3C) += rtc-s3c.o obj-$(CONFIG_RTC_DRV_S5M) += rtc-s5m.o diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c new file mode 100644 index 000000000000..6b2139a09ae2 --- /dev/null +++ b/drivers/rtc/rtc-rzn1.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Renesas RZ/N1 Real Time Clock interface for Linux + * + * Copyright: + * - 2014 Renesas Electronics Europe Limited + * - 2022 Schneider Electric + * + * Authors: + * - Michel Pollet , + * - Miquel Raynal + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RZN1_RTC_CTL0 0x00 +#define RZN1_RTC_CTL0_SLSB_SUBU 0 +#define RZN1_RTC_CTL0_SLSB_SCMP BIT(4) +#define RZN1_RTC_CTL0_AMPM BIT(5) +#define RZN1_RTC_CTL0_CE BIT(7) + +#define RZN1_RTC_CTL1 0x04 +#define RZN1_RTC_CTL1_ALME BIT(4) + +#define RZN1_RTC_CTL2 0x08 +#define RZN1_RTC_CTL2_WAIT BIT(0) +#define RZN1_RTC_CTL2_WST BIT(1) +#define RZN1_RTC_CTL2_WUST BIT(5) +#define RZN1_RTC_CTL2_STOPPED (RZN1_RTC_CTL2_WAIT | RZN1_RTC_CTL2_WST) + +#define RZN1_RTC_SEC 0x14 +#define RZN1_RTC_MIN 0x18 +#define RZN1_RTC_HOUR 0x1c +#define RZN1_RTC_WEEK 0x20 +#define RZN1_RTC_DAY 0x24 +#define RZN1_RTC_MONTH 0x28 +#define RZN1_RTC_YEAR 0x2c + +#define RZN1_RTC_SUBU 0x38 +#define RZN1_RTC_SUBU_DEV BIT(7) +#define RZN1_RTC_SUBU_DECR BIT(6) + +#define RZN1_RTC_ALM 0x40 +#define RZN1_RTC_ALH 0x44 +#define RZN1_RTC_ALW 0x48 + +#define RZN1_RTC_SECC 0x4c +#define RZN1_RTC_MINC 0x50 +#define RZN1_RTC_HOURC 0x54 +#define RZN1_RTC_WEEKC 0x58 +#define RZN1_RTC_DAYC 0x5c +#define RZN1_RTC_MONTHC 0x60 +#define RZN1_RTC_YEARC 0x64 + +struct rzn1_rtc { + struct rtc_device *rtcdev; + void __iomem *base; +}; + +static void rzn1_rtc_get_time_snapshot(struct rzn1_rtc *rtc, struct rtc_time *tm) +{ + tm->tm_sec = readl(rtc->base + RZN1_RTC_SECC); + tm->tm_min = readl(rtc->base + RZN1_RTC_MINC); + tm->tm_hour = readl(rtc->base + RZN1_RTC_HOURC); + tm->tm_wday = readl(rtc->base + RZN1_RTC_WEEKC); + tm->tm_mday = readl(rtc->base + RZN1_RTC_DAYC); + tm->tm_mon = readl(rtc->base + RZN1_RTC_MONTHC); + tm->tm_year = readl(rtc->base + RZN1_RTC_YEARC); +} + +static unsigned int rzn1_rtc_tm_to_wday(struct rtc_time *tm) +{ + time64_t time; + unsigned int days; + u32 secs; + + time = rtc_tm_to_time64(tm); + days = div_s64_rem(time, 86400, &secs); + + /* day of the week, 1970-01-01 was a Thursday */ + return (days + 4) % 7; +} + +static int rzn1_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + u32 val, secs; + + /* + * The RTC was not started or is stopped and thus does not carry the + * proper time/date. + */ + val = readl(rtc->base + RZN1_RTC_CTL2); + if (val & RZN1_RTC_CTL2_STOPPED) + return -EINVAL; + + rzn1_rtc_get_time_snapshot(rtc, tm); + secs = readl(rtc->base + RZN1_RTC_SECC); + if (tm->tm_sec != secs) + rzn1_rtc_get_time_snapshot(rtc, tm); + + tm->tm_sec = bcd2bin(tm->tm_sec); + tm->tm_min = bcd2bin(tm->tm_min); + tm->tm_hour = bcd2bin(tm->tm_hour); + tm->tm_wday = bcd2bin(tm->tm_wday); + tm->tm_mday = bcd2bin(tm->tm_mday); + tm->tm_mon = bcd2bin(tm->tm_mon); + tm->tm_year = bcd2bin(tm->tm_year); + + return 0; +} + +static int rzn1_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + u32 val; + int ret; + + tm->tm_sec = bin2bcd(tm->tm_sec); + tm->tm_min = bin2bcd(tm->tm_min); + tm->tm_hour = bin2bcd(tm->tm_hour); + tm->tm_wday = bin2bcd(rzn1_rtc_tm_to_wday(tm)); + tm->tm_mday = bin2bcd(tm->tm_mday); + tm->tm_mon = bin2bcd(tm->tm_mon); + tm->tm_year = bin2bcd(tm->tm_year); + + val = readl(rtc->base + RZN1_RTC_CTL2); + if (!(val & RZN1_RTC_CTL2_STOPPED)) { + /* Hold the counter if it was counting up */ + writel(RZN1_RTC_CTL2_WAIT, rtc->base + RZN1_RTC_CTL2); + + /* Wait for the counter to stop: two 32k clock cycles */ + usleep_range(61, 100); + ret = readl_poll_timeout(rtc->base + RZN1_RTC_CTL2, val, + val & RZN1_RTC_CTL2_WST, 0, 100); + if (ret) + return ret; + } + + writel(tm->tm_sec, rtc->base + RZN1_RTC_SEC); + writel(tm->tm_min, rtc->base + RZN1_RTC_MIN); + writel(tm->tm_hour, rtc->base + RZN1_RTC_HOUR); + writel(tm->tm_wday, rtc->base + RZN1_RTC_WEEK); + writel(tm->tm_mday, rtc->base + RZN1_RTC_DAY); + writel(tm->tm_mon, rtc->base + RZN1_RTC_MONTH); + writel(tm->tm_year, rtc->base + RZN1_RTC_YEAR); + writel(0, rtc->base + RZN1_RTC_CTL2); + + return 0; +} + +static const struct rtc_class_ops rzn1_rtc_ops = { + .read_time = rzn1_rtc_read_time, + .set_time = rzn1_rtc_set_time, +}; + +static int rzn1_rtc_probe(struct platform_device *pdev) +{ + struct rzn1_rtc *rtc; + int ret; + + rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); + if (!rtc) + return -ENOMEM; + + platform_set_drvdata(pdev, rtc); + + rtc->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(rtc->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(rtc->base), "Missing reg\n"); + + rtc->rtcdev = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(rtc->rtcdev)) + return PTR_ERR(rtc); + + rtc->rtcdev->range_min = RTC_TIMESTAMP_BEGIN_2000; + rtc->rtcdev->range_max = RTC_TIMESTAMP_END_2099; + rtc->rtcdev->ops = &rzn1_rtc_ops; + clear_bit(RTC_FEATURE_ALARM, rtc->rtcdev->features); + clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->rtcdev->features); + + devm_pm_runtime_enable(&pdev->dev); + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret < 0) + return ret; + + /* + * Ensure the clock counter is enabled. + * Set 24-hour mode and possible oscillator offset compensation in SUBU mode. + */ + writel(RZN1_RTC_CTL0_CE | RZN1_RTC_CTL0_AMPM | RZN1_RTC_CTL0_SLSB_SUBU, + rtc->base + RZN1_RTC_CTL0); + + /* Disable all interrupts */ + writel(0, rtc->base + RZN1_RTC_CTL1); + + ret = devm_rtc_register_device(rtc->rtcdev); + if (ret) + goto dis_runtime_pm; + + return 0; + +dis_runtime_pm: + pm_runtime_put(&pdev->dev); + + return ret; +} + +static int rzn1_rtc_remove(struct platform_device *pdev) +{ + pm_runtime_put(&pdev->dev); + + return 0; +} + +static const struct of_device_id rzn1_rtc_of_match[] = { + { .compatible = "renesas,rzn1-rtc" }, + {}, +}; +MODULE_DEVICE_TABLE(of, rzn1_rtc_of_match); + +static struct platform_driver rzn1_rtc_driver = { + .probe = rzn1_rtc_probe, + .remove = rzn1_rtc_remove, + .driver = { + .name = "rzn1-rtc", + .owner = THIS_MODULE, + .of_match_table = rzn1_rtc_of_match, + }, +}; +module_platform_driver(rzn1_rtc_driver); + +MODULE_AUTHOR("Michel Pollet Date: Mon, 16 May 2022 10:25:02 +0200 Subject: [PATCH 170/334] rtc: rzn1: Add alarm support The RZN1 RTC can trigger an interrupt when reaching a particular date up to 7 days ahead. Bring support for this alarm. One drawback though, the granularity is about a minute. Signed-off-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220516082504.33913-4-miquel.raynal@bootlin.com --- drivers/rtc/rtc-rzn1.c | 106 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c index 6b2139a09ae2..c1b082e3c8a7 100644 --- a/drivers/rtc/rtc-rzn1.c +++ b/drivers/rtc/rtc-rzn1.c @@ -156,14 +156,107 @@ static int rzn1_rtc_set_time(struct device *dev, struct rtc_time *tm) return 0; } +static irqreturn_t rzn1_rtc_alarm_irq(int irq, void *dev_id) +{ + struct rzn1_rtc *rtc = dev_id; + + rtc_update_irq(rtc->rtcdev, 1, RTC_AF | RTC_IRQF); + + return IRQ_HANDLED; +} + +static int rzn1_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + u32 ctl1 = readl(rtc->base + RZN1_RTC_CTL1); + + if (enable) + ctl1 |= RZN1_RTC_CTL1_ALME; + else + ctl1 &= ~RZN1_RTC_CTL1_ALME; + + writel(ctl1, rtc->base + RZN1_RTC_CTL1); + + return 0; +} + +static int rzn1_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + struct rtc_time *tm = &alrm->time; + unsigned int min, hour, wday, delta_days; + time64_t alarm; + u32 ctl1; + int ret; + + ret = rzn1_rtc_read_time(dev, tm); + if (ret) + return ret; + + min = readl(rtc->base + RZN1_RTC_ALM); + hour = readl(rtc->base + RZN1_RTC_ALH); + wday = readl(rtc->base + RZN1_RTC_ALW); + + tm->tm_sec = 0; + tm->tm_min = bcd2bin(min); + tm->tm_hour = bcd2bin(hour); + delta_days = ((fls(wday) - 1) - tm->tm_wday + 7) % 7; + tm->tm_wday = fls(wday) - 1; + + if (delta_days) { + alarm = rtc_tm_to_time64(tm) + (delta_days * 86400); + rtc_time64_to_tm(alarm, tm); + } + + ctl1 = readl(rtc->base + RZN1_RTC_CTL1); + alrm->enabled = !!(ctl1 & RZN1_RTC_CTL1_ALME); + + return 0; +} + +static int rzn1_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + struct rtc_time *tm = &alrm->time, tm_now; + unsigned long alarm, farest; + unsigned int days_ahead, wday; + int ret; + + ret = rzn1_rtc_read_time(dev, &tm_now); + if (ret) + return ret; + + /* We cannot set alarms more than one week ahead */ + farest = rtc_tm_to_time64(&tm_now) + (7 * 86400); + alarm = rtc_tm_to_time64(tm); + if (time_after(alarm, farest)) + return -ERANGE; + + /* Convert alarm day into week day */ + days_ahead = tm->tm_mday - tm_now.tm_mday; + wday = (tm_now.tm_wday + days_ahead) % 7; + + writel(bin2bcd(tm->tm_min), rtc->base + RZN1_RTC_ALM); + writel(bin2bcd(tm->tm_hour), rtc->base + RZN1_RTC_ALH); + writel(BIT(wday), rtc->base + RZN1_RTC_ALW); + + rzn1_rtc_alarm_irq_enable(dev, alrm->enabled); + + return 0; +} + static const struct rtc_class_ops rzn1_rtc_ops = { .read_time = rzn1_rtc_read_time, .set_time = rzn1_rtc_set_time, + .read_alarm = rzn1_rtc_read_alarm, + .set_alarm = rzn1_rtc_set_alarm, + .alarm_irq_enable = rzn1_rtc_alarm_irq_enable, }; static int rzn1_rtc_probe(struct platform_device *pdev) { struct rzn1_rtc *rtc; + int alarm_irq; int ret; rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); @@ -176,6 +269,10 @@ static int rzn1_rtc_probe(struct platform_device *pdev) if (IS_ERR(rtc->base)) return dev_err_probe(&pdev->dev, PTR_ERR(rtc->base), "Missing reg\n"); + alarm_irq = platform_get_irq(pdev, 0); + if (alarm_irq < 0) + return alarm_irq; + rtc->rtcdev = devm_rtc_allocate_device(&pdev->dev); if (IS_ERR(rtc->rtcdev)) return PTR_ERR(rtc); @@ -183,7 +280,7 @@ static int rzn1_rtc_probe(struct platform_device *pdev) rtc->rtcdev->range_min = RTC_TIMESTAMP_BEGIN_2000; rtc->rtcdev->range_max = RTC_TIMESTAMP_END_2099; rtc->rtcdev->ops = &rzn1_rtc_ops; - clear_bit(RTC_FEATURE_ALARM, rtc->rtcdev->features); + set_bit(RTC_FEATURE_ALARM_RES_MINUTE, rtc->rtcdev->features); clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->rtcdev->features); devm_pm_runtime_enable(&pdev->dev); @@ -201,6 +298,13 @@ static int rzn1_rtc_probe(struct platform_device *pdev) /* Disable all interrupts */ writel(0, rtc->base + RZN1_RTC_CTL1); + ret = devm_request_irq(&pdev->dev, alarm_irq, rzn1_rtc_alarm_irq, 0, + dev_name(&pdev->dev), rtc); + if (ret) { + dev_err(&pdev->dev, "RTC timer interrupt not available\n"); + goto dis_runtime_pm; + } + ret = devm_rtc_register_device(rtc->rtcdev); if (ret) goto dis_runtime_pm; From be4a11cf98aff5d456eae947a49b6163393d9420 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 16 May 2022 10:25:03 +0200 Subject: [PATCH 171/334] rtc: rzn1: Add oscillator offset support The RZN1 RTC can compensate the imprecision of the oscillator up to approximately 190ppm. Seconds can last slightly shorter or longer depending on the configuration. Below ~65ppm of correction, we can change the time spent in a second every minute, which is the most accurate compensation that the RTC can offer. Above, the compensation will be active every 20s. Signed-off-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220516082504.33913-5-miquel.raynal@bootlin.com --- drivers/rtc/rtc-rzn1.c | 73 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c index c1b082e3c8a7..980ade8c9601 100644 --- a/drivers/rtc/rtc-rzn1.c +++ b/drivers/rtc/rtc-rzn1.c @@ -245,12 +245,85 @@ static int rzn1_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } +static int rzn1_rtc_read_offset(struct device *dev, long *offset) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + unsigned int ppb_per_step; + bool subtract; + u32 val; + + val = readl(rtc->base + RZN1_RTC_SUBU); + ppb_per_step = val & RZN1_RTC_SUBU_DEV ? 1017 : 3051; + subtract = val & RZN1_RTC_SUBU_DECR; + val &= 0x3F; + + if (!val) + *offset = 0; + else if (subtract) + *offset = -(((~val) & 0x3F) + 1) * ppb_per_step; + else + *offset = (val - 1) * ppb_per_step; + + return 0; +} + +static int rzn1_rtc_set_offset(struct device *dev, long offset) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + unsigned int steps; + int stepsh, stepsl; + u32 val; + int ret; + + /* + * Check which resolution mode (every 20 or 60s) can be used. + * Between 2 and 124 clock pulses can be added or substracted. + * + * In 20s mode, the minimum resolution is 2 / (32768 * 20) which is + * close to 3051 ppb. In 60s mode, the resolution is closer to 1017. + */ + stepsh = DIV_ROUND_CLOSEST(offset, 1017); + stepsl = DIV_ROUND_CLOSEST(offset, 3051); + + if (stepsh >= -0x3E && stepsh <= 0x3E) { + /* 1017 ppb per step */ + steps = stepsh; + val |= RZN1_RTC_SUBU_DEV; + } else if (stepsl >= -0x3E && stepsl <= 0x3E) { + /* 3051 ppb per step */ + steps = stepsl; + } else { + return -ERANGE; + } + + if (!steps) + return 0; + + if (steps > 0) { + val |= steps + 1; + } else { + val |= RZN1_RTC_SUBU_DECR; + val |= (~(-steps - 1)) & 0x3F; + } + + ret = readl_poll_timeout(rtc->base + RZN1_RTC_CTL2, val, + !(val & RZN1_RTC_CTL2_WUST), 100, 2000000); + if (ret) + return ret; + + writel(val, rtc->base + RZN1_RTC_SUBU); + + return 0; +} + static const struct rtc_class_ops rzn1_rtc_ops = { .read_time = rzn1_rtc_read_time, .set_time = rzn1_rtc_set_time, .read_alarm = rzn1_rtc_read_alarm, .set_alarm = rzn1_rtc_set_alarm, .alarm_irq_enable = rzn1_rtc_alarm_irq_enable, + .read_offset = rzn1_rtc_read_offset, + .set_offset = rzn1_rtc_set_offset, }; static int rzn1_rtc_probe(struct platform_device *pdev) From 060eceb739e5b30db684666592c2a33d09426651 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 16 May 2022 10:25:04 +0200 Subject: [PATCH 172/334] MAINTAINERS: Add myself as maintainer of the RZN1 RTC driver After contributing it, I'll volunteer to maintain it. Signed-off-by: Miquel Raynal Acked-by: Geert Uytterhoeven Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220516082504.33913-6-miquel.raynal@bootlin.com --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index fd768d43e048..0fc279f89f1f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16849,6 +16849,14 @@ S: Supported F: Documentation/devicetree/bindings/iio/adc/renesas,rzg2l-adc.yaml F: drivers/iio/adc/rzg2l_adc.c +RENESAS RZ/N1 RTC CONTROLLER DRIVER +M: Miquel Raynal +L: linux-rtc@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml +F: drivers/rtc/rtc-rzn1.c + RENESAS R-CAR GEN3 & RZ/N1 NAND CONTROLLER DRIVER M: Miquel Raynal L: linux-mtd@lists.infradead.org From bb9b98071eacc5f195fe3e7cb0c7664df4cff60f Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 18 Apr 2022 22:44:44 -0300 Subject: [PATCH 173/334] dt-binding: pcf85063: Add an entry for pca85073a The PCA85073A RTC has the same programming model as the PCF85063A. Add a compatible entry for it. Signed-off-by: Fabio Estevam Acked-by: Rob Herring Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220419014445.341444-1-festevam@gmail.com --- Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt b/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt index 6439682c9319..217b7cd06c11 100644 --- a/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt +++ b/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt @@ -2,6 +2,7 @@ Required properties: - compatible: Should one of contain: + "nxp,pca85073a", "nxp,pcf85063", "nxp,pcf85063a", "nxp,pcf85063tp", From aabfe05a824585f64a0620f131841f12ee259a20 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 18 Apr 2022 22:44:45 -0300 Subject: [PATCH 174/334] rtc: pcf85063: Add a compatible entry for pca85073a The PCA85073A RTC has the same programming model as the PCF85063A. Add a compatible entry for it. Tested on a custom i.MX6SX based board. Signed-off-by: Fabio Estevam Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220419014445.341444-2-festevam@gmail.com --- drivers/rtc/rtc-pcf85063.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index 9760824ec199..095891999da1 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -650,6 +650,7 @@ static int pcf85063_probe(struct i2c_client *client) } static const struct i2c_device_id pcf85063_ids[] = { + { "pca85073a", PCF85063A }, { "pcf85063", PCF85063 }, { "pcf85063tp", PCF85063TP }, { "pcf85063a", PCF85063A }, @@ -660,6 +661,7 @@ MODULE_DEVICE_TABLE(i2c, pcf85063_ids); #ifdef CONFIG_OF static const struct of_device_id pcf85063_of_match[] = { + { .compatible = "nxp,pca85073a", .data = &pcf85063_cfg[PCF85063A] }, { .compatible = "nxp,pcf85063", .data = &pcf85063_cfg[PCF85063] }, { .compatible = "nxp,pcf85063tp", .data = &pcf85063_cfg[PCF85063TP] }, { .compatible = "nxp,pcf85063a", .data = &pcf85063_cfg[PCF85063A] }, From bce7a01ada6456d00c69cd816357ded268ad780c Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Thu, 5 May 2022 02:23:14 +0000 Subject: [PATCH 175/334] rtc: simplify the return expression of rx8025_set_offset() Simplify the return expression. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Reviewed-by: Nobuhiro Iwamatsu Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220505022314.59822-1-chi.minghao@zte.com.cn --- drivers/rtc/rtc-rx8025.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index 5bfdd34a72ff..b32117ccd74b 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -436,7 +436,6 @@ static int rx8025_set_offset(struct device *dev, long offset) { struct i2c_client *client = to_i2c_client(dev); u8 digoff; - int err; offset /= RX8025_ADJ_RESOLUTION; if (offset > RX8025_ADJ_DATA_MAX) @@ -449,11 +448,7 @@ static int rx8025_set_offset(struct device *dev, long offset) offset += 128; digoff = offset; - err = rx8025_write_reg(client, RX8025_REG_DIGOFF, digoff); - if (err) - return err; - - return 0; + return rx8025_write_reg(client, RX8025_REG_DIGOFF, digoff); } static const struct rtc_class_ops rx8025_rtc_ops = { From a37bdde620c2eb89505ee42ff60ed7030b50f71a Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Sat, 7 May 2022 08:28:50 +0900 Subject: [PATCH 176/334] rtc: meson: Fix email address in MODULE_AUTHOR Ben Dooks's email address is . Fix Ben Dooks's email address in MODULE_AUTHOR. Signed-off-by: Nobuhiro Iwamatsu Reviewed-by: Martin Blumenstingl Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220506232850.220582-1-nobuhiro1.iwamatsu@toshiba.co.jp --- drivers/rtc/rtc-meson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-meson.c b/drivers/rtc/rtc-meson.c index 44bdc8b4a90d..db1d626edca5 100644 --- a/drivers/rtc/rtc-meson.c +++ b/drivers/rtc/rtc-meson.c @@ -399,7 +399,7 @@ static struct platform_driver meson_rtc_driver = { module_platform_driver(meson_rtc_driver); MODULE_DESCRIPTION("Amlogic Meson RTC Driver"); -MODULE_AUTHOR("Ben Dooks "); +MODULE_AUTHOR("Ben Dooks "); MODULE_AUTHOR("Martin Blumenstingl "); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("platform:meson-rtc"); From 6ddabcb106280db0f7344850adfce3dd6b171cbd Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Wed, 11 May 2022 07:13:54 +0000 Subject: [PATCH 177/334] rtc: gamecube: Add missing iounmap in gamecube_rtc_read_offset_from_sram The hw_srnprot needs to be unmapped when gamecube_rtc_read_offset_from_sram returns. Fixs: 86559400b3ef9d (rtc: gamecube: Add a RTC driver for the GameCube, Wii and Wii U) Signed-off-by: Yuan Can Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220511071354.46202-1-yuancan@huawei.com --- drivers/rtc/rtc-gamecube.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-gamecube.c b/drivers/rtc/rtc-gamecube.c index 18ca3b38b2d0..c2717bb52b2b 100644 --- a/drivers/rtc/rtc-gamecube.c +++ b/drivers/rtc/rtc-gamecube.c @@ -267,6 +267,7 @@ static int gamecube_rtc_read_offset_from_sram(struct priv *d) ret = regmap_read(d->regmap, RTC_SRAM_BIAS, &d->rtc_bias); if (ret) { pr_err("failed to get the RTC bias\n"); + iounmap(hw_srnprot); return -1; } From 2b2c651baf1c24414be4f76b152de80fae7c7786 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Wed, 18 May 2022 16:46:09 +0530 Subject: [PATCH 178/334] vfio/pci: Invalidate mmaps and block the access in D3hot power state According to [PCIe v5 5.3.1.4.1] for D3hot state "Configuration and Message requests are the only TLPs accepted by a Function in the D3Hot state. All other received Requests must be handled as Unsupported Requests, and all received Completions may optionally be handled as Unexpected Completions." Currently, if the vfio PCI device has been put into D3hot state and if user makes non-config related read/write request in D3hot state, these requests will be forwarded to the host and this access may cause issues on a few systems. This patch leverages the memory-disable support added in commit 'abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory")' to generate page fault on mmap access and return error for the direct read/write. If the device is D3hot state, then the error will be returned for MMIO access. The IO access generally does not make the system unresponsive so the IO access can still happen in D3hot state. The default value should be returned in this case without bringing down the complete system. Also, the power related structure fields need to be protected so we can use the same 'memory_lock' to protect these fields also. This protection is mainly needed when user changes the PCI power state by writing into PCI_PM_CTRL register. vfio_lock_and_set_power_state() wrapper function will take the required locks and then it will invoke the vfio_pci_set_power_state(). Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220518111612.16985-2-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 6e58b4bf7a60..ea7d2306ba9d 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -402,11 +402,14 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); /* + * Memory region cannot be accessed if device power state is D3. + * * SR-IOV VF memory enable is handled by the MSE bit in the * PF SR-IOV capability, there's therefore no need to trigger * faults based on the virtual value. */ - return pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY); + return pdev->current_state < PCI_D3hot && + (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } /* @@ -692,6 +695,22 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) return 0; } +/* + * It takes all the required locks to protect the access of power related + * variables and then invokes vfio_pci_set_power_state(). + */ +static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, + pci_power_t state) +{ + if (state >= PCI_D3hot) + vfio_pci_zap_and_down_write_memory_lock(vdev); + else + down_write(&vdev->memory_lock); + + vfio_pci_set_power_state(vdev, state); + up_write(&vdev->memory_lock); +} + static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos, int count, struct perm_bits *perm, int offset, __le32 val) @@ -718,7 +737,7 @@ static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos, break; } - vfio_pci_set_power_state(vdev, state); + vfio_lock_and_set_power_state(vdev, state); } return count; From f4162eb1e2fc9b423dfb8f3a7b2b55a337efcc60 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Wed, 18 May 2022 16:46:10 +0530 Subject: [PATCH 179/334] vfio/pci: Change the PF power state to D0 before enabling VFs According to [PCIe v5 9.6.2] for PF Device Power Management States "The PF's power management state (D-state) has global impact on its associated VFs. If a VF does not implement the Power Management Capability, then it behaves as if it is in an equivalent power state of its associated PF. If a VF implements the Power Management Capability, the Device behavior is undefined if the PF is placed in a lower power state than the VF. Software should avoid this situation by placing all VFs in lower power state before lowering their associated PF's power state." From the vfio driver side, user can enable SR-IOV when the PF is in D3hot state. If VF does not implement the Power Management Capability, then the VF will be actually in D3hot state and then the VF BAR access will fail. If VF implements the Power Management Capability, then VF will assume that its current power state is D0 when the PF is D3hot and in this case, the behavior is undefined. To support PF power management, we need to create power management dependency between PF and its VF's. The runtime power management support may help with this where power management dependencies are supported through device links. But till we have such support in place, we can disallow the PF to go into low power state, if PF has VF enabled. There can be a case, where user first enables the VF's and then disables the VF's. If there is no user of PF, then the PF can put into D3hot state again. But with this patch, the PF will still be in D0 state after disabling VF's since detecting this case inside vfio_pci_core_sriov_configure() requires access to struct vfio_device::open_count along with its locks. But the subsequent patches related to runtime PM will handle this case since runtime PM maintains its own usage count. Also, vfio_pci_core_sriov_configure() can be called at any time (with and without vfio pci device user), so the power state change and SR-IOV enablement need to be protected with the required locks. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220518111612.16985-3-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 05a3aa95ba52..9489ceea8875 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -217,6 +217,10 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat bool needs_restore = false, needs_save = false; int ret; + /* Prevent changing power state for PFs with VFs enabled */ + if (pci_num_vf(pdev) && state > PCI_D0) + return -EBUSY; + if (vdev->needs_pm_restore) { if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { pci_save_state(pdev); @@ -1944,7 +1948,19 @@ int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, } list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs); mutex_unlock(&vfio_pci_sriov_pfs_mutex); + + /* + * The PF power state should always be higher than the VF power + * state. If PF is in the low power state, then change the + * power state to D0 first before enabling SR-IOV. + * Also, this function can be called at any time, and userspace + * PCI_PM_CTRL write can race against this code path, + * so protect the same with 'memory_lock'. + */ + down_write(&vdev->memory_lock); + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_enable_sriov(pdev, nr_virtfn); + up_write(&vdev->memory_lock); if (ret) goto out_del; return nr_virtfn; From 54918c28740109e4ef4feca22d38e5fe41712b1a Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Wed, 18 May 2022 16:46:11 +0530 Subject: [PATCH 180/334] vfio/pci: Virtualize PME related registers bits and initialize to zero If any PME event will be generated by PCI, then it will be mostly handled in the host by the root port PME code. For example, in the case of PCIe, the PME event will be sent to the root port and then the PME interrupt will be generated. This will be handled in drivers/pci/pcie/pme.c at the host side. Inside this, the pci_check_pme_status() will be called where PME_Status and PME_En bits will be cleared. So, the guest OS which is using vfio-pci device will not come to know about this PME event. To handle these PME events inside guests, we need some framework so that if any PME events will happen, then it needs to be forwarded to virtual machine monitor. We can virtualize PME related registers bits and initialize these bits to zero so vfio-pci device user will assume that it is not capable of asserting the PME# signal from any power state. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220518111612.16985-4-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 33 +++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index ea7d2306ba9d..9343f597182d 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -757,12 +757,29 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm) */ p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + /* + * The guests can't process PME events. If any PME event will be + * generated, then it will be mostly handled in the host and the + * host will clear the PME_STATUS. So virtualize PME_Support bits. + * The vconfig bits will be cleared during device capability + * initialization. + */ + p_setw(perm, PCI_PM_PMC, PCI_PM_CAP_PME_MASK, NO_WRITE); + /* * Power management is defined *per function*, so we can let * the user change power state, but we trap and initiate the * change ourselves, so the state bits are read-only. + * + * The guest can't process PME from D3cold so virtualize PME_Status + * and PME_En bits. The vconfig bits will be cleared during device + * capability initialization. */ - p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK); + p_setd(perm, PCI_PM_CTRL, + PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS, + ~(PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS | + PCI_PM_CTRL_STATE_MASK)); + return 0; } @@ -1431,6 +1448,17 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo return 0; } +static void vfio_update_pm_vconfig_bytes(struct vfio_pci_core_device *vdev, + int offset) +{ + __le16 *pmc = (__le16 *)&vdev->vconfig[offset + PCI_PM_PMC]; + __le16 *ctrl = (__le16 *)&vdev->vconfig[offset + PCI_PM_CTRL]; + + /* Clear vconfig PME_Support, PME_Status, and PME_En bits */ + *pmc &= ~cpu_to_le16(PCI_PM_CAP_PME_MASK); + *ctrl &= ~cpu_to_le16(PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS); +} + static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev, int offset, int size) { @@ -1554,6 +1582,9 @@ static int vfio_cap_init(struct vfio_pci_core_device *vdev) if (ret) return ret; + if (cap == PCI_CAP_ID_PM) + vfio_update_pm_vconfig_bytes(vdev, pos); + prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; pos = next; caps++; From 7ab5e10eda02da1d9562ffde562c51055d368e9c Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Wed, 18 May 2022 16:46:12 +0530 Subject: [PATCH 181/334] vfio/pci: Move the unused device into low power state with runtime PM Currently, there is very limited power management support available in the upstream vfio_pci_core based drivers. If there are no users of the device, then the PCI device will be moved into D3hot state by writing directly into PCI PM registers. This D3hot state help in saving power but we can achieve zero power consumption if we go into the D3cold state. The D3cold state cannot be possible with native PCI PM. It requires interaction with platform firmware which is system-specific. To go into low power states (including D3cold), the runtime PM framework can be used which internally interacts with PCI and platform firmware and puts the device into the lowest possible D-States. This patch registers vfio_pci_core based drivers with the runtime PM framework. 1. The PCI core framework takes care of most of the runtime PM related things. For enabling the runtime PM, the PCI driver needs to decrement the usage count and needs to provide 'struct dev_pm_ops' at least. The runtime suspend/resume callbacks are optional and needed only if we need to do any extra handling. Now there are multiple vfio_pci_core based drivers. Instead of assigning the 'struct dev_pm_ops' in individual parent driver, the vfio_pci_core itself assigns the 'struct dev_pm_ops'. There are other drivers where the 'struct dev_pm_ops' is being assigned inside core layer (For example, wlcore_probe() and some sound based driver, etc.). 2. This patch provides the stub implementation of 'struct dev_pm_ops'. The subsequent patch will provide the runtime suspend/resume callbacks. All the config state saving, and PCI power management related things will be done by PCI core framework itself inside its runtime suspend/resume callbacks (pci_pm_runtime_suspend() and pci_pm_runtime_resume()). 3. Inside pci_reset_bus(), all the devices in dev_set needs to be runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take care of the runtime resume and its error handling. 4. Inside vfio_pci_core_disable(), the device usage count always needs to be decremented which was incremented in vfio_pci_core_enable(). 5. Since the runtime PM framework will provide the same functionality, so directly writing into PCI PM config register can be replaced with the use of runtime PM routines. Also, the use of runtime PM can help us in more power saving. In the systems which do not support D3cold, With the existing implementation: // PCI device # cat /sys/bus/pci/devices/0000\:01\:00.0/power_state D3hot // upstream bridge # cat /sys/bus/pci/devices/0000\:00\:01.0/power_state D0 With runtime PM: // PCI device # cat /sys/bus/pci/devices/0000\:01\:00.0/power_state D3hot // upstream bridge # cat /sys/bus/pci/devices/0000\:00\:01.0/power_state D3hot So, with runtime PM, the upstream bridge or root port will also go into lower power state which is not possible with existing implementation. In the systems which support D3cold, // PCI device # cat /sys/bus/pci/devices/0000\:01\:00.0/power_state D3hot // upstream bridge # cat /sys/bus/pci/devices/0000\:00\:01.0/power_state D0 With runtime PM: // PCI device # cat /sys/bus/pci/devices/0000\:01\:00.0/power_state D3cold // upstream bridge # cat /sys/bus/pci/devices/0000\:00\:01.0/power_state D3cold So, with runtime PM, both the PCI device and upstream bridge will go into D3cold state. 6. If 'disable_idle_d3' module parameter is set, then also the runtime PM will be enabled, but in this case, the usage count should not be decremented. 7. vfio_pci_dev_set_try_reset() return value is unused now, so this function return type can be changed to void. 8. Use the runtime PM API's in vfio_pci_core_sriov_configure(). The device can be in low power state either with runtime power management (when there is no user) or PCI_PM_CTRL register write by the user. In both the cases, the PF should be moved to D0 state. For preventing any runtime usage mismatch, pci_num_vf() has been called explicitly during disable. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 170 ++++++++++++++++++++----------- 1 file changed, 113 insertions(+), 57 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9489ceea8875..a0d69ddaf90d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -156,7 +156,7 @@ no_mmap: } struct vfio_pci_group_info; -static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); +static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups); @@ -259,6 +259,17 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +/* + * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, + * so use structure without any callbacks. + * + * The pci-driver core runtime PM routines always save the device state + * before going into suspended state. If the device is going into low power + * state with only with runtime PM ops, then no explicit handling is needed + * for the devices which have NoSoftRst-. + */ +static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -266,21 +277,23 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) u16 cmd; u8 msix_pos; - vfio_pci_set_power_state(vdev, PCI_D0); + if (!disable_idle_d3) { + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret < 0) + return ret; + } /* Don't allow our initial saved state to include busmaster */ pci_clear_master(pdev); ret = pci_enable_device(pdev); if (ret) - return ret; + goto out_power; /* If reset fails because of the device lock, fail this path entirely */ ret = pci_try_reset_function(pdev); - if (ret == -EAGAIN) { - pci_disable_device(pdev); - return ret; - } + if (ret == -EAGAIN) + goto out_disable_device; vdev->reset_works = !ret; pci_save_state(pdev); @@ -304,12 +317,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) } ret = vfio_config_init(vdev); - if (ret) { - kfree(vdev->pci_saved_state); - vdev->pci_saved_state = NULL; - pci_disable_device(pdev); - return ret; - } + if (ret) + goto out_free_state; msix_pos = pdev->msix_cap; if (msix_pos) { @@ -330,6 +339,16 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) return 0; + +out_free_state: + kfree(vdev->pci_saved_state); + vdev->pci_saved_state = NULL; +out_disable_device: + pci_disable_device(pdev); +out_power: + if (!disable_idle_d3) + pm_runtime_put(&pdev->dev); + return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_enable); @@ -437,8 +456,11 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) out: pci_disable_device(pdev); - if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D3hot); + vfio_pci_dev_set_try_reset(vdev->vdev.dev_set); + + /* Put the pm-runtime usage counter acquired during enable */ + if (!disable_idle_d3) + pm_runtime_put(&pdev->dev); } EXPORT_SYMBOL_GPL(vfio_pci_core_disable); @@ -1823,10 +1845,11 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; + struct device *dev = &pdev->dev; int ret; /* Drivers must set the vfio_pci_core_device to their drvdata */ - if (WARN_ON(vdev != dev_get_drvdata(&vdev->pdev->dev))) + if (WARN_ON(vdev != dev_get_drvdata(dev))) return -EINVAL; if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) @@ -1868,19 +1891,21 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) vfio_pci_probe_power_state(vdev); - if (!disable_idle_d3) { - /* - * pci-core sets the device power state to an unknown value at - * bootup and after being removed from a driver. The only - * transition it allows from this unknown state is to D0, which - * typically happens when a driver calls pci_enable_device(). - * We're not ready to enable the device yet, but we do want to - * be able to get to D3. Therefore first do a D0 transition - * before going to D3. - */ - vfio_pci_set_power_state(vdev, PCI_D0); - vfio_pci_set_power_state(vdev, PCI_D3hot); - } + /* + * pci-core sets the device power state to an unknown value at + * bootup and after being removed from a driver. The only + * transition it allows from this unknown state is to D0, which + * typically happens when a driver calls pci_enable_device(). + * We're not ready to enable the device yet, but we do want to + * be able to get to D3. Therefore first do a D0 transition + * before enabling runtime PM. + */ + vfio_pci_set_power_state(vdev, PCI_D0); + + dev->driver->pm = &vfio_pci_core_pm_ops; + pm_runtime_allow(dev); + if (!disable_idle_d3) + pm_runtime_put(dev); ret = vfio_register_group_dev(&vdev->vdev); if (ret) @@ -1889,7 +1914,9 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) out_power: if (!disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D0); + pm_runtime_get_noresume(dev); + + pm_runtime_forbid(dev); out_vf: vfio_pci_vf_uninit(vdev); return ret; @@ -1906,7 +1933,9 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vga_uninit(vdev); if (!disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D0); + pm_runtime_get_noresume(&vdev->pdev->dev); + + pm_runtime_forbid(&vdev->pdev->dev); } EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); @@ -1951,22 +1980,33 @@ int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, /* * The PF power state should always be higher than the VF power - * state. If PF is in the low power state, then change the - * power state to D0 first before enabling SR-IOV. - * Also, this function can be called at any time, and userspace - * PCI_PM_CTRL write can race against this code path, + * state. The PF can be in low power state either with runtime + * power management (when there is no user) or PCI_PM_CTRL + * register write by the user. If PF is in the low power state, + * then change the power state to D0 first before enabling + * SR-IOV. Also, this function can be called at any time, and + * userspace PCI_PM_CTRL write can race against this code path, * so protect the same with 'memory_lock'. */ + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret) + goto out_del; + down_write(&vdev->memory_lock); vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_enable_sriov(pdev, nr_virtfn); up_write(&vdev->memory_lock); - if (ret) + if (ret) { + pm_runtime_put(&pdev->dev); goto out_del; + } return nr_virtfn; } - pci_disable_sriov(pdev); + if (pci_num_vf(pdev)) { + pci_disable_sriov(pdev); + pm_runtime_put(&pdev->dev); + } out_del: mutex_lock(&vfio_pci_sriov_pfs_mutex); @@ -2041,6 +2081,27 @@ vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) return pdev; } +static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set) +{ + struct vfio_pci_core_device *cur; + int ret; + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { + ret = pm_runtime_resume_and_get(&cur->pdev->dev); + if (ret) + goto unwind; + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(cur, &dev_set->device_list, + vdev.dev_set_list) + pm_runtime_put(&cur->pdev->dev); + + return ret; +} + /* * We need to get memory_lock for each device, but devices can share mmap_lock, * therefore we need to zap and hold the vma_lock for each device, and only then @@ -2147,43 +2208,38 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) * - At least one of the affected devices is marked dirty via * needs_reset (such as by lack of FLR support) * Then attempt to perform that bus or slot reset. - * Returns true if the dev_set was reset. */ -static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) +static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) { struct vfio_pci_core_device *cur; struct pci_dev *pdev; - int ret; + bool reset_done = false; if (!vfio_pci_dev_set_needs_reset(dev_set)) - return false; + return; pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) - return false; + return; /* - * The pci_reset_bus() will reset all the devices in the bus. - * The power state can be non-D0 for some of the devices in the bus. - * For these devices, the pci_reset_bus() will internally set - * the power state to D0 without vfio driver involvement. - * For the devices which have NoSoftRst-, the reset function can - * cause the PCI config space reset without restoring the original - * state (saved locally in 'vdev->pm_save'). + * Some of the devices in the bus can be in the runtime suspended + * state. Increment the usage count for all the devices in the dev_set + * before reset and decrement the same after reset. */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set)) + return; - ret = pci_reset_bus(pdev); - if (ret) - return false; + if (!pci_reset_bus(pdev)) + reset_done = true; list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - cur->needs_reset = false; + if (reset_done) + cur->needs_reset = false; + if (!disable_idle_d3) - vfio_pci_set_power_state(cur, PCI_D3hot); + pm_runtime_put(&cur->pdev->dev); } - return true; } void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga, From 958ed92922028ec67f504dcdc72bfdfd0f43936a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 17 May 2022 11:37:23 +0800 Subject: [PATCH 182/334] f2fs: fix fallocate to use file_modified to update permissions consistently This patch tries to fix permission consistency issue as all other mainline filesystems. Since the initial introduction of (posix) fallocate back at the turn of the century, it has been possible to use this syscall to change the user-visible contents of files. This can happen by extending the file size during a preallocation, or through any of the newer modes (punch, zero, collapse, insert range). Because the call can be used to change file contents, we should treat it like we do any other modification to a file -- update the mtime, and drop set[ug]id privileges/capabilities. The VFS function file_modified() does all this for us if pass it a locked inode, so let's make fallocate drop permissions correctly. Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 216081ea8c81..7aac53ac5acf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1780,6 +1780,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; From 677a82b44ebf263d4f9a0cfbd576a6ade797a07b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 18 May 2022 20:28:41 +0800 Subject: [PATCH 183/334] f2fs: fix to do sanity check for inline inode Yanming reported a kernel bug in Bugzilla kernel [1], which can be reproduced. The bug message is: The kernel message is shown below: kernel BUG at fs/inode.c:611! Call Trace: evict+0x282/0x4e0 __dentry_kill+0x2b2/0x4d0 dput+0x2dd/0x720 do_renameat2+0x596/0x970 __x64_sys_rename+0x78/0x90 do_syscall_64+0x3b/0x90 [1] https://bugzilla.kernel.org/show_bug.cgi?id=215895 The bug is due to fuzzed inode has both inline_data and encrypted flags. During f2fs_evict_inode(), as the inode was deleted by rename(), it will cause inline data conversion due to conflicting flags. The page cache will be polluted and the panic will be triggered in clear_inode(). Try fixing the bug by doing more sanity checks for inline data inode in sanity_check_inode(). Cc: stable@vger.kernel.org Reported-by: Ming Yan Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/inline.c | 29 ++++++++++++++++++++++++----- fs/f2fs/inode.c | 3 +-- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e9e32bc814df..000468bf06ca 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4019,6 +4019,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a578bf83b803..bf46a7dfbea2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -14,21 +14,40 @@ #include "node.h" #include -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2fce8fa0dac8..938961a9084e 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); From e60e8a73235ce5d42a2891c6989e8df1c8888c4a Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 19 May 2022 07:24:45 +0800 Subject: [PATCH 184/334] rtc: rzn1: fix platform_no_drv_owner.cocci warning Remove .owner field if calls are used which set it automatically. ./drivers/rtc/rtc-rzn1.c:411:3-8: No need to set .owner here. The core will do it. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220518232445.79156-1-yang.lee@linux.alibaba.com --- drivers/rtc/rtc-rzn1.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c index 980ade8c9601..f92d1398b0f1 100644 --- a/drivers/rtc/rtc-rzn1.c +++ b/drivers/rtc/rtc-rzn1.c @@ -408,7 +408,6 @@ static struct platform_driver rzn1_rtc_driver = { .remove = rzn1_rtc_remove, .driver = { .name = "rzn1-rtc", - .owner = THIS_MODULE, .of_match_table = rzn1_rtc_of_match, }, }; From d3b43eb505bffb8e4cdf6800c15660c001553fe6 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 5 May 2022 20:50:43 +0800 Subject: [PATCH 185/334] rtc: mt6397: check return value after calling platform_get_resource() It will cause null-ptr-deref if platform_get_resource() returns NULL, we need check the return value. Fixes: fc2979118f3f ("rtc: mediatek: Add MT6397 RTC driver") Signed-off-by: Yang Yingliang Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220505125043.1594771-1-yangyingliang@huawei.com --- drivers/rtc/rtc-mt6397.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c index 80dc479a6ff0..1d297af80f87 100644 --- a/drivers/rtc/rtc-mt6397.c +++ b/drivers/rtc/rtc-mt6397.c @@ -269,6 +269,8 @@ static int mtk_rtc_probe(struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; rtc->addr_base = res->start; rtc->data = of_device_get_match_data(&pdev->dev); From b520cbe5be37b1b9b401c0b6ecbdae32575273db Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Sun, 3 Apr 2022 05:49:12 +0000 Subject: [PATCH 186/334] rtc: ftrtc010: Fix error handling in ftrtc010_rtc_probe In the error handling path, the clk_prepare_enable() function call should be balanced by a corresponding 'clk_disable_unprepare()' call , as already done in the remove function. clk_disable_unprepare calls clk_disable() and clk_unprepare(). They will use IS_ERR_OR_NULL to check the argument. Fixes: ac05fba39cc5 ("rtc: gemini: Add optional clock handling") Signed-off-by: Miaoqian Lin Reviewed-by: Linus Walleij Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220403054912.31739-1-linmq006@gmail.com --- drivers/rtc/rtc-ftrtc010.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/rtc/rtc-ftrtc010.c b/drivers/rtc/rtc-ftrtc010.c index 53bb08fe1cd4..25c6e7d9570f 100644 --- a/drivers/rtc/rtc-ftrtc010.c +++ b/drivers/rtc/rtc-ftrtc010.c @@ -137,26 +137,34 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev) ret = clk_prepare_enable(rtc->extclk); if (ret) { dev_err(dev, "failed to enable EXTCLK\n"); - return ret; + goto err_disable_pclk; } } rtc->rtc_irq = platform_get_irq(pdev, 0); - if (rtc->rtc_irq < 0) - return rtc->rtc_irq; + if (rtc->rtc_irq < 0) { + ret = rtc->rtc_irq; + goto err_disable_extclk; + } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; + if (!res) { + ret = -ENODEV; + goto err_disable_extclk; + } rtc->rtc_base = devm_ioremap(dev, res->start, resource_size(res)); - if (!rtc->rtc_base) - return -ENOMEM; + if (!rtc->rtc_base) { + ret = -ENOMEM; + goto err_disable_extclk; + } rtc->rtc_dev = devm_rtc_allocate_device(dev); - if (IS_ERR(rtc->rtc_dev)) - return PTR_ERR(rtc->rtc_dev); + if (IS_ERR(rtc->rtc_dev)) { + ret = PTR_ERR(rtc->rtc_dev); + goto err_disable_extclk; + } rtc->rtc_dev->ops = &ftrtc010_rtc_ops; @@ -172,9 +180,15 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev) ret = devm_request_irq(dev, rtc->rtc_irq, ftrtc010_rtc_interrupt, IRQF_SHARED, pdev->name, dev); if (unlikely(ret)) - return ret; + goto err_disable_extclk; return devm_rtc_register_device(rtc->rtc_dev); + +err_disable_extclk: + clk_disable_unprepare(rtc->extclk); +err_disable_pclk: + clk_disable_unprepare(rtc->pclk); + return ret; } static int ftrtc010_rtc_remove(struct platform_device *pdev) From d9c454ab2293f6b143d3d1be2bf54d766ed8bfc5 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 19 May 2022 18:40:10 +0800 Subject: [PATCH 187/334] f2fs: make f2fs_read_inline_data() more readable In f2fs_read_inline_data(), it is confused with checking of inline_data flag, as we checked it before calling. So this patch add some comments for f2fs_has_inline_data(). Signed-off-by: Chao Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 000468bf06ca..52c34651f469 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3167,6 +3167,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); From 309001c22cdd75c62e6c3a217bf6967e178f929a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 20 May 2022 14:41:34 +1000 Subject: [PATCH 188/334] xfs: don't leak da state when freeing the attr intent item kmemleak reported that we lost an xfs_da_state while removing xattrs in generic/020: unreferenced object 0xffff88801c0e4b40 (size 480): comm "attr", pid 30515, jiffies 4294931061 (age 5.960s) hex dump (first 32 bytes): 78 bc 65 07 00 c9 ff ff 00 30 60 1c 80 88 ff ff x.e......0`..... 02 00 00 00 00 00 00 00 80 18 83 4e 80 88 ff ff ...........N.... backtrace: [] xfs_da_state_alloc+0x1a/0x30 [xfs] [] xfs_attr_node_hasname+0x23/0x90 [xfs] [] xfs_attr_set_iter+0x441/0xa30 [xfs] [] xfs_xattri_finish_update+0x44/0x80 [xfs] [] xfs_attr_finish_item+0x1e/0x40 [xfs] [] xfs_defer_finish_noroll+0x184/0x740 [xfs] [] __xfs_trans_commit+0x153/0x3e0 [xfs] [] xfs_attr_set+0x469/0x7e0 [xfs] [] xfs_xattr_set+0x89/0xd0 [xfs] [] __vfs_removexattr+0x52/0x70 [] __vfs_removexattr_locked+0xb8/0x150 [] vfs_removexattr+0x56/0x100 [] removexattr+0x58/0x90 [] path_removexattr+0x9e/0xc0 [] __x64_sys_lremovexattr+0x14/0x20 [] do_syscall_64+0x35/0x80 I think this is a consequence of xfs_attr_node_removename_setup attaching a new da(btree) state to xfs_attr_item and never freeing it. I /think/ it's the case that the remove paths could detach the da state earlier in the remove state machine since nothing else accesses the state. However, let's future-proof the new xattr code by adding a catch-all when we free the xfs_attr_item to make sure we never leak the da state. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 22 ++++++++++++++-------- fs/xfs/xfs_attr_item.c | 15 ++++++++++++--- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 14ae0826bc15..d0418b79056f 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -604,26 +604,29 @@ int xfs_attr_node_removename_setup( struct xfs_attr_item *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state **state = &attr->xattri_da_state; + struct xfs_da_state *state; int error; - error = xfs_attr_node_hasname(args, state); + error = xfs_attr_node_hasname(args, &attr->xattri_da_state); if (error != -EEXIST) goto out; error = 0; - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_mark_incomplete(args, *state); + error = xfs_attr_leaf_mark_incomplete(args, state); if (error) goto out; if (args->rmtblkno > 0) error = xfs_attr_rmtval_invalidate(args); out: - if (error) - xfs_da_state_free(*state); + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } return error; } @@ -1456,8 +1459,10 @@ xfs_attr_node_addname_find_attr( return 0; error: - if (attr->xattri_da_state) + if (attr->xattri_da_state) { xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } return error; } @@ -1511,6 +1516,7 @@ xfs_attr_node_try_addname( out: xfs_da_state_free(state); + attr->xattri_da_state = NULL; return error; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e8ac88d9fd14..687cf517841a 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -396,6 +396,15 @@ xfs_attr_create_intent( return &attrip->attri_item; } +static inline void +xfs_attr_free_item( + struct xfs_attr_item *attr) +{ + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + kmem_free(attr); +} + /* Process an attr. */ STATIC int xfs_attr_finish_item( @@ -420,7 +429,7 @@ xfs_attr_finish_item( error = xfs_xattri_finish_update(attr, done_item); if (error != -EAGAIN) - kmem_free(attr); + xfs_attr_free_item(attr); return error; } @@ -441,7 +450,7 @@ xfs_attr_cancel_item( struct xfs_attr_item *attr; attr = container_of(item, struct xfs_attr_item, xattri_list); - kmem_free(attr); + xfs_attr_free_item(attr); } STATIC xfs_lsn_t @@ -613,7 +622,7 @@ out_unlock: xfs_irele(ip); out: if (ret != -EAGAIN) - kmem_free(attr); + xfs_attr_free_item(attr); return error; } From a618acab136b1b01a4c10957ce8bae70cc9f7ca4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 20 May 2022 14:41:42 +1000 Subject: [PATCH 189/334] xfs: don't leak the retained da state when doing a leaf to node conversion If a setxattr operation finds an xattr structure in leaf format, adding the attr can fail due to lack of space and hence requires an upgrade to node format. After this happens, we'll roll the transaction and re-enter the state machine, at which time we need to perform a second lookup of the attribute name to find its new location. This lookup attaches a new da state structure to the xfs_attr_item but doesn't free the old one (from the leaf lookup) and leaks it. Fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index d0418b79056f..576de34cfca0 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1401,8 +1401,10 @@ xfs_attr_node_hasname( int retval, error; state = xfs_da_state_alloc(args); - if (statep != NULL) + if (statep != NULL) { + ASSERT(*statep == NULL); *statep = state; + } /* * Search to see if name exists, and get back a pointer to it. @@ -1428,6 +1430,10 @@ xfs_attr_node_addname_find_attr( struct xfs_da_args *args = attr->xattri_da_args; int error; + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + /* * Search to see if name already exists, and get back a pointer * to where it should go. @@ -1593,7 +1599,7 @@ STATIC int xfs_attr_node_get( struct xfs_da_args *args) { - struct xfs_da_state *state; + struct xfs_da_state *state = NULL; struct xfs_da_state_blk *blk; int i; int error; From 356cb708ea1886d1f7734cc1a8c4708b35ce5d77 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 20 May 2022 14:41:47 +1000 Subject: [PATCH 190/334] xfs: reject unknown xattri log item operation flags during recovery Make sure we screen the op flags field of recovered xattr intent log items to reject flag bits that we don't know about. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 687cf517841a..ae227a56bbed 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -349,6 +349,7 @@ xfs_attr_log_item( */ attrp = &attrip->attri_format; attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + ASSERT(!(attr->xattri_op_flags & ~XFS_ATTR_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; attrp->alfi_value_len = attr->xattri_da_args->valuelen; attrp->alfi_name_len = attr->xattri_da_args->namelen; @@ -496,6 +497,9 @@ xfs_attri_validate( if (attrp->__pad != 0) return false; + if (attrp->alfi_op_flags & ~XFS_ATTR_OP_FLAGS_TYPE_MASK) + return false; + /* alfi_op_flags should be either a set or remove */ switch (op) { case XFS_ATTR_OP_FLAGS_SET: @@ -556,7 +560,8 @@ xfs_attri_item_recover( args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; - attr->xattri_op_flags = attrp->alfi_op_flags; + attr->xattri_op_flags = attrp->alfi_op_flags & + XFS_ATTR_OP_FLAGS_TYPE_MASK; args->dp = ip; args->geo = mp->m_attr_geo; @@ -567,7 +572,7 @@ xfs_attri_item_recover( args->attr_filter = attrp->alfi_attr_flags; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; - switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) { + switch (attr->xattri_op_flags) { case XFS_ATTR_OP_FLAGS_SET: case XFS_ATTR_OP_FLAGS_REPLACE: args->value = attrip->attri_value; From 85d76aec6bbb3dd0131511e73d48b2816e7ab8de Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 20 May 2022 14:42:15 +1000 Subject: [PATCH 191/334] xfs: reject unknown xattri log item filter flags during recovery Make sure we screen the "attr flags" field of recovered xattr intent log items to reject flag bits that we don't know about. This is really the attr *filter* field from xfs_da_args, so rename the field and create a mask to make checking for invalid bits easier. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_log_format.h | 10 +++++++++- fs/xfs/xfs_attr_item.c | 10 +++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f7edd1ecf6d9..a9d08f3d4682 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -911,6 +911,14 @@ struct xfs_icreate_log { #define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */ #define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) + /* * This is the structure used to lay out an attr log item in the * log. @@ -924,7 +932,7 @@ struct xfs_attri_log_format { uint32_t alfi_op_flags; /* marks the op as a set or remove */ uint32_t alfi_name_len; /* attr name length */ uint32_t alfi_value_len; /* attr value length */ - uint32_t alfi_attr_flags;/* attr flags */ + uint32_t alfi_attr_filter;/* attr filter flags */ }; struct xfs_attrd_log_format { diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index ae227a56bbed..fd0a74f3ef45 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -353,7 +353,8 @@ xfs_attr_log_item( attrp->alfi_op_flags = attr->xattri_op_flags; attrp->alfi_value_len = attr->xattri_da_args->valuelen; attrp->alfi_name_len = attr->xattri_da_args->namelen; - attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter; + ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; memcpy(attrip->attri_name, attr->xattri_da_args->name, attr->xattri_da_args->namelen); @@ -500,6 +501,9 @@ xfs_attri_validate( if (attrp->alfi_op_flags & ~XFS_ATTR_OP_FLAGS_TYPE_MASK) return false; + if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) + return false; + /* alfi_op_flags should be either a set or remove */ switch (op) { case XFS_ATTR_OP_FLAGS_SET: @@ -569,7 +573,7 @@ xfs_attri_item_recover( args->name = attrip->attri_name; args->namelen = attrp->alfi_name_len; args->hashval = xfs_da_hashname(args->name, args->namelen); - args->attr_filter = attrp->alfi_attr_flags; + args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; switch (attr->xattri_op_flags) { @@ -658,7 +662,7 @@ xfs_attri_item_relog( new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; new_attrp->alfi_value_len = old_attrp->alfi_value_len; new_attrp->alfi_name_len = old_attrp->alfi_name_len; - new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags; + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; memcpy(new_attrip->attri_name, old_attrip->attri_name, new_attrip->attri_name_len); From 25b1e9dc32299b98257835910f8e532d805b824f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 20 May 2022 14:42:36 +1000 Subject: [PATCH 192/334] xfs: validate xattr name earlier in recovery When we're validating a recovered xattr log item during log recovery, we should check the name before starting to allocate resources. This isn't strictly necessary on its own, but it means that we won't bother with huge memory allocations during recovery if the attr name is garbage, which will simplify the changes in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index fd0a74f3ef45..4976b1ddc09f 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -688,16 +688,23 @@ xlog_recover_attri_commit_pass2( struct xfs_mount *mp = log->l_mp; struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; + const void *attr_name; int region = 0; attri_formatp = item->ri_buf[region].i_addr; + attr_name = item->ri_buf[1].i_addr; - /* Validate xfs_attri_log_format */ + /* Validate xfs_attri_log_format before the large memory allocation */ if (!xfs_attri_validate(mp, attri_formatp)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } + if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + /* memory alloc failure will cause replay to abort */ attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len, attri_formatp->alfi_value_len); @@ -713,12 +720,6 @@ xlog_recover_attri_commit_pass2( memcpy(attrip->attri_name, item->ri_buf[region].i_addr, attrip->attri_name_len); - if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto out; - } - if (attrip->attri_value_len > 0) { region++; memcpy(attrip->attri_value, item->ri_buf[region].i_addr, From 2fe3ffcf5592f24a9f9131a91727673d19f2b2d9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 20 May 2022 14:42:49 +1000 Subject: [PATCH 193/334] xfs: free xfs_attrd_log_items correctly Technically speaking, objects allocated out of a specific slab cache are supposed to be freed to that slab cache. The popular slab backends will take care of this for us, but SLOB famously doesn't. Fix this, even if slob + xfs are not that common of a combination. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4976b1ddc09f..4f1ee8b91a17 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -233,7 +233,7 @@ STATIC void xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) { kmem_free(attrdp->attrd_item.li_lv_shadow); - kmem_free(attrdp); + kmem_cache_free(xfs_attrd_cache, attrdp); } STATIC void From 762c4e7fce551fbd617ae79e55fc8d9850627b8f Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Fri, 8 Apr 2022 16:30:07 +0800 Subject: [PATCH 194/334] pwm: sifive: Simplify if-if to if-else Use if and else instead of if(A) and if (!A). Signed-off-by: Wan Jiabing Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sifive.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c index 253c4a17d255..e6d05a329002 100644 --- a/drivers/pwm/pwm-sifive.c +++ b/drivers/pwm/pwm-sifive.c @@ -138,10 +138,9 @@ static int pwm_sifive_enable(struct pwm_chip *chip, bool enable) dev_err(ddata->chip.dev, "Enable clk failed\n"); return ret; } - } - - if (!enable) + } else { clk_disable(ddata->clk); + } return 0; } From daa986d5f8d8871e6691b0fe54375f263c754d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 28 Mar 2022 09:34:34 +0200 Subject: [PATCH 195/334] pwm: samsung: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). The size check for state->period is moved to .apply() to make sure that the values of state->duty_cycle and state->period are passed to pwm_samsung_config without change while they are discarded to int. Signed-off-by: Uwe Kleine-König Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- drivers/pwm/pwm-samsung.c | 54 ++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c index 0a4ff55fad04..9c5b4f515641 100644 --- a/drivers/pwm/pwm-samsung.c +++ b/drivers/pwm/pwm-samsung.c @@ -321,14 +321,6 @@ static int __pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm); u32 tin_ns = chan->tin_ns, tcnt, tcmp, oldtcmp; - /* - * We currently avoid using 64bit arithmetic by using the - * fact that anything faster than 1Hz is easily representable - * by 32bits. - */ - if (period_ns > NSEC_PER_SEC) - return -ERANGE; - tcnt = readl(our_chip->base + REG_TCNTB(pwm->hwpwm)); oldtcmp = readl(our_chip->base + REG_TCMPB(pwm->hwpwm)); @@ -438,13 +430,51 @@ static int pwm_samsung_set_polarity(struct pwm_chip *chip, return 0; } +static int pwm_samsung_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err, enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity) { + if (enabled) { + pwm_samsung_disable(chip, pwm); + enabled = false; + } + + err = pwm_samsung_set_polarity(chip, pwm, state->polarity); + if (err) + return err; + } + + if (!state->enabled) { + if (enabled) + pwm_samsung_disable(chip, pwm); + + return 0; + } + + /* + * We currently avoid using 64bit arithmetic by using the + * fact that anything faster than 1Hz is easily representable + * by 32bits. + */ + if (state->period > NSEC_PER_SEC) + return -ERANGE; + + err = pwm_samsung_config(chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = pwm_samsung_enable(chip, pwm); + + return err; +} + static const struct pwm_ops pwm_samsung_ops = { .request = pwm_samsung_request, .free = pwm_samsung_free, - .enable = pwm_samsung_enable, - .disable = pwm_samsung_disable, - .config = pwm_samsung_config, - .set_polarity = pwm_samsung_set_polarity, + .apply = pwm_samsung_apply, .owner = THIS_MODULE, }; From 6eb3af76ade388329a6c87ba41d38000184ae4e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2022 14:12:35 +0200 Subject: [PATCH 196/334] pwm: renesas-tpu: Make use of dev_err_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The added benefit is that the error code is mentioned in the error message and its usage is a bit more compact than open coding it. This also improves behaviour in case devm_clk_get() returns -EPROBE_DEFER. While touching this code, consistently start error messages with upper case. Reviewed-by: Geert Uytterhoeven Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-renesas-tpu.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 4381df90a527..c3ae78e37789 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -398,10 +398,8 @@ static int tpu_probe(struct platform_device *pdev) return PTR_ERR(tpu->base); tpu->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(tpu->clk)) { - dev_err(&pdev->dev, "cannot get clock\n"); - return PTR_ERR(tpu->clk); - } + if (IS_ERR(tpu->clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(tpu->clk), "Failed to get clock\n"); /* Initialize and register the device. */ platform_set_drvdata(pdev, tpu); @@ -414,9 +412,8 @@ static int tpu_probe(struct platform_device *pdev) ret = pwmchip_add(&tpu->chip); if (ret < 0) { - dev_err(&pdev->dev, "failed to register PWM chip\n"); pm_runtime_disable(&pdev->dev); - return ret; + return dev_err_probe(&pdev->dev, ret, "Failed to register PWM chip\n"); } return 0; From ff4bcd56c0495dd37f9e843a6aa0682abd7fae37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2022 14:12:36 +0200 Subject: [PATCH 197/334] pwm: renesas-tpu: Make use of devm functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This simplifies an error path in .probe() and allows to drop the .remove() function. Reviewed-by: Geert Uytterhoeven Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-renesas-tpu.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index c3ae78e37789..12376988c70e 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -408,24 +408,13 @@ static int tpu_probe(struct platform_device *pdev) tpu->chip.ops = &tpu_pwm_ops; tpu->chip.npwm = TPU_CHANNEL_MAX; - pm_runtime_enable(&pdev->dev); + ret = devm_pm_runtime_enable(&pdev->dev); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Failed to enable runtime PM\n"); - ret = pwmchip_add(&tpu->chip); - if (ret < 0) { - pm_runtime_disable(&pdev->dev); + ret = devm_pwmchip_add(&pdev->dev, &tpu->chip); + if (ret < 0) return dev_err_probe(&pdev->dev, ret, "Failed to register PWM chip\n"); - } - - return 0; -} - -static int tpu_remove(struct platform_device *pdev) -{ - struct tpu_device *tpu = platform_get_drvdata(pdev); - - pwmchip_remove(&tpu->chip); - - pm_runtime_disable(&pdev->dev); return 0; } @@ -444,7 +433,6 @@ MODULE_DEVICE_TABLE(of, tpu_of_table); static struct platform_driver tpu_driver = { .probe = tpu_probe, - .remove = tpu_remove, .driver = { .name = "renesas-tpu-pwm", .of_match_table = of_match_ptr(tpu_of_table), From ec00cd5e63f05461ab48128775c73c851c3c2b18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2022 14:12:37 +0200 Subject: [PATCH 198/334] pwm: renesas-tpu: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). As pwm->state might not be updated in tpu_pwm_apply() before calling tpu_pwm_config(), an additional parameter is needed for tpu_pwm_config() to not change the implemented logic. Signed-off-by: Uwe Kleine-König Reviewed-by: Geert Uytterhoeven Signed-off-by: Thierry Reding --- drivers/pwm/pwm-renesas-tpu.c | 44 ++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 12376988c70e..bebae65a6ed7 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -242,7 +242,7 @@ static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *_pwm) } static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, - int duty_ns, int period_ns) + int duty_ns, int period_ns, bool enabled) { static const unsigned int prescalers[] = { 1, 4, 16, 64 }; struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); @@ -293,7 +293,7 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, pwm->duty = duty; /* If the channel is disabled we're done. */ - if (!pwm_is_enabled(_pwm)) + if (!enabled) return 0; if (duty_only && pwm->timer_on) { @@ -366,13 +366,45 @@ static void tpu_pwm_disable(struct pwm_chip *chip, struct pwm_device *_pwm) tpu_pwm_timer_stop(pwm); } +static int tpu_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity) { + if (enabled) { + tpu_pwm_disable(chip, pwm); + enabled = false; + } + + err = tpu_pwm_set_polarity(chip, pwm, state->polarity); + if (err) + return err; + } + + if (!state->enabled) { + if (enabled) + tpu_pwm_disable(chip, pwm); + + return 0; + } + + err = tpu_pwm_config(pwm->chip, pwm, + state->duty_cycle, state->period, enabled); + if (err) + return err; + + if (!enabled) + err = tpu_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops tpu_pwm_ops = { .request = tpu_pwm_request, .free = tpu_pwm_free, - .config = tpu_pwm_config, - .set_polarity = tpu_pwm_set_polarity, - .enable = tpu_pwm_enable, - .disable = tpu_pwm_disable, + .apply = tpu_pwm_apply, .owner = THIS_MODULE, }; From 208ab8676b9c787b4285364fcb52a159f94b861d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2022 14:12:38 +0200 Subject: [PATCH 199/334] pwm: renesas-tpu: Rename variables to match the usual naming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver used "pwm" for struct tpu_pwm_device pointers. This name is usually only used for struct pwm_device pointers which this driver calls "_pwm". So rename to the driver data pointers to "tpd" which then allows to drop the underscore from "_pwm". Signed-off-by: Uwe Kleine-König Reviewed-by: Geert Uytterhoeven Signed-off-by: Thierry Reding --- drivers/pwm/pwm-renesas-tpu.c | 172 +++++++++++++++++----------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index bebae65a6ed7..5bced2050ece 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -89,71 +89,71 @@ struct tpu_device { #define to_tpu_device(c) container_of(c, struct tpu_device, chip) -static void tpu_pwm_write(struct tpu_pwm_device *pwm, int reg_nr, u16 value) +static void tpu_pwm_write(struct tpu_pwm_device *tpd, int reg_nr, u16 value) { - void __iomem *base = pwm->tpu->base + TPU_CHANNEL_OFFSET - + pwm->channel * TPU_CHANNEL_SIZE; + void __iomem *base = tpd->tpu->base + TPU_CHANNEL_OFFSET + + tpd->channel * TPU_CHANNEL_SIZE; iowrite16(value, base + reg_nr); } -static void tpu_pwm_set_pin(struct tpu_pwm_device *pwm, +static void tpu_pwm_set_pin(struct tpu_pwm_device *tpd, enum tpu_pin_state state) { static const char * const states[] = { "inactive", "PWM", "active" }; - dev_dbg(&pwm->tpu->pdev->dev, "%u: configuring pin as %s\n", - pwm->channel, states[state]); + dev_dbg(&tpd->tpu->pdev->dev, "%u: configuring pin as %s\n", + tpd->channel, states[state]); switch (state) { case TPU_PIN_INACTIVE: - tpu_pwm_write(pwm, TPU_TIORn, - pwm->polarity == PWM_POLARITY_INVERSED ? + tpu_pwm_write(tpd, TPU_TIORn, + tpd->polarity == PWM_POLARITY_INVERSED ? TPU_TIOR_IOA_1 : TPU_TIOR_IOA_0); break; case TPU_PIN_PWM: - tpu_pwm_write(pwm, TPU_TIORn, - pwm->polarity == PWM_POLARITY_INVERSED ? + tpu_pwm_write(tpd, TPU_TIORn, + tpd->polarity == PWM_POLARITY_INVERSED ? TPU_TIOR_IOA_0_SET : TPU_TIOR_IOA_1_CLR); break; case TPU_PIN_ACTIVE: - tpu_pwm_write(pwm, TPU_TIORn, - pwm->polarity == PWM_POLARITY_INVERSED ? + tpu_pwm_write(tpd, TPU_TIORn, + tpd->polarity == PWM_POLARITY_INVERSED ? TPU_TIOR_IOA_0 : TPU_TIOR_IOA_1); break; } } -static void tpu_pwm_start_stop(struct tpu_pwm_device *pwm, int start) +static void tpu_pwm_start_stop(struct tpu_pwm_device *tpd, int start) { unsigned long flags; u16 value; - spin_lock_irqsave(&pwm->tpu->lock, flags); - value = ioread16(pwm->tpu->base + TPU_TSTR); + spin_lock_irqsave(&tpd->tpu->lock, flags); + value = ioread16(tpd->tpu->base + TPU_TSTR); if (start) - value |= 1 << pwm->channel; + value |= 1 << tpd->channel; else - value &= ~(1 << pwm->channel); + value &= ~(1 << tpd->channel); - iowrite16(value, pwm->tpu->base + TPU_TSTR); - spin_unlock_irqrestore(&pwm->tpu->lock, flags); + iowrite16(value, tpd->tpu->base + TPU_TSTR); + spin_unlock_irqrestore(&tpd->tpu->lock, flags); } -static int tpu_pwm_timer_start(struct tpu_pwm_device *pwm) +static int tpu_pwm_timer_start(struct tpu_pwm_device *tpd) { int ret; - if (!pwm->timer_on) { + if (!tpd->timer_on) { /* Wake up device and enable clock. */ - pm_runtime_get_sync(&pwm->tpu->pdev->dev); - ret = clk_prepare_enable(pwm->tpu->clk); + pm_runtime_get_sync(&tpd->tpu->pdev->dev); + ret = clk_prepare_enable(tpd->tpu->clk); if (ret) { - dev_err(&pwm->tpu->pdev->dev, "cannot enable clock\n"); + dev_err(&tpd->tpu->pdev->dev, "cannot enable clock\n"); return ret; } - pwm->timer_on = true; + tpd->timer_on = true; } /* @@ -161,8 +161,8 @@ static int tpu_pwm_timer_start(struct tpu_pwm_device *pwm) * completely. First drive the pin to the inactive state to avoid * glitches. */ - tpu_pwm_set_pin(pwm, TPU_PIN_INACTIVE); - tpu_pwm_start_stop(pwm, false); + tpu_pwm_set_pin(tpd, TPU_PIN_INACTIVE); + tpu_pwm_start_stop(tpd, false); /* * - Clear TCNT on TGRB match @@ -172,80 +172,80 @@ static int tpu_pwm_timer_start(struct tpu_pwm_device *pwm) * - Output 1 until TGRA, output 0 until TGRB (active high polarity * - PWM mode */ - tpu_pwm_write(pwm, TPU_TCRn, TPU_TCR_CCLR_TGRB | TPU_TCR_CKEG_RISING | - pwm->prescaler); - tpu_pwm_write(pwm, TPU_TMDRn, TPU_TMDR_MD_PWM); - tpu_pwm_set_pin(pwm, TPU_PIN_PWM); - tpu_pwm_write(pwm, TPU_TGRAn, pwm->duty); - tpu_pwm_write(pwm, TPU_TGRBn, pwm->period); + tpu_pwm_write(tpd, TPU_TCRn, TPU_TCR_CCLR_TGRB | TPU_TCR_CKEG_RISING | + tpd->prescaler); + tpu_pwm_write(tpd, TPU_TMDRn, TPU_TMDR_MD_PWM); + tpu_pwm_set_pin(tpd, TPU_PIN_PWM); + tpu_pwm_write(tpd, TPU_TGRAn, tpd->duty); + tpu_pwm_write(tpd, TPU_TGRBn, tpd->period); - dev_dbg(&pwm->tpu->pdev->dev, "%u: TGRA 0x%04x TGRB 0x%04x\n", - pwm->channel, pwm->duty, pwm->period); + dev_dbg(&tpd->tpu->pdev->dev, "%u: TGRA 0x%04x TGRB 0x%04x\n", + tpd->channel, tpd->duty, tpd->period); /* Start the channel. */ - tpu_pwm_start_stop(pwm, true); + tpu_pwm_start_stop(tpd, true); return 0; } -static void tpu_pwm_timer_stop(struct tpu_pwm_device *pwm) +static void tpu_pwm_timer_stop(struct tpu_pwm_device *tpd) { - if (!pwm->timer_on) + if (!tpd->timer_on) return; /* Disable channel. */ - tpu_pwm_start_stop(pwm, false); + tpu_pwm_start_stop(tpd, false); /* Stop clock and mark device as idle. */ - clk_disable_unprepare(pwm->tpu->clk); - pm_runtime_put(&pwm->tpu->pdev->dev); + clk_disable_unprepare(tpd->tpu->clk); + pm_runtime_put(&tpd->tpu->pdev->dev); - pwm->timer_on = false; + tpd->timer_on = false; } /* ----------------------------------------------------------------------------- * PWM API */ -static int tpu_pwm_request(struct pwm_chip *chip, struct pwm_device *_pwm) +static int tpu_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct tpu_device *tpu = to_tpu_device(chip); - struct tpu_pwm_device *pwm; + struct tpu_pwm_device *tpd; - if (_pwm->hwpwm >= TPU_CHANNEL_MAX) + if (pwm->hwpwm >= TPU_CHANNEL_MAX) return -EINVAL; - pwm = kzalloc(sizeof(*pwm), GFP_KERNEL); - if (pwm == NULL) + tpd = kzalloc(sizeof(*tpd), GFP_KERNEL); + if (tpd == NULL) return -ENOMEM; - pwm->tpu = tpu; - pwm->channel = _pwm->hwpwm; - pwm->polarity = PWM_POLARITY_NORMAL; - pwm->prescaler = 0; - pwm->period = 0; - pwm->duty = 0; + tpd->tpu = tpu; + tpd->channel = pwm->hwpwm; + tpd->polarity = PWM_POLARITY_NORMAL; + tpd->prescaler = 0; + tpd->period = 0; + tpd->duty = 0; - pwm->timer_on = false; + tpd->timer_on = false; - pwm_set_chip_data(_pwm, pwm); + pwm_set_chip_data(pwm, tpd); return 0; } -static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *_pwm) +static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); - tpu_pwm_timer_stop(pwm); - kfree(pwm); + tpu_pwm_timer_stop(tpd); + kfree(tpd); } -static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, +static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns, bool enabled) { static const unsigned int prescalers[] = { 1, 4, 16, 64 }; - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); struct tpu_device *tpu = to_tpu_device(chip); unsigned int prescaler; bool duty_only = false; @@ -285,29 +285,29 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, "rate %u, prescaler %u, period %u, duty %u\n", clk_rate, prescalers[prescaler], period, duty); - if (pwm->prescaler == prescaler && pwm->period == period) + if (tpd->prescaler == prescaler && tpd->period == period) duty_only = true; - pwm->prescaler = prescaler; - pwm->period = period; - pwm->duty = duty; + tpd->prescaler = prescaler; + tpd->period = period; + tpd->duty = duty; /* If the channel is disabled we're done. */ if (!enabled) return 0; - if (duty_only && pwm->timer_on) { + if (duty_only && tpd->timer_on) { /* * If only the duty cycle changed and the timer is already * running, there's no need to reconfigure it completely, Just * modify the duty cycle. */ - tpu_pwm_write(pwm, TPU_TGRAn, pwm->duty); - dev_dbg(&tpu->pdev->dev, "%u: TGRA 0x%04x\n", pwm->channel, - pwm->duty); + tpu_pwm_write(tpd, TPU_TGRAn, tpd->duty); + dev_dbg(&tpu->pdev->dev, "%u: TGRA 0x%04x\n", tpd->channel, + tpd->duty); } else { /* Otherwise perform a full reconfiguration. */ - ret = tpu_pwm_timer_start(pwm); + ret = tpu_pwm_timer_start(tpd); if (ret < 0) return ret; } @@ -317,29 +317,29 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, * To avoid running the timer when not strictly required, handle * 0% and 100% duty cycles as fixed levels and stop the timer. */ - tpu_pwm_set_pin(pwm, duty ? TPU_PIN_ACTIVE : TPU_PIN_INACTIVE); - tpu_pwm_timer_stop(pwm); + tpu_pwm_set_pin(tpd, duty ? TPU_PIN_ACTIVE : TPU_PIN_INACTIVE); + tpu_pwm_timer_stop(tpd); } return 0; } -static int tpu_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *_pwm, +static int tpu_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm, enum pwm_polarity polarity) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); - pwm->polarity = polarity; + tpd->polarity = polarity; return 0; } -static int tpu_pwm_enable(struct pwm_chip *chip, struct pwm_device *_pwm) +static int tpu_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); int ret; - ret = tpu_pwm_timer_start(pwm); + ret = tpu_pwm_timer_start(tpd); if (ret < 0) return ret; @@ -347,23 +347,23 @@ static int tpu_pwm_enable(struct pwm_chip *chip, struct pwm_device *_pwm) * To avoid running the timer when not strictly required, handle 0% and * 100% duty cycles as fixed levels and stop the timer. */ - if (pwm->duty == 0 || pwm->duty == pwm->period) { - tpu_pwm_set_pin(pwm, pwm->duty ? + if (tpd->duty == 0 || tpd->duty == tpd->period) { + tpu_pwm_set_pin(tpd, tpd->duty ? TPU_PIN_ACTIVE : TPU_PIN_INACTIVE); - tpu_pwm_timer_stop(pwm); + tpu_pwm_timer_stop(tpd); } return 0; } -static void tpu_pwm_disable(struct pwm_chip *chip, struct pwm_device *_pwm) +static void tpu_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); /* The timer must be running to modify the pin output configuration. */ - tpu_pwm_timer_start(pwm); - tpu_pwm_set_pin(pwm, TPU_PIN_INACTIVE); - tpu_pwm_timer_stop(pwm); + tpu_pwm_timer_start(tpd); + tpu_pwm_set_pin(tpd, TPU_PIN_INACTIVE); + tpu_pwm_timer_stop(tpd); } static int tpu_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, From 3c173376efc461dc670b02ba2846c2a533491104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2022 14:12:39 +0200 Subject: [PATCH 200/334] pwm: renesas-tpu: Improve maths to compute register settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newly computed register values are intended to exactly match the previously computed values. The main improvement is that the prescaler is computed without a loop that involves two divisions in each step. This uses the fact, that prescalers[i] = 1 << (2 * i). Assuming a moderately smart compiler, the needed number of divisions for the case where the requested period is too big, is reduced from 5 to 2. Signed-off-by: Uwe Kleine-König Reviewed-by: Geert Uytterhoeven Signed-off-by: Thierry Reding --- drivers/pwm/pwm-renesas-tpu.c | 52 ++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 5bced2050ece..4aff3870010c 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -244,7 +244,6 @@ static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns, bool enabled) { - static const unsigned int prescalers[] = { 1, 4, 16, 64 }; struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); struct tpu_device *tpu = to_tpu_device(chip); unsigned int prescaler; @@ -254,26 +253,47 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, u32 duty; int ret; - /* - * Pick a prescaler to avoid overflowing the counter. - * TODO: Pick the highest acceptable prescaler. - */ clk_rate = clk_get_rate(tpu->clk); - for (prescaler = 0; prescaler < ARRAY_SIZE(prescalers); ++prescaler) { - period = clk_rate / prescalers[prescaler] - / (NSEC_PER_SEC / period_ns); - if (period <= 0xffff) - break; + period = clk_rate / (NSEC_PER_SEC / period_ns); + + /* + * Find the minimal prescaler in [0..3] such that + * + * period >> (2 * prescaler) < 0x10000 + * + * This could be calculated using something like: + * + * prescaler = max(ilog2(period) / 2, 7) - 7; + * + * but given there are only four allowed results and that ilog2 isn't + * cheap on all platforms using a switch statement is more effective. + */ + switch (period) { + case 1 ... 0xffff: + prescaler = 0; + break; + + case 0x10000 ... 0x3ffff: + prescaler = 1; + break; + + case 0x40000 ... 0xfffff: + prescaler = 2; + break; + + case 0x100000 ... 0x3fffff: + prescaler = 3; + break; + + default: + return -EINVAL; } - if (prescaler == ARRAY_SIZE(prescalers) || period == 0) { - dev_err(&tpu->pdev->dev, "clock rate mismatch\n"); - return -ENOTSUPP; - } + period >>= 2 * prescaler; if (duty_ns) { - duty = clk_rate / prescalers[prescaler] + duty = (clk_rate >> 2 * prescaler) / (NSEC_PER_SEC / duty_ns); if (duty > period) return -EINVAL; @@ -283,7 +303,7 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, dev_dbg(&tpu->pdev->dev, "rate %u, prescaler %u, period %u, duty %u\n", - clk_rate, prescalers[prescaler], period, duty); + clk_rate, 1 << (2 * prescaler), period, duty); if (tpd->prescaler == prescaler && tpd->period == period) duty_only = true; From 615f4e84461b71e5fed01d9f6d9d98ef3dd1d452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 20 Apr 2022 14:12:40 +0200 Subject: [PATCH 201/334] pwm: renesas-tpu: Improve precision of period and duty_cycle calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dividing by the result of a division looses precision. Consider for example clk_rate = 33000000 and period_ns = 500001. Then clk_rate / (NSEC_PER_SEC / period_ns) has the exact value 16500.033, but in C this evaluates to 16508. It gets worse for even bigger values of period_ns, so with period_ns = 500000001, the exact result is 16500000.033 while in C we get 33000000. For that reason use clk_rate * period_ns / NSEC_PER_SEC instead which doesn't suffer from this problem. To ensure this doesn't overflow add a safeguard check for clk_rate. Note that duty > period can never happen, so the respective check can be dropped. Incidentally this fixes a division by zero if period_ns > NSEC_PER_SEC. Another side effect is that values bigger than INT_MAX for period and duty_cyle are not wrongly discarded any more. Fixes: 99b82abb0a35 ("pwm: Add Renesas TPU PWM driver") Signed-off-by: Uwe Kleine-König Reviewed-by: Geert Uytterhoeven Signed-off-by: Thierry Reding --- drivers/pwm/pwm-renesas-tpu.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 4aff3870010c..d7311614c846 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -242,20 +242,29 @@ static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns, bool enabled) + u64 duty_ns, u64 period_ns, bool enabled) { struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); struct tpu_device *tpu = to_tpu_device(chip); unsigned int prescaler; bool duty_only = false; u32 clk_rate; - u32 period; + u64 period; u32 duty; int ret; clk_rate = clk_get_rate(tpu->clk); + if (unlikely(clk_rate > NSEC_PER_SEC)) { + /* + * This won't happen in the nearer future, so this is only a + * safeguard to prevent the following calculation from + * overflowing. With this clk_rate * period_ns / NSEC_PER_SEC is + * not greater than period_ns and so fits into an u64. + */ + return -EINVAL; + } - period = clk_rate / (NSEC_PER_SEC / period_ns); + period = mul_u64_u64_div_u64(clk_rate, period_ns, NSEC_PER_SEC); /* * Find the minimal prescaler in [0..3] such that @@ -292,18 +301,15 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, period >>= 2 * prescaler; - if (duty_ns) { - duty = (clk_rate >> 2 * prescaler) - / (NSEC_PER_SEC / duty_ns); - if (duty > period) - return -EINVAL; - } else { + if (duty_ns) + duty = mul_u64_u64_div_u64(clk_rate, duty_ns, + (u64)NSEC_PER_SEC << (2 * prescaler)); + else duty = 0; - } dev_dbg(&tpu->pdev->dev, "rate %u, prescaler %u, period %u, duty %u\n", - clk_rate, 1 << (2 * prescaler), period, duty); + clk_rate, 1 << (2 * prescaler), (u32)period, duty); if (tpd->prescaler == prescaler && tpd->period == period) duty_only = true; From 8c193f4714df136a6747fb66f4218134771092be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 25 Apr 2022 15:22:44 +0200 Subject: [PATCH 202/334] pwm: tegra: Optimize period calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dividing by the result of a division looses precision because the result is rounded twice. E.g. with clk_rate = 48000000 and period = 32760033 the following numbers result: rate = pc->clk_rate >> PWM_DUTY_WIDTH = 187500 hz = DIV_ROUND_CLOSEST_ULL(100ULL * NSEC_PER_SEC, period_ns) = 3052 rate = DIV_ROUND_CLOSEST_ULL(100ULL * rate, hz) = 6144 The exact result would be 6142.5061875 and (apart from rounding) this is found by using a single division. As a side effect is also a tad cheaper to calculate. Also using clk_rate >> PWM_DUTY_WIDTH looses precision. Consider for example clk_rate = 47999999 and period = 106667: mul_u64_u64_div_u64(pc->clk_rate >> PWM_DUTY_WIDTH, period_ns, NSEC_PER_SEC) = 19 mul_u64_u64_div_u64(pc->clk_rate, period_ns, NSEC_PER_SEC << PWM_DUTY_WIDTH) = 20 (The exact result is 20.000062083332033.) With this optimizations also switch from round-closest to round-down for the period calculation. Given that the calculations were non-optimal for quite some time now with variations in both directions which nobody reported as a problem, this is the opportunity to align the driver's behavior to the requirements of new drivers. This has several upsides: - Implementation is easier as there are no round-nearest variants of mul_u64_u64_div_u64(). - Requests for too small periods are now consistently refused. This was kind of arbitrary before, where period_ns < min_period_ns was refused, but in some cases min_period_ns isn't actually implementable and then values between min_period_ns and the actual minimum were rounded up to the actual minimum. Note that the duty_cycle calculation isn't using the usual round-down approach yet. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-tegra.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index e5a9ffef4a71..7fc03a9ec154 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -99,7 +99,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct tegra_pwm_chip *pc = to_tegra_pwm_chip(chip); - unsigned long long c = duty_ns, hz; + unsigned long long c = duty_ns; unsigned long rate, required_clk_rate; u32 val = 0; int err; @@ -156,11 +156,9 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, pc->clk_rate = clk_get_rate(pc->clk); } - rate = pc->clk_rate >> PWM_DUTY_WIDTH; - /* Consider precision in PWM_SCALE_WIDTH rate calculation */ - hz = DIV_ROUND_CLOSEST_ULL(100ULL * NSEC_PER_SEC, period_ns); - rate = DIV_ROUND_CLOSEST_ULL(100ULL * rate, hz); + rate = mul_u64_u64_div_u64(pc->clk_rate, period_ns, + (u64)NSEC_PER_SEC << PWM_DUTY_WIDTH); /* * Since the actual PWM divider is the register's frequency divider @@ -169,6 +167,8 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, */ if (rate > 0) rate--; + else + return -EINVAL; /* * Make sure that the rate will fit in the register's frequency From b76160954cb0fc8b4a389e54daf22a87d4f44fbe Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 3 May 2022 12:54:05 +0200 Subject: [PATCH 203/334] dt-bindings: pwm: pwm-mediatek: Add documentation for MT6795 SoC Add binding documentation for the MT6795 Helio X10 SoC. Signed-off-by: AngeloGioacchino Del Regno Acked-by: Krzysztof Kozlowski Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/pwm-mediatek.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt index 25ed214473d7..033d1fc0f405 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt @@ -3,6 +3,7 @@ MediaTek PWM controller Required properties: - compatible: should be "mediatek,-pwm": - "mediatek,mt2712-pwm": found on mt2712 SoC. + - "mediatek,mt6795-pwm": found on mt6795 SoC. - "mediatek,mt7622-pwm": found on mt7622 SoC. - "mediatek,mt7623-pwm": found on mt7623 SoC. - "mediatek,mt7628-pwm": found on mt7628 SoC. From cb696e74892b266bd7e03137b5e5c1c4e92bf2ea Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Tue, 3 May 2022 12:54:04 +0200 Subject: [PATCH 204/334] pwm: pwm-mediatek: Add support for MediaTek Helio X10 MT6795 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MediaTek Helio X10 MT6795 SoC has 7 PWMs: add a compatible string to use the right match data. Signed-off-by: AngeloGioacchino Del Regno Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 568b13a48717..28c2d0e5a8ac 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -264,6 +264,12 @@ static const struct pwm_mediatek_of_data mt2712_pwm_data = { .has_ck_26m_sel = false, }; +static const struct pwm_mediatek_of_data mt6795_pwm_data = { + .num_pwms = 7, + .pwm45_fixup = false, + .has_ck_26m_sel = false, +}; + static const struct pwm_mediatek_of_data mt7622_pwm_data = { .num_pwms = 6, .pwm45_fixup = false, @@ -302,6 +308,7 @@ static const struct pwm_mediatek_of_data mt8516_pwm_data = { static const struct of_device_id pwm_mediatek_of_match[] = { { .compatible = "mediatek,mt2712-pwm", .data = &mt2712_pwm_data }, + { .compatible = "mediatek,mt6795-pwm", .data = &mt6795_pwm_data }, { .compatible = "mediatek,mt7622-pwm", .data = &mt7622_pwm_data }, { .compatible = "mediatek,mt7623-pwm", .data = &mt7623_pwm_data }, { .compatible = "mediatek,mt7628-pwm", .data = &mt7628_pwm_data }, From b2e60b32b4fe5da4cf7fedd976416b5f47f62332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 May 2022 16:09:52 +0200 Subject: [PATCH 205/334] pwm: sti: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushed a variant of pwm_apply_legacy() into the driver that was slightly simplified because the driver doesn't provide a .set_polarity() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sti.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c index f491d56254d7..44b1f93256b3 100644 --- a/drivers/pwm/pwm-sti.c +++ b/drivers/pwm/pwm-sti.c @@ -391,11 +391,34 @@ out: return ret; } +static int sti_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + sti_pwm_disable(chip, pwm); + + return 0; + } + + err = sti_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = sti_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops sti_pwm_ops = { .capture = sti_pwm_capture, - .config = sti_pwm_config, - .enable = sti_pwm_enable, - .disable = sti_pwm_disable, + .apply = sti_pwm_apply, .free = sti_pwm_free, .owner = THIS_MODULE, }; From 57c95faabf094192049bffa5599ae6ab5ff88edf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 May 2022 16:15:35 +0200 Subject: [PATCH 206/334] pwm: stmpe: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushed a variant of pwm_apply_legacy() into the driver that was slightly simplified because the driver doesn't provide a .set_polarity() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-stmpe.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-stmpe.c b/drivers/pwm/pwm-stmpe.c index c4336d3bace3..5d4a4762ce0c 100644 --- a/drivers/pwm/pwm-stmpe.c +++ b/drivers/pwm/pwm-stmpe.c @@ -259,10 +259,33 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +static int stmpe_24xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + stmpe_24xx_pwm_disable(chip, pwm); + + return 0; + } + + err = stmpe_24xx_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = stmpe_24xx_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops stmpe_24xx_pwm_ops = { - .config = stmpe_24xx_pwm_config, - .enable = stmpe_24xx_pwm_enable, - .disable = stmpe_24xx_pwm_disable, + .apply = stmpe_24xx_pwm_apply, .owner = THIS_MODULE, }; From fd3ddd4355c0e5388b40d48321e2e834393a4912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 May 2022 18:48:06 +0200 Subject: [PATCH 207/334] pwm: tegra: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushed a variant of pwm_apply_legacy() into the driver that was slightly simplified because the driver doesn't provide a .set_polarity() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-tegra.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index 7fc03a9ec154..dad9978c9186 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -230,10 +230,34 @@ static void tegra_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) pm_runtime_put_sync(pc->dev); } +static int tegra_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (enabled) + tegra_pwm_disable(chip, pwm); + + return 0; + } + + err = tegra_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!enabled) + err = tegra_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops tegra_pwm_ops = { - .config = tegra_pwm_config, - .enable = tegra_pwm_enable, - .disable = tegra_pwm_disable, + .apply = tegra_pwm_apply, .owner = THIS_MODULE, }; From 5fa3b87fe8fb1ce941c6418539f7b8062c0dea9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 7 May 2022 10:29:01 +0200 Subject: [PATCH 208/334] pwm: lpc32xx: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushed a variant of pwm_apply_legacy() into the driver that was slightly simplified because the driver doesn't provide a .set_polarity() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpc32xx.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index ddeab5687cb8..86a0ea0f6955 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -88,10 +88,33 @@ static void lpc32xx_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) clk_disable_unprepare(lpc32xx->clk); } +static int lpc32xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + lpc32xx_pwm_disable(chip, pwm); + + return 0; + } + + err = lpc32xx_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = lpc32xx_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops lpc32xx_pwm_ops = { - .config = lpc32xx_pwm_config, - .enable = lpc32xx_pwm_enable, - .disable = lpc32xx_pwm_disable, + .apply = lpc32xx_pwm_apply, .owner = THIS_MODULE, }; From 758de66f4bd2cac2b1d71db917c65c3d611d4e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 7 May 2022 10:43:37 +0200 Subject: [PATCH 209/334] pwm: mediatek: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushed a variant of pwm_apply_legacy() into the driver that was slightly simplified because the driver doesn't provide a .set_polarity() callback. Signed-off-by: Uwe Kleine-König Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 28c2d0e5a8ac..d28c0874c7f2 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -198,10 +198,33 @@ static void pwm_mediatek_disable(struct pwm_chip *chip, struct pwm_device *pwm) pwm_mediatek_clk_disable(chip, pwm); } +static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + pwm_mediatek_disable(chip, pwm); + + return 0; + } + + err = pwm_mediatek_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = pwm_mediatek_enable(chip, pwm); + + return err; +} + static const struct pwm_ops pwm_mediatek_ops = { - .config = pwm_mediatek_config, - .enable = pwm_mediatek_enable, - .disable = pwm_mediatek_disable, + .apply = pwm_mediatek_apply, .owner = THIS_MODULE, }; From c449a8ca5ea4da2465e7b399ce18e5c5bc824c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 8 May 2022 11:40:32 +0200 Subject: [PATCH 210/334] pwm: lpc18xx: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This pushes a variant of pwm_apply_legacy into the driver that was slightly simplified because the .set_polarity callback was a noop. There is no change in behavior. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpc18xx-sct.c | 43 ++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c index b909096dba2f..272e0b5d01b8 100644 --- a/drivers/pwm/pwm-lpc18xx-sct.c +++ b/drivers/pwm/pwm-lpc18xx-sct.c @@ -226,14 +226,7 @@ static int lpc18xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } -static int lpc18xx_pwm_set_polarity(struct pwm_chip *chip, - struct pwm_device *pwm, - enum pwm_polarity polarity) -{ - return 0; -} - -static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) +static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm, enum pwm_polarity polarity) { struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip); struct lpc18xx_pwm_data *lpc18xx_data = &lpc18xx_pwm->channeldata[pwm->hwpwm]; @@ -249,7 +242,7 @@ static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) LPC18XX_PWM_EVSTATEMSK(lpc18xx_data->duty_event), LPC18XX_PWM_EVSTATEMSK_ALL); - if (pwm_get_polarity(pwm) == PWM_POLARITY_NORMAL) { + if (polarity == PWM_POLARITY_NORMAL) { set_event = lpc18xx_pwm->period_event; clear_event = lpc18xx_data->duty_event; res_action = LPC18XX_PWM_RES_SET; @@ -308,11 +301,35 @@ static void lpc18xx_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) clear_bit(lpc18xx_data->duty_event, &lpc18xx_pwm->event_map); } +static int lpc18xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity && pwm->state.enabled) { + lpc18xx_pwm_disable(chip, pwm); + enabled = false; + } + + if (!state->enabled) { + if (enabled) + lpc18xx_pwm_disable(chip, pwm); + + return 0; + } + + err = lpc18xx_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!enabled) + err = lpc18xx_pwm_enable(chip, pwm, state->polarity); + + return err; +} static const struct pwm_ops lpc18xx_pwm_ops = { - .config = lpc18xx_pwm_config, - .set_polarity = lpc18xx_pwm_set_polarity, - .enable = lpc18xx_pwm_enable, - .disable = lpc18xx_pwm_disable, + .apply = lpc18xx_pwm_apply, .request = lpc18xx_pwm_request, .free = lpc18xx_pwm_free, .owner = THIS_MODULE, From a1bbf823e5e9178ec2f70dd52345fd1a57efe145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 17 May 2022 22:07:09 +0200 Subject: [PATCH 211/334] pwm: twl-led: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushed a variant of pwm_apply_legacy() into the driver that was slightly simplified because the driver doesn't provide a .set_polarity() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-twl-led.c | 76 +++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 10 deletions(-) diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c index 49d9f7a78012..ed0b63dd38f1 100644 --- a/drivers/pwm/pwm-twl-led.c +++ b/drivers/pwm/pwm-twl-led.c @@ -137,6 +137,45 @@ out: mutex_unlock(&twl->mutex); } +static int twl4030_pwmled_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int ret; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl4030_pwmled_disable(chip, pwm); + + return 0; + } + + /* + * We cannot skip calling ->config even if state->period == + * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle + * because we might have exited early in the last call to + * pwm_apply_state because of !state->enabled and so the two values in + * pwm->state might not be configured in hardware. + */ + ret = twl4030_pwmled_config(pwm->chip, pwm, + state->duty_cycle, state->period); + if (ret) + return ret; + + if (!pwm->state.enabled) + ret = twl4030_pwmled_enable(chip, pwm); + + return ret; +} + + +static const struct pwm_ops twl4030_pwmled_ops = { + .apply = twl4030_pwmled_apply, + .owner = THIS_MODULE, +}; + static int twl6030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { @@ -206,6 +245,32 @@ out: mutex_unlock(&twl->mutex); } +static int twl6030_pwmled_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != pwm->state.polarity) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl6030_pwmled_disable(chip, pwm); + + return 0; + } + + err = twl6030_pwmled_config(pwm->chip, pwm, + state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = twl6030_pwmled_enable(chip, pwm); + + return err; +} + static int twl6030_pwmled_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct twl_pwmled_chip *twl = to_twl(chip); @@ -257,17 +322,8 @@ out: mutex_unlock(&twl->mutex); } -static const struct pwm_ops twl4030_pwmled_ops = { - .enable = twl4030_pwmled_enable, - .disable = twl4030_pwmled_disable, - .config = twl4030_pwmled_config, - .owner = THIS_MODULE, -}; - static const struct pwm_ops twl6030_pwmled_ops = { - .enable = twl6030_pwmled_enable, - .disable = twl6030_pwmled_disable, - .config = twl6030_pwmled_config, + .apply = twl6030_pwmled_apply, .request = twl6030_pwmled_request, .free = twl6030_pwmled_free, .owner = THIS_MODULE, From 80a22fde803af6f390be49ee5ced6ee75595ba05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 17 May 2022 17:05:55 +0200 Subject: [PATCH 212/334] pwm: Document that the pinstate of a disabled PWM isn't reliable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some (most?) drivers emit the inactive state when the PWM is disabled. However there are exceptions, so document that a consumer better doesn't depend on this behaviour. Some known exceptions: - imx27 emits 0 independent of the configured polarity - mxs just drives the output to the last emitted state. - iqs620a makes the output tristated on disable, so an external pull-down would be required. Signed-off-by: Uwe Kleine-König Acked-by: Alexander Stein Signed-off-by: Thierry Reding --- Documentation/driver-api/pwm.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst index ccb06e485756..fd26c3d895b6 100644 --- a/Documentation/driver-api/pwm.rst +++ b/Documentation/driver-api/pwm.rst @@ -49,6 +49,12 @@ After being requested, a PWM has to be configured using:: This API controls both the PWM period/duty_cycle config and the enable/disable state. + +As a consumer, don't rely on the output's state for a disabled PWM. If it's +easily possible, drivers are supposed to emit the inactive state, but some +drivers cannot. If you rely on getting the inactive state, use .duty_cycle=0, +.enabled=true. + There is also a usage_power setting: If set, the PWM driver is only required to maintain the power output but has more freedom regarding signal form. If supported by the driver, the signal can be optimized, for example to improve From 84d0940454a3bca6c7c59cf16b6de2f173e81a11 Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Thu, 28 Apr 2022 10:04:18 +0000 Subject: [PATCH 213/334] dt-bindings: Add mfd/cros_ec definitions Add a dt-bindings include file for cros_ec devicetree definition, define a pair of special purpose PWM channels in it. Signed-off-by: Fabio Baltieri Acked-by: Rob Herring Signed-off-by: Thierry Reding --- include/dt-bindings/mfd/cros_ec.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 include/dt-bindings/mfd/cros_ec.h diff --git a/include/dt-bindings/mfd/cros_ec.h b/include/dt-bindings/mfd/cros_ec.h new file mode 100644 index 000000000000..3b29cd049578 --- /dev/null +++ b/include/dt-bindings/mfd/cros_ec.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * DTS binding definitions used for the Chromium OS Embedded Controller. + * + * Copyright (c) 2022 The Chromium OS Authors. All rights reserved. + */ + +#ifndef _DT_BINDINGS_MFD_CROS_EC_H +#define _DT_BINDINGS_MFD_CROS_EC_H + +/* Typed channel for keyboard backlight. */ +#define CROS_EC_PWM_DT_KB_LIGHT 0 +/* Typed channel for display backlight. */ +#define CROS_EC_PWM_DT_DISPLAY_LIGHT 1 +/* Number of typed channels. */ +#define CROS_EC_PWM_DT_COUNT 2 + +#endif From a48d66d87274b7ec538cdf82abc8d3ebf4bf0363 Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Thu, 28 Apr 2022 10:04:20 +0000 Subject: [PATCH 214/334] dt-bindings: google,cros-ec-pwm: Add the new -type compatible Update google,cros-ec-pwm node documentation to mention the google,cros-ec-pwm-type compatible as a valid alternative. Signed-off-by: Fabio Baltieri Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- .../devicetree/bindings/pwm/google,cros-ec-pwm.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml index 7ab6912a845f..c8577bdf6c94 100644 --- a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml @@ -21,7 +21,14 @@ allOf: properties: compatible: - const: google,cros-ec-pwm + oneOf: + - description: PWM controlled using EC_PWM_TYPE_GENERIC channels. + items: + - const: google,cros-ec-pwm + - description: PWM controlled using CROS_EC_PWM_DT_<...> types. + items: + - const: google,cros-ec-pwm-type + "#pwm-cells": description: The cell specifies the PWM index. const: 1 From 3d593b6e80ad2c911b5645af28d83eabb96e7c1b Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Thu, 28 Apr 2022 10:04:19 +0000 Subject: [PATCH 215/334] pwm: pwm-cros-ec: Add channel type support Add support for EC_PWM_TYPE_DISPLAY_LIGHT and EC_PWM_TYPE_KB_LIGHT pwm types to the PWM cros_ec_pwm driver. This allows specifying one of these PWM channel by functionality, and let the EC firmware pick the correct channel, thus abstracting the hardware implementation from the kernel driver. To use it, define the node with the "google,cros-ec-pwm-type" compatible. Signed-off-by: Fabio Baltieri Reviewed-by: Tzung-Bi Shih Signed-off-by: Thierry Reding --- drivers/pwm/pwm-cros-ec.c | 82 ++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 5e29d9c682c3..7f10f56c3eb6 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -12,17 +12,21 @@ #include #include +#include + /** * struct cros_ec_pwm_device - Driver data for EC PWM * * @dev: Device node * @ec: Pointer to EC device * @chip: PWM controller chip + * @use_pwm_type: Use PWM types instead of generic channels */ struct cros_ec_pwm_device { struct device *dev; struct cros_ec_device *ec; struct pwm_chip chip; + bool use_pwm_type; }; /** @@ -58,14 +62,31 @@ static void cros_ec_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) kfree(channel); } -static int cros_ec_pwm_set_duty(struct cros_ec_device *ec, u8 index, u16 duty) +static int cros_ec_dt_type_to_pwm_type(u8 dt_index, u8 *pwm_type) { + switch (dt_index) { + case CROS_EC_PWM_DT_KB_LIGHT: + *pwm_type = EC_PWM_TYPE_KB_LIGHT; + return 0; + case CROS_EC_PWM_DT_DISPLAY_LIGHT: + *pwm_type = EC_PWM_TYPE_DISPLAY_LIGHT; + return 0; + default: + return -EINVAL; + } +} + +static int cros_ec_pwm_set_duty(struct cros_ec_pwm_device *ec_pwm, u8 index, + u16 duty) +{ + struct cros_ec_device *ec = ec_pwm->ec; struct { struct cros_ec_command msg; struct ec_params_pwm_set_duty params; } __packed buf; struct ec_params_pwm_set_duty *params = &buf.params; struct cros_ec_command *msg = &buf.msg; + int ret; memset(&buf, 0, sizeof(buf)); @@ -75,14 +96,25 @@ static int cros_ec_pwm_set_duty(struct cros_ec_device *ec, u8 index, u16 duty) msg->outsize = sizeof(*params); params->duty = duty; - params->pwm_type = EC_PWM_TYPE_GENERIC; - params->index = index; + + if (ec_pwm->use_pwm_type) { + ret = cros_ec_dt_type_to_pwm_type(index, ¶ms->pwm_type); + if (ret) { + dev_err(ec->dev, "Invalid PWM type index: %d\n", index); + return ret; + } + params->index = 0; + } else { + params->pwm_type = EC_PWM_TYPE_GENERIC; + params->index = index; + } return cros_ec_cmd_xfer_status(ec, msg); } -static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) +static int cros_ec_pwm_get_duty(struct cros_ec_pwm_device *ec_pwm, u8 index) { + struct cros_ec_device *ec = ec_pwm->ec; struct { struct cros_ec_command msg; union { @@ -102,8 +134,17 @@ static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) msg->insize = sizeof(*resp); msg->outsize = sizeof(*params); - params->pwm_type = EC_PWM_TYPE_GENERIC; - params->index = index; + if (ec_pwm->use_pwm_type) { + ret = cros_ec_dt_type_to_pwm_type(index, ¶ms->pwm_type); + if (ret) { + dev_err(ec->dev, "Invalid PWM type index: %d\n", index); + return ret; + } + params->index = 0; + } else { + params->pwm_type = EC_PWM_TYPE_GENERIC; + params->index = index; + } ret = cros_ec_cmd_xfer_status(ec, msg); if (ret < 0) @@ -133,7 +174,7 @@ static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, */ duty_cycle = state->enabled ? state->duty_cycle : 0; - ret = cros_ec_pwm_set_duty(ec_pwm->ec, pwm->hwpwm, duty_cycle); + ret = cros_ec_pwm_set_duty(ec_pwm, pwm->hwpwm, duty_cycle); if (ret < 0) return ret; @@ -149,7 +190,7 @@ static void cros_ec_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, struct cros_ec_pwm *channel = pwm_get_chip_data(pwm); int ret; - ret = cros_ec_pwm_get_duty(ec_pwm->ec, pwm->hwpwm); + ret = cros_ec_pwm_get_duty(ec_pwm, pwm->hwpwm); if (ret < 0) { dev_err(chip->dev, "error getting initial duty: %d\n", ret); return; @@ -204,13 +245,13 @@ static const struct pwm_ops cros_ec_pwm_ops = { * of PWMs it supports directly, so we have to read the pwm duty cycle for * subsequent channels until we get an error. */ -static int cros_ec_num_pwms(struct cros_ec_device *ec) +static int cros_ec_num_pwms(struct cros_ec_pwm_device *ec_pwm) { int i, ret; /* The index field is only 8 bits */ for (i = 0; i <= U8_MAX; i++) { - ret = cros_ec_pwm_get_duty(ec, i); + ret = cros_ec_pwm_get_duty(ec_pwm, i); /* * We look for SUCCESS, INVALID_COMMAND, or INVALID_PARAM * responses; everything else is treated as an error. @@ -236,6 +277,7 @@ static int cros_ec_pwm_probe(struct platform_device *pdev) { struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent); struct device *dev = &pdev->dev; + struct device_node *np = pdev->dev.of_node; struct cros_ec_pwm_device *ec_pwm; struct pwm_chip *chip; int ret; @@ -251,17 +293,26 @@ static int cros_ec_pwm_probe(struct platform_device *pdev) chip = &ec_pwm->chip; ec_pwm->ec = ec; + if (of_device_is_compatible(np, "google,cros-ec-pwm-type")) + ec_pwm->use_pwm_type = true; + /* PWM chip */ chip->dev = dev; chip->ops = &cros_ec_pwm_ops; chip->of_xlate = cros_ec_pwm_xlate; chip->of_pwm_n_cells = 1; - ret = cros_ec_num_pwms(ec); - if (ret < 0) { - dev_err(dev, "Couldn't find PWMs: %d\n", ret); - return ret; + + if (ec_pwm->use_pwm_type) { + chip->npwm = CROS_EC_PWM_DT_COUNT; + } else { + ret = cros_ec_num_pwms(ec_pwm); + if (ret < 0) { + dev_err(dev, "Couldn't find PWMs: %d\n", ret); + return ret; + } + chip->npwm = ret; } - chip->npwm = ret; + dev_dbg(dev, "Probed %u PWMs\n", chip->npwm); ret = pwmchip_add(chip); @@ -288,6 +339,7 @@ static int cros_ec_pwm_remove(struct platform_device *dev) #ifdef CONFIG_OF static const struct of_device_id cros_ec_pwm_of_match[] = { { .compatible = "google,cros-ec-pwm" }, + { .compatible = "google,cros-ec-pwm-type" }, {}, }; MODULE_DEVICE_TABLE(of, cros_ec_pwm_of_match); From b1912875a548cf32eebc77eec85034052909bba8 Mon Sep 17 00:00:00 2001 From: Primoz Fiser Date: Fri, 22 Apr 2022 09:27:11 +0200 Subject: [PATCH 216/334] dt-bindings: mfd: da9063: watchdog: add suspend disable option Document the watchdog disable option which can be used if the hardware automatic suspend option is broken. Based on commit c514430c51ee8 ("dt-bindings: watchdog: da9062: add suspend disable option"). Signed-off-by: Primoz Fiser Reviewed-by: Rob Herring Reviewed-by: Adam Thomson Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220422072713.3172345-1-primoz.fiser@norik.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/mfd/da9063.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/da9063.txt b/Documentation/devicetree/bindings/mfd/da9063.txt index 91b79a21d403..aa8b800cc4ad 100644 --- a/Documentation/devicetree/bindings/mfd/da9063.txt +++ b/Documentation/devicetree/bindings/mfd/da9063.txt @@ -64,10 +64,13 @@ Sub-nodes: and KEY_SLEEP. - watchdog : This node defines settings for the Watchdog timer associated - with the DA9063 and DA9063L. There are currently no entries in this - binding, however compatible = "dlg,da9063-watchdog" should be added - if a node is created. + with the DA9063 and DA9063L. The node should contain the compatible property + with the value "dlg,da9063-watchdog". + Optional watchdog properties: + - dlg,use-sw-pm: Add this property to disable the watchdog during suspend. + Only use this option if you can't use the watchdog automatic suspend + function during a suspend (see register CONTROL_B). Example: From a7ceca4398bc835c6ad29d10e1106de4cf0332e1 Mon Sep 17 00:00:00 2001 From: Primoz Fiser Date: Fri, 22 Apr 2022 09:27:12 +0200 Subject: [PATCH 217/334] watchdog: da9063: optionally disable watchdog during suspend Optionally disable watchdog during suspend (if enabled) and re-enable it upon resume. This enables boards to sleep without being interrupted by the watchdog. This patch is based on commit f6c98b08381c ("watchdog: da9062: add power management ops") and commit 8541673d2a5f ("watchdog: da9062: fix power management ops") and brings the same functionality to DA9063. Signed-off-by: Primoz Fiser Reviewed-by: Adam Thomson Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220422072713.3172345-2-primoz.fiser@norik.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/da9063_wdt.c | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/watchdog/da9063_wdt.c b/drivers/watchdog/da9063_wdt.c index 9adad1862bbd..09a4af4c58fc 100644 --- a/drivers/watchdog/da9063_wdt.c +++ b/drivers/watchdog/da9063_wdt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -26,6 +27,8 @@ * others: timeout = 2048 ms * 2^(TWDSCALE-1). */ static const unsigned int wdt_timeout[] = { 0, 2, 4, 8, 16, 32, 65, 131 }; +static bool use_sw_pm; + #define DA9063_TWDSCALE_DISABLE 0 #define DA9063_TWDSCALE_MIN 1 #define DA9063_TWDSCALE_MAX (ARRAY_SIZE(wdt_timeout) - 1) @@ -218,6 +221,8 @@ static int da9063_wdt_probe(struct platform_device *pdev) if (!wdd) return -ENOMEM; + use_sw_pm = device_property_present(dev, "dlg,use-sw-pm"); + wdd->info = &da9063_watchdog_info; wdd->ops = &da9063_watchdog_ops; wdd->min_timeout = DA9063_WDT_MIN_TIMEOUT; @@ -228,6 +233,7 @@ static int da9063_wdt_probe(struct platform_device *pdev) watchdog_set_restart_priority(wdd, 128); watchdog_set_drvdata(wdd, da9063); + dev_set_drvdata(dev, wdd); wdd->timeout = DA9063_WDG_TIMEOUT; @@ -249,10 +255,40 @@ static int da9063_wdt_probe(struct platform_device *pdev) return devm_watchdog_register_device(dev, wdd); } +static int __maybe_unused da9063_wdt_suspend(struct device *dev) +{ + struct watchdog_device *wdd = dev_get_drvdata(dev); + + if (!use_sw_pm) + return 0; + + if (watchdog_active(wdd)) + return da9063_wdt_stop(wdd); + + return 0; +} + +static int __maybe_unused da9063_wdt_resume(struct device *dev) +{ + struct watchdog_device *wdd = dev_get_drvdata(dev); + + if (!use_sw_pm) + return 0; + + if (watchdog_active(wdd)) + return da9063_wdt_start(wdd); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(da9063_wdt_pm_ops, + da9063_wdt_suspend, da9063_wdt_resume); + static struct platform_driver da9063_wdt_driver = { .probe = da9063_wdt_probe, .driver = { .name = DA9063_DRVNAME_WATCHDOG, + .pm = &da9063_wdt_pm_ops, }, }; module_platform_driver(da9063_wdt_driver); From ac97c9374d3cdfa42611a32a653ac75db423cac4 Mon Sep 17 00:00:00 2001 From: Eliav Farber Date: Thu, 14 Apr 2022 05:42:33 +0000 Subject: [PATCH 218/334] watchdog: sp805: disable watchdog on remove Disable the watchdog if it is active while removing the module. It is necessary in order to prevent a reset in case watchdog hw was running before the removal. Signed-off-by: Eliav Farber Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220414054233.1357-2-farbere@amazon.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sp805_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c index dbeb2146c968..f9479a3fe2a6 100644 --- a/drivers/watchdog/sp805_wdt.c +++ b/drivers/watchdog/sp805_wdt.c @@ -272,6 +272,7 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id) watchdog_set_nowayout(&wdt->wdd, nowayout); watchdog_set_drvdata(&wdt->wdd, wdt); watchdog_set_restart_priority(&wdt->wdd, 128); + watchdog_stop_on_unregister(&wdt->wdd); /* * If 'timeout-sec' devicetree property is specified, use that. From 95d0eee9718a21ed35fb80a68d3a71759b136f77 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Thu, 7 Apr 2022 16:46:46 +0900 Subject: [PATCH 219/334] dt-bindings: watchdog: uniphier: Use unevaluatedProperties This refers common bindings, so this is preferred for unevaluatedProperties instead of additionalProperties. Signed-off-by: Kunihiko Hayashi Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1649317606-21267-1-git-send-email-hayashi.kunihiko@socionext.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml b/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml index a059d16cb4f2..90698cfa8f94 100644 --- a/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml @@ -19,7 +19,7 @@ properties: required: - compatible -additionalProperties: false +unevaluatedProperties: false examples: - | From c83f643878388d468e0a674ac9502d4080499646 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 21 Feb 2022 17:22:38 +0100 Subject: [PATCH 220/334] watchdog: rti_wdt: Fix calculation and evaluation of preset heartbeat This ensures that the same value is read back as was eventually programmed when using seconds as accuracy. Even then, comparing the more precise heartbeat_ms against heartbeat in seconds will almost never provide a match and will needlessly raise a warning. Fix by comparing apples to apples. Tested in combination with U-Boot as watchdog starter. Signed-off-by: Jan Kiszka Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/6a4b54ac-9588-e172-c4c7-b91d524a851e@siemens.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rti_wdt.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c index db843f825860..35f3c396a2ff 100644 --- a/drivers/watchdog/rti_wdt.c +++ b/drivers/watchdog/rti_wdt.c @@ -253,6 +253,7 @@ static int rti_wdt_probe(struct platform_device *pdev) } if (readl(wdt->base + RTIDWDCTRL) == WDENABLE_KEY) { + int preset_heartbeat; u32 time_left_ms; u64 heartbeat_ms; u32 wsize; @@ -263,11 +264,12 @@ static int rti_wdt_probe(struct platform_device *pdev) heartbeat_ms <<= WDT_PRELOAD_SHIFT; heartbeat_ms *= 1000; do_div(heartbeat_ms, wdt->freq); - if (heartbeat_ms != heartbeat * 1000) + preset_heartbeat = heartbeat_ms + 500; + preset_heartbeat /= 1000; + if (preset_heartbeat != heartbeat) dev_warn(dev, "watchdog already running, ignoring heartbeat config!\n"); - heartbeat = heartbeat_ms; - heartbeat /= 1000; + heartbeat = preset_heartbeat; wsize = readl(wdt->base + RTIWWDSIZECTRL); ret = rti_wdt_setup_hw_hb(wdd, wsize); From 26d14b9fc341893e862f17ab7db4543d5a530327 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 8 Apr 2022 14:28:54 -0700 Subject: [PATCH 221/334] dt-bindings: watchdog: Add SC8180X and SC8280XP compatibles Add compatibles for the SC8180X and SC8280XP platforms to the Qualcomm watchdog binding. Signed-off-by: Bjorn Andersson Acked-by: Krzysztof Kozlowski Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220408212854.581481-1-bjorn.andersson@linaro.org [groeck: Rebased and resolved conflicts] Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml b/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml index 4ff8c59c59ab..2bd6b4a52637 100644 --- a/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml @@ -20,6 +20,8 @@ properties: - qcom,apss-wdt-qcs404 - qcom,apss-wdt-sc7180 - qcom,apss-wdt-sc7280 + - qcom,apss-wdt-sc8180x + - qcom,apss-wdt-sc8280xp - qcom,apss-wdt-sdm845 - qcom,apss-wdt-sdx55 - qcom,apss-wdt-sm6350 From b3ac0c58fa8934926360268f3d89ec7680644d7b Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 12 Apr 2022 07:08:23 +0000 Subject: [PATCH 222/334] watchdog: rti-wdt: Fix pm_runtime_get_sync() error checking If the device is already in a runtime PM enabled state pm_runtime_get_sync() will return 1, so a test for negative value should be used to check for errors. Fixes: 2d63908bdbfb ("watchdog: Add K3 RTI watchdog support") Signed-off-by: Miaoqian Lin Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220412070824.23708-1-linmq006@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rti_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c index 35f3c396a2ff..053ef3bde12d 100644 --- a/drivers/watchdog/rti_wdt.c +++ b/drivers/watchdog/rti_wdt.c @@ -226,7 +226,7 @@ static int rti_wdt_probe(struct platform_device *pdev) pm_runtime_enable(dev); ret = pm_runtime_get_sync(dev); - if (ret) { + if (ret < 0) { pm_runtime_put_noidle(dev); pm_runtime_disable(&pdev->dev); return dev_err_probe(dev, ret, "runtime pm failed\n"); From 9ef958929fc288a85146d18368f4d2aaac80cdbc Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 18:11:45 +0800 Subject: [PATCH 223/334] watchdog: iTCO_wdt: Using existing macro define covers more scenarios For power management, SET_NOIRQ_SYSTEM_SLEEP_PM_OPS defined for CONFIG_PM_SLEEP, will point ->suspend_noirq, ->freeze_noirq and ->poweroff_noirq to the same function. Vice versa happens for ->resume_noirq, ->thaw_noirq and ->restore_noirq. Signed-off-by: Liu Xinpeng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650967905-3199-1-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/iTCO_wdt.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 3f2f4343644f..34693f11385f 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -596,7 +596,6 @@ static int iTCO_wdt_probe(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP /* * Suspend-to-idle requires this, because it stops the ticks and timekeeping, so * the watchdog cannot be pinged while in that state. In ACPI sleep states the @@ -604,15 +603,15 @@ static int iTCO_wdt_probe(struct platform_device *pdev) */ #ifdef CONFIG_ACPI -static inline bool need_suspend(void) +static inline bool __maybe_unused need_suspend(void) { return acpi_target_system_state() == ACPI_STATE_S0; } #else -static inline bool need_suspend(void) { return true; } +static inline bool __maybe_unused need_suspend(void) { return true; } #endif -static int iTCO_wdt_suspend_noirq(struct device *dev) +static int __maybe_unused iTCO_wdt_suspend_noirq(struct device *dev) { struct iTCO_wdt_private *p = dev_get_drvdata(dev); int ret = 0; @@ -626,7 +625,7 @@ static int iTCO_wdt_suspend_noirq(struct device *dev) return ret; } -static int iTCO_wdt_resume_noirq(struct device *dev) +static int __maybe_unused iTCO_wdt_resume_noirq(struct device *dev) { struct iTCO_wdt_private *p = dev_get_drvdata(dev); @@ -637,20 +636,15 @@ static int iTCO_wdt_resume_noirq(struct device *dev) } static const struct dev_pm_ops iTCO_wdt_pm = { - .suspend_noirq = iTCO_wdt_suspend_noirq, - .resume_noirq = iTCO_wdt_resume_noirq, + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(iTCO_wdt_suspend_noirq, + iTCO_wdt_resume_noirq) }; -#define ITCO_WDT_PM_OPS (&iTCO_wdt_pm) -#else -#define ITCO_WDT_PM_OPS NULL -#endif /* CONFIG_PM_SLEEP */ - static struct platform_driver iTCO_wdt_driver = { .probe = iTCO_wdt_probe, .driver = { .name = DRV_NAME, - .pm = ITCO_WDT_PM_OPS, + .pm = &iTCO_wdt_pm, }, }; From af84a5a75344871c44ddbf4ab901c3427909f893 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Sun, 24 Apr 2022 08:13:23 +0100 Subject: [PATCH 224/334] dt-bindings: watchdog: renesas,wdt: Document RZ/G2UL SoC Document RZ/G2UL WDT bindings. RZ/G2UL WDT is similar to one found on the RZ/G2L SoC. No driver changes are required as generic compatible string "renesas,rzg2l-wdt" will be used as a fallback. Signed-off-by: Biju Das Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220424071323.151757-1-biju.das.jz@bp.renesas.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index 59a48387e72d..84c008d956e4 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -21,6 +21,7 @@ properties: - items: - enum: + - renesas,r9a07g043-wdt # RZ/G2UL - renesas,r9a07g044-wdt # RZ/G2{L,LC} - renesas,r9a07g054-wdt # RZ/V2L - const: renesas,rzg2l-wdt From 382256b219fbc901a69481e5b60812556f53152e Mon Sep 17 00:00:00 2001 From: Andrej Picej Date: Mon, 6 Dec 2021 07:47:31 +0100 Subject: [PATCH 225/334] dt-bindings: watchdog: da9062: add watchdog timeout mode Document the watchdog timeout mode property. If this property is used the user can select what happens on watchdog timeout. Set this property to 1 to enable SHUTDOWN (the device resets), set it to 0 and the device will go to POWERDOWN on watchdog timeout. If this property is not set, don't touch the WATCHDOG_SD bit and leave the configuration to OTP. This way backward compatibility is not broken. Signed-off-by: Andrej Picej Reviewed-by: Adam Thomson Acked-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20211206064732.280375-4-andrej.picej@norik.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/da9062-wdt.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt b/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt index 950e4fba8dbc..354314d854ef 100644 --- a/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt @@ -10,6 +10,12 @@ Optional properties: - dlg,use-sw-pm: Add this property to disable the watchdog during suspend. Only use this option if you can't use the watchdog automatic suspend function during a suspend (see register CONTROL_B). +- dlg,wdt-sd: Set what happens on watchdog timeout. If this bit is set the + watchdog timeout triggers SHUTDOWN, if cleared the watchdog triggers + POWERDOWN. Can be 0 or 1. Only use this option if you want to change the + default chip's OTP setting for WATCHDOG_SD bit. If this property is NOT + set the WATCHDOG_SD bit and on timeout watchdog behavior will match the + chip's OTP settings. Example: DA9062 From 6d72c7ac9fbe26a77800676507da980436b40b2f Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 22:53:28 +0800 Subject: [PATCH 226/334] watchdog: wdat_wdt: Using the existing function to check parameter timeout If max_hw_heartbeat_ms is provided, the configured maximum timeout is not limited by it. The limit check in this driver therefore doesn't make much sense. Similar, the watchdog core ensures that minimum timeout limits are met if min_hw_heartbeat_ms is set. Using watchdog_timeout_invalid() makes more sense because it takes this into account. Signed-off-by: Liu Xinpeng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650984810-6247-2-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/wdat_wdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 195c8c004b69..df0865a61a70 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -344,6 +344,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) wdat->period = tbl->timer_period; wdat->wdd.min_hw_heartbeat_ms = wdat->period * tbl->min_count; wdat->wdd.max_hw_heartbeat_ms = wdat->period * tbl->max_count; + wdat->wdd.min_timeout = 1; wdat->stopped_in_sleep = tbl->flags & ACPI_WDAT_STOPPED; wdat->wdd.info = &wdat_wdt_info; wdat->wdd.ops = &wdat_wdt_ops; @@ -450,8 +451,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) * watchdog properly after it has opened the device. In some cases * the BIOS default is too short and causes immediate reboot. */ - if (timeout * 1000 < wdat->wdd.min_hw_heartbeat_ms || - timeout * 1000 > wdat->wdd.max_hw_heartbeat_ms) { + if (watchdog_timeout_invalid(&wdat->wdd, timeout)) { dev_warn(dev, "Invalid timeout %d given, using %d\n", timeout, WDAT_DEFAULT_TIMEOUT); timeout = WDAT_DEFAULT_TIMEOUT; From 27fdf84510a1374748904db43f6755f912736d92 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 22:53:29 +0800 Subject: [PATCH 227/334] watchdog: wdat_wdt: Stop watchdog when rebooting the system Executing reboot command several times on the machine "Dell PowerEdge R740", UEFI security detection stopped machine with the following prompt: UEFI0082: The system was reset due to a timeout from the watchdog timer. Check the System Event Log (SEL) or crash dumps from Operating Sysstem to identify the source that triggered the watchdog timer reset. Update the firmware or driver for the identified device. iDRAC has warning event: "The watchdog timer reset the system". This patch fixes this issue by adding the reboot notifier. Signed-off-by: Liu Xinpeng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650984810-6247-3-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/wdat_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index df0865a61a70..6f36a653767b 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -462,6 +462,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) return ret; watchdog_set_nowayout(&wdat->wdd, nowayout); + watchdog_stop_on_reboot(&wdat->wdd); return devm_watchdog_register_device(dev, &wdat->wdd); } From 330415ebea81b65842e4cc6d2fd985c1b369e650 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Tue, 26 Apr 2022 22:53:30 +0800 Subject: [PATCH 228/334] watchdog: wdat_wdt: Stop watchdog when uninstalling module Test shows that wachdog still reboots machine after the module is removed. Use watchdog_stop_on_unregister to stop the watchdog on removing. Signed-off-by: Liu Xinpeng eviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1650984810-6247-4-git-send-email-liuxp11@chinatelecom.cn Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/wdat_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 6f36a653767b..e6f95e99156d 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -463,6 +463,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) watchdog_set_nowayout(&wdat->wdd, nowayout); watchdog_stop_on_reboot(&wdat->wdd); + watchdog_stop_on_unregister(&wdat->wdd); return devm_watchdog_register_device(dev, &wdat->wdd); } From 9dc731bbb455eeaff6e90abef7a8d05dfa6ccc9a Mon Sep 17 00:00:00 2001 From: Jean-Jacques Hiblot Date: Wed, 27 Apr 2022 15:55:30 +0200 Subject: [PATCH 229/334] dt-bindings: watchdog: renesas,wdt: Add support for RZ/N1 Describe the WDT hardware in the RZ/N1 series. Signed-off-by: Jean-Jacques Hiblot Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring Link: https://lore.kernel.org/r/20220427135531.708279-2-jjhiblot@traphandler.com [groeck: Rebased, fixed conflicts] Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index 84c008d956e4..514d400c7a36 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -19,6 +19,11 @@ properties: - renesas,r7s9210-wdt # RZ/A2 - const: renesas,rza-wdt # RZ/A + - items: + - enum: + - renesas,r9a06g032-wdt # RZ/N1D + - const: renesas,rzn1-wdt # RZ/N1 + - items: - enum: - renesas,r9a07g043-wdt # RZ/G2UL @@ -96,6 +101,7 @@ allOf: contains: enum: - renesas,rza-wdt + - renesas,rzn1-wdt then: required: - power-domains From d65112f58464fd0ab62b8e4fb6c66cbf29b24b43 Mon Sep 17 00:00:00 2001 From: Phil Edworthy Date: Wed, 27 Apr 2022 15:55:31 +0200 Subject: [PATCH 230/334] watchdog: Add Renesas RZ/N1 Watchdog driver This is a driver for the standard WDT on the RZ/N1 devices. This WDT has very limited timeout capabilities. However, it can reset the device. To do so, the corresponding bits in the SysCtrl RSTEN register need to be enabled. This is not done by this driver. Signed-off-by: Phil Edworthy Signed-off-by: Jean-Jacques Hiblot Reviewed-by: Tzung-Bi Shih Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220427135531.708279-3-jjhiblot@traphandler.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 8 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/rzn1_wdt.c | 203 ++++++++++++++++++++++++++++++++++++ 3 files changed, 212 insertions(+) create mode 100644 drivers/watchdog/rzn1_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index a7cd3ef5b3d8..001ae1be9b61 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -883,6 +883,14 @@ config RENESAS_RZAWDT This driver adds watchdog support for the integrated watchdogs in the Renesas RZ/A SoCs. These watchdogs can be used to reset a system. +config RENESAS_RZN1WDT + tristate "Renesas RZ/N1 watchdog" + depends on ARCH_RENESAS || COMPILE_TEST + select WATCHDOG_CORE + help + This driver adds watchdog support for the integrated watchdogs in the + Renesas RZ/N1 SoCs. These watchdogs can be used to reset a system. + config RENESAS_RZG2LWDT tristate "Renesas RZ/G2L WDT Watchdog" depends on ARCH_RENESAS || COMPILE_TEST diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 71c933a5964a..5f88a6237f7c 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_LPC18XX_WATCHDOG) += lpc18xx_wdt.o obj-$(CONFIG_BCM7038_WDT) += bcm7038_wdt.o obj-$(CONFIG_RENESAS_WDT) += renesas_wdt.o obj-$(CONFIG_RENESAS_RZAWDT) += rza_wdt.o +obj-$(CONFIG_RENESAS_RZN1WDT) += rzn1_wdt.o obj-$(CONFIG_RENESAS_RZG2LWDT) += rzg2l_wdt.o obj-$(CONFIG_ASPEED_WATCHDOG) += aspeed_wdt.o obj-$(CONFIG_STM32_WATCHDOG) += stm32_iwdg.o diff --git a/drivers/watchdog/rzn1_wdt.c b/drivers/watchdog/rzn1_wdt.c new file mode 100644 index 000000000000..55ab384b9965 --- /dev/null +++ b/drivers/watchdog/rzn1_wdt.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/N1 Watchdog timer. + * This is a 12-bit timer driver from a (62.5/16384) MHz clock. It can't even + * cope with 2 seconds. + * + * Copyright 2018 Renesas Electronics Europe Ltd. + * + * Derived from Ralink RT288x watchdog timer. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_TIMEOUT 60 + +#define RZN1_WDT_RETRIGGER 0x0 +#define RZN1_WDT_RETRIGGER_RELOAD_VAL 0 +#define RZN1_WDT_RETRIGGER_RELOAD_VAL_MASK 0xfff +#define RZN1_WDT_RETRIGGER_PRESCALE BIT(12) +#define RZN1_WDT_RETRIGGER_ENABLE BIT(13) +#define RZN1_WDT_RETRIGGER_WDSI (0x2 << 14) + +#define RZN1_WDT_PRESCALER 16384 +#define RZN1_WDT_MAX 4095 + +struct rzn1_watchdog { + struct watchdog_device wdtdev; + void __iomem *base; + unsigned long clk_rate_khz; +}; + +static inline uint32_t max_heart_beat_ms(unsigned long clk_rate_khz) +{ + return (RZN1_WDT_MAX * RZN1_WDT_PRESCALER) / clk_rate_khz; +} + +static inline uint32_t compute_reload_value(uint32_t tick_ms, + unsigned long clk_rate_khz) +{ + return (tick_ms * clk_rate_khz) / RZN1_WDT_PRESCALER; +} + +static int rzn1_wdt_ping(struct watchdog_device *w) +{ + struct rzn1_watchdog *wdt = watchdog_get_drvdata(w); + + /* Any value retrigggers the watchdog */ + writel(0, wdt->base + RZN1_WDT_RETRIGGER); + + return 0; +} + +static int rzn1_wdt_start(struct watchdog_device *w) +{ + struct rzn1_watchdog *wdt = watchdog_get_drvdata(w); + u32 val; + + /* + * The hardware allows you to write to this reg only once. + * Since this includes the reload value, there is no way to change the + * timeout once started. Also note that the WDT clock is half the bus + * fabric clock rate, so if the bus fabric clock rate is changed after + * the WDT is started, the WDT interval will be wrong. + */ + val = RZN1_WDT_RETRIGGER_WDSI; + val |= RZN1_WDT_RETRIGGER_ENABLE; + val |= RZN1_WDT_RETRIGGER_PRESCALE; + val |= compute_reload_value(w->max_hw_heartbeat_ms, wdt->clk_rate_khz); + writel(val, wdt->base + RZN1_WDT_RETRIGGER); + + return 0; +} + +static irqreturn_t rzn1_wdt_irq(int irq, void *_wdt) +{ + pr_crit("RZN1 Watchdog. Initiating system reboot\n"); + emergency_restart(); + + return IRQ_HANDLED; +} + +static struct watchdog_info rzn1_wdt_info = { + .identity = "RZ/N1 Watchdog", + .options = WDIOF_MAGICCLOSE | WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING, +}; + +static const struct watchdog_ops rzn1_wdt_ops = { + .owner = THIS_MODULE, + .start = rzn1_wdt_start, + .ping = rzn1_wdt_ping, +}; + +static void rzn1_wdt_clk_disable_unprepare(void *data) +{ + clk_disable_unprepare(data); +} + +static int rzn1_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rzn1_watchdog *wdt; + struct device_node *np = dev->of_node; + struct clk *clk; + unsigned long clk_rate; + int ret; + int irq; + + wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + wdt->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(wdt->base)) + return PTR_ERR(wdt->base); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_irq(dev, irq, rzn1_wdt_irq, 0, + np->name, wdt); + if (ret) { + dev_err(dev, "failed to request irq %d\n", irq); + return ret; + } + + clk = devm_clk_get(dev, NULL); + if (IS_ERR(clk)) { + dev_err(dev, "failed to get the clock\n"); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (ret) { + dev_err(dev, "failed to prepare/enable the clock\n"); + return ret; + } + + ret = devm_add_action_or_reset(dev, rzn1_wdt_clk_disable_unprepare, + clk); + if (ret) + return ret; + + clk_rate = clk_get_rate(clk); + if (!clk_rate) { + dev_err(dev, "failed to get the clock rate\n"); + return -EINVAL; + } + + wdt->clk_rate_khz = clk_rate / 1000; + wdt->wdtdev.info = &rzn1_wdt_info, + wdt->wdtdev.ops = &rzn1_wdt_ops, + wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS, + wdt->wdtdev.parent = dev; + /* + * The period of the watchdog cannot be changed once set + * and is limited to a very short period. + * Configure it for a 1s period once and for all, and + * rely on the heart-beat provided by the watchdog core + * to make this usable by the user-space. + */ + wdt->wdtdev.max_hw_heartbeat_ms = max_heart_beat_ms(wdt->clk_rate_khz); + if (wdt->wdtdev.max_hw_heartbeat_ms > 1000) + wdt->wdtdev.max_hw_heartbeat_ms = 1000; + + wdt->wdtdev.timeout = DEFAULT_TIMEOUT; + ret = watchdog_init_timeout(&wdt->wdtdev, 0, dev); + if (ret) + return ret; + + watchdog_set_drvdata(&wdt->wdtdev, wdt); + + return devm_watchdog_register_device(dev, &wdt->wdtdev); +} + + +static const struct of_device_id rzn1_wdt_match[] = { + { .compatible = "renesas,rzn1-wdt" }, + {}, +}; +MODULE_DEVICE_TABLE(of, rzn1_wdt_match); + +static struct platform_driver rzn1_wdt_driver = { + .probe = rzn1_wdt_probe, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = rzn1_wdt_match, + }, +}; + +module_platform_driver(rzn1_wdt_driver); + +MODULE_DESCRIPTION("Renesas RZ/N1 hardware watchdog"); +MODULE_AUTHOR("Phil Edworthy "); +MODULE_LICENSE("GPL"); From 9215a90dd56e008999dbaa228c8eb2c7df98a418 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 2 May 2022 15:34:59 +0200 Subject: [PATCH 231/334] dt-bindings: watchdog: renesas,wdt: R-Car V3U is R-Car Gen4 Despite the name, R-Car V3U is the first member of the R-Car Gen4 family. Hence move its compatible value to the R-Car Gen4 section. Signed-off-by: Geert Uytterhoeven Acked-by: Krzysztof Kozlowski Reviewed-by: Wolfram Sang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/2882a6de3905a57ae62d91060d27521af43c4068.1651497024.git.geert+renesas@glider.be Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index 514d400c7a36..a8d7dde5271b 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -59,11 +59,11 @@ properties: - renesas,r8a77980-wdt # R-Car V3H - renesas,r8a77990-wdt # R-Car E3 - renesas,r8a77995-wdt # R-Car D3 - - renesas,r8a779a0-wdt # R-Car V3U - const: renesas,rcar-gen3-wdt # R-Car Gen3 and RZ/G2 - items: - enum: + - renesas,r8a779a0-wdt # R-Car V3U - renesas,r8a779f0-wdt # R-Car S4-8 - const: renesas,rcar-gen4-wdt # R-Car Gen4 From 5d24df3d690809952528e7a19a43d84bc5b99d44 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 11 May 2022 15:42:03 +0400 Subject: [PATCH 232/334] watchdog: ts4800_wdt: Fix refcount leak in ts4800_wdt_probe of_parse_phandle() returns a node pointer with refcount incremented, we should use of_node_put() on it when done. Add missing of_node_put() in some error paths. Fixes: bf9006399939 ("watchdog: ts4800: add driver for TS-4800 watchdog") Signed-off-by: Miaoqian Lin Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220511114203.47420-1-linmq006@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ts4800_wdt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/ts4800_wdt.c b/drivers/watchdog/ts4800_wdt.c index c137ad2bd5c3..0ea554c7cda5 100644 --- a/drivers/watchdog/ts4800_wdt.c +++ b/drivers/watchdog/ts4800_wdt.c @@ -125,13 +125,16 @@ static int ts4800_wdt_probe(struct platform_device *pdev) ret = of_property_read_u32_index(np, "syscon", 1, ®); if (ret < 0) { dev_err(dev, "no offset in syscon\n"); + of_node_put(syscon_np); return ret; } /* allocate memory for watchdog struct */ wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL); - if (!wdt) + if (!wdt) { + of_node_put(syscon_np); return -ENOMEM; + } /* set regmap and offset to know where to write */ wdt->feed_offset = reg; From 1807abcf8778bcbbf584fe54da9ccbe9029c49bb Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Sat, 30 Apr 2022 08:30:25 +0900 Subject: [PATCH 233/334] ksmbd: smbd: change prototypes of RDMA read/write related functions Change the prototypes of RDMA read/write operations to accept a pointer and length of buffer descriptors. Signed-off-by: Hyunchul Lee Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.c | 20 ++++++++++---------- fs/ksmbd/connection.h | 27 ++++++++++++++++----------- fs/ksmbd/smb2pdu.c | 23 ++++++++--------------- fs/ksmbd/transport_rdma.c | 30 +++++++++++++++++------------- 4 files changed, 51 insertions(+), 49 deletions(-) diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 208d2cff7bd3..7db87771884a 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work) return 0; } -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len) +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_read) ret = conn->transport->ops->rdma_read(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_write) ret = conn->transport->ops->rdma_write(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 7a59aacb5daa..98c1cbe45ec9 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -122,11 +122,14 @@ struct ksmbd_transport_ops { int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, unsigned int remote_key); - int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - u32 remote_key, u64 remote_offset, u32 remote_len); - int (*rdma_write)(struct ksmbd_transport *t, void *buf, - unsigned int len, u32 remote_key, u64 remote_offset, - u32 remote_len); + int (*rdma_read)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); + int (*rdma_write)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); }; struct ksmbd_transport { @@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 2bdfe449b2da..07d8c3644fe3 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6116,7 +6116,6 @@ out: static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, struct smb2_buffer_desc_v1 *desc, __le32 Channel, - __le16 ChannelInfoOffset, __le16 ChannelInfoLength) { unsigned int i, ch_count; @@ -6142,7 +6141,8 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); + if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) + work->remote_key = le32_to_cpu(desc->token); return 0; } @@ -6150,14 +6150,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, struct smb2_read_req *req, void *data_buf, size_t length) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), + le16_to_cpu(req->ReadChannelInfoLength)); if (err) return err; @@ -6201,7 +6199,6 @@ int smb2_read(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->ReadChannelInfoOffset, req->ReadChannelInfoLength); if (err) goto out; @@ -6379,21 +6376,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t offset, size_t length, bool sync) { - struct smb2_buffer_desc_v1 *desc; char *data_buf; int ret; ssize_t nbytes; - desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), + le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { kvfree(data_buf); return ret; @@ -6445,7 +6439,6 @@ int smb2_write(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->WriteChannelInfoOffset, req->WriteChannelInfoLength); if (err) goto out; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index e646d79554b8..5e34625b5faf 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1351,14 +1351,18 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) read_write_done(cq, wc, DMA_TO_DEVICE); } -static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, - int buf_len, u32 remote_key, u64 remote_offset, - u32 remote_len, bool is_read) +static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + void *buf, int buf_len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len, + bool is_read) { struct smb_direct_rdma_rw_msg *msg; int ret; DECLARE_COMPLETION_ONSTACK(completion); struct ib_send_wr *first_wr = NULL; + u32 remote_key = le32_to_cpu(desc[0].token); + u64 remote_offset = le64_to_cpu(desc[0].offset); ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); if (ret < 0) @@ -1423,22 +1427,22 @@ err: return ret; } -static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_write(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, false); + desc, desc_len, false); } -static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_read(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, true); + desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) From ddbdc861e37c168cf2fb8a7b7477f5d18b4daf76 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Sat, 30 Apr 2022 08:30:26 +0900 Subject: [PATCH 234/334] ksmbd: smbd: introduce read/write credits for RDMA read/write SMB2_READ/SMB2_WRITE request has to be granted the number of rw credits, the pages the request wants to transfer / the maximum pages which can be registered with one MR to read and write a file. And allocate enough RDMA resources for the maximum number of rw credits allowed by ksmbd. Signed-off-by: Hyunchul Lee Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 120 ++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 49 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 5e34625b5faf..2edb5acfb1f6 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 524224; - -static int smb_direct_max_outstanding_rw_ops = 8; +static int smb_direct_max_read_write_size = 8 * 1024 * 1024; static LIST_HEAD(smb_direct_device_list); static DEFINE_RWLOCK(smb_direct_device_lock); @@ -147,10 +145,12 @@ struct smb_direct_transport { atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; - atomic_t rw_avail_ops; + int max_rw_credits; + int pages_per_rw_credit; + atomic_t rw_credits; wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_avail_ops; + wait_queue_head_t wait_rw_credits; mempool_t *sendmsg_mempool; struct kmem_cache *sendmsg_cache; @@ -377,7 +377,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->reassembly_queue_length = 0; init_waitqueue_head(&t->wait_reassembly_queue); init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_avail_ops); + init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); spin_lock_init(&t->recvmsg_queue_lock); @@ -983,18 +983,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, } static int wait_for_credits(struct smb_direct_transport *t, - wait_queue_head_t *waitq, atomic_t *credits) + wait_queue_head_t *waitq, atomic_t *total_credits, + int needed) { int ret; do { - if (atomic_dec_return(credits) >= 0) + if (atomic_sub_return(needed, total_credits) >= 0) return 0; - atomic_inc(credits); + atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, - atomic_read(credits) > 0 || - t->status != SMB_DIRECT_CS_CONNECTED); + atomic_read(total_credits) >= needed || + t->status != SMB_DIRECT_CS_CONNECTED); if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; @@ -1015,7 +1016,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t, return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); + return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); +} + +static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +{ + return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); +} + +static int calc_rw_credits(struct smb_direct_transport *t, + char *buf, unsigned int len) +{ + return DIV_ROUND_UP(get_buf_page_count(buf, len), + t->pages_per_rw_credit); } static int smb_direct_create_header(struct smb_direct_transport *t, @@ -1331,8 +1344,8 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, smb_direct_disconnect_rdma_connection(t); } - if (atomic_inc_return(&t->rw_avail_ops) > 0) - wake_up(&t->wait_rw_avail_ops); + if (atomic_inc_return(&t->rw_credits) > 0) + wake_up(&t->wait_rw_credits); rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, msg->sg_list, msg->sgt.nents, dir); @@ -1363,8 +1376,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, struct ib_send_wr *first_wr = NULL; u32 remote_key = le32_to_cpu(desc[0].token); u64 remote_offset = le64_to_cpu(desc[0].offset); + int credits_needed; - ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + credits_needed = calc_rw_credits(t, buf, buf_len); + ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) return ret; @@ -1372,7 +1387,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); if (!msg) { - atomic_inc(&t->rw_avail_ops); + atomic_add(credits_needed, &t->rw_credits); return -ENOMEM; } @@ -1381,7 +1396,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, get_buf_page_count(buf, buf_len), msg->sg_list, SG_CHUNK_SIZE); if (ret) { - atomic_inc(&t->rw_avail_ops); + atomic_add(credits_needed, &t->rw_credits); kfree(msg); return -ENOMEM; } @@ -1417,7 +1432,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, return 0; err: - atomic_inc(&t->rw_avail_ops); + atomic_add(credits_needed, &t->rw_credits); if (first_wr) rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, msg->sg_list, msg->sgt.nents, @@ -1642,11 +1657,19 @@ out_err: return ret; } +static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +{ + return min_t(unsigned int, + t->cm_id->device->attrs.max_fast_reg_page_list_len, + 256); +} + static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct ib_device *device = t->cm_id->device; - int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; + int max_send_sges, max_rw_wrs, max_send_wrs; + unsigned int max_sge_per_wr, wrs_per_credit; /* need 2 more sge. because a SMB_DIRECT header will be mapped, * and maybe a send buffer could be not page aligned. @@ -1658,25 +1681,31 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - /* - * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA - * read/writes. HCA guarantees at least max_send_sge of sges for - * a RDMA read/write work request, and if memory registration is used, - * we need reg_mr, local_inv wrs for each read/write. + /* Calculate the number of work requests for RDMA R/W. + * The maximum number of pages which can be registered + * with one Memory region can be transferred with one + * R/W credit. And at least 4 work requests for each credit + * are needed for MR registration, RDMA R/W, local & remote + * MR invalidation. */ t->max_rdma_rw_size = smb_direct_max_read_write_size; - max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); - max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, - max_pages) * 2; - max_rw_wrs *= smb_direct_max_outstanding_rw_ops; + t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); + t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, + (t->pages_per_rw_credit - 1) * + PAGE_SIZE); + + max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, + device->attrs.max_sge_rd); + wrs_per_credit = max_t(unsigned int, 4, + DIV_ROUND_UP(t->pages_per_rw_credit, + max_sge_per_wr) + 1); + max_rw_wrs = t->max_rw_credits * wrs_per_credit; max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { - pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", - smb_direct_send_credit_target, - smb_direct_max_outstanding_rw_ops); + pr_err("consider lowering send_credit_target = %d\n", + smb_direct_send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; @@ -1711,7 +1740,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->send_credit_target = smb_direct_send_credit_target; atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); + atomic_set(&t->rw_credits, t->max_rw_credits); t->max_send_size = smb_direct_max_send_size; t->max_recv_size = smb_direct_max_receive_size; @@ -1719,12 +1748,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_send_sge = max_sge_per_wr; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = - rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * - smb_direct_max_outstanding_rw_ops; + cap->max_rdma_ctxs = t->max_rw_credits; return 0; } @@ -1817,7 +1844,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->send_cq = ib_alloc_cq(t->cm_id->device, t, - t->send_credit_target, 0, IB_POLL_WORKQUEUE); + smb_direct_send_credit_target + cap->max_rdma_ctxs, + 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->send_cq)) { pr_err("Can't create RDMA send CQ\n"); ret = PTR_ERR(t->send_cq); @@ -1826,8 +1854,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - cap->max_send_wr + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); + t->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(t->recv_cq); @@ -1856,17 +1883,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - int pages_per_mr, mr_count; - - pages_per_mr = min_t(int, pages_per_rw, - t->cm_id->device->attrs.max_fast_reg_page_list_len); - mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * - atomic_read(&t->rw_avail_ops); - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, - IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, + t->max_rw_credits, IB_MR_TYPE_MEM_REG, + t->pages_per_rw_credit, 0); if (ret) { pr_err("failed to init mr pool count %d pages %d\n", - mr_count, pages_per_mr); + t->max_rw_credits, t->pages_per_rw_credit); goto err; } } From 11659a8ddbd9c4c1ab6f3b8f52837178ef121b20 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Sat, 30 Apr 2022 08:30:27 +0900 Subject: [PATCH 235/334] ksmbd: smbd: simplify tracking pending packets Because we don't have to tracking pending packets by dividing these into packets with payload and packets without payload, merge the tracking code. Signed-off-by: Hyunchul Lee Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 2edb5acfb1f6..4372d631735e 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -157,8 +157,6 @@ struct smb_direct_transport { mempool_t *recvmsg_mempool; struct kmem_cache *recvmsg_cache; - wait_queue_head_t wait_send_payload_pending; - atomic_t send_payload_pending; wait_queue_head_t wait_send_pending; atomic_t send_pending; @@ -386,8 +384,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->empty_recvmsg_queue_lock); INIT_LIST_HEAD(&t->empty_recvmsg_queue); - init_waitqueue_head(&t->wait_send_payload_pending); - atomic_set(&t->send_payload_pending, 0); init_waitqueue_head(&t->wait_send_pending); atomic_set(&t->send_pending, 0); @@ -417,8 +413,6 @@ static void free_transport(struct smb_direct_transport *t) wake_up_interruptible(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_payload_pending, - atomic_read(&t->send_payload_pending) == 0); wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); @@ -873,13 +867,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); } - if (sendmsg->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); /* iterate and free the list of messages in reverse. the list's head * is invalid. @@ -911,21 +900,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t, { int ret; - if (wr->num_sge > 1) - atomic_inc(&t->send_payload_pending); - else - atomic_inc(&t->send_pending); - + atomic_inc(&t->send_pending); ret = ib_post_send(t->qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (wr->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); smb_direct_disconnect_rdma_connection(t); } return ret; @@ -1326,8 +1306,8 @@ done: * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_payload_pending, - atomic_read(&st->send_payload_pending) == 0); + wait_event(st->wait_send_pending, + atomic_read(&st->send_pending) == 0); return ret; } From 4e3edd0092704b25626a0fe60a974f6f382ff93d Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Sat, 30 Apr 2022 08:30:28 +0900 Subject: [PATCH 236/334] ksmbd: smbd: change the return value of get_sg_list Make get_sg_list return EINVAL if there aren't mapped scatterlists. Signed-off-by: Hyunchul Lee Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 4372d631735e..696ffd2ae661 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1079,7 +1079,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen int offset, len; int i = 0; - if (nentries < get_buf_page_count(buf, size)) + if (size <= 0 || nentries < get_buf_page_count(buf, size)) return -EINVAL; offset = offset_in_page(buf); @@ -1111,7 +1111,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, int npages; npages = get_sg_list(buf, size, sg_list, nentries); - if (npages <= 0) + if (npages < 0) return -EINVAL; return ib_dma_map_sg(device, sg_list, npages, dir); } From ee1b0558965909872775183dc237cdf9f8eddaba Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Sat, 30 Apr 2022 08:30:29 +0900 Subject: [PATCH 237/334] ksmbd: smbd: handle multiple Buffer descriptors Make ksmbd handle multiple buffer descriptors when reading and writing files using SMB direct: Post the work requests of rdma_rw_ctx for RDMA read/write in smb_direct_rdma_xmit(), and the work request for the READ/WRITE response with a remote invalidation in smb_direct_writev(). Signed-off-by: Hyunchul Lee Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 5 +- fs/ksmbd/transport_rdma.c | 168 ++++++++++++++++++++++++-------------- 2 files changed, 109 insertions(+), 64 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 07d8c3644fe3..06ed71fd3e1a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6133,11 +6133,8 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, le32_to_cpu(desc[i].length)); } } - if (ch_count != 1) { - ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", - ch_count); + if (!ch_count) return -EINVAL; - } work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 696ffd2ae661..19a605fd46ff 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -206,7 +206,9 @@ struct smb_direct_recvmsg { struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; + int status; struct completion *completion; + struct list_head list; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; struct scatterlist sg_list[]; @@ -1311,6 +1313,16 @@ done: return ret; } +static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, + struct smb_direct_rdma_rw_msg *msg, + enum dma_data_direction dir) +{ + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, msg->sgt.nents, dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); +} + static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { @@ -1319,19 +1331,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, struct smb_direct_transport *t = msg->t; if (wc->status != IB_WC_SUCCESS) { + msg->status = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); - smb_direct_disconnect_rdma_connection(t); + if (wc->status != IB_WC_WR_FLUSH_ERR) + smb_direct_disconnect_rdma_connection(t); } - if (atomic_inc_return(&t->rw_credits) > 0) - wake_up(&t->wait_rw_credits); - - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, dir); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); complete(msg->completion); - kfree(msg); } static void read_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1350,75 +1357,116 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, unsigned int desc_len, bool is_read) { - struct smb_direct_rdma_rw_msg *msg; - int ret; + struct smb_direct_rdma_rw_msg *msg, *next_msg; + int i, ret; DECLARE_COMPLETION_ONSTACK(completion); - struct ib_send_wr *first_wr = NULL; - u32 remote_key = le32_to_cpu(desc[0].token); - u64 remote_offset = le64_to_cpu(desc[0].offset); + struct ib_send_wr *first_wr; + LIST_HEAD(msg_list); + char *desc_buf; int credits_needed; + unsigned int desc_buf_len; + size_t total_length = 0; + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; + + /* calculate needed credits */ + credits_needed = 0; + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + desc_buf_len = le32_to_cpu(desc[i].length); + + credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + desc_buf += desc_buf_len; + total_length += desc_buf_len; + if (desc_buf_len == 0 || total_length > buf_len || + total_length > t->max_rdma_rw_size) + return -EINVAL; + } + + ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", + is_read ? "read" : "write", buf_len, credits_needed); - credits_needed = calc_rw_credits(t, buf, buf_len); ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) return ret; - /* TODO: mempool */ - msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + - sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); - if (!msg) { - atomic_add(credits_needed, &t->rw_credits); - return -ENOMEM; + /* build rdma_rw_ctx for each descriptor */ + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + desc_buf_len = le32_to_cpu(desc[i].length); + + msg->t = t; + msg->cqe.done = is_read ? read_done : write_done; + msg->completion = &completion; + + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + get_buf_page_count(desc_buf, desc_buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { + kfree(msg); + ret = -ENOMEM; + goto out; + } + + ret = get_sg_list(desc_buf, desc_buf_len, + msg->sgt.sgl, msg->sgt.orig_nents); + if (ret < 0) { + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, + get_buf_page_count(desc_buf, desc_buf_len), + 0, + le64_to_cpu(desc[i].offset), + le32_to_cpu(desc[i].token), + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + list_add_tail(&msg->list, &msg_list); + desc_buf += desc_buf_len; } - msg->sgt.sgl = &msg->sg_list[0]; - ret = sg_alloc_table_chained(&msg->sgt, - get_buf_page_count(buf, buf_len), - msg->sg_list, SG_CHUNK_SIZE); - if (ret) { - atomic_add(credits_needed, &t->rw_credits); - kfree(msg); - return -ENOMEM; + /* concatenate work requests of rdma_rw_ctxs */ + first_wr = NULL; + list_for_each_entry_reverse(msg, &msg_list, list) { + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + &msg->cqe, first_wr); } - ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret <= 0) { - pr_err("failed to get pages\n"); - goto err; - } - - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, get_buf_page_count(buf, buf_len), - 0, remote_offset, remote_key, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - if (ret < 0) { - pr_err("failed to init rdma_rw_ctx: %d\n", ret); - goto err; - } - - msg->t = t; - msg->cqe.done = is_read ? read_done : write_done; - msg->completion = &completion; - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, - &msg->cqe, NULL); - ret = ib_post_send(t->qp, first_wr, NULL); if (ret) { - pr_err("failed to post send wr: %d\n", ret); - goto err; + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); + goto out; } + msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); wait_for_completion(&completion); - return 0; - -err: + ret = msg->status; +out: + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { + list_del(&msg->list); + smb_direct_free_rdma_rw_msg(t, msg, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } atomic_add(credits_needed, &t->rw_credits); - if (first_wr) - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); + wake_up(&t->wait_rw_credits); return ret; } From 65ca7a3ffff811d6c0d4342d467c381257d566d4 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 16 May 2022 16:22:09 +0900 Subject: [PATCH 238/334] ksmbd: handle smb2 query dir request for OutputBufferLength that is too small We found the issue that ksmbd return STATUS_NO_MORE_FILES response even though there are still dentries that needs to be read while file read/write test using framtest utils. windows client send smb2 query dir request included OutputBufferLength(128) that is too small to contain even one entry. This patch make ksmbd immediately returns OutputBufferLength of response as zero to client. Signed-off-by: Namjae Jeon Reviewed-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 06ed71fd3e1a..6bc30dd34999 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3938,6 +3938,12 @@ int smb2_query_dir(struct ksmbd_work *work) set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); + /* + * req->OutputBufferLength is too small to contain even one entry. + * In this case, it immediately returns OutputBufferLength 0 to client. + */ + if (!d_info.out_buf_len && !d_info.num_entry) + goto no_buf_len; if (rc == 0) restart_ctx(&dir_fp->readdir_data.ctx); if (rc == -ENOSPC) @@ -3964,10 +3970,12 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->Buffer[0] = 0; inc_rfc1001_len(work->response_buf, 9); } else { +no_buf_len: ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; - d_info.data_count -= d_info.last_entry_off_align; + if (d_info.data_count >= d_info.last_entry_off_align) + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); From 65bb45b97b578c8eed1ffa80caec84708df49729 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 16 May 2022 16:22:43 +0900 Subject: [PATCH 239/334] ksmbd: add smbd max io size parameter Add 'smbd max io size' parameter to adjust smbd-direct max read/write size. Signed-off-by: Namjae Jeon Reviewed-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/ksmbd_netlink.h | 3 ++- fs/ksmbd/transport_ipc.c | 3 +++ fs/ksmbd/transport_rdma.c | 8 +++++++- fs/ksmbd/transport_rdma.h | 6 ++++++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index ebe6ca08467a..52aa0adeb951 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -104,7 +104,8 @@ struct ksmbd_startup_request { */ __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ - __u32 reserved[128]; /* Reserved room */ + __u32 smbd_max_io_size; /* smbd read write size */ + __u32 reserved[127]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 3ad6881e0f7e..7cb0eeb07c80 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -26,6 +26,7 @@ #include "mgmt/ksmbd_ida.h" #include "connection.h" #include "transport_tcp.h" +#include "transport_rdma.h" #define IPC_WAIT_TIMEOUT (2 * HZ) @@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_trans_size(req->smb2_max_trans); if (req->smb2_max_credits) init_smb2_max_credits(req->smb2_max_credits); + if (req->smbd_max_io_size) + init_smbd_max_io_size(req->smbd_max_io_size); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 19a605fd46ff..6d652ff38b82 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 8 * 1024 * 1024; +static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; static LIST_HEAD(smb_direct_device_list); static DEFINE_RWLOCK(smb_direct_device_lock); @@ -214,6 +214,12 @@ struct smb_direct_rdma_rw_msg { struct scatterlist sg_list[]; }; +void init_smbd_max_io_size(unsigned int sz) +{ + sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); + smb_direct_max_read_write_size = sz; +} + static inline int get_buf_page_count(void *buf, int size) { return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 5567d93a6f96..e7b4e6790fab 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,6 +7,10 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ +#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024) +#define SMBD_MIN_IOSIZE (512 * 1024) +#define SMBD_MAX_IOSIZE (16 * 1024 * 1024) + /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; @@ -52,10 +56,12 @@ struct smb_direct_data_transfer { int ksmbd_rdma_init(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); +void init_smbd_max_io_size(unsigned int sz); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline int ksmbd_rdma_destroy(void) { return 0; } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } +static inline void init_smbd_max_io_size(unsigned int sz) { } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ From 7a84399e1ce3f5f2fbec3e7dd93459ba25badc2f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 16 May 2022 16:23:28 +0900 Subject: [PATCH 240/334] ksmbd: fix wrong smbd max read/write size check smb-direct max read/write size can be different with smb2 max read/write size. So smb2_read() can return error by wrong max read/write size check. This patch use smb_direct_max_read_write_size for this check in smb-direct read/write(). Signed-off-by: Namjae Jeon Reviewed-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 39 +++++++++++++++++++++++++-------------- fs/ksmbd/transport_rdma.c | 5 +++++ fs/ksmbd/transport_rdma.h | 2 ++ 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 6bc30dd34999..e6f4ccc12f49 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6183,6 +6183,8 @@ int smb2_read(struct ksmbd_work *work) size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; int err = 0; + bool is_rdma_channel = false; + unsigned int max_read_size = conn->vals->max_read_size; WORK_BUFFERS(work, req, rsp); @@ -6194,6 +6196,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { + is_rdma_channel = true; + max_read_size = get_smbd_max_read_write_size(); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { @@ -6225,9 +6232,9 @@ int smb2_read(struct ksmbd_work *work) length = le32_to_cpu(req->Length); mincount = le32_to_cpu(req->MinimumCount); - if (length > conn->vals->max_read_size) { + if (length > max_read_size) { ksmbd_debug(SMB, "limiting read size to max size(%u)\n", - conn->vals->max_read_size); + max_read_size); err = -EINVAL; goto out; } @@ -6259,8 +6266,7 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n", nbytes, offset, mincount); - if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || - req->Channel == SMB2_CHANNEL_RDMA_V1) { + if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, work->aux_payload_buf, @@ -6421,8 +6427,9 @@ int smb2_write(struct ksmbd_work *work) size_t length; ssize_t nbytes; char *data_buf; - bool writethrough = false; + bool writethrough = false, is_rdma_channel = false; int err = 0; + unsigned int max_write_size = work->conn->vals->max_write_size; WORK_BUFFERS(work, req, rsp); @@ -6431,8 +6438,17 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + offset = le64_to_cpu(req->Offset); + length = le32_to_cpu(req->Length); + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + is_rdma_channel = true; + max_write_size = get_smbd_max_read_write_size(); + length = le32_to_cpu(req->RemainingBytes); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); if (req->Length != 0 || req->DataOffset != 0 || @@ -6467,12 +6483,9 @@ int smb2_write(struct ksmbd_work *work) goto out; } - offset = le64_to_cpu(req->Offset); - length = le32_to_cpu(req->Length); - - if (length > work->conn->vals->max_write_size) { + if (length > max_write_size) { ksmbd_debug(SMB, "limiting write size to max size(%u)\n", - work->conn->vals->max_write_size); + max_write_size); err = -EINVAL; goto out; } @@ -6480,8 +6493,7 @@ int smb2_write(struct ksmbd_work *work) if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; - if (req->Channel != SMB2_CHANNEL_RDMA_V1 && - req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + if (is_rdma_channel == false) { if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(work->request_buf)) { pr_err("invalid write data offset %u, smb_len %u\n", @@ -6507,8 +6519,7 @@ int smb2_write(struct ksmbd_work *work) /* read data from the client using rdma channel, and * write the data. */ - nbytes = smb2_write_rdma_channel(work, req, fp, offset, - le32_to_cpu(req->RemainingBytes), + nbytes = smb2_write_rdma_channel(work, req, fp, offset, length, writethrough); if (nbytes < 0) { err = (int)nbytes; diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 6d652ff38b82..0741fd129d16 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -220,6 +220,11 @@ void init_smbd_max_io_size(unsigned int sz) smb_direct_max_read_write_size = sz; } +unsigned int get_smbd_max_read_write_size(void) +{ + return smb_direct_max_read_write_size; +} + static inline int get_buf_page_count(void *buf, int size) { return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index e7b4e6790fab..77aee4e5c9dc 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -57,11 +57,13 @@ int ksmbd_rdma_init(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); +unsigned int get_smbd_max_read_write_size(void); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline int ksmbd_rdma_destroy(void) { return 0; } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } +static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ From 7820c6ee029548290b318e522eb2578516d05393 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 12 May 2022 15:56:05 +0800 Subject: [PATCH 241/334] ksmbd: Fix some kernel-doc comments Remove some warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/misc.c:30: warning: Function parameter or member 'str' not described in 'match_pattern' fs/ksmbd/misc.c:30: warning: Excess function parameter 'string' description in 'match_pattern' fs/ksmbd/misc.c:163: warning: Function parameter or member 'share' not described in 'convert_to_nt_pathname' fs/ksmbd/misc.c:163: warning: Function parameter or member 'path' not described in 'convert_to_nt_pathname' fs/ksmbd/misc.c:163: warning: Excess function parameter 'filename' description in 'convert_to_nt_pathname' fs/ksmbd/misc.c:163: warning: Excess function parameter 'sharepath' description in 'convert_to_nt_pathname' fs/ksmbd/misc.c:259: warning: Function parameter or member 'share' not described in 'convert_to_unix_name' fs/ksmbd/misc.c:259: warning: Function parameter or member 'name' not described in 'convert_to_unix_name' fs/ksmbd/misc.c:259: warning: Excess function parameter 'path' description in 'convert_to_unix_name' fs/ksmbd/misc.c:259: warning: Excess function parameter 'tid' description in 'convert_to_unix_name' Reported-by: Abaci Robot Signed-off-by: Yang Li Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/misc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 1e2076a53bed..df991107ad2c 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -20,7 +20,7 @@ * wildcard '*' and '?' * TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR * - * @string: string to compare with a pattern + * @str: string to compare with a pattern * @len: string length * @pattern: pattern string which might include wildcard '*' and '?' * @@ -152,8 +152,8 @@ out: /** * convert_to_nt_pathname() - extract and return windows path string * whose share directory prefix was removed from file path - * @filename : unix filename - * @sharepath: share path string + * @share: ksmbd_share_config pointer + * @path: path to report * * Return : windows path string or error */ @@ -250,8 +250,8 @@ char *ksmbd_extract_sharename(char *treename) /** * convert_to_unix_name() - convert windows name to unix format - * @path: name to be converted - * @tid: tree id of mathing share + * @share: ksmbd_share_config pointer + * @name: file name that is relative to share * * Return: converted name on success, otherwise NULL */ From 5366afc4065075a4456941fbd51c33604d631ee5 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 18 May 2022 06:46:08 +0900 Subject: [PATCH 242/334] ksmbd: smbd: fix connection dropped issue When there are bursty connection requests, RDMA connection event handler is deferred and Negotiation requests are received even if connection status is NEW. To handle it, set the status to CONNECTED if Negotiation requests are received. Reported-by: Yufan Chen Signed-off-by: Hyunchul Lee Tested-by: Yufan Chen Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 0741fd129d16..e91acc2746bc 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -576,6 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + t->status = SMB_DIRECT_CS_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; From 376b9133826865568167b4091ef92a68c4622b87 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Fri, 20 May 2022 14:35:47 +0900 Subject: [PATCH 243/334] ksmbd: fix outstanding credits related bugs outstanding credits must be initialized to 0, because it means the sum of credits consumed by in-flight requests. And outstanding credits must be compared with total credits in smb2_validate_credit_charge(), because total credits are the sum of credits granted by ksmbd. This patch fix the following error, while frametest with Windows clients: Limits exceeding the maximum allowable outstanding requests, given : 128, pending : 8065 Fixes: b589f5db6d4a ("ksmbd: limits exceeding the maximum allowable outstanding requests") Cc: stable@vger.kernel.org Signed-off-by: Hyunchul Lee Reported-by: Yufan Chen Tested-by: Yufan Chen Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.c | 2 +- fs/ksmbd/smb2misc.c | 2 +- fs/ksmbd/smb_common.c | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 7db87771884a..e8f476c5f189 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,7 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; - conn->outstanding_credits = 1; + conn->outstanding_credits = 0; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 4a9460153b59..f8f456377a51 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -338,7 +338,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ret = 1; } - if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + if ((u64)conn->outstanding_credits + credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", credit_charge, conn->outstanding_credits); ret = 1; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 9a7e211dbf4f..7f8ab14fb8ec 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -140,8 +140,10 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) hdr = work->request_buf; if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER && - hdr->Command == SMB_COM_NEGOTIATE) + hdr->Command == SMB_COM_NEGOTIATE) { + work->conn->outstanding_credits++; return 0; + } return -EINVAL; } From 4d0cdd2bb8f0bf1a30d361f14bafc428794551e0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 15:59:34 +1000 Subject: [PATCH 244/334] xfs: clean up xfs_attr_node_hasname The calling conventions of this function are a mess -- callers /can/ provide a pointer to a pointer to a state structure, but it's not required, and as evidenced by the last two patches, the callers that do weren't be careful enough about how to deal with an existing da state. Push the allocation and freeing responsibilty to the callers, which means that callers from the xattr node state machine steps now have the visibility to allocate or free the da state structure as they please. As a bonus, the node remove/add paths for larp-mode replaces can reset the da state structure instead of freeing and immediately reallocating it. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 63 ++++++++++++++++++------------------ fs/xfs/libxfs/xfs_da_btree.c | 11 +++++++ fs/xfs/libxfs/xfs_da_btree.h | 1 + 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 576de34cfca0..940f1e3a480b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -61,8 +61,8 @@ STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, - struct xfs_da_state **state); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( @@ -594,6 +594,19 @@ xfs_attr_leaf_mark_incomplete( return xfs_attr3_leaf_setflag(args); } +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_item *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); +} + /* * Initial setup for xfs_attr_node_removename. Make sure the attr is there and * the blocks are valid. Attr keys with remote blocks will be marked @@ -607,7 +620,8 @@ int xfs_attr_node_removename_setup( struct xfs_da_state *state; int error; - error = xfs_attr_node_hasname(args, &attr->xattri_da_state); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); if (error != -EEXIST) goto out; error = 0; @@ -855,6 +869,7 @@ xfs_attr_lookup( { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; if (!xfs_inode_hasattr(dp)) @@ -872,7 +887,10 @@ xfs_attr_lookup( return error; } - return xfs_attr_node_hasname(args, NULL); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); + return error; } static int @@ -1387,34 +1405,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Return EEXIST if attr is found, or ENOATTR if not - * statep: If not null is set to point at the found state. Caller will - * be responsible for freeing the state in this case. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_hasname( +xfs_attr_node_lookup( struct xfs_da_args *args, - struct xfs_da_state **statep) + struct xfs_da_state *state) { - struct xfs_da_state *state; int retval, error; - state = xfs_da_state_alloc(args); - if (statep != NULL) { - ASSERT(*statep == NULL); - *statep = state; - } - /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); if (error) - retval = error; - - if (!statep) - xfs_da_state_free(state); + return error; return retval; } @@ -1430,15 +1434,12 @@ xfs_attr_node_addname_find_attr( struct xfs_da_args *args = attr->xattri_da_args; int error; - if (attr->xattri_da_state) - xfs_da_state_free(attr->xattri_da_state); - attr->xattri_da_state = NULL; - /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_attr_node_hasname(args, &attr->xattri_da_state); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); switch (error) { case -ENOATTR: if (args->op_flags & XFS_DA_OP_REPLACE) @@ -1599,7 +1600,7 @@ STATIC int xfs_attr_node_get( struct xfs_da_args *args) { - struct xfs_da_state *state = NULL; + struct xfs_da_state *state; struct xfs_da_state_blk *blk; int i; int error; @@ -1609,7 +1610,8 @@ xfs_attr_node_get( /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_attr_node_hasname(args, &state); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); if (error != -EEXIST) goto out_release; @@ -1628,8 +1630,7 @@ out_release: state->path.blk[i].bp = NULL; } - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index aa74f3fdb571..e7201dc68f43 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -117,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_cache_free(xfs_da_state_cache, state); } +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; +} + static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) { if (whichfork == XFS_DATA_FORK) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ed2303e4d46a..d33b7686a0b3 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -225,6 +225,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); From b53d212b4b5c29ab16edb7eca2b0e05927b7aa34 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 15:59:48 +1000 Subject: [PATCH 245/334] xfs: put the xattr intent item op flags in their own namespace The flags that are stored in the extended attr intent log item really should have a separate namespace from the rest of the XFS_ATTR_* flags. Give them one to make it a little more obvious that they're intent item flags. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 6 +++--- fs/xfs/libxfs/xfs_attr.h | 2 +- fs/xfs/libxfs/xfs_log_format.h | 8 ++++---- fs/xfs/xfs_attr_item.c | 20 ++++++++++---------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 940f1e3a480b..b49e7dbf9a5b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -918,7 +918,7 @@ xfs_attr_defer_add( struct xfs_attr_item *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); + error = xfs_attr_item_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); if (error) return error; @@ -937,7 +937,7 @@ xfs_attr_defer_replace( struct xfs_attr_item *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new); + error = xfs_attr_item_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); if (error) return error; @@ -957,7 +957,7 @@ xfs_attr_defer_remove( struct xfs_attr_item *new; int error; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); + error = xfs_attr_item_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 1af7abe29eef..c739caa11a4b 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -530,7 +530,7 @@ struct xfs_attr_item { enum xfs_delattr_state xattri_dela_state; /* - * Attr operation being performed - XFS_ATTR_OP_FLAGS_* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* */ unsigned int xattri_op_flags; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a9d08f3d4682..b351b9dc6561 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -906,10 +906,10 @@ struct xfs_icreate_log { * Flags for deferred attribute operations. * Upper bits are flags, lower byte is type code */ -#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */ -#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */ -#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */ -#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ /* * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4f1ee8b91a17..5397b7894da0 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -349,7 +349,7 @@ xfs_attr_log_item( */ attrp = &attrip->attri_format; attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; - ASSERT(!(attr->xattri_op_flags & ~XFS_ATTR_OP_FLAGS_TYPE_MASK)); + ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; attrp->alfi_value_len = attr->xattri_da_args->valuelen; attrp->alfi_name_len = attr->xattri_da_args->namelen; @@ -493,12 +493,12 @@ xfs_attri_validate( struct xfs_attri_log_format *attrp) { unsigned int op = attrp->alfi_op_flags & - XFS_ATTR_OP_FLAGS_TYPE_MASK; + XFS_ATTRI_OP_FLAGS_TYPE_MASK; if (attrp->__pad != 0) return false; - if (attrp->alfi_op_flags & ~XFS_ATTR_OP_FLAGS_TYPE_MASK) + if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) return false; if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) @@ -506,9 +506,9 @@ xfs_attri_validate( /* alfi_op_flags should be either a set or remove */ switch (op) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: - case XFS_ATTR_OP_FLAGS_REMOVE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + case XFS_ATTRI_OP_FLAGS_REMOVE: break; default: return false; @@ -565,7 +565,7 @@ xfs_attri_item_recover( attr->xattri_da_args = args; attr->xattri_op_flags = attrp->alfi_op_flags & - XFS_ATTR_OP_FLAGS_TYPE_MASK; + XFS_ATTRI_OP_FLAGS_TYPE_MASK; args->dp = ip; args->geo = mp->m_attr_geo; @@ -577,8 +577,8 @@ xfs_attri_item_recover( args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; switch (attr->xattri_op_flags) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: args->value = attrip->attri_value; args->valuelen = attrp->alfi_value_len; args->total = xfs_attr_calc_size(args, &local); @@ -587,7 +587,7 @@ xfs_attri_item_recover( else attr->xattri_dela_state = xfs_attr_init_add_state(args); break; - case XFS_ATTR_OP_FLAGS_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: if (!xfs_inode_hasattr(args->dp)) goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); From e2c78949b641d34cb4051b32c2fa5e03a4bfef35 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 15:59:48 +1000 Subject: [PATCH 246/334] xfs: use a separate slab cache for deferred xattr work state Create a separate slab cache for struct xfs_attr_item objects, since we can pack the (104-byte) intent items more tightly than we can with the general slab cache objects. On x86, this means 39 intents per memory page instead of 32. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 20 +++++++++++++++++++- fs/xfs/libxfs/xfs_attr.h | 4 ++++ fs/xfs/libxfs/xfs_defer.c | 4 ++++ fs/xfs/xfs_attr_item.c | 5 ++++- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index b49e7dbf9a5b..93aa892ed0e3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -29,6 +29,7 @@ struct kmem_cache *xfs_attri_cache; struct kmem_cache *xfs_attrd_cache; +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -902,7 +903,7 @@ xfs_attr_item_init( struct xfs_attr_item *new; - new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); new->xattri_op_flags = op_flags; new->xattri_da_args = args; @@ -1650,3 +1651,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_item", + sizeof(struct xfs_attr_item), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index c739caa11a4b..cb3b3d270569 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -634,4 +634,8 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ceb222b4f261..ed65f7e5a9c7 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -877,6 +877,9 @@ xfs_defer_init_item_caches(void) if (error) goto err; error = xfs_attrd_init_cache(); + if (error) + goto err; + error = xfs_attr_intent_init_cache(); if (error) goto err; return 0; @@ -889,6 +892,7 @@ err: void xfs_defer_destroy_item_caches(void) { + xfs_attr_intent_destroy_cache(); xfs_attri_destroy_cache(); xfs_attrd_destroy_cache(); xfs_extfree_intent_destroy_cache(); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5397b7894da0..f85bcbfc2478 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -404,7 +404,10 @@ xfs_attr_free_item( { if (attr->xattri_da_state) xfs_da_state_free(attr->xattri_da_state); - kmem_free(attr); + if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) + kmem_free(attr); + else + kmem_cache_free(xfs_attr_intent_cache, attr); } /* Process an attr. */ From 500a512c60d132dbffc0233ba22ad067756c0ccd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 15:59:48 +1000 Subject: [PATCH 247/334] xfs: remove struct xfs_attr_item.xattri_flags Nobody uses this field, so get rid of it and the unused flag definition. Rearrange the structure layout to reduce its size from 104 to 96 bytes. This gets us from 39 to 42 objects per page. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.h | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index cb3b3d270569..f0b93515c1e8 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -501,15 +501,19 @@ enum xfs_delattr_state { { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ { XFS_DAS_DONE, "XFS_DAS_DONE" } -/* - * Defines for xfs_attr_item.xattri_flags - */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/ - /* * Context used for keeping track of delayed attribute operations */ struct xfs_attr_item { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; + + /* Used in xfs_attr_node_removename to roll through removing blocks */ + struct xfs_da_state *xattri_da_state; + struct xfs_da_args *xattri_da_args; /* @@ -517,16 +521,7 @@ struct xfs_attr_item { */ struct xfs_buf *xattri_leaf_bp; - /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec xattri_map; - xfs_dablk_t xattri_lblkno; - int xattri_blkcnt; - - /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *xattri_da_state; - /* Used to keep track of current state of delayed operation */ - unsigned int xattri_flags; enum xfs_delattr_state xattri_dela_state; /* @@ -534,11 +529,10 @@ struct xfs_attr_item { */ unsigned int xattri_op_flags; - /* - * used to log this item to an intent containing a list of attrs to - * commit later - */ - struct list_head xattri_list; + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; }; From 4136e38af728eddcab2e51aecde28e94d0782b9b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 15:59:48 +1000 Subject: [PATCH 248/334] xfs: put attr[id] log item cache init with the others Initialize and destroy the xattr log item caches in the same places that we do all the other log item caches. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 36 ------------------------------------ fs/xfs/libxfs/xfs_attr.h | 8 -------- fs/xfs/libxfs/xfs_defer.c | 8 -------- fs/xfs/xfs_attr_item.c | 3 +++ fs/xfs/xfs_attr_item.h | 3 +++ fs/xfs/xfs_super.c | 19 +++++++++++++++++++ 6 files changed, 25 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 93aa892ed0e3..a711976e4faf 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -27,8 +27,6 @@ #include "xfs_attr_item.h" #include "xfs_log.h" -struct kmem_cache *xfs_attri_cache; -struct kmem_cache *xfs_attrd_cache; struct kmem_cache *xfs_attr_intent_cache; /* @@ -1113,40 +1111,6 @@ out_trans_cancel: goto out_unlock; } -int __init -xfs_attri_init_cache(void) -{ - xfs_attri_cache = kmem_cache_create("xfs_attri", - sizeof(struct xfs_attri_log_item), - 0, 0, NULL); - - return xfs_attri_cache != NULL ? 0 : -ENOMEM; -} - -void -xfs_attri_destroy_cache(void) -{ - kmem_cache_destroy(xfs_attri_cache); - xfs_attri_cache = NULL; -} - -int __init -xfs_attrd_init_cache(void) -{ - xfs_attrd_cache = kmem_cache_create("xfs_attrd", - sizeof(struct xfs_attrd_log_item), - 0, 0, NULL); - - return xfs_attrd_cache != NULL ? 0 : -ENOMEM; -} - -void -xfs_attrd_destroy_cache(void) -{ - kmem_cache_destroy(xfs_attrd_cache); - xfs_attrd_cache = NULL; -} - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index f0b93515c1e8..22a2f288c1c0 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -558,14 +558,6 @@ int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); -extern struct kmem_cache *xfs_attri_cache; -extern struct kmem_cache *xfs_attrd_cache; - -int __init xfs_attri_init_cache(void); -void xfs_attri_destroy_cache(void); -int __init xfs_attrd_init_cache(void); -void xfs_attrd_destroy_cache(void); - /* * Check to see if the attr should be upgraded from non-existent or shortform to * single-leaf-block attribute list. diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ed65f7e5a9c7..ace229c1d251 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -871,12 +871,6 @@ xfs_defer_init_item_caches(void) if (error) goto err; error = xfs_extfree_intent_init_cache(); - if (error) - goto err; - error = xfs_attri_init_cache(); - if (error) - goto err; - error = xfs_attrd_init_cache(); if (error) goto err; error = xfs_attr_intent_init_cache(); @@ -893,8 +887,6 @@ void xfs_defer_destroy_item_caches(void) { xfs_attr_intent_destroy_cache(); - xfs_attri_destroy_cache(); - xfs_attrd_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index f85bcbfc2478..ca0dd46c0ae5 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -29,6 +29,9 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; + static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index c3b779f82adb..cc2fbc9d58a7 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -43,4 +43,7 @@ struct xfs_attrd_log_item { struct xfs_attrd_log_format attrd_format; }; +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + #endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 93e43e1a2863..51ce127a0cc6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,7 @@ #include "xfs_pwork.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_attr_item.h" #include #include @@ -2083,8 +2084,24 @@ xfs_init_caches(void) if (!xfs_bui_cache) goto out_destroy_bud_cache; + xfs_attrd_cache = kmem_cache_create("xfs_attrd_item", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + if (!xfs_attrd_cache) + goto out_destroy_bui_cache; + + xfs_attri_cache = kmem_cache_create("xfs_attri_item", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + if (!xfs_attri_cache) + goto out_destroy_attrd_cache; + return 0; + out_destroy_attrd_cache: + kmem_cache_destroy(xfs_attrd_cache); + out_destroy_bui_cache: + kmem_cache_destroy(xfs_bui_cache); out_destroy_bud_cache: kmem_cache_destroy(xfs_bud_cache); out_destroy_cui_cache: @@ -2131,6 +2148,8 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_attri_cache); + kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); kmem_cache_destroy(xfs_bud_cache); kmem_cache_destroy(xfs_cui_cache); From 3768f6985700106e90eca87d814dc08e609a9969 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 15:59:48 +1000 Subject: [PATCH 249/334] xfs: clean up state variable usage in xfs_attr_node_remove_attr The state variable is now a local variable pointing to a heap allocation, so we don't need to zero-initialize it, nor do we need the conditional to decide if we should free it. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Allison Henderson Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index a711976e4faf..130eed8f5185 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1516,7 +1516,7 @@ xfs_attr_node_remove_attr( struct xfs_attr_item *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state *state = NULL; + struct xfs_da_state *state = xfs_da_state_alloc(args); int retval = 0; int error = 0; @@ -1526,8 +1526,6 @@ xfs_attr_node_remove_attr( * attribute entry after any split ops. */ args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1545,8 +1543,7 @@ xfs_attr_node_remove_attr( retval = error = 0; out: - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); if (error) return error; return retval; From e3c5de22026fda01674c400b97f96857dfb58aab Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 22 May 2022 16:00:26 +1000 Subject: [PATCH 250/334] xfs: rename struct xfs_attr_item to xfs_attr_intent Everywhere else in XFS, structures that capture the state of an ongoing deferred work item all have names that end with "_intent". The new extended attribute deferred work items are not named as such, so fix it to follow the naming convention used elsewhere. Signed-off-by: Darrick J. Wong Reviewed-by: Allison Henderson Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 52 ++++++++++++++++----------------- fs/xfs/libxfs/xfs_attr.h | 8 ++--- fs/xfs/libxfs/xfs_attr_remote.c | 6 ++-- fs/xfs/libxfs/xfs_attr_remote.h | 6 ++-- fs/xfs/xfs_attr_item.c | 24 +++++++-------- fs/xfs/xfs_attr_item.h | 6 ++-- 6 files changed, 51 insertions(+), 51 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 130eed8f5185..79615727f8c2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -57,9 +57,9 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, struct xfs_da_state *state); @@ -376,7 +376,7 @@ xfs_attr_try_sf_addname( static int xfs_attr_sf_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -422,7 +422,7 @@ out: */ static enum xfs_delattr_state xfs_attr_complete_op( - struct xfs_attr_item *attr, + struct xfs_attr_intent *attr, enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; @@ -438,7 +438,7 @@ xfs_attr_complete_op( static int xfs_attr_leaf_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -492,7 +492,7 @@ out: */ static int xfs_attr_node_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -529,7 +529,7 @@ out: static int xfs_attr_rmtval_alloc( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error = 0; @@ -596,7 +596,7 @@ xfs_attr_leaf_mark_incomplete( /* Ensure the da state of an xattr deferred work item is ready to go. */ static inline void xfs_attr_item_init_da_state( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; @@ -613,7 +613,7 @@ xfs_attr_item_init_da_state( */ static int xfs_attr_node_removename_setup( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state; @@ -651,7 +651,7 @@ out: */ static int xfs_attr_leaf_remove_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -716,7 +716,7 @@ xfs_attr_leaf_shrink( */ int xfs_attr_set_iter( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error = 0; @@ -893,13 +893,13 @@ xfs_attr_lookup( } static int -xfs_attr_item_init( +xfs_attr_intent_init( struct xfs_da_args *args, unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_item **attr) /* new xfs_attr_item */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); new->xattri_op_flags = op_flags; @@ -914,10 +914,10 @@ static int xfs_attr_defer_add( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); if (error) return error; @@ -933,10 +933,10 @@ static int xfs_attr_defer_replace( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); if (error) return error; @@ -953,10 +953,10 @@ xfs_attr_defer_remove( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error; - error = xfs_attr_item_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); if (error) return error; @@ -1394,7 +1394,7 @@ xfs_attr_node_lookup( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -1447,7 +1447,7 @@ error: */ static int xfs_attr_node_try_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = attr->xattri_da_state; @@ -1513,7 +1513,7 @@ xfs_attr_node_removename( static int xfs_attr_node_remove_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = xfs_da_state_alloc(args); @@ -1616,8 +1616,8 @@ xfs_attr_namecheck( int __init xfs_attr_intent_init_cache(void) { - xfs_attr_intent_cache = kmem_cache_create("xfs_attr_item", - sizeof(struct xfs_attr_item), + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), 0, 0, NULL); return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 22a2f288c1c0..b88b6d74e4fc 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -434,7 +434,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_attr_item.xattri_da_state + * Enum values for xfs_attr_intent.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -504,7 +504,7 @@ enum xfs_delattr_state { /* * Context used for keeping track of delayed attribute operations */ -struct xfs_attr_item { +struct xfs_attr_intent { /* * used to log this item to an intent containing a list of attrs to * commit later @@ -551,8 +551,8 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_iter(struct xfs_attr_item *attr); -int xfs_attr_remove_iter(struct xfs_attr_item *attr); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4250159ecced..7298c148f848 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,7 +568,7 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_bmbt_irec *map = &attr->xattri_map; @@ -598,7 +598,7 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -674,7 +674,7 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error, done; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 62b398edec3f..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_attr_item *attr); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr); -int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index ca0dd46c0ae5..d6f7b6d9402a 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -300,7 +300,7 @@ xfs_attrd_item_intent( */ STATIC int xfs_xattri_finish_update( - struct xfs_attr_item *attr, + struct xfs_attr_intent *attr, struct xfs_attrd_log_item *attrdp) { struct xfs_da_args *args = attr->xattri_da_args; @@ -338,7 +338,7 @@ STATIC void xfs_attr_log_item( struct xfs_trans *tp, struct xfs_attri_log_item *attrip, - struct xfs_attr_item *attr) + const struct xfs_attr_intent *attr) { struct xfs_attri_log_format *attrp; @@ -346,9 +346,9 @@ xfs_attr_log_item( set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); /* - * At this point the xfs_attr_item has been constructed, and we've + * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format - * structure with fields from this xfs_attr_item + * structure with fields from this xfs_attr_intent */ attrp = &attrip->attri_format; attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; @@ -377,7 +377,7 @@ xfs_attr_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_attri_log_item *attrip; - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; ASSERT(count == 1); @@ -403,7 +403,7 @@ xfs_attr_create_intent( static inline void xfs_attr_free_item( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { if (attr->xattri_da_state) xfs_da_state_free(attr->xattri_da_state); @@ -421,11 +421,11 @@ xfs_attr_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; struct xfs_attrd_log_item *done_item = NULL; int error; - attr = container_of(item, struct xfs_attr_item, xattri_list); + attr = container_of(item, struct xfs_attr_intent, xattri_list); if (done) done_item = ATTRD_ITEM(done); @@ -455,9 +455,9 @@ STATIC void xfs_attr_cancel_item( struct list_head *item) { - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; - attr = container_of(item, struct xfs_attr_item, xattri_list); + attr = container_of(item, struct xfs_attr_intent, xattri_list); xfs_attr_free_item(attr); } @@ -540,7 +540,7 @@ xfs_attri_item_recover( struct list_head *capture_list) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_inode *ip; struct xfs_da_args *args; @@ -565,7 +565,7 @@ xfs_attri_item_recover( if (error) return error; - attr = kmem_zalloc(sizeof(struct xfs_attr_item) + + attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); args = (struct xfs_da_args *)(attr + 1); diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index cc2fbc9d58a7..a40e702e0215 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -15,13 +15,13 @@ struct kmem_zone; * This is the "attr intention" log item. It is used to log the fact that some * extended attribute operations need to be processed. An operation is * currently either a set or remove. Set or remove operations are described by - * the xfs_attr_item which may be logged to this intent. + * the xfs_attr_intent which may be logged to this intent. * * During a normal attr operation, name and value point to the name and value * fields of the caller's xfs_da_args structure. During a recovery, the name * and value buffers are copied from the log, and stored in a trailing buffer - * attached to the xfs_attr_item until they are committed. They are freed when - * the xfs_attr_item itself is freed when the work is done. + * attached to the xfs_attr_intent until they are committed. They are freed + * when the xfs_attr_intent itself is freed when the work is done. */ struct xfs_attri_log_item { struct xfs_log_item attri_item; From 41bc61c02a5a3aa4f8ba5f9e9c54668468925c72 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 22 May 2022 16:46:38 +1000 Subject: [PATCH 251/334] xfs: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_symlink_remote.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f0b38f4aba80..8b9bd178a487 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -213,7 +213,7 @@ xfs_symlink_shortform_verify( /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; From e62c720817597f259b81f1ff004eb042293bf046 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sun, 22 May 2022 16:46:57 +1000 Subject: [PATCH 252/334] xfs: Remove dead code Remove tht entire xlog_recover_check_summary() function, this entire function is dead code and has been for 12 years. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 59 ---------------------------------------- 1 file changed, 59 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 97b941c07957..b1980d7cbbee 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -39,13 +39,6 @@ STATIC int xlog_clear_stale_blocks( struct xlog *, xfs_lsn_t); -#if defined(DEBUG) -STATIC void -xlog_recover_check_summary( - struct xlog *); -#else -#define xlog_recover_check_summary(log) -#endif STATIC int xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); @@ -3339,8 +3332,6 @@ xlog_do_recover( } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - xlog_recover_check_summary(log); - /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; @@ -3483,7 +3474,6 @@ xlog_recover_finish( } xlog_recover_process_iunlinks(log); - xlog_recover_check_summary(log); /* * Recover any CoW staging blocks that are still referenced by the @@ -3517,52 +3507,3 @@ xlog_recover_cancel( xlog_recover_cancel_intents(log); } -#if defined(DEBUG) -/* - * Read all of the agf and agi counters and check that they - * are consistent with the superblock counters. - */ -STATIC void -xlog_recover_check_summary( - struct xlog *log) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; - - freeblks = 0LL; - itotal = 0LL; - ifree = 0LL; - for_each_perag(mp, agno, pag) { - error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); - if (error) { - xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agf *agfp = agfbp->b_addr; - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - } - - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); - if (error) { - xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agi *agi = agibp->b_addr; - - itotal += be32_to_cpu(agi->agi_count); - ifree += be32_to_cpu(agi->agi_freecount); - xfs_buf_relse(agibp); - } - } -} -#endif /* DEBUG */ From 93e6aa4329d07fa01e1016446464c666c5b49b5c Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 22 May 2022 16:47:11 +1000 Subject: [PATCH 253/334] xfs: reduce IOCB_NOWAIT judgment for retry exclusive unaligned DIO Retry unaligned DIO with exclusive blocking semantics only when the IOCB_NOWAIT flag is not set. If we are doing nonblocking user I/O, propagate the error directly. Signed-off-by: Kaixu Xia Reviewed-by: Chaitanya Kulkarni Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index af954a5b71f8..e2f2a3a94634 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -576,9 +576,9 @@ xfs_file_dio_write_unaligned( * don't even bother trying the fast path in this case. */ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { -retry_exclusive: if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; +retry_exclusive: iolock = XFS_IOLOCK_EXCL; flags = IOMAP_DIO_FORCE_WAIT; } From 73c348d4ab5c34aa2186071e66eaa5700d35e104 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sun, 22 May 2022 16:47:17 +1000 Subject: [PATCH 254/334] xfs: Remove duplicate include Clean up the following includecheck warning: ./fs/xfs/xfs_attr_item.c: xfs_inode.h is included more than once. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr_item.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e8ac88d9fd14..bf9a01b083b8 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -22,7 +22,6 @@ #include "xfs_attr.h" #include "xfs_attr_item.h" #include "xfs_trace.h" -#include "xfs_inode.h" #include "xfs_trans_space.h" #include "xfs_errortag.h" #include "xfs_error.h" From 22a68ba724232ba675166c307ddef3749ae4c37c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 May 2022 08:41:03 +1000 Subject: [PATCH 255/334] xfs: do not use logged xattr updates on V4 filesystems V4 superblocks do not contain the log_incompat feature bit, which means that we cannot protect xattr log items against kernels that are too old to know how to recover them. Turn off the log items for such filesystems and adjust the "delayed" name to reflect what it's really controlling. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 6 +++--- fs/xfs/libxfs/xfs_attr.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 79615727f8c2..9f14aca29ec4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -982,7 +982,7 @@ xfs_attr_set( int error, local; int rmt_blks = 0; unsigned int total; - int delayed = xfs_has_larp(mp); + bool use_logging = xfs_has_larp(mp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; @@ -1027,7 +1027,7 @@ xfs_attr_set( rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } - if (delayed) { + if (use_logging) { error = xfs_attr_use_log_assist(mp); if (error) return error; @@ -1101,7 +1101,7 @@ xfs_attr_set( out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); drop_incompat: - if (delayed) + if (use_logging) xlog_drop_incompat_feat(mp->m_log); return error; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index b88b6d74e4fc..3cd9cbb68b0f 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -31,7 +31,8 @@ struct xfs_attr_list_context; static inline bool xfs_has_larp(struct xfs_mount *mp) { #ifdef DEBUG - return xfs_globals.larp; + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; #else return false; #endif From 4183e4f27f402d712bccab30588a6fe7575963c0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 23 May 2022 08:43:46 +1000 Subject: [PATCH 256/334] xfs: share xattr name and value buffers when logging xattr updates While running xfs/297 and generic/642, I noticed a crash in xfs_attri_item_relog when it tries to copy the attr name to the new xattri log item. I think what happened here was that we called ->iop_commit on the old attri item (which nulls out the pointers) as part of a log force at the same time that a chained attr operation was ongoing. The system was busy enough that at some later point, the defer ops operation decided it was necessary to relog the attri log item, but as we've detached the name buffer from the old attri log item, we can't copy it to the new one, and kaboom. I think there's a broader refcounting problem with LARP mode -- the setxattr code can return to userspace before the CIL actually formats and commits the log item, which results in a UAF bug. Therefore, the xattr log item needs to be able to retain a reference to the name and value buffers until the log items have completely cleared the log. Furthermore, each time we create an intent log item, we allocate new memory and (re)copy the contents; sharing here would be very useful. Solve the UAF and the unnecessary memory allocations by having the log code create a single refcounted buffer to contain the name and value contents. This buffer can be passed from old to new during a relog operation, and the logging code can (optionally) attach it to the xfs_attr_item for reuse when LARP mode is enabled. This also fixes a problem where the xfs_attri_log_item objects weren't being freed back to the same cache where they came from. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.h | 8 ++ fs/xfs/libxfs/xfs_defer.c | 59 +++++++-- fs/xfs/xfs_attr_item.c | 268 +++++++++++++++++++++----------------- fs/xfs/xfs_attr_item.h | 13 +- fs/xfs/xfs_log.h | 7 + 5 files changed, 222 insertions(+), 133 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 3cd9cbb68b0f..e329da3e7afa 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -502,6 +502,8 @@ enum xfs_delattr_state { { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ { XFS_DAS_DONE, "XFS_DAS_DONE" } +struct xfs_attri_log_nameval; + /* * Context used for keeping track of delayed attribute operations */ @@ -517,6 +519,12 @@ struct xfs_attr_intent { struct xfs_da_args *xattri_da_args; + /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; + /* * Used by xfs_attr_set to hold a leaf buffer across a transaction roll */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ace229c1d251..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -191,35 +191,56 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static bool +/* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; - if (!dfp->dfp_intent) - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); - return dfp->dfp_intent != NULL; + if (dfp->dfp_intent) + return 1; + + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -static bool +static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; - bool ret = false; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - ret |= xfs_defer_create_intent(tp, dfp, true); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } return ret; } @@ -457,6 +478,8 @@ xfs_defer_finish_one( dfp->dfp_count--; error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { + int ret; + /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to @@ -467,7 +490,9 @@ xfs_defer_finish_one( dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; - xfs_defer_create_intent(tp, dfp, false); + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; } if (error) @@ -514,10 +539,14 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - bool has_intents = xfs_defer_create_intents(*tp); + int has_intents = xfs_defer_create_intents(*tp); list_splice_init(&(*tp)->t_dfops, &dop_pending); + if (has_intents < 0) { + error = has_intents; + goto out_shutdown; + } if (has_intents || dfp) { error = xfs_defer_trans_roll(tp); if (error) @@ -676,13 +705,15 @@ xfs_defer_ops_capture( if (list_empty(&tp->t_dfops)) return NULL; + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + /* Create an object to capture the defer ops. */ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); - xfs_defer_create_intents(tp); - /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; @@ -759,6 +790,10 @@ xfs_defer_ops_capture_and_commit( /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } if (!dfc) return xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index d6f7b6d9402a..028c358a7c90 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -42,12 +42,80 @@ static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_attri_log_item, attri_item); } +/* + * Shared xattr name/value buffers for logged extended attribute operations + * + * When logging updates to extended attributes, we can create quite a few + * attribute log intent items for a single xattr update. To avoid cycling the + * memory allocator and memcpy overhead, the name (and value, for setxattr) + * are kept in a refcounted object that is shared across all related log items + * and the upper-level deferred work state structure. The shared buffer has + * a control structure, followed by the name, and then the value. + */ + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_get( + struct xfs_attri_log_nameval *nv) +{ + if (!refcount_inc_not_zero(&nv->refcount)) + return NULL; + return nv; +} + +static inline void +xfs_attri_log_nameval_put( + struct xfs_attri_log_nameval *nv) +{ + if (!nv) + return; + if (refcount_dec_and_test(&nv->refcount)) + kvfree(nv); +} + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_alloc( + const void *name, + unsigned int name_len, + const void *value, + unsigned int value_len) +{ + struct xfs_attri_log_nameval *nv; + + /* + * This could be over 64kB in length, so we have to use kvmalloc() for + * this. But kvmalloc() utterly sucks, so we use our own version. + */ + nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + + name_len + value_len); + if (!nv) + return nv; + + nv->name.i_addr = nv + 1; + nv->name.i_len = name_len; + nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; + memcpy(nv->name.i_addr, name, name_len); + + if (value_len) { + nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_len = value_len; + memcpy(nv->value.i_addr, value, value_len); + } else { + nv->value.i_addr = NULL; + nv->value.i_len = 0; + } + nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + + refcount_set(&nv->refcount, 1); + return nv; +} + STATIC void xfs_attri_item_free( struct xfs_attri_log_item *attrip) { kmem_free(attrip->attri_item.li_lv_shadow); - kvfree(attrip); + xfs_attri_log_nameval_put(attrip->attri_nameval); + kmem_cache_free(xfs_attri_cache, attrip); } /* @@ -76,16 +144,17 @@ xfs_attri_item_size( int *nbytes) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; *nvecs += 2; *nbytes += sizeof(struct xfs_attri_log_format) + - xlog_calc_iovec_len(attrip->attri_name_len); + xlog_calc_iovec_len(nv->name.i_len); - if (!attrip->attri_value_len) + if (!nv->value.i_len) return; *nvecs += 1; - *nbytes += xlog_calc_iovec_len(attrip->attri_value_len); + *nbytes += xlog_calc_iovec_len(nv->value.i_len); } /* @@ -100,6 +169,7 @@ xfs_attri_item_format( { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_log_iovec *vecp = NULL; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; attrip->attri_format.alfi_type = XFS_LI_ATTRI; attrip->attri_format.alfi_size = 1; @@ -111,22 +181,18 @@ xfs_attri_item_format( * the log recovery. */ - ASSERT(attrip->attri_name_len > 0); + ASSERT(nv->name.i_len > 0); attrip->attri_format.alfi_size++; - if (attrip->attri_value_len > 0) + if (nv->value.i_len > 0) attrip->attri_format.alfi_size++; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, - attrip->attri_name, - attrip->attri_name_len); - if (attrip->attri_value_len > 0) - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, - attrip->attri_value, - attrip->attri_value_len); + xlog_copy_from_iovec(lv, &vecp, &nv->name); + if (nv->value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->value); } /* @@ -161,41 +227,18 @@ xfs_attri_item_release( STATIC struct xfs_attri_log_item * xfs_attri_init( struct xfs_mount *mp, - uint32_t name_len, - uint32_t value_len) - + struct xfs_attri_log_nameval *nv) { struct xfs_attri_log_item *attrip; - uint32_t buffer_size = name_len + value_len; - if (buffer_size) { - /* - * This could be over 64kB in length, so we have to use - * kvmalloc() for this. But kvmalloc() utterly sucks, so we - * use own version. - */ - attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) + - buffer_size); - } else { - attrip = kmem_cache_alloc(xfs_attri_cache, - GFP_NOFS | __GFP_NOFAIL); - } - memset(attrip, 0, sizeof(struct xfs_attri_log_item)); + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); - attrip->attri_name_len = name_len; - if (name_len) - attrip->attri_name = ((char *)attrip) + - sizeof(struct xfs_attri_log_item); - else - attrip->attri_name = NULL; - - attrip->attri_value_len = value_len; - if (value_len) - attrip->attri_value = ((char *)attrip) + - sizeof(struct xfs_attri_log_item) + - name_len; - else - attrip->attri_value = NULL; + /* + * Grab an extra reference to the name/value buffer for this log item. + * The caller retains its own reference! + */ + attrip->attri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attrip->attri_nameval); xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, &xfs_attri_item_ops); @@ -354,17 +397,10 @@ xfs_attr_log_item( attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = attr->xattri_da_args->valuelen; - attrp->alfi_name_len = attr->xattri_da_args->namelen; + attrp->alfi_value_len = attr->xattri_nameval->value.i_len; + attrp->alfi_name_len = attr->xattri_nameval->name.i_len; ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; - - memcpy(attrip->attri_name, attr->xattri_da_args->name, - attr->xattri_da_args->namelen); - memcpy(attrip->attri_value, attr->xattri_da_args->value, - attr->xattri_da_args->valuelen); - attrip->attri_name_len = attr->xattri_da_args->namelen; - attrip->attri_value_len = attr->xattri_da_args->valuelen; } /* Get an ATTRI. */ @@ -388,15 +424,30 @@ xfs_attr_create_intent( * Each attr item only performs one attribute operation at a time, so * this is a list of one */ - list_for_each_entry(attr, items, xattri_list) { - attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen, - attr->xattri_da_args->valuelen); - if (attrip == NULL) - return NULL; + attr = list_first_entry_or_null(items, struct xfs_attr_intent, + xattri_list); - xfs_trans_add_item(tp, &attrip->attri_item); - xfs_attr_log_item(tp, attrip, attr); + /* + * Create a buffer to store the attribute name and value. This buffer + * will be shared between the higher level deferred xattr work state + * and the lower level xattr log items. + */ + if (!attr->xattri_nameval) { + struct xfs_da_args *args = attr->xattri_da_args; + + /* + * Transfer our reference to the name/value buffer to the + * deferred work state structure. + */ + attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, + args->namelen, args->value, args->valuelen); } + if (!attr->xattri_nameval) + return ERR_PTR(-ENOMEM); + + attrip = xfs_attri_init(mp, attr->xattri_nameval); + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; } @@ -407,6 +458,7 @@ xfs_attr_free_item( { if (attr->xattri_da_state) xfs_da_state_free(attr->xattri_da_state); + xfs_attri_log_nameval_put(attr->xattri_nameval); if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) kmem_free(attr); else @@ -461,29 +513,6 @@ xfs_attr_cancel_item( xfs_attr_free_item(attr); } -STATIC xfs_lsn_t -xfs_attri_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); - - /* - * The attrip refers to xfs_attr_item memory to log the name and value - * with the intent item. This already occurred when the intent was - * committed so these fields are no longer accessed. Clear them out of - * caution since we're about to free the xfs_attr_item. - */ - attrip->attri_name = NULL; - attrip->attri_value = NULL; - - /* - * The ATTRI is logged only once and cannot be moved in the log, so - * simply return the lsn at which it's been logged. - */ - return lsn; -} - STATIC bool xfs_attri_item_match( struct xfs_log_item *lip, @@ -547,6 +576,7 @@ xfs_attri_item_recover( struct xfs_trans *tp; struct xfs_trans_res tres; struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error, ret = 0; int total; int local; @@ -558,7 +588,7 @@ xfs_attri_item_recover( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) return -EFSCORRUPTED; error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); @@ -573,11 +603,19 @@ xfs_attri_item_recover( attr->xattri_op_flags = attrp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; + /* + * We're reconstructing the deferred work state structure from the + * recovered log item. Grab a reference to the name/value buffer and + * attach it to the new work state. + */ + attr->xattri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attr->xattri_nameval); + args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->name = attrip->attri_name; - args->namelen = attrp->alfi_name_len; + args->name = nv->name.i_addr; + args->namelen = nv->name.i_len; args->hashval = xfs_da_hashname(args->name, args->namelen); args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; @@ -585,8 +623,8 @@ xfs_attri_item_recover( switch (attr->xattri_op_flags) { case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_REPLACE: - args->value = attrip->attri_value; - args->valuelen = attrp->alfi_value_len; + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; args->total = xfs_attr_calc_size(args, &local); if (xfs_inode_hasattr(args->dp)) attr->xattri_dela_state = xfs_attr_init_replace_state(args); @@ -660,8 +698,11 @@ xfs_attri_item_relog( attrdp = xfs_trans_get_attrd(tp, old_attrip); set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len, - old_attrp->alfi_value_len); + /* + * Create a new log item that shares the same name/value buffer as the + * old log item. + */ + new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval); new_attrp = &new_attrip->attri_format; new_attrp->alfi_ino = old_attrp->alfi_ino; @@ -670,13 +711,6 @@ xfs_attri_item_relog( new_attrp->alfi_name_len = old_attrp->alfi_name_len; new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; - memcpy(new_attrip->attri_name, old_attrip->attri_name, - new_attrip->attri_name_len); - - if (new_attrip->attri_value_len > 0) - memcpy(new_attrip->attri_value, old_attrip->attri_value, - new_attrip->attri_value_len); - xfs_trans_add_item(tp, &new_attrip->attri_item); set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); @@ -690,14 +724,15 @@ xlog_recover_attri_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; + struct xfs_attri_log_nameval *nv; + const void *attr_value = NULL; const void *attr_name; - int region = 0; + int error; - attri_formatp = item->ri_buf[region].i_addr; + attri_formatp = item->ri_buf[0].i_addr; attr_name = item->ri_buf[1].i_addr; /* Validate xfs_attri_log_format before the large memory allocation */ @@ -711,27 +746,25 @@ xlog_recover_attri_commit_pass2( return -EFSCORRUPTED; } - /* memory alloc failure will cause replay to abort */ - attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len, - attri_formatp->alfi_value_len); - if (attrip == NULL) + if (attri_formatp->alfi_value_len) + attr_value = item->ri_buf[2].i_addr; + + /* + * Memory alloc failure will cause replay to abort. We attach the + * name/value buffer to the recovered incore log item and drop our + * reference. + */ + nv = xfs_attri_log_nameval_alloc(attr_name, + attri_formatp->alfi_name_len, attr_value, + attri_formatp->alfi_value_len); + if (!nv) return -ENOMEM; - error = xfs_attri_copy_format(&item->ri_buf[region], - &attrip->attri_format); + attrip = xfs_attri_init(mp, nv); + error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); if (error) goto out; - region++; - memcpy(attrip->attri_name, item->ri_buf[region].i_addr, - attrip->attri_name_len); - - if (attrip->attri_value_len > 0) { - region++; - memcpy(attrip->attri_value, item->ri_buf[region].i_addr, - attrip->attri_value_len); - } - /* * The ATTRI has two references. One for the ATTRD and one for ATTRI to * ensure it makes it into the AIL. Insert the ATTRI into the AIL @@ -740,9 +773,11 @@ xlog_recover_attri_commit_pass2( */ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); xfs_attri_release(attrip); + xfs_attri_log_nameval_put(nv); return 0; out: xfs_attri_item_free(attrip); + xfs_attri_log_nameval_put(nv); return error; } @@ -822,7 +857,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_size = xfs_attri_item_size, .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, - .iop_committed = xfs_attri_item_committed, .iop_release = xfs_attri_item_release, .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index a40e702e0215..3280a7930287 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -11,6 +11,14 @@ struct xfs_mount; struct kmem_zone; +struct xfs_attri_log_nameval { + struct xfs_log_iovec name; + struct xfs_log_iovec value; + refcount_t refcount; + + /* name and value follow the end of this struct */ +}; + /* * This is the "attr intention" log item. It is used to log the fact that some * extended attribute operations need to be processed. An operation is @@ -26,10 +34,7 @@ struct kmem_zone; struct xfs_attri_log_item { struct xfs_log_item attri_item; atomic_t attri_refcount; - int attri_name_len; - int attri_value_len; - void *attri_name; - void *attri_value; + struct xfs_attri_log_nameval *attri_nameval; struct xfs_attri_log_format attri_format; }; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 252b098cde1f..f3ce046a7d45 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -86,6 +86,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } +static inline void * +xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + const struct xfs_log_iovec *src) +{ + return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); +} + /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number From a3da1ab6fbea7d5cbcb796f62c8771d8ebd7282a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 19 May 2022 14:03:48 -0300 Subject: [PATCH 257/334] vfio: Do not manipulate iommu dma_owner for fake iommu groups Since asserting dma ownership now causes the group to have its DMA blocked the iommu layer requires a working iommu. This means the dma_owner APIs cannot be used on the fake groups that VFIO creates. Test for this and avoid calling them. Otherwise asserting dma ownership will fail for VFIO mdev devices as a BLOCKING iommu_domain cannot be allocated due to the NULL iommu ops. Fixes: 0286300e6045 ("iommu: iommu_group_claim_dma_owner() must always assign a domain") Reported-by: Eric Farman Tested-by: Eric Farman Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/0-v1-9cfc47edbcd4+13546-vfio_dma_owner_fix_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index cfcff7764403..f5ed03897210 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -927,7 +927,8 @@ static void __vfio_group_unset_container(struct vfio_group *group) driver->ops->detach_group(container->iommu_data, group->iommu_group); - iommu_group_release_dma_owner(group->iommu_group); + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner(group->iommu_group); group->container = NULL; group->container_users = 0; @@ -1001,9 +1002,11 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) goto unlock_out; } - ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); - if (ret) - goto unlock_out; + if (group->type == VFIO_IOMMU) { + ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); + if (ret) + goto unlock_out; + } driver = container->iommu_driver; if (driver) { @@ -1011,7 +1014,9 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) group->iommu_group, group->type); if (ret) { - iommu_group_release_dma_owner(group->iommu_group); + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner( + group->iommu_group); goto unlock_out; } } From c490513c818d1ec61aff1614f5d0e38de680665f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 19 May 2022 20:14:01 -0300 Subject: [PATCH 258/334] vfio/pci: Add driver_managed_dma to the new vfio_pci drivers When the iommu series adding driver_managed_dma was rebased it missed that new VFIO drivers were added and did not update them too. Without this vfio will claim the groups are not viable. Add driver_managed_dma to mlx5 and hisi. Fixes: 70693f470848 ("vfio: Set DMA ownership for VFIO devices") Reported-by: Yishai Hadas Signed-off-by: Jason Gunthorpe Tested-by: Shameer Kolothum Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v1-f9dfa642fab0+2b3-vfio_managed_dma_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index e92376837b29..4def43f5f7b6 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1323,6 +1323,7 @@ static struct pci_driver hisi_acc_vfio_pci_driver = { .probe = hisi_acc_vfio_pci_probe, .remove = hisi_acc_vfio_pci_remove, .err_handler = &hisi_acc_vf_err_handlers, + .driver_managed_dma = true, }; module_pci_driver(hisi_acc_vfio_pci_driver); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index dd1009b5ff9c..0558d0649ddb 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -641,6 +641,7 @@ static struct pci_driver mlx5vf_pci_driver = { .probe = mlx5vf_pci_probe, .remove = mlx5vf_pci_remove, .err_handler = &mlx5vf_err_handlers, + .driver_managed_dma = true, }; static void __exit mlx5vf_pci_cleanup(void) From 759820c92a346dd18daaa8873f5cabe731d8a31c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:43 +0200 Subject: [PATCH 259/334] f2fs: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 52c34651f469..f4d03a81ef3f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1757,7 +1757,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Mar 2022 08:26:48 +0200 Subject: [PATCH 260/334] livepatch: Remove klp_arch_set_pc() and asm/livepatch.h All three versions of klp_arch_set_pc() do exactly the same: they call ftrace_instruction_pointer_set(). Call ftrace_instruction_pointer_set() directly and remove klp_arch_set_pc(). As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h on x86 and s390, remove asm/livepatch.h livepatch.h remains on powerpc but its content is exclusively used by powerpc specific code. Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Petr Mladek --- MAINTAINERS | 2 -- arch/powerpc/include/asm/livepatch.h | 10 +--------- arch/powerpc/kernel/irq.c | 1 - arch/powerpc/kernel/setup_64.c | 2 +- arch/s390/include/asm/livepatch.h | 22 ---------------------- arch/x86/include/asm/livepatch.h | 20 -------------------- include/linux/livepatch.h | 2 -- kernel/livepatch/patch.c | 2 +- 8 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 arch/s390/include/asm/livepatch.h delete mode 100644 arch/x86/include/asm/livepatch.h diff --git a/MAINTAINERS b/MAINTAINERS index 741825cff43c..589c61412e3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11338,8 +11338,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: arch/s390/include/asm/livepatch.h -F: arch/x86/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: lib/livepatch/ diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 1c60094ea0cd..d044a1fd4f44 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -7,17 +7,9 @@ #ifndef _ASM_POWERPC_LIVEPATCH_H #define _ASM_POWERPC_LIVEPATCH_H -#include -#include +#include #include -#ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} -#endif /* CONFIG_LIVEPATCH */ - #ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) { diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 752fb182eacb..fb99e3fa034d 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e547066a06aa..2c1e9a5ac6ca 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h deleted file mode 100644 index 5209f223331a..000000000000 --- a/arch/s390/include/asm/livepatch.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * livepatch.h - s390-specific Kernel Live Patching Core - * - * Copyright (c) 2013-2015 SUSE - * Authors: Jiri Kosina - * Vojtech Pavlik - * Jiri Slaby - */ - -#ifndef ASM_LIVEPATCH_H -#define ASM_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h deleted file mode 100644 index 7c5cc6660e4b..000000000000 --- a/arch/x86/include/asm/livepatch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * livepatch.h - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings - * Copyright (C) 2014 SUSE - */ - -#ifndef _ASM_X86_LIVEPATCH_H -#define _ASM_X86_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 2614247a9781..293e29960c6e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -16,8 +16,6 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) -#include - /* task patch states */ #define KLP_UNDEFINED -1 #define KLP_UNPATCHED 0 diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index c172bf92b576..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); From 421cfe6596f6cb316991c02bf30a93bd81092853 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Thu, 19 May 2022 14:33:11 -0400 Subject: [PATCH 261/334] vfio: remove VFIO_GROUP_NOTIFY_SET_KVM Rather than relying on a notifier for associating the KVM with the group, let's assume that the association has already been made prior to device_open. The first time a device is opened associate the group KVM with the device. This fixes a user-triggerable oops in GVT. Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Signed-off-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Acked-by: Zhi Wang Link: https://lore.kernel.org/r/20220519183311.582380-2-mjrosato@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/gtt.c | 4 +- drivers/gpu/drm/i915/gvt/gvt.h | 3 - drivers/gpu/drm/i915/gvt/kvmgt.c | 88 +++++++-------------------- drivers/s390/crypto/vfio_ap_ops.c | 35 ++--------- drivers/s390/crypto/vfio_ap_private.h | 3 - drivers/vfio/vfio.c | 83 +++++++++---------------- include/linux/vfio.h | 6 +- 7 files changed, 60 insertions(+), 162 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 9c5cc2800975..b4f69364f9a1 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -51,7 +51,7 @@ static int preallocated_oos_pages = 8192; static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn) { - struct kvm *kvm = vgpu->kvm; + struct kvm *kvm = vgpu->vfio_device.kvm; int idx; bool ret; @@ -1185,7 +1185,7 @@ static int is_2MB_gtt_possible(struct intel_vgpu *vgpu, if (!vgpu->attached) return -EINVAL; - pfn = gfn_to_pfn(vgpu->kvm, ops->get_pfn(entry)); + pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry)); if (is_error_noslot_pfn(pfn)) return -EINVAL; return PageTransHuge(pfn_to_page(pfn)); diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 2af4c83e733c..aee1a45da74b 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -227,9 +227,6 @@ struct intel_vgpu { struct mutex cache_lock; struct notifier_block iommu_notifier; - struct notifier_block group_notifier; - struct kvm *kvm; - struct work_struct release_work; atomic_t released; struct kvm_page_track_notifier_node track_node; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7655ffa97d51..e2f6c56ab342 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -228,8 +228,6 @@ static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt) } } -static void intel_vgpu_release_work(struct work_struct *work); - static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long size) { @@ -761,23 +759,6 @@ static int intel_vgpu_iommu_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static int intel_vgpu_group_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct intel_vgpu *vgpu = - container_of(nb, struct intel_vgpu, group_notifier); - - /* the only action we care about */ - if (action == VFIO_GROUP_NOTIFY_SET_KVM) { - vgpu->kvm = data; - - if (!data) - schedule_work(&vgpu->release_work); - } - - return NOTIFY_OK; -} - static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu) { struct intel_vgpu *itr; @@ -789,7 +770,7 @@ static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu) if (!itr->attached) continue; - if (vgpu->kvm == itr->kvm) { + if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) { ret = true; goto out; } @@ -806,7 +787,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) int ret; vgpu->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier; - vgpu->group_notifier.notifier_call = intel_vgpu_group_notifier; events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; ret = vfio_register_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &events, @@ -817,38 +797,32 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) goto out; } - events = VFIO_GROUP_NOTIFY_SET_KVM; - ret = vfio_register_notifier(vfio_dev, VFIO_GROUP_NOTIFY, &events, - &vgpu->group_notifier); - if (ret != 0) { - gvt_vgpu_err("vfio_register_notifier for group failed: %d\n", - ret); + ret = -EEXIST; + if (vgpu->attached) + goto undo_iommu; + + ret = -ESRCH; + if (!vgpu->vfio_device.kvm || + vgpu->vfio_device.kvm->mm != current->mm) { + gvt_vgpu_err("KVM is required to use Intel vGPU\n"); goto undo_iommu; } - ret = -EEXIST; - if (vgpu->attached) - goto undo_register; - - ret = -ESRCH; - if (!vgpu->kvm || vgpu->kvm->mm != current->mm) { - gvt_vgpu_err("KVM is required to use Intel vGPU\n"); - goto undo_register; - } + kvm_get_kvm(vgpu->vfio_device.kvm); ret = -EEXIST; if (__kvmgt_vgpu_exist(vgpu)) - goto undo_register; + goto undo_iommu; vgpu->attached = true; - kvm_get_kvm(vgpu->kvm); kvmgt_protect_table_init(vgpu); gvt_cache_init(vgpu); vgpu->track_node.track_write = kvmgt_page_track_write; vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; - kvm_page_track_register_notifier(vgpu->kvm, &vgpu->track_node); + kvm_page_track_register_notifier(vgpu->vfio_device.kvm, + &vgpu->track_node); debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs, &vgpu->nr_cache_entries); @@ -858,10 +832,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) atomic_set(&vgpu->released, 0); return 0; -undo_register: - vfio_unregister_notifier(vfio_dev, VFIO_GROUP_NOTIFY, - &vgpu->group_notifier); - undo_iommu: vfio_unregister_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &vgpu->iommu_notifier); @@ -880,8 +850,9 @@ static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu) } } -static void __intel_vgpu_release(struct intel_vgpu *vgpu) +static void intel_vgpu_close_device(struct vfio_device *vfio_dev) { + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); struct drm_i915_private *i915 = vgpu->gvt->gt->i915; int ret; @@ -898,35 +869,19 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu) drm_WARN(&i915->drm, ret, "vfio_unregister_notifier for iommu failed: %d\n", ret); - ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_GROUP_NOTIFY, - &vgpu->group_notifier); - drm_WARN(&i915->drm, ret, - "vfio_unregister_notifier for group failed: %d\n", ret); - debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs)); - kvm_page_track_unregister_notifier(vgpu->kvm, &vgpu->track_node); - kvm_put_kvm(vgpu->kvm); + kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm, + &vgpu->track_node); kvmgt_protect_table_destroy(vgpu); gvt_cache_destroy(vgpu); intel_vgpu_release_msi_eventfd_ctx(vgpu); - vgpu->kvm = NULL; vgpu->attached = false; -} -static void intel_vgpu_close_device(struct vfio_device *vfio_dev) -{ - __intel_vgpu_release(vfio_dev_to_vgpu(vfio_dev)); -} - -static void intel_vgpu_release_work(struct work_struct *work) -{ - struct intel_vgpu *vgpu = - container_of(work, struct intel_vgpu, release_work); - - __intel_vgpu_release(vgpu); + if (vgpu->vfio_device.kvm) + kvm_put_kvm(vgpu->vfio_device.kvm); } static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar) @@ -1675,7 +1630,6 @@ static int intel_vgpu_probe(struct mdev_device *mdev) return PTR_ERR(vgpu); } - INIT_WORK(&vgpu->release_work, intel_vgpu_release_work); vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev, &intel_vgpu_dev_ops); @@ -1713,7 +1667,7 @@ static struct mdev_driver intel_vgpu_mdev_driver = { int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) { - struct kvm *kvm = info->kvm; + struct kvm *kvm = info->vfio_device.kvm; struct kvm_memory_slot *slot; int idx; @@ -1743,7 +1697,7 @@ out: int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn) { - struct kvm *kvm = info->kvm; + struct kvm *kvm = info->vfio_device.kvm; struct kvm_memory_slot *slot; int idx; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index e8914024f5b1..a7d2a95796d3 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1284,25 +1284,6 @@ static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev) } } -static int vfio_ap_mdev_group_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - int notify_rc = NOTIFY_OK; - struct ap_matrix_mdev *matrix_mdev; - - if (action != VFIO_GROUP_NOTIFY_SET_KVM) - return NOTIFY_OK; - - matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier); - - if (!data) - vfio_ap_mdev_unset_kvm(matrix_mdev); - else if (vfio_ap_mdev_set_kvm(matrix_mdev, data)) - notify_rc = NOTIFY_DONE; - - return notify_rc; -} - static struct vfio_ap_queue *vfio_ap_find_queue(int apqn) { struct device *dev; @@ -1402,11 +1383,10 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev) unsigned long events; int ret; - matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier; - events = VFIO_GROUP_NOTIFY_SET_KVM; + if (!vdev->kvm) + return -EINVAL; - ret = vfio_register_notifier(vdev, VFIO_GROUP_NOTIFY, &events, - &matrix_mdev->group_notifier); + ret = vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm); if (ret) return ret; @@ -1415,12 +1395,11 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev) ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events, &matrix_mdev->iommu_notifier); if (ret) - goto out_unregister_group; + goto err_kvm; return 0; -out_unregister_group: - vfio_unregister_notifier(vdev, VFIO_GROUP_NOTIFY, - &matrix_mdev->group_notifier); +err_kvm: + vfio_ap_mdev_unset_kvm(matrix_mdev); return ret; } @@ -1431,8 +1410,6 @@ static void vfio_ap_mdev_close_device(struct vfio_device *vdev) vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &matrix_mdev->iommu_notifier); - vfio_unregister_notifier(vdev, VFIO_GROUP_NOTIFY, - &matrix_mdev->group_notifier); vfio_ap_mdev_unset_kvm(matrix_mdev); } diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 648fcaf8104a..a26efd804d0d 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -81,8 +81,6 @@ struct ap_matrix { * @node: allows the ap_matrix_mdev struct to be added to a list * @matrix: the adapters, usage domains and control domains assigned to the * mediated matrix device. - * @group_notifier: notifier block used for specifying callback function for - * handling the VFIO_GROUP_NOTIFY_SET_KVM event * @iommu_notifier: notifier block used for specifying callback function for * handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even * @kvm: the struct holding guest's state @@ -94,7 +92,6 @@ struct ap_matrix_mdev { struct vfio_device vdev; struct list_head node; struct ap_matrix matrix; - struct notifier_block group_notifier; struct notifier_block iommu_notifier; struct kvm *kvm; crypto_hook pqap_hook; diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index f5ed03897210..e22be13e6771 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1088,10 +1088,21 @@ static struct file *vfio_device_open(struct vfio_device *device) mutex_lock(&device->dev_set->lock); device->open_count++; - if (device->open_count == 1 && device->ops->open_device) { - ret = device->ops->open_device(device); - if (ret) - goto err_undo_count; + if (device->open_count == 1) { + /* + * Here we pass the KVM pointer with the group under the read + * lock. If the device driver will use it, it must obtain a + * reference and release it during close_device. + */ + down_read(&device->group->group_rwsem); + device->kvm = device->group->kvm; + + if (device->ops->open_device) { + ret = device->ops->open_device(device); + if (ret) + goto err_undo_count; + } + up_read(&device->group->group_rwsem); } mutex_unlock(&device->dev_set->lock); @@ -1124,10 +1135,14 @@ static struct file *vfio_device_open(struct vfio_device *device) err_close_device: mutex_lock(&device->dev_set->lock); + down_read(&device->group->group_rwsem); if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); err_undo_count: device->open_count--; + if (device->open_count == 0 && device->kvm) + device->kvm = NULL; + up_read(&device->group->group_rwsem); mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); err_unassign_container: @@ -1320,9 +1335,13 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); + down_read(&device->group->group_rwsem); if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); + up_read(&device->group->group_rwsem); device->open_count--; + if (device->open_count == 0) + device->kvm = NULL; mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); @@ -1731,8 +1750,8 @@ EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); * @file: VFIO group file * @kvm: KVM to link * - * The kvm pointer will be forwarded to all the vfio_device's attached to the - * VFIO file via the VFIO_GROUP_NOTIFY_SET_KVM notifier. + * When a VFIO device is first opened the KVM will be available in + * device->kvm if one was associated with the group. */ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) { @@ -1743,8 +1762,6 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) down_write(&group->group_rwsem); group->kvm = kvm; - blocking_notifier_call_chain(&group->notifier, - VFIO_GROUP_NOTIFY_SET_KVM, kvm); up_write(&group->group_rwsem); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); @@ -2011,7 +2028,8 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; - down_read(&group->group_rwsem); + lockdep_assert_held_read(&group->group_rwsem); + container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->register_notifier)) @@ -2019,7 +2037,6 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, events, nb); else ret = -ENOTTY; - up_read(&group->group_rwsem); return ret; } @@ -2031,7 +2048,8 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; - down_read(&group->group_rwsem); + lockdep_assert_held_read(&group->group_rwsem); + container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unregister_notifier)) @@ -2039,47 +2057,10 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, nb); else ret = -ENOTTY; - up_read(&group->group_rwsem); return ret; } -static int vfio_register_group_notifier(struct vfio_group *group, - unsigned long *events, - struct notifier_block *nb) -{ - int ret; - bool set_kvm = false; - - if (*events & VFIO_GROUP_NOTIFY_SET_KVM) - set_kvm = true; - - /* clear known events */ - *events &= ~VFIO_GROUP_NOTIFY_SET_KVM; - - /* refuse to continue if still events remaining */ - if (*events) - return -EINVAL; - - ret = blocking_notifier_chain_register(&group->notifier, nb); - if (ret) - return ret; - - /* - * The attaching of kvm and vfio_group might already happen, so - * here we replay once upon registration. - */ - if (set_kvm) { - down_read(&group->group_rwsem); - if (group->kvm) - blocking_notifier_call_chain(&group->notifier, - VFIO_GROUP_NOTIFY_SET_KVM, - group->kvm); - up_read(&group->group_rwsem); - } - return 0; -} - int vfio_register_notifier(struct vfio_device *device, enum vfio_notify_type type, unsigned long *events, struct notifier_block *nb) @@ -2095,9 +2076,6 @@ int vfio_register_notifier(struct vfio_device *device, case VFIO_IOMMU_NOTIFY: ret = vfio_register_iommu_notifier(group, events, nb); break; - case VFIO_GROUP_NOTIFY: - ret = vfio_register_group_notifier(group, events, nb); - break; default: ret = -EINVAL; } @@ -2119,9 +2097,6 @@ int vfio_unregister_notifier(struct vfio_device *device, case VFIO_IOMMU_NOTIFY: ret = vfio_unregister_iommu_notifier(group, nb); break; - case VFIO_GROUP_NOTIFY: - ret = blocking_notifier_chain_unregister(&group->notifier, nb); - break; default: ret = -EINVAL; } diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 45b287826ce6..aa888cc51757 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -36,6 +36,8 @@ struct vfio_device { struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; + /* Driver must reference the kvm during open_device or never touch it */ + struct kvm *kvm; /* Members below here are private, not for driver use */ refcount_t refcount; @@ -155,15 +157,11 @@ extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, /* each type has independent events */ enum vfio_notify_type { VFIO_IOMMU_NOTIFY = 0, - VFIO_GROUP_NOTIFY = 1, }; /* events for VFIO_IOMMU_NOTIFY */ #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) -/* events for VFIO_GROUP_NOTIFY */ -#define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) - extern int vfio_register_notifier(struct vfio_device *device, enum vfio_notify_type type, unsigned long *required_events, From 64d69b5daf6fe9b86236d34e57ba8ebf7d84f245 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 20 May 2022 10:22:21 +0200 Subject: [PATCH 262/334] rtc: rzn1: Avoid mixing variables In the ->set_offset() callback, the 'val' variable is used for two different purposes at the same time, which is oviously wrong: - It contains the intermediate value of the SUBU register that must be written at the end of ->set_offset(). - It is used in the middle of the above calculations to poll the CTL2 register for a specific value. Let's introduce a 'ctl2' variable just for the readl_poll_timeout() call and use it there in place of 'var'. In order to avoid mixing those two variables again, let's rename the remaining occurences of 'val' into 'subu' and initialize it to 0 to avoid the uninitialized variable situation reported by Tom Rix and Colin Ian King already. Fixes: be4a11cf98af ("rtc: rzn1: Add oscillator offset support") Reported-by: Tom Rix Reported-by: Colin Ian King Signed-off-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220520082221.488849-1-miquel.raynal@bootlin.com --- drivers/rtc/rtc-rzn1.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c index f92d1398b0f1..1108d98af86f 100644 --- a/drivers/rtc/rtc-rzn1.c +++ b/drivers/rtc/rtc-rzn1.c @@ -272,7 +272,7 @@ static int rzn1_rtc_set_offset(struct device *dev, long offset) struct rzn1_rtc *rtc = dev_get_drvdata(dev); unsigned int steps; int stepsh, stepsl; - u32 val; + u32 subu = 0, ctl2; int ret; /* @@ -288,7 +288,7 @@ static int rzn1_rtc_set_offset(struct device *dev, long offset) if (stepsh >= -0x3E && stepsh <= 0x3E) { /* 1017 ppb per step */ steps = stepsh; - val |= RZN1_RTC_SUBU_DEV; + subu |= RZN1_RTC_SUBU_DEV; } else if (stepsl >= -0x3E && stepsl <= 0x3E) { /* 3051 ppb per step */ steps = stepsl; @@ -300,18 +300,18 @@ static int rzn1_rtc_set_offset(struct device *dev, long offset) return 0; if (steps > 0) { - val |= steps + 1; + subu |= steps + 1; } else { - val |= RZN1_RTC_SUBU_DECR; - val |= (~(-steps - 1)) & 0x3F; + subu |= RZN1_RTC_SUBU_DECR; + subu |= (~(-steps - 1)) & 0x3F; } - ret = readl_poll_timeout(rtc->base + RZN1_RTC_CTL2, val, - !(val & RZN1_RTC_CTL2_WUST), 100, 2000000); + ret = readl_poll_timeout(rtc->base + RZN1_RTC_CTL2, ctl2, + !(ctl2 & RZN1_RTC_CTL2_WUST), 100, 2000000); if (ret) return ret; - writel(val, rtc->base + RZN1_RTC_SUBU); + writel(subu, rtc->base + RZN1_RTC_SUBU); return 0; } From 0b6da785130d9e8cf33d001a7bf08a979c87d019 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 19 May 2022 16:56:19 +0300 Subject: [PATCH 263/334] rtc: rzn1: Fix error code in probe There is a copy and paste error so this code returns the wrong variable. Fixes: deeb4b5393e1 ("rtc: rzn1: Add new RTC driver") Signed-off-by: Dan Carpenter Reviewed-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/YoZMg1dmHHSJEfE9@kili --- drivers/rtc/rtc-rzn1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c index 1108d98af86f..18f33ca62f6c 100644 --- a/drivers/rtc/rtc-rzn1.c +++ b/drivers/rtc/rtc-rzn1.c @@ -348,7 +348,7 @@ static int rzn1_rtc_probe(struct platform_device *pdev) rtc->rtcdev = devm_rtc_allocate_device(&pdev->dev); if (IS_ERR(rtc->rtcdev)) - return PTR_ERR(rtc); + return PTR_ERR(rtc->rtcdev); rtc->rtcdev->range_min = RTC_TIMESTAMP_BEGIN_2000; rtc->rtcdev->range_max = RTC_TIMESTAMP_END_2099; From 3f3489248927a53fcfec571ff603163f6b676a46 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 20 May 2022 10:25:00 +0200 Subject: [PATCH 264/334] rtc: rzn1: Fix a variable type The calculation in ->set_offset() handles both negative and positive offsets. The 'steps' variable will be checked to be in a specific [-x; +x] range, which means it must be a signed integer rather than unsigned. This also fixes the following smatch warning: warn: 'steps' 'true' implies 'steps > 0' is 'true' Fixes: be4a11cf98af ("rtc: rzn1: Add oscillator offset support") Reported-by: Dan Carpenter Signed-off-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220520082500.489248-1-miquel.raynal@bootlin.com --- drivers/rtc/rtc-rzn1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c index 18f33ca62f6c..ac788799c8e3 100644 --- a/drivers/rtc/rtc-rzn1.c +++ b/drivers/rtc/rtc-rzn1.c @@ -270,8 +270,7 @@ static int rzn1_rtc_read_offset(struct device *dev, long *offset) static int rzn1_rtc_set_offset(struct device *dev, long offset) { struct rzn1_rtc *rtc = dev_get_drvdata(dev); - unsigned int steps; - int stepsh, stepsl; + int stepsh, stepsl, steps; u32 subu = 0, ctl2; int ret; From 66d34fcbbe63ebd8584b792e0d741f6648100894 Mon Sep 17 00:00:00 2001 From: Sungjong Seo Date: Tue, 24 May 2022 10:29:11 +0900 Subject: [PATCH 265/334] f2fs: allow compression for mmap files in compress_mode=user Since commit e3c548323d32 ("f2fs: let's allow compression for mmap files"), it has been allowed to compress mmap files. However, in compress_mode=user, it is not allowed yet. To keep the same concept in both compress_modes, f2fs_ioc_(de)compress_file() should also allow it. Let's remove checking mmap files in f2fs_ioc_(de)compress_file() so that the compression for mmap files is also allowed in compress_mode=user. Signed-off-by: Sungjong Seo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7aac53ac5acf..a05d842a7e72 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3945,11 +3945,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4017,11 +4012,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; From 78901cfa44981d170fb5caae0d5421c97782b0d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 25 May 2022 09:56:34 +0800 Subject: [PATCH 266/334] f2fs: avoid unneeded error handling for revoke_entry_slab allocation In __f2fs_commit_atomic_write(), we will guarantee success of revoke_entry_slab allocation, so let's avoid unneeded error handling. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0a4180f64291..51ceff797b97 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -315,11 +315,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode) new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); - if (!new) { - f2fs_put_dnode(&dn); - ret = -ENOMEM; - goto out; - } ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); From 908ea6541661d72c6852650f91adebba88b0de0b Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 25 May 2022 17:43:36 +0800 Subject: [PATCH 267/334] f2fs: add f2fs_init_write_merge_io function Almost all other initialization of variables in f2fs_fill_super are extraced to a single function. Also do it for write_io[], which can make code more clean. This patch just refactors the code, theres no functional change. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu [Jaegeuk Kim: clean up] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 28 ++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 27 +++------------------------ 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 54a7a8ad994d..f5f2b7233982 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -584,6 +584,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f4d03a81ef3f..c73014901006 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3713,6 +3713,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 63daae67a9d9..5b558f69c461 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4091,30 +4091,9 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); From d7a2dc523085f8b8c60548ceedc696934aefeb0e Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Thu, 10 Mar 2022 09:34:19 -0500 Subject: [PATCH 268/334] ceph: allow ceph.dir.rctime xattr to be updatable `rctime' has been a pain point in cephfs due to its buggy nature - inconsistent values reported and those sorts. Fixing rctime is non-trivial needing an overall redesign of the entire nested statistics infrastructure. As a workaround, PR http://github.com/ceph/ceph/pull/37938 allows this extended attribute to be manually set. This allows users to "fixup" inconsistent rctime values. While this sounds messy, its probably the wisest approach allowing users/scripts to workaround buggy rctime values. The above PR enables Ceph MDS to allow manually setting rctime extended attribute with the corresponding user-land changes. We may as well allow the same to be done via kclient for parity. Signed-off-by: Venky Shankar Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index afec84088471..8c2dc2c762a4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, } #define XATTR_RSTAT_FIELD(_type, _name) \ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ + } #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ @@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rsubdirs), XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), - XATTR_RSTAT_FIELD(dir, rctime), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), { .name = "ceph.dir.pin", .name_size = sizeof("ceph.dir.pin"), From 3302ffd44c3d70e77ad5764d15f5028a6ade540a Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 24 Mar 2022 08:20:50 +0100 Subject: [PATCH 269/334] rbd: replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b844432bad20..5e2ee398b7dc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -756,24 +756,23 @@ static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) */ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) { - struct rbd_client *client_node; - bool found = false; + struct rbd_client *rbdc = NULL, *iter; if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; spin_lock(&rbd_client_list_lock); - list_for_each_entry(client_node, &rbd_client_list, node) { - if (!ceph_compare_options(ceph_opts, client_node->client)) { - __rbd_get_client(client_node); + list_for_each_entry(iter, &rbd_client_list, node) { + if (!ceph_compare_options(ceph_opts, iter->client)) { + __rbd_get_client(iter); - found = true; + rbdc = iter; break; } } spin_unlock(&rbd_client_list_lock); - return found ? client_node : NULL; + return rbdc; } /* From 6c1dc50284c4dab9f1e563efba6f9c4a47af894a Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 28 Mar 2022 10:25:35 +0800 Subject: [PATCH 270/334] ceph: remove unused CEPH_MDS_LEASE_RELEASE related code The ceph_mdsc_lease_release() has been removed by commit 8aa152c77890 (ceph: remove ceph_mdsc_lease_release). ceph_mdsc_lease_send_msg will never be called with CEPH_MDS_LEASE_RELEASE. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 00c3de177dd6..7b9a16dc8353 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4396,12 +4396,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); - /* - * if this is a preemptive lease RELEASE, no need to - * flush request stream, since the actual request will - * soon follow. - */ - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); ceph_con_send(&session->s_con, msg); } From 1980b1bf17a4975fee5ee42df167f50f7f67b3d9 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 29 Mar 2022 12:48:01 +0800 Subject: [PATCH 271/334] ceph: stop forwarding the request when exceeding 256 times MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The type of 'num_fwd' in ceph 'MClientRequestForward' is 'int32_t', while in 'ceph_mds_request_head' the type is '__u8'. So in case the request bounces between MDSes exceeding 256 times, the client will get stuck. In this case it's ususally a bug in MDS and continue bouncing the request makes no sense. URL: https://tracker.ceph.com/issues/55130 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Reviewed-by: Luís Henriques Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7b9a16dc8353..75506ac30307 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3265,6 +3265,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; + bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); @@ -3273,16 +3274,41 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { + mutex_unlock(&mdsc->mutex); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); - goto out; /* dup reply? */ + return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); + /* + * The type of 'num_fwd' in ceph 'MClientRequestForward' + * is 'int32_t', while in 'ceph_mds_request_head' the + * type is '__u8'. So in case the request bounces between + * MDSes exceeding 256 times, the client will get stuck. + * + * In this case it's ususally a bug in MDS and continue + * bouncing the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + int max = sizeof_field(struct ceph_mds_request_head, num_fwd); + max = 1 << (max * BITS_PER_BYTE); + if (req->r_num_fwd >= max) { + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", + tid); + } else { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -3294,9 +3320,12 @@ static void handle_forward(struct ceph_mds_client *mdsc, put_request_session(req); __do_request(mdsc, req); } - ceph_mdsc_put_request(req); -out: mutex_unlock(&mdsc->mutex); + + /* kick calling process */ + if (aborted) + complete_request(mdsc, req); + ceph_mdsc_put_request(req); return; bad: From 546a5d6122faae161cb59159e8af8518130efeab Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 30 Mar 2022 14:39:33 +0800 Subject: [PATCH 272/334] ceph: stop retrying the request when exceeding 256 times The type of 'r_attempts' in kernel 'ceph_mds_request' is 'int', while in 'ceph_mds_request_head' the type of 'num_retry' is '__u8'. So in case the request retries exceeding 256 times, the MDS will receive a incorrect retry seq. In this case it's ususally a bug in MDS and continue retrying the request makes no sense. For now let's limit it to 256. In future this could be fixed in ceph code, so avoid using the hardcode here. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 75506ac30307..b5139382cf9b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2651,7 +2651,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; - int flags = 0; + int flags = 0, max_retry; + + /* + * The type of 'r_attempts' in kernel 'ceph_mds_request' + * is 'int', while in 'ceph_mds_request_head' the type of + * 'num_retry' is '__u8'. So in case the request retries + * exceeding 256 times, the MDS will receive a incorrect + * retry seq. + * + * In this case it's ususally a bug in MDS and continue + * retrying the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); + max_retry = 1 << (max_retry * BITS_PER_BYTE); + if (req->r_attempts >= max_retry) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } req->r_attempts++; if (req->r_inode) { @@ -2663,7 +2684,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, else req->r_sent_on_mseq = -1; } - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { From 7ffe4fcea789552fac47216188f30559c329c847 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 30 Mar 2022 12:21:12 +0800 Subject: [PATCH 273/334] ceph: update the dlease for the hashed dentry when removing The MDS will always refresh the dentry lease when removing the files or directories. And if the dentry is still hashed, we can update the dentry lease and no need to do the lookup from the MDS later. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 63113e2a4890..f9b68b2c9b12 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1466,10 +1466,12 @@ retry_lookup: } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); + } + + if (!d_unhashed(dn) && have_lease) update_dentry_lease(dir, dn, rinfo->dlease, session, req->r_request_started); - } goto done; } From 57a5df0e8653a4ae4fa8f8fbfed8f1f0d734ebc0 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 31 Mar 2022 23:53:28 +0200 Subject: [PATCH 274/334] ceph: use dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5c14ef04e474..1485a63159e3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; - struct ceph_cap_flush *cf; + struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { @@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, } ret = -ENOENT; - list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { - if (cf->tid >= first_tid) { + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { + if (iter->tid >= first_tid) { + cf = iter; ret = 0; break; } From 3ffa9d6f991facd370fe35fd7fcbdbf5d1063afd Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Thu, 31 Mar 2022 23:53:29 +0200 Subject: [PATCH 275/334] ceph: replace usage of found with dedicated list iterator variable To move the list iterator variable into the list_for_each_entry_*() macro in the future it should be avoided to use the list iterator variable after the loop body. To *never* use the list iterator variable after the loop it was concluded to use a separate iterator variable instead of a found boolean. This removes the need to use a found variable and simply checking if the variable was set, can determine if the break/goto was hit. Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ Signed-off-by: Jakob Koschel Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1485a63159e3..b8d9404fbdb3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3183,10 +3183,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap = NULL; + struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; - bool found = false; bool flush_snaps = false; bool complete_capsnap = false; @@ -3213,14 +3212,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->context == snapc) { - found = true; + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->context == snapc) { + capsnap = iter; break; } } - if (!found) { + if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. @@ -3770,8 +3769,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); - struct ceph_cap_snap *capsnap; - bool flushed = false; + struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; @@ -3779,26 +3777,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, inode, ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->follows == follows) { - if (capsnap->cap_flush.tid != flush_tid) { + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->follows == follows) { + if (iter->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", capsnap, follows, - flush_tid, capsnap->cap_flush.tid); + " %lld\n", iter, follows, + flush_tid, iter->cap_flush.tid); break; } - flushed = true; + capsnap = iter; break; } else { dout(" skipping cap_snap %p follows %lld\n", - capsnap, capsnap->follows); + iter, iter->follows); } } - if (flushed) + if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); - if (flushed) { + if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) From 68e5ec2ec94576bee4d5280d512ed47e6f876baf Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 7 Apr 2022 13:12:42 +0800 Subject: [PATCH 276/334] ceph: no need to invalidate the fscache twice Fixes: 400e1286c0ec3 ("ceph: conversion to new fscache API") Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f9b68b2c9b12..e6065483c36b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1886,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); From 261998c30004f0e6eeddd24b1eb5e504cfacee18 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 11 Apr 2022 09:59:09 +0800 Subject: [PATCH 277/334] ceph: fix statx AT_STATX_DONT_SYNC vs AT_STATX_FORCE_SYNC check From the posix and the initial statx supporting commit comments, the AT_STATX_DONT_SYNC is a lightweight stat and the AT_STATX_FORCE_SYNC is a heaverweight one. And also checked all the other current usage about these two flags they are all doing the same, that is only when the AT_STATX_FORCE_SYNC is not set and the AT_STATX_DONT_SYNC is set will they skip sync retriving the attributes from storage. Signed-off-by: Xiubo Li Reviewed-by: David Howells Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e6065483c36b..88d7074d011d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2424,7 +2424,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return -ESTALE; /* Skip the getattr altogether if we're asked not to sync */ - if (!(flags & AT_STATX_DONT_SYNC)) { + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { err = ceph_do_getattr(inode, statx_to_caps(request_mask, inode->i_mode), flags & AT_STATX_FORCE_SYNC); From d9d58f0402a8bd14b980e6f41a5aa28aa0ca0e04 Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Tue, 12 Apr 2022 14:46:20 +0800 Subject: [PATCH 278/334] libceph: use swap() macro instead of taking tmp variable Fix the following coccicheck warning: net/ceph/crush/mapper.c:1077:8-9: WARNING opportunity for swap() by using swap() for the swapping of variable values and drop the tmp variable that is not needed any more. Signed-off-by: Guo Zhengkui Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crush/mapper.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 7057f8db4f99..1daf95e17d67 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -906,7 +906,6 @@ int crush_do_rule(const struct crush_map *map, int recurse_to_leaf; int wsize = 0; int osize; - int *tmp; const struct crush_rule *rule; __u32 step; int i, j; @@ -1073,9 +1072,7 @@ int crush_do_rule(const struct crush_map *map, memcpy(o, c, osize*sizeof(*o)); /* swap o and w arrays */ - tmp = o; - o = w; - w = tmp; + swap(o, w); wsize = osize; break; From ae0670633014d172fab3772fff2da99269d17874 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 18 Apr 2022 20:04:51 +0800 Subject: [PATCH 279/334] ceph: rename unsafe_request_wait() Rename it to flush_mdlog_and_wait_inode_unsafe_requests() to make it more descriptive. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b8d9404fbdb3..a051e6e4d7ca 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2219,9 +2219,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) } /* - * wait for any unsafe requests to complete. + * flush the mdlog and wait for any unsafe requests to complete. */ -static int unsafe_request_wait(struct inode *inode) +static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -2337,7 +2337,7 @@ retry: kfree(sessions); } - dout("unsafe_request_wait %p wait on tid %llu %llu\n", + dout("%s %p wait on tid %llu %llu\n", __func__, inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, @@ -2381,7 +2381,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - err = unsafe_request_wait(inode); + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds From 1b2ba3c5616e17ff951359e25c658a1c3f146f1e Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 19 Apr 2022 08:58:49 +0800 Subject: [PATCH 280/334] ceph: flush the mdlog for filesystem sync Before waiting for a request's safe reply, we will send the mdlog flush request to the relevant MDS. And this will also flush the mdlog for all the other unsafe requests in the same session, so we can record the last session and no need to flush mdlog again in the next loop. But there still have cases that it may send the mdlog flush requst twice or more, but that should be not often. Rename wait_unsafe_requests() to flush_mdlog_and_wait_mdsc_unsafe_requests() to make it more descriptive. [xiubli: fold in MDS request refcount leak fix from Jeff] URL: https://tracker.ceph.com/issues/55284 URL: https://tracker.ceph.com/issues/55411 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b5139382cf9b..23a6c11d805c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4740,15 +4740,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) } /* - * wait for all write mds requests to flush. + * flush the mdlog and wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, + u64 want_tid) { struct ceph_mds_request *req = NULL, *nextreq; + struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("wait_unsafe_requests want %lld\n", want_tid); + dout("%s want %lld\n", __func__, want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -4760,14 +4762,32 @@ restart: nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_mds_session *s = req->r_session; + + if (!s) { + req = nextreq; + continue; + } + /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); + s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", + + /* send flush mdlog request to MDS */ + if (last_session != s) { + send_flush_mdlog(s); + ceph_put_mds_session(last_session); + last_session = s; + } else { + ceph_put_mds_session(s); + } + dout("%s wait on %llu (want %llu)\n", __func__, req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) @@ -4782,7 +4802,8 @@ restart: req = nextreq; } mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests done\n"); + ceph_put_mds_session(last_session); + dout("%s done\n", __func__); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) @@ -4811,7 +4832,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); - wait_unsafe_requests(mdsc, want_tid); + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } From f7a2d0688a3b2bb4769402b4c962f54f7b0fc23c Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 20 Apr 2022 13:13:02 +0800 Subject: [PATCH 281/334] ceph: disable updating the atime since cephfs won't maintain it Since CephFS makes no attempt to maintain atime, we shouldn't try to update it in mmap and generic read cases and ignore updating it in direct and sync read cases. And even we update it in mmap and generic read cases we will drop it and won't sync it to MDS. And we are seeing the atime will be updated and then dropped to the floor again and again. URL: https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/thread/VSJM7T4CS5TDRFF6XFPIYMHP75K73PZ6/ Signed-off-by: Xiubo Li Acked-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 1 - fs/ceph/super.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b6edcf89a429..69578cc3749d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1777,7 +1777,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) if (!mapping->a_ops->readpage) return -ENOEXEC; - file_accessed(file); vma->vm_ops = &ceph_vmops; return 0; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e6987d295079..b73b4f75462c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1119,6 +1119,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; + s->s_flags |= SB_NODIRATIME | SB_NOATIME; ret = set_anon_super_fc(s, fc); if (ret != 0) From 5eed80fba65cd707075892450bc5d6bd464862a0 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 21 Apr 2022 11:26:40 +0800 Subject: [PATCH 282/334] ceph: try to choose the auth MDS if possible for getattr If any 'x' caps is issued we can just choose the auth MDS instead of the random replica MDSes. Because only when the Locker is in LOCK_EXEC state will the loner client could get the 'x' caps. And if we send the getattr requests to any replica MDS it must auth pin and tries to rdlock from the auth MDS, and then the auth MDS need to do the Locker state transition to LOCK_SYNC. And after that the lock state will change back. This cost much when doing the Locker state transition and usually will need to revoke caps from clients. URL: https://tracker.ceph.com/issues/55240 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- fs/ceph/inode.c | 26 +++++++++++++++++++++++++- fs/ceph/super.h | 1 + 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 69578cc3749d..c25a33dd6284 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct iov_iter iter; ssize_t err = 0; size_t len; + int mode; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) goto out; /* We need to fetch the inline data. */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 88d7074d011d..2db1a21c9ae0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2259,6 +2259,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & CEPH_STAT_RSTAT)) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. @@ -2282,7 +2306,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + mode = ceph_try_to_choose_auth_mds(inode, mask); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20ceab74e871..d749c96070f6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1022,6 +1022,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode) ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } +extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) From 3459bd0c55ed28f86a1fd2e01e376492cb6d4f91 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sun, 24 Apr 2022 17:35:53 +0800 Subject: [PATCH 283/334] ceph: redirty the page for writepage on failure When run out of memories we should redirty the page before failing the writepage. Or we will hit BUG_ON(folio_get_private(folio)) in ceph_dirty_folio(). URL: https://tracker.ceph.com/issues/55421 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c25a33dd6284..18956c629ad4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -606,8 +606,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); - if (IS_ERR(req)) + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); return PTR_ERR(req); + } set_page_writeback(page); if (caching) From 825978fd6a0defc3c29d8a38b6cea76a0938d21e Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 25 Apr 2022 16:08:24 +0800 Subject: [PATCH 284/334] ceph: fix possible deadlock when holding Fwb to get inline_data 1, mount with wsync. 2, create a file with O_RDWR, and the request was sent to mds.0: ceph_atomic_open()--> ceph_mdsc_do_request(openc) finish_open(file, dentry, ceph_open)--> ceph_open()--> ceph_init_file()--> ceph_init_file_info()--> ceph_uninline_data()--> { ... if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) goto out_unlock; ... } The inline_version will be 1, which is the initial version for the new create file. And here the ci->i_inline_version will keep with 1, it's buggy. 3, buffer write to the file immediately: ceph_write_iter()--> ceph_get_caps(file, need=Fw, want=Fb, ...); generic_perform_write()--> a_ops->write_begin()--> ceph_write_begin()--> netfs_write_begin()--> netfs_begin_read()--> netfs_rreq_submit_slice()--> netfs_read_from_server()--> rreq->netfs_ops->issue_read()--> ceph_netfs_issue_read()--> { ... if (ci->i_inline_version != CEPH_INLINE_NONE && ceph_netfs_issue_op_inline(subreq)) return; ... } ceph_put_cap_refs(ci, Fwb); The ceph_netfs_issue_op_inline() will send a getattr(Fsr) request to mds.1. 4, then the mds.1 will request the rd lock for CInode::filelock from the auth mds.0, the mds.0 will do the CInode::filelock state transation from excl --> sync, but it need to revoke the Fxwb caps back from the clients. While the kernel client has aleady held the Fwb caps and waiting for the getattr(Fsr). It's deadlock! URL: https://tracker.ceph.com/issues/55377 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 18956c629ad4..1a108f24e7d9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1648,7 +1648,7 @@ int ceph_uninline_data(struct file *file) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; @@ -1656,18 +1656,6 @@ int ceph_uninline_data(struct file *file) int err = 0; u64 len; - prealloc_cf = ceph_alloc_cap_flush(); - if (!prealloc_cf) - return -ENOMEM; - - folio = read_mapping_folio(inode->i_mapping, 0, file); - if (IS_ERR(folio)) { - err = PTR_ERR(folio); - goto out; - } - - folio_lock(folio); - spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; spin_unlock(&ci->i_ceph_lock); @@ -1675,9 +1663,23 @@ int ceph_uninline_data(struct file *file) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); - if (inline_version == 1 || /* initial version, no data */ - inline_version == CEPH_INLINE_NONE) - goto out_unlock; + if (inline_version == CEPH_INLINE_NONE) + return 0; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); len = i_size_read(inode); if (len > folio_size(folio)) @@ -1743,6 +1745,7 @@ int ceph_uninline_data(struct file *file) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); +out_uninline: if (!err) { int dirty; @@ -1761,8 +1764,10 @@ out_put_req: if (err == -ECANCELED) err = 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } out: ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", From 55ab5520802016b13098e0ea3794480289659aab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= Date: Wed, 27 Apr 2022 16:57:04 +0100 Subject: [PATCH 285/334] ceph: fix statfs for subdir mounts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When doing a mount using as base a directory that has 'max_bytes' quotas statfs uses that value as the total; if a subdirectory is used instead, the same 'max_bytes' too in statfs, unless there is another quota set. Unfortunately, if this subdirectory only has the 'max_files' quota set, then statfs uses the filesystem total. Fix this by making sure we only lookup realms that contain the 'max_bytes' quota. Cc: Ryan Taylor URL: https://tracker.ceph.com/issues/55090 Signed-off-by: Luís Henriques Reviewed-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 2 +- fs/ceph/quota.c | 19 +++++++++++-------- fs/ceph/super.h | 28 ++++++++++++++++++++++++---- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2db1a21c9ae0..b7e9cac3aeef 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode) __ceph_remove_caps(ci); - if (__ceph_has_any_quota(ci)) + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) ceph_adjust_quota_realms_count(inode, false); /* diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index a338a3ec0dc4..64592adfe48f 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) /* * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (either max_files - * or max_bytes). If the root is reached, return the root ceph_snap_realm - * instead. + * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * max_bytes, or any, depending on the 'which_quota' argument). If the root is + * reached, return the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, bool retry) + struct inode *inode, + enum quota_get_realm which_quota, + bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -248,7 +250,7 @@ restart: } ci = ceph_inode(in); - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, which_quota); iput(in); next = realm->parent; @@ -279,8 +281,8 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, true); - new_realm = get_quota_realm(mdsc, new, false); + old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); + new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); if (PTR_ERR(new_realm) == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) @@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), + QUOTA_GET_MAX_BYTES, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d749c96070f6..dd7dac0f984a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1279,9 +1279,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ -static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) + +enum quota_get_realm { + QUOTA_GET_MAX_FILES, + QUOTA_GET_MAX_BYTES, + QUOTA_GET_ANY +}; + +static inline bool __ceph_has_quota(struct ceph_inode_info *ci, + enum quota_get_realm which) { - return ci->i_max_files || ci->i_max_bytes; + bool has_quota = false; + + switch (which) { + case QUOTA_GET_MAX_BYTES: + has_quota = !!ci->i_max_bytes; + break; + case QUOTA_GET_MAX_FILES: + has_quota = !!ci->i_max_files; + break; + default: + has_quota = !!(ci->i_max_files || ci->i_max_bytes); + } + return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); @@ -1290,10 +1310,10 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; - had_quota = __ceph_has_any_quota(ci); + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); From a74379543d229a3fb1af8cd44cbd19844a7bb1bc Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 27 Apr 2022 14:14:41 +0800 Subject: [PATCH 286/334] ceph: try to queue a writeback if revoking fails If the pagecaches writeback just finished and the i_wrbuffer_ref reaches zero it will try to trigger ceph_check_caps(). But if just before ceph_check_caps() the i_wrbuffer_ref could be increased again by mmap/cache write, then the Fwb revoke will fail. We need to try to queue a writeback in this case instead of triggering the writeback by BDI's delayed work per 5 seconds. URL: https://tracker.ceph.com/issues/46904 URL: https://tracker.ceph.com/issues/55377 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a051e6e4d7ca..bf2e94005598 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1911,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; + bool queue_writeback = false; if (session) ceph_get_mds_session(session); @@ -2063,10 +2064,27 @@ retry: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & cap_used) == 0) { - dout("completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; + if (revoking) { + if ((revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; } /* want more caps from mds? */ @@ -2136,6 +2154,8 @@ ack: spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); + if (queue_writeback) + ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } From 2ecd0edd13a8bed87c3588bcd4a048113eff18f6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 May 2022 09:55:08 +0100 Subject: [PATCH 287/334] ceph: remove redundant variable ino Variable ino is being assigned a value that is never read. The variable and assignment are redundant, remove it. Cleans up clang scan build warning: warning: Although the value stored to 'ino' is used in the enclosing expression, the value is never actually read from 'ino' [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 23a6c11d805c..e9eb996e0aee 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_32_safe(p, end, sets, bad); dout("got %u sets of delegated inodes\n", sets); while (sets--) { - u64 start, len, ino; + u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); @@ -449,7 +449,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, continue; } while (len--) { - int err = xa_insert(&s->s_delegated_inos, ino = start++, + int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { From 5e56776d5215ab5ab886006fc749346bad8473c8 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 18 May 2022 22:49:26 +0800 Subject: [PATCH 288/334] ceph: switch TASK_INTERRUPTIBLE to TASK_KILLABLE If the task is placed in the TASK_INTERRUPTIBLE state it will sleep until either something explicitly wakes it up, or a non-masked signal is received. Switch to TASK_KILLABLE to avoid the noises. Cc: Matthew Wilcox Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 33497846e47e..1140aecd82ce 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -579,7 +579,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, - TASK_INTERRUPTIBLE); + TASK_KILLABLE); } extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); From ea16567f11018e2f58e72b667b0c803ff92b8153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= Date: Mon, 23 May 2022 17:09:51 +0100 Subject: [PATCH 289/334] ceph: fix decoding of client session messages flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cephfs kernel client started to show the message: ceph: mds0 session blocklisted when mounting a filesystem. This is due to the fact that the session messages are being incorrectly decoded: the skip needs to take into account the 'len'. While there, fixed some whitespaces too. Cc: stable@vger.kernel.org Fixes: e1c9788cb397 ("ceph: don't rely on error_string to validate blocklisted session.") Signed-off-by: Luís Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e9eb996e0aee..f5d110d90b77 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3425,13 +3425,17 @@ static void handle_session(struct ceph_mds_session *session, } if (msg_version >= 5) { - u32 flags; - /* version >= 4, struct_v, struct_cv, len, metric_spec */ - ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); + u32 flags, len; + + /* version >= 4 */ + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ + ceph_decode_32_safe(&p, end, len, bad); /* len */ + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ + /* version >= 5, flags */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { - pr_warn("mds%d session blocklisted\n", session->s_mds); + pr_warn("mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } From af7dc8e5124daf017ebc85c8ea97212e1e2e62fe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 May 2022 13:22:09 -0400 Subject: [PATCH 290/334] MAINTAINERS: move myself from ceph "Maintainer" to "Reviewer" Xiubo has graciously volunteered to take over for me as the Linux cephfs client maintainer. Make it official by changing myself to be a "Reviewer" for libceph and ceph. Signed-off-by: Jeff Layton Acked-by: Xiubo Li Signed-off-by: Ilya Dryomov --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f468864fd268..68038dd3eef8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4547,8 +4547,8 @@ F: drivers/power/supply/cw2015_battery.c CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov -M: Jeff Layton M: Xiubo Li +R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -4558,9 +4558,9 @@ F: include/linux/crush/ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) -M: Jeff Layton M: Xiubo Li M: Ilya Dryomov +R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ From 61bf40ef51aa73f6216b33563271b6acf7ea8d70 Mon Sep 17 00:00:00 2001 From: Eddie James Date: Wed, 25 May 2022 11:58:51 -0500 Subject: [PATCH 291/334] spi: fsi: Fix spurious timeout The driver may return a timeout error even if the status register indicates that the transfer may proceed. Fix this by restructuring the polling loop. Fixes: 89b35e3f2851 ("spi: fsi: Implement a timeout for polling status") Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20220525165852.33167-2-eajames@linux.ibm.com Signed-off-by: Mark Brown --- drivers/spi/spi-fsi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi-fsi.c b/drivers/spi/spi-fsi.c index d403a7a3021d..72ab066ce552 100644 --- a/drivers/spi/spi-fsi.c +++ b/drivers/spi/spi-fsi.c @@ -319,12 +319,12 @@ static int fsi_spi_transfer_data(struct fsi_spi *ctx, end = jiffies + msecs_to_jiffies(SPI_FSI_STATUS_TIMEOUT_MS); do { + if (time_after(jiffies, end)) + return -ETIMEDOUT; + rc = fsi_spi_status(ctx, &status, "TX"); if (rc) return rc; - - if (time_after(jiffies, end)) - return -ETIMEDOUT; } while (status & SPI_FSI_STATUS_TDR_FULL); sent += nb; @@ -337,12 +337,12 @@ static int fsi_spi_transfer_data(struct fsi_spi *ctx, while (transfer->len > recv) { end = jiffies + msecs_to_jiffies(SPI_FSI_STATUS_TIMEOUT_MS); do { + if (time_after(jiffies, end)) + return -ETIMEDOUT; + rc = fsi_spi_status(ctx, &status, "RX"); if (rc) return rc; - - if (time_after(jiffies, end)) - return -ETIMEDOUT; } while (!(status & SPI_FSI_STATUS_RDR_FULL)); rc = fsi_spi_read_reg(ctx, SPI_FSI_DATA_RX, &in); From ebf2a3521738520e12849b221fea24928b3f61ff Mon Sep 17 00:00:00 2001 From: Eddie James Date: Wed, 25 May 2022 11:58:52 -0500 Subject: [PATCH 292/334] spi: core: Display return code when failing to transfer message All the other calls to the controller driver display the error return code. The return code is helpful to understand what went wrong, so include it when failing to transfer one message. Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20220525165852.33167-3-eajames@linux.ibm.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 136bd0e51ada..bbc8d4448e79 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1672,7 +1672,8 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread) ret = ctlr->transfer_one_message(ctlr, msg); if (ret) { dev_err(&ctlr->dev, - "failed to transfer one message from queue\n"); + "failed to transfer one message from queue: %d\n", + ret); goto out; } From 5672225e8f2a872a22b0cecedba7a6644af1fb84 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 27 May 2022 10:20:45 +1000 Subject: [PATCH 293/334] xfs: avoid unnecessary runtime sibling pointer endian conversions Commit dc04db2aa7c9 has caused a small aim7 regression, showing a small increase in CPU usage in __xfs_btree_check_sblock() as a result of the extra checking. This is likely due to the endian conversion of the sibling poitners being unconditional instead of relying on the compiler to endian convert the NULL pointer at compile time and avoiding the runtime conversion for this common case. Rework the checks so that endian conversion of the sibling pointers is only done if they are not null as the original code did. .... and these need to be "inline" because the compiler completely fails to inline them automatically like it should be doing. $ size fs/xfs/libxfs/xfs_btree.o* text data bss dec hex filename 51874 240 0 52114 cb92 fs/xfs/libxfs/xfs_btree.o.orig 51562 240 0 51802 ca5a fs/xfs/libxfs/xfs_btree.o.inline Just when you think the tools have advanced sufficiently we don't have to care about stuff like this anymore, along comes a reminder that *our tools still suck*. Fixes: dc04db2aa7c9 ("xfs: detect self referencing btree sibling pointers") Reported-by: kernel test robot Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 47 +++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2aa300f7461f..786ec1cb1bba 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -51,16 +51,31 @@ xfs_btree_magic( return magic; } -static xfs_failaddr_t +/* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t xfs_btree_check_lblock_siblings( struct xfs_mount *mp, struct xfs_btree_cur *cur, int level, xfs_fsblock_t fsb, - xfs_fsblock_t sibling) + __be64 dsibling) { - if (sibling == NULLFSBLOCK) + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; + + sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; if (level >= 0) { @@ -74,17 +89,21 @@ xfs_btree_check_lblock_siblings( return NULL; } -static xfs_failaddr_t +static inline xfs_failaddr_t xfs_btree_check_sblock_siblings( struct xfs_mount *mp, struct xfs_btree_cur *cur, int level, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_agblock_t sibling) + __be32 dsibling) { - if (sibling == NULLAGBLOCK) + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) return NULL; + + sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; if (level >= 0) { @@ -136,10 +155,10 @@ __xfs_btree_check_lblock( fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - be64_to_cpu(block->bb_u.l.bb_leftsib)); + block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - be64_to_cpu(block->bb_u.l.bb_rightsib)); + block->bb_u.l.bb_rightsib); return fa; } @@ -204,10 +223,10 @@ __xfs_btree_check_sblock( } fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_leftsib)); + block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, - agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); + agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -4523,10 +4542,10 @@ xfs_btree_lblock_verify( /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - be64_to_cpu(block->bb_u.l.bb_leftsib)); + block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - be64_to_cpu(block->bb_u.l.bb_rightsib)); + block->bb_u.l.bb_rightsib); return fa; } @@ -4580,10 +4599,10 @@ xfs_btree_sblock_verify( agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_leftsib)); + block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_rightsib)); + block->bb_u.s.bb_rightsib); return fa; } From 5b55cbc2d72632e874e50d2e36bce608e55aaaea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 27 May 2022 10:21:04 +1000 Subject: [PATCH 294/334] xfs: don't assert fail on perag references on teardown Not fatal, the assert is there to catch developer attention. I'm seeing this occasionally during recoveryloop testing after a shutdown, and I don't want this to stop an overnight recoveryloop run as it is currently doing. Convert the ASSERT to a XFS_IS_CORRUPT() check so it will dump a corruption report into the log and cause a test failure that way, but it won't stop the machine dead. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ag.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1e4ee042d52f..3e920cf1b454 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -173,7 +173,6 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -192,7 +191,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); From 56486f307100e8fc66efa2ebd8a71941fa10bf6f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 27 May 2022 10:21:09 +1000 Subject: [PATCH 295/334] xfs: assert in xfs_btree_del_cursor should take into account error xfs/538 on a 1kB block filesystem failed with this assert: XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || xfs_is_shutdown(cur->bc_mp), file: fs/xfs/libxfs/xfs_btree.c, line: 448 The problem was that an allocation failed unexpectedly in xfs_bmbt_alloc_block() after roughly 150,000 minlen allocation error injections, resulting in an EFSCORRUPTED error being returned to xfs_bmapi_write(). The error occurred on extent-to-btree format conversion allocating the new root block: RIP: 0010:xfs_bmbt_alloc_block+0x177/0x210 Call Trace: xfs_btree_new_iroot+0xdf/0x520 xfs_btree_make_block_unfull+0x10d/0x1c0 xfs_btree_insrec+0x364/0x790 xfs_btree_insert+0xaa/0x210 xfs_bmap_add_extent_hole_real+0x1fe/0x9a0 xfs_bmapi_allocate+0x34c/0x420 xfs_bmapi_write+0x53c/0x9c0 xfs_alloc_file_space+0xee/0x320 xfs_file_fallocate+0x36b/0x450 vfs_fallocate+0x148/0x340 __x64_sys_fallocate+0x3c/0x70 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa Why the allocation failed at this point is unknown, but is likely that we ran the transaction out of reserved space and filesystem out of space with bmbt blocks because of all the minlen allocations being done causing worst case fragmentation of a large allocation. Regardless of the cause, we've then called xfs_bmapi_finish() which calls xfs_btree_del_cursor(cur, error) to tear down the cursor. So we have a failed operation, error != 0, cur->bc_ino.allocated > 0 and the filesystem is still up. The assert fails to take into account that allocation can fail with an error and the transaction teardown will shut the filesystem down if necessary. i.e. the assert needs to check "|| error != 0" as well, because at this point shutdown is pending because the current transaction is dirty.... Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 786ec1cb1bba..32100cfb9dfc 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -445,8 +445,14 @@ xfs_btree_del_cursor( break; } + /* + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. + */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - xfs_is_shutdown(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) From 86d40f1e49e9a909d25c35ba01bea80dbcd758cb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:21:43 +1000 Subject: [PATCH 296/334] xfs: purge dquots after inode walk fails during quotacheck xfs/434 and xfs/436 have been reporting occasional memory leaks of xfs_dquot objects. These tests themselves were the messenger, not the culprit, since they unload the xfs module, which trips the slub debugging code while tearing down all the xfs slab caches: ============================================================================= BUG xfs_dquot (Tainted: G W ): Objects remaining in xfs_dquot on __kmem_cache_shutdown() ----------------------------------------------------------------------------- Slab 0xffffea000606de00 objects=30 used=5 fp=0xffff888181b78a78 flags=0x17ff80000010200(slab|head|node=0|zone=2|lastcpupid=0xfff) CPU: 0 PID: 3953166 Comm: modprobe Tainted: G W 5.18.0-rc6-djwx #rc6 d5824be9e46a2393677bda868f9b154d917ca6a7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20171121_152543-x86-ol7-builder-01.us.oracle.com-4.el7.1 04/01/2014 Since we don't generally rmmod the xfs module between fstests, this means that xfs/434 is really just the canary in the coal mine -- something leaked a dquot, but we don't know who. After days of pounding on fstests with kmemleak enabled, I finally got it to spit this out: unreferenced object 0xffff8880465654c0 (size 536): comm "u10:4", pid 88, jiffies 4294935810 (age 29.512s) hex dump (first 32 bytes): 60 4a 56 46 80 88 ff ff 58 ea e4 5c 80 88 ff ff `JVF....X..\.... 00 e0 52 49 80 88 ff ff 01 00 01 00 00 00 00 00 ..RI............ backtrace: [] xfs_dquot_alloc+0x2c/0x530 [xfs] [] xfs_qm_dqread+0x6f/0x330 [xfs] [] xfs_qm_dqget+0x132/0x4e0 [xfs] [] xfs_qm_quotacheck_dqadjust+0xa0/0x3e0 [xfs] [] xfs_qm_dqusage_adjust+0x35d/0x4f0 [xfs] [] xfs_iwalk_ag_recs+0x348/0x5d0 [xfs] [] xfs_iwalk_run_callbacks+0x273/0x540 [xfs] [] xfs_iwalk_ag+0x5ed/0x890 [xfs] [] xfs_iwalk_ag_work+0xff/0x170 [xfs] [] xfs_pwork_work+0x79/0x130 [xfs] [] process_one_work+0x672/0x1040 [] worker_thread+0x59b/0xec0 [] kthread+0x29e/0x340 [] ret_from_fork+0x1f/0x30 Now we know that quotacheck is at fault, but even this report was canaryish -- it was triggered by xfs/494, which doesn't actually mount any filesystems. (kmemleak can be a little slow to notice leaks, even with fstests repeatedly whacking it to look for them.) Looking at the *previous* fstest, however, showed that the test run before xfs/494 was xfs/117. The tipoff to the problem is in this excerpt from dmesg: XFS (sda4): Quotacheck needed: Please wait. XFS (sda4): Metadata corruption detected at xfs_dinode_verify.part.0+0xdb/0x7b0 [xfs], inode 0x119 dinode XFS (sda4): Unmount and run xfs_repair XFS (sda4): First 128 bytes of corrupted metadata buffer: 00000000: 49 4e 81 a4 03 02 00 00 00 00 00 00 00 00 00 00 IN.............. 00000010: 00 00 00 01 00 00 00 00 00 90 57 54 54 1a 4c 68 ..........WTT.Lh 00000020: 81 f9 7d e1 6d ee 16 00 34 bd 7d e1 6d ee 16 00 ..}.m...4.}.m... 00000030: 34 bd 7d e1 6d ee 16 00 00 00 00 00 00 00 00 00 4.}.m........... 00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000050: 00 00 00 02 00 00 00 00 00 00 00 00 96 80 f3 ab ................ 00000060: ff ff ff ff da 57 7b 11 00 00 00 00 00 00 00 03 .....W{......... 00000070: 00 00 00 01 00 00 00 10 00 00 00 00 00 00 00 08 ................ XFS (sda4): Quotacheck: Unsuccessful (Error -117): Disabling quotas. The dinode verifier decided that the inode was corrupt, which causes iget to return with EFSCORRUPTED. Since this happened during quotacheck, it is obvious that the kernel aborted the inode walk on account of the corruption error and disabled quotas. Unfortunately, we neglect to purge the dquot cache before doing that, which is how the dquots leaked. The problems started 10 years ago in commit b84a3a, when the dquot lists were converted to a radix tree, but the error handling behavior was not correctly preserved -- in that commit, if the bulkstat failed and usrquota was enabled, the bulkstat failure code would be overwritten by the result of flushing all the dquots to disk. As long as that succeeds, we'd continue the quota mount as if everything were ok, but instead we're now operating with a corrupt inode and incorrect quota usage counts. I didn't notice this bug in 2019 when I wrote commit ebd126a, which changed quotacheck to skip the dqflush when the scan doesn't complete due to inode walk failures. Introduced-by: b84a3a96751f ("xfs: remove the per-filesystem list of dquots") Fixes: ebd126a651f8 ("xfs: convert quotacheck to use the new iwalk functions") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 8fc813cb6011..abf08bbf34a9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1308,8 +1308,15 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) + if (error) { + /* + * The inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and + * tearing down the quotainfo, or else the dquots will leak. + */ + xfs_qm_dqpurge_all(mp); goto error_return; + } /* * We've made all the changes that we need to make incore. Flush them From a54f78def73d847cb060b18c4e4a3d1d26c9ca6d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:22:56 +1000 Subject: [PATCH 297/334] xfs: don't leak btree cursor when insrec fails after a split The recent patch to improve btree cycle checking caused a regression when I rebased the in-memory btree branch atop the 5.19 for-next branch, because in-memory short-pointer btrees do not have AG numbers. This produced the following complaint from kmemleak: unreferenced object 0xffff88803d47dde8 (size 264): comm "xfs_io", pid 4889, jiffies 4294906764 (age 24.072s) hex dump (first 32 bytes): 90 4d 0b 0f 80 88 ff ff 00 a0 bd 05 80 88 ff ff .M.............. e0 44 3a a0 ff ff ff ff 00 df 08 06 80 88 ff ff .D:............. backtrace: [] xfbtree_dup_cursor+0x49/0xc0 [xfs] [] xfs_btree_dup_cursor+0x3b/0x200 [xfs] [] __xfs_btree_split+0x6ad/0x820 [xfs] [] xfs_btree_split+0x60/0x110 [xfs] [] xfs_btree_make_block_unfull+0x19a/0x1f0 [xfs] [] xfs_btree_insrec+0x3aa/0x810 [xfs] [] xfs_btree_insert+0xb3/0x240 [xfs] [] xfs_rmap_insert+0x99/0x200 [xfs] [] xfs_rmap_map_shared+0x192/0x5f0 [xfs] [] xfs_rmap_map_raw+0x6b/0x90 [xfs] [] xrep_rmap_stash+0xd5/0x1d0 [xfs] [] xrep_rmap_visit_bmbt+0xa0/0xf0 [xfs] [] xrep_rmap_scan_iext+0x56/0xa0 [xfs] [] xrep_rmap_scan_ifork+0xd8/0x160 [xfs] [] xrep_rmap_scan_inode+0x35/0x80 [xfs] [] xrep_rmap_find_rmaps+0x10e/0x270 [xfs] I noticed that xfs_btree_insrec has a bunch of debug code that return out of the function immediately, without freeing the "new" btree cursor that can be returned when _make_block_unfull calls xfs_btree_split. Fix the error return in this function to free the btree cursor. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 32100cfb9dfc..2eecc49fc1b2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -3272,7 +3272,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3352,7 +3352,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3372,7 +3372,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3457,6 +3457,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } From 2723234923b3294dbcf6019c288c87465e927ed4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:26:17 +1000 Subject: [PATCH 298/334] xfs: refactor buffer cancellation table allocation Move the code that allocates and frees the buffer cancellation tables used by log recovery into the file that actually uses the tables. This is a precursor to some cleanups and a memory leak fix. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_log_recover.h | 14 +++++----- fs/xfs/xfs_buf_item_recover.c | 47 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_log_priv.h | 3 --- fs/xfs/xfs_log_recover.c | 32 +++++++--------------- 4 files changed, 64 insertions(+), 32 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 32e216255cb0..cc36ef9f5df5 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -110,12 +110,6 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -128,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +void xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index e484251dc9c8..d2e2dff01b99 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -23,6 +23,15 @@ #include "xfs_dir2.h" #include "xfs_quota.h" +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + /* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. @@ -993,3 +1002,41 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +void +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), + 0); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + if (!log->l_buf_cancel_table) + return; + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 67fd9789e69a..686c01eb3661 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -428,9 +428,6 @@ struct xlog { struct rw_semaphore l_incompat_users; }; -#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b1980d7cbbee..9cf59ae98b86 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3223,7 +3223,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error; ASSERT(head_blk != tail_blk); @@ -3231,37 +3231,23 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + xlog_alloc_buf_cancel_table(log); error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; } From 8db074bd84df5ccc88bff3f8f900f66f4b8349fa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:26:38 +1000 Subject: [PATCH 299/334] xfs: don't leak xfs_buf_cancel structures when recovery fails If log recovery fails, we free the memory used by the buffer cancellation buckets, but we don't actually traverse each bucket list to free the individual xfs_buf_cancel objects. This leads to a memory leak, as reported by kmemleak in xfs/051: unreferenced object 0xffff888103629560 (size 32): comm "mount", pid 687045, jiffies 4296935916 (age 10.752s) hex dump (first 32 bytes): 08 d3 0a 01 00 00 00 00 08 00 00 00 01 00 00 00 ................ d0 f5 0b 92 81 88 ff ff 80 64 64 25 81 88 ff ff .........dd%.... backtrace: [] kmem_alloc+0x73/0x140 [xfs] [] xlog_recover_buf_commit_pass1+0x139/0x200 [xfs] [] xlog_recover_commit_trans+0x307/0x350 [xfs] [] xlog_recovery_process_trans+0xa5/0xe0 [xfs] [] xlog_recover_process_data+0x8d/0x140 [xfs] [] xlog_do_recovery_pass+0x19d/0x740 [xfs] [] xlog_do_log_recovery+0x6d/0x150 [xfs] [] xlog_do_recover+0x33/0x1d0 [xfs] [] xlog_recover+0xda/0x190 [xfs] [] xfs_log_mount+0x14c/0x360 [xfs] [] xfs_mountfs+0x50d/0xa60 [xfs] [] xfs_fs_fill_super+0x6a5/0x950 [xfs] [] get_tree_bdev+0x175/0x280 [] vfs_get_tree+0x1a/0x80 [] path_mount+0x6ff/0xaa0 [] __x64_sys_mount+0x103/0x140 Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf_item_recover.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index d2e2dff01b99..f983af4de0a5 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1034,9 +1034,22 @@ void xlog_free_buf_cancel_table( struct xlog *log) { + int i; + if (!log->l_buf_cancel_table) return; + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; } From 910bbdf2f4d7df46781bc9b723048f5ebed3d0d7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:27:19 +1000 Subject: [PATCH 300/334] xfs: convert buf_cancel_table allocation to kmalloc_array While we're messing around with how recovery allocates and frees the buffer cancellation table, convert the allocation to use kmalloc_array instead of the old kmem_alloc APIs, and make it handle a null return, even though that's not likely. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_buf_item_recover.c | 14 ++++++++++---- fs/xfs/xfs_log_recover.c | 4 +++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index cc36ef9f5df5..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -122,7 +122,7 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); -void xlog_alloc_buf_cancel_table(struct xlog *log); +int xlog_alloc_buf_cancel_table(struct xlog *log); void xlog_free_buf_cancel_table(struct xlog *log); #ifdef DEBUG diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index f983af4de0a5..ffa94102094d 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1015,19 +1015,25 @@ xlog_check_buf_cancel_table( } #endif -void +int xlog_alloc_buf_cancel_table( struct xlog *log) { + void *p; int i; ASSERT(log->l_buf_cancel_table == NULL); - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; } void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9cf59ae98b86..5f7e4e6e33ce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3231,7 +3231,9 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - xlog_alloc_buf_cancel_table(log); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error; error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); From 374037966d661d62139a8b6fd33f42fc2038a1e6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:29:51 +1000 Subject: [PATCH 301/334] xfs: don't log every time we clear the log incompat flags There's no need to spam the logs every time we clear the log incompat flags -- if someone is periodically using one of these features, they'll be cleared every time the log tries to clean itself, which can get pretty chatty: $ dmesg | grep -i clear [ 5363.894711] XFS (sdd): Clearing log incompat feature flags. [ 5365.157516] XFS (sdd): Clearing log incompat feature flags. [ 5369.388543] XFS (sdd): Clearing log incompat feature flags. [ 5371.281246] XFS (sdd): Clearing log incompat feature flags. These aren't high value messages either -- nothing's gone wrong, and nobody's trying anything tricky. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0c0bcbd4949d..daa8d29c46b4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1356,7 +1356,6 @@ xfs_clear_incompat_log_features( if (xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { - xfs_info(mp, "Clearing log incompat feature flags."); xfs_sb_remove_incompat_log_features(&mp->m_sb); ret = true; } From df5660cf63bbafb5a1250954b91d9ec26558536f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:31:34 +1000 Subject: [PATCH 302/334] xfs: implement per-mount warnings for scrub and shrink usage Currently, we don't have a consistent story around logging when an EXPERIMENTAL feature gets turned on at runtime -- online fsck and shrink log a message once per day across all mounts, and the recently merged LARP mode only ever does it once per insmod cycle or reboot. Because EXPERIMENTAL tags are supposed to go away eventually, convert the existing daily warnings into state flags that travel with the mount, and warn once per mount. Making this an opstate flag means that we'll be able to capture the experimental usage in the ftrace output too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/scrub/scrub.c | 17 ++--------------- fs/xfs/xfs_fsops.c | 7 +------ fs/xfs/xfs_message.h | 6 ++++++ fs/xfs/xfs_mount.h | 15 ++++++++++++++- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index b11870d07c56..2e8e400f10a9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -340,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, }; -/* This isn't a stable feature, warn once per day. */ -static inline void -xchk_experimental_warning( - struct xfs_mount *mp) -{ - static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( - "xchk_warning", 86400 * HZ, 1); - ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&scrub_warning)) - xfs_alert(mp, -"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); -} - static int xchk_validate_inputs( struct xfs_mount *mp, @@ -478,7 +464,8 @@ xfs_scrub_metadata( if (error) goto out; - xchk_experimental_warning(mp); + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, + "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); if (!sc) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 888839e75d11..d4a77c53f94b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -149,12 +149,7 @@ xfs_growfs_data_private( error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, &lastag_extended); } else { - static struct ratelimit_state shrink_warning = \ - RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); - ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&shrink_warning)) - xfs_alert(mp, + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 55ee464ab59f..cc323775a12c 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -75,6 +75,12 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_mount(mp, warntag, fmt, ...) \ +do { \ + if (xfs_should_warn((mp), (warntag))) \ + xfs_warn((mp), (fmt), ##__VA_ARGS__); \ +} while (0) + #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c42786e4942..93a954271db2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -391,6 +391,11 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 +/* Kernel has logged a warning about online fsck being used on this fs. */ +#define XFS_OPSTATE_WARNED_SCRUB 7 +/* Kernel has logged a warning about shrink being used on this fs. */ +#define XFS_OPSTATE_WARNED_SHRINK 8 + #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ { \ @@ -413,6 +418,12 @@ __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +static inline bool +xfs_should_warn(struct xfs_mount *mp, long nr) +{ + return !test_and_set_bit(nr, &mp->m_opstate); +} + #define XFS_OPSTATE_STRINGS \ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ @@ -420,7 +431,9 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ - { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ + { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" } /* * Max and min values for mount-option defined I/O From 202865cc215d135762ea99abda7d87925b1cfc7a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:32:07 +1000 Subject: [PATCH 303/334] xfs: warn about LARP once per mount Since LARP is an experimental debug-only feature, we should try to warn about it being in use once per mount, not once per reboot. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_mount.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9dc748abdf33..a75f4ffc75f9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3910,8 +3910,8 @@ xfs_attr_use_log_assist( if (error) goto drop_incompat; - xfs_warn_once(mp, -"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!"); + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, + "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); return 0; drop_incompat: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 93a954271db2..ba5d42abf66e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -395,6 +395,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_SCRUB 7 /* Kernel has logged a warning about shrink being used on this fs. */ #define XFS_OPSTATE_WARNED_SHRINK 8 +/* Kernel has logged a warning about logged xattr updates being used. */ +#define XFS_OPSTATE_WARNED_LARP 9 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -433,7 +435,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ - { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" } + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } /* * Max and min values for mount-option defined I/O From d9c61ccb3b09d8f892cccbf662ce0c870f8e4ade Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:33:29 +1000 Subject: [PATCH 304/334] xfs: move xfs_attr_use_log_assist out of xfs_log.c The LARP patchset added an awkward coupling point between libxfs and what would be libxlog, if the XFS log were actually its own library. Move the code that enables logged xattr updates out of "lib"xlog and into xfs_xattr.c so that it no longer has to know about xlog_* functions. While we're at it, give xfs_xattr.c its own header file. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 6 ++--- fs/xfs/xfs_log.c | 41 --------------------------------- fs/xfs/xfs_super.c | 1 + fs/xfs/xfs_super.h | 1 - fs/xfs/xfs_xattr.c | 49 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_xattr.h | 14 ++++++++++++ 6 files changed, 67 insertions(+), 45 deletions(-) create mode 100644 fs/xfs/xfs_xattr.h diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 9f14aca29ec4..24fa213715c1 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -25,7 +25,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_attr_item.h" -#include "xfs_log.h" +#include "xfs_xattr.h" struct kmem_cache *xfs_attr_intent_cache; @@ -1028,7 +1028,7 @@ xfs_attr_set( } if (use_logging) { - error = xfs_attr_use_log_assist(mp); + error = xfs_attr_grab_log_assist(mp); if (error) return error; } @@ -1102,7 +1102,7 @@ out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); drop_incompat: if (use_logging) - xlog_drop_incompat_feat(mp->m_log); + xfs_attr_rele_log_assist(mp); return error; out_trans_cancel: diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a75f4ffc75f9..1e972f884a81 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3877,44 +3877,3 @@ xlog_drop_incompat_feat( { up_read(&log->l_incompat_users); } - -/* - * Get permission to use log-assisted atomic exchange of file extents. - * - * Callers must not be running any transactions or hold any inode locks, and - * they must release the permission by calling xlog_drop_incompat_feat - * when they're done. - */ -int -xfs_attr_use_log_assist( - struct xfs_mount *mp) -{ - int error = 0; - - /* - * Protect ourselves from an idle log clearing the logged xattrs log - * incompat feature bit. - */ - xlog_use_incompat_feat(mp->m_log); - - /* - * If log-assisted xattrs are already enabled, the caller can use the - * log assisted swap functions with the log-incompat reference we got. - */ - if (xfs_sb_version_haslogxattrs(&mp->m_sb)) - return 0; - - /* Enable log-assisted xattrs. */ - error = xfs_add_incompat_log_feature(mp, - XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); - if (error) - goto drop_incompat; - - xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, - "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); - - return 0; -drop_incompat: - xlog_drop_incompat_feat(mp->m_log); - return error; -} diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 51ce127a0cc6..a6e7b4176faf 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -39,6 +39,7 @@ #include "xfs_ag.h" #include "xfs_defer.h" #include "xfs_attr_item.h" +#include "xfs_xattr.h" #include #include diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 167d23f92ffe..3cd5a51bace1 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -91,7 +91,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; -extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 7a044afd4c46..fc6acf7021a7 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -15,9 +15,58 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" +#include "xfs_log.h" +#include "xfs_xattr.h" #include +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +int +xfs_attr_grab_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, + "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} + +void +xfs_attr_rele_log_assist( + struct xfs_mount *mp) +{ + xlog_drop_incompat_feat(mp->m_log); +} static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h new file mode 100644 index 000000000000..d34ef1835541 --- /dev/null +++ b/fs/xfs/xfs_xattr.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_XATTR_H__ +#define __XFS_XATTR_H__ + +int xfs_attr_grab_log_assist(struct xfs_mount *mp); +void xfs_attr_rele_log_assist(struct xfs_mount *mp); + +extern const struct xattr_handler *xfs_xattr_handlers[]; + +#endif /* __XFS_XATTR_H__ */ From efc2efeba169ff37bbd425631985064365c614e0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 27 May 2022 10:34:04 +1000 Subject: [PATCH 305/334] xfs: move xfs_attr_use_log_assist usage out of libxfs The LARP patchset added an awkward coupling point between libxfs and what would be libxlog, if the XFS log were actually its own library. Move the code that sets up logged xattr updates out of libxfs and into xfs_xattr.c so that libxfs no longer has to know about xlog_* functions. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 12 +----------- fs/xfs/xfs_acl.c | 3 ++- fs/xfs/xfs_ioctl.c | 3 ++- fs/xfs/xfs_iops.c | 3 ++- fs/xfs/xfs_xattr.c | 34 +++++++++++++++++++++++++++++++--- fs/xfs/xfs_xattr.h | 3 +-- 6 files changed, 39 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 24fa213715c1..836ab1b8ed7b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -982,7 +982,6 @@ xfs_attr_set( int error, local; int rmt_blks = 0; unsigned int total; - bool use_logging = xfs_has_larp(mp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; @@ -1027,12 +1026,6 @@ xfs_attr_set( rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } - if (use_logging) { - error = xfs_attr_grab_log_assist(mp); - if (error) - return error; - } - /* * Root fork attributes can use reserved data blocks for this * operation if necessary @@ -1040,7 +1033,7 @@ xfs_attr_set( xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) - goto drop_incompat; + return error; if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, @@ -1100,9 +1093,6 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); -drop_incompat: - if (use_logging) - xfs_attr_rele_log_assist(mp); return error; out_trans_cancel: diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 3df9c1782ead..b744c62052b6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -17,6 +17,7 @@ #include "xfs_error.h" #include "xfs_acl.h" #include "xfs_trans.h" +#include "xfs_xattr.h" #include @@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) xfs_acl_to_disk(args.value, acl); } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); kmem_free(args.value); /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e5cb7936206..5a364a7d58fd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -37,6 +37,7 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include #include @@ -524,7 +525,7 @@ xfs_attrmulti_attr_set( args.valuelen = len; } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (flags & XFS_IOC_ATTR_ROOT)) xfs_forget_acl(inode, name); kfree(args.value); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e912b7fee714..29f5b8b8aca6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -24,6 +24,7 @@ #include "xfs_iomap.h" #include "xfs_error.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include #include @@ -61,7 +62,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (error < 0) break; } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index fc6acf7021a7..35e13e125ec6 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -27,7 +27,7 @@ * they must release the permission by calling xlog_drop_incompat_feat * when they're done. */ -int +static inline int xfs_attr_grab_log_assist( struct xfs_mount *mp) { @@ -61,13 +61,41 @@ drop_incompat: return error; } -void +static inline void xfs_attr_rele_log_assist( struct xfs_mount *mp) { xlog_drop_incompat_feat(mp->m_log); } +/* + * Set or remove an xattr, having grabbed the appropriate logging resources + * prior to calling libxfs. + */ +int +xfs_attr_change( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + bool use_logging = false; + int error; + + if (xfs_has_larp(mp)) { + error = xfs_attr_grab_log_assist(mp); + if (error) + return error; + + use_logging = true; + } + + error = xfs_attr_set(args); + + if (use_logging) + xfs_attr_rele_log_assist(mp); + return error; +} + + static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) @@ -105,7 +133,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h index d34ef1835541..2b09133b1b9b 100644 --- a/fs/xfs/xfs_xattr.h +++ b/fs/xfs/xfs_xattr.h @@ -6,8 +6,7 @@ #ifndef __XFS_XATTR_H__ #define __XFS_XATTR_H__ -int xfs_attr_grab_log_assist(struct xfs_mount *mp); -void xfs_attr_rele_log_assist(struct xfs_mount *mp); +int xfs_attr_change(struct xfs_da_args *args); extern const struct xattr_handler *xfs_xattr_handlers[]; From 809631e2bff5aba056df75cc4e43127dbd0278c9 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 26 May 2022 22:36:56 +0206 Subject: [PATCH 306/334] Revert "printk: wake up all waiters" This reverts commit 938ba4084abcf6fdd21d9078513c52f8fb9b00d0. The wait queue @log_wait never has exclusive waiters, so there is no need to use wake_up_interruptible_all(). Using wake_up_interruptible() was the correct function to wake all waiters. Since there are no exclusive waiters, erroneously changing wake_up_interruptible() to wake_up_interruptible_all() did not result in any behavior change. However, using wake_up_interruptible_all() on a wait queue without exclusive waiters is fundamentally wrong. Go back to using wake_up_interruptible() to wake all waiters. Signed-off-by: John Ogness Acked-by: Steven Rostedt (Google) Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220526203056.81123-1-john.ogness@linutronix.de --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a3e1035929b0..ea3dd55709e7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3904,7 +3904,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) } if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible_all(&log_wait); + wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = From 054cb2891b9cc21bf08fb6380e638c056748098f Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Thu, 26 May 2022 10:21:06 +0800 Subject: [PATCH 307/334] f2fs: replace F2FS_I(inode) and sbi by the local variable We have define 'fi' at the begin of the functions, just use it, rather than use F2FS_I(inode) again. Signed-off-by: Yufen Yu Reviewed-by: Chao Yu [Jaegeuk Kim: replace sbi] Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 22 +++++++++++----------- fs/f2fs/inode.c | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c73014901006..e10838879538 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4403,8 +4403,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a05d842a7e72..0297a4431540 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2032,25 +2032,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } /* Create a COW inode for atomic write */ pinode = f2fs_iget(inode->i_sb, fi->i_pino); if (IS_ERR(pinode)) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); ret = PTR_ERR(pinode); goto out; } @@ -2058,7 +2058,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); iput(pinode); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); @@ -2070,10 +2070,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->atomic_write_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2952,7 +2952,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -2972,7 +2972,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3922,7 +3922,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 938961a9084e..fc55f5bd1fcc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -465,10 +465,10 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); From 2d1fe8a86bf5e0663866fd0da83c2af1e1b0e362 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 May 2022 12:13:30 +0800 Subject: [PATCH 308/334] f2fs: fix to tag gcing flag on page during file defragment In order to garantee migrated data be persisted during checkpoint, otherwise out-of-order persistency between data and node may cause data corruption after SPOR. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0297a4431540..35e99d197adb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2602,6 +2602,7 @@ do_map: } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; From 621433b7e25d6d42e5f75bd8c4a62d6c7251511b Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Fri, 27 May 2022 08:50:54 +0900 Subject: [PATCH 309/334] ksmbd: smbd: relax the count of sges required Remove the condition that the count of sges must be greater than or equal to SMB_DIRECT_MAX_SEND_SGES(8). Because ksmbd needs sges only for SMB direct header, SMB2 transform header, SMB2 response, and optional payload. Signed-off-by: Hyunchul Lee Acked-by: Namjae Jeon Reviewed-by: Tom Talpey Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index e91acc2746bc..d035e060c2f0 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1711,11 +1711,11 @@ static int smb_direct_init_params(struct smb_direct_transport *t, int max_send_sges, max_rw_wrs, max_send_wrs; unsigned int max_sge_per_wr, wrs_per_credit; - /* need 2 more sge. because a SMB_DIRECT header will be mapped, - * and maybe a send buffer could be not page aligned. + /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, + * SMB2 response could be mapped. */ t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2; + max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { pr_err("max_send_size %d is too large\n", t->max_send_size); return -EINVAL; @@ -1736,6 +1736,8 @@ static int smb_direct_init_params(struct smb_direct_transport *t, max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, device->attrs.max_sge_rd); + max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, + max_send_sges); wrs_per_credit = max_t(unsigned int, 4, DIV_ROUND_UP(t->pages_per_rw_credit, max_sge_per_wr) + 1); @@ -1760,11 +1762,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { - pr_err("warning: device max_send_sge = %d too small\n", - device->attrs.max_send_sge); - return -EINVAL; - } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); From 2ef4bb24ff39ae4af89b80bcc5d516f55368e8ae Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 9 Mar 2022 05:37:32 +0000 Subject: [PATCH 310/334] pcmcia: Use platform_get_irq() to get the interrupt It is not recommened to use platform_get_resource(pdev, IORESOURCE_IRQ) for requesting IRQ's resources any more, as they can be not ready yet in case of DT-booting. platform_get_irq() instead is a recommended way for getting IRQ even if it was not retrieved earlier. It also makes code simpler because we're getting "int" value right away and no conversion from resource to int is required. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: Dominik Brodowski --- drivers/pcmcia/bcm63xx_pcmcia.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/pcmcia/bcm63xx_pcmcia.c b/drivers/pcmcia/bcm63xx_pcmcia.c index 16f573173471..bb06311d0b5f 100644 --- a/drivers/pcmcia/bcm63xx_pcmcia.c +++ b/drivers/pcmcia/bcm63xx_pcmcia.c @@ -327,10 +327,11 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) { struct bcm63xx_pcmcia_socket *skt; struct pcmcia_socket *sock; - struct resource *res, *irq_res; + struct resource *res; unsigned int regmem_size = 0, iomem_size = 0; u32 val; int ret; + int irq; skt = kzalloc(sizeof(*skt), GFP_KERNEL); if (!skt) @@ -342,9 +343,9 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) /* make sure we have all resources we need */ skt->common_res = platform_get_resource(pdev, IORESOURCE_MEM, 1); skt->attr_res = platform_get_resource(pdev, IORESOURCE_MEM, 2); - irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + irq = platform_get_irq(pdev, 0); skt->pd = pdev->dev.platform_data; - if (!skt->common_res || !skt->attr_res || !irq_res || !skt->pd) { + if (!skt->common_res || !skt->attr_res || (irq < 0) || !skt->pd) { ret = -EINVAL; goto err; } @@ -380,7 +381,7 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) sock->dev.parent = &pdev->dev; sock->features = SS_CAP_STATIC_MAP | SS_CAP_PCCARD; sock->io_offset = (unsigned long)skt->io_base; - sock->pci_irq = irq_res->start; + sock->pci_irq = irq; #ifdef CONFIG_CARDBUS sock->cb_dev = bcm63xx_cb_dev; From 0130e4e8e49f9bd0342d3fc14102470ea9e7230e Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 26 May 2022 09:03:44 +0800 Subject: [PATCH 311/334] erofs: leave compressed inodes unsupported in fscache mode for now erofs over fscache doesn't support the compressed layout yet. It will cause NULL crash if there are compressed inodes contained when working in fscache mode. So far in the erofs based container image distribution scenarios (RAFS v6), the compressed RAFS v6 images are downloaded and then decompressed on demand as an uncompressed erofs image. Then the erofs image is mounted in fscache mode for containers to use. IOWs, currently compressed data is decompressed on the userspace side instead and uncompressed erofs images will be finally cached. The fscache support for the compressed layout is still under development and it will be used for runtime decompression feature. Anyway, to avoid the potential crash, let's leave the compressed inodes unsupported in fscache mode until we support it later. Fixes: 1442b02b66ad ("erofs: implement fscache-based data read for non-inline layout") Signed-off-by: Jeffle Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220526010344.118493-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index bcc8335b46b3..95a403720e8c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -288,7 +288,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - err = z_erofs_fill_inode(inode); + if (!erofs_is_fscache_mode(inode->i_sb)) + err = z_erofs_fill_inode(inode); + else + err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; From b5cb79dcfd03e7bb8054d38eaaa557d07966a811 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Fri, 27 May 2022 18:18:00 +0800 Subject: [PATCH 312/334] erofs: fix crash when enable tracepoint cachefiles_prep_read RIP: 0010:trace_event_raw_event_cachefiles_prep_read+0x88/0xe0 [cachefiles] Call Trace: cachefiles_prepare_read+0x1d7/0x3a0 [cachefiles] erofs_fscache_read_folios+0x188/0x220 [erofs] erofs_fscache_meta_readpage+0x106/0x160 [erofs] do_read_cache_folio+0x42a/0x590 ? bdi_register_va.part.14+0x1a7/0x210 ? super_setup_bdi_name+0x76/0xe0 erofs_bread+0x5b/0x170 [erofs] erofs_fc_fill_super+0x12b/0xc50 [erofs] This tracepoint uses rreq->inode, should set it when allocating. Fixes: d435d53228dd ("erofs: change to use asynchronous io for fscache readpage/readahead") Signed-off-by: Xin Yin Reviewed-by: Jeffle Xu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220527101800.22360-1-yinxin.x@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 7e4417167d0b..355259252943 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -17,6 +17,7 @@ static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space rreq->start = start; rreq->len = len; rreq->mapping = mapping; + rreq->inode = mapping->host; INIT_LIST_HEAD(&rreq->subrequests); refcount_set(&rreq->ref, 1); return rreq; From 6e95d0a01899ed176b3450db057c3c0a9609cf47 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 27 May 2022 15:01:33 +0800 Subject: [PATCH 313/334] erofs: update documentation - refine the filesystem overview for better description of recent new features like FSDAX and Fscache; - add the new `fsid' mount option; - fix some typos. Link: https://lore.kernel.org/r/20220527070133.77962-1-hsiangkao@linux.alibaba.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 64 +++++++++++++++++++---------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index bef6d3040ce4..05e03d54af1a 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -1,63 +1,82 @@ .. SPDX-License-Identifier: GPL-2.0 ====================================== -Enhanced Read-Only File System - EROFS +EROFS - Enhanced Read-Only File System ====================================== Overview ======== -EROFS file-system stands for Enhanced Read-Only File System. Different -from other read-only file systems, it aims to be designed for flexibility, -scalability, but be kept simple and high performance. +EROFS filesystem stands for Enhanced Read-Only File System. It aims to form a +generic read-only filesystem solution for various read-only use cases instead +of just focusing on storage space saving without considering any side effects +of runtime performance. -It is designed as a better filesystem solution for the following scenarios: +It is designed to meet the needs of flexibility, feature extendability and user +payload friendly, etc. Apart from those, it is still kept as a simple +random-access friendly high-performance filesystem to get rid of unneeded I/O +amplification and memory-resident overhead compared to similar approaches. + +It is implemented to be a better choice for the following scenarios: - read-only storage media or - part of a fully trusted read-only solution, which means it needs to be immutable and bit-for-bit identical to the official golden image for - their releases due to security and other considerations and + their releases due to security or other considerations and - hope to minimize extra storage space with guaranteed end-to-end performance by using compact layout, transparent file compression and direct access, especially for those embedded devices with limited memory and high-density - hosts with numerous containers; + hosts with numerous containers. Here is the main features of EROFS: - Little endian on-disk design; - - Currently 4KB block size (nobh) and therefore maximum 16TB address space; + - 4KiB block size and 32-bit block addresses, therefore 16TiB address space + at most for now; - - Metadata & data could be mixed by design; + - Two inode layouts for different requirements: - - 2 inode versions for different requirements: - - ===================== ============ ===================================== + ===================== ============ ====================================== compact (v1) extended (v2) - ===================== ============ ===================================== + ===================== ============ ====================================== Inode metadata size 32 bytes 64 bytes - Max file size 4 GB 16 EB (also limited by max. vol size) + Max file size 4 GiB 16 EiB (also limited by max. vol size) Max uids/gids 65536 4294967296 Per-inode timestamp no yes (64 + 32-bit timestamp) Max hardlinks 65536 4294967296 - Metadata reserved 4 bytes 14 bytes - ===================== ============ ===================================== + Metadata reserved 8 bytes 18 bytes + ===================== ============ ====================================== + + - Metadata and data could be mixed as an option; - Support extended attributes (xattrs) as an option; - - Support xattr inline and tail-end data inline for all files; + - Support tailpacking data and xattr inline compared to byte-addressed + unaligned metadata or smaller block size alternatives; - Support POSIX.1e ACLs by using xattrs; - Support transparent data compression as an option: - LZ4 algorithm with the fixed-sized output compression for high performance; + LZ4 and MicroLZMA algorithms can be used on a per-file basis; In addition, + inplace decompression is also supported to avoid bounce compressed buffers + and page cache thrashing. - - Multiple device support for multi-layer container images. + - Support direct I/O on uncompressed files to avoid double caching for loop + devices; + + - Support FSDAX on uncompressed images for secure containers and ramdisks in + order to get rid of unnecessary page cache. + + - Support multiple devices for multi blob container images; + + - Support file-based on-demand loading with the Fscache infrastructure. The following git tree provides the file system user-space tools under -development (ex, formatting tool mkfs.erofs): +development, such as a formatting tool (mkfs.erofs), an on-disk consistency & +compatibility checking tool (fsck.erofs), and a debugging tool (dump.erofs): - git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs-utils.git @@ -91,6 +110,7 @@ dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. device=%s Specify a path to an extra device to be used together. +fsid=%s Specify a filesystem image ID for Fscache back-end. =================== ========================================================= Sysfs Entries @@ -226,8 +246,8 @@ Note that apart from the offset of the first filename, nameoff0 also indicates the total number of directory entries in this block since it is no need to introduce another on-disk field at all. -Chunk-based file ----------------- +Chunk-based files +----------------- In order to support chunk-based data deduplication, a new inode data layout has been supported since Linux v5.15: Files are split in equal-sized data chunks with ``extents`` area of the inode metadata indicating how to get the chunk From 6f5097e3367a7c0751e165e4c15bc30511a4ba38 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 30 May 2022 10:56:33 +1000 Subject: [PATCH 314/334] xfs: fix xfs_ifree() error handling to not leak perag ref For some reason commit 9a5280b312e2e ("xfs: reorder iunlink remove operation in xfs_ifree") replaced a jump to the exit path in the event of an xfs_difree() error with a direct return, which skips releasing the perag reference acquired at the top of the function. Restore the original code to drop the reference on error. Fixes: 9a5280b312e2e ("xfs: reorder iunlink remove operation in xfs_ifree") Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b2879870a17e..52d6f2c7d58b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2622,7 +2622,7 @@ xfs_ifree( */ error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - return error; + goto out; error = xfs_iunlink_remove(tp, pag, ip); if (error) From 87ca34a7065db66adbbe882a2be6b04127c26a87 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 29 May 2022 13:54:23 +0800 Subject: [PATCH 315/334] erofs: get rid of `struct z_erofs_collection' It was incompletely introduced for deduplication between different logical extents backed with the same pcluster. We will have a better in-memory representation in the next release cycle for this, as well as partial memory folios support. So get rid of it instead. No logic changes. Link: https://lore.kernel.org/r/20220529055425.226363-2-xiang@kernel.org Acked-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 111 ++++++++++++++++++----------------------------- fs/erofs/zdata.h | 50 ++++++++++----------- 2 files changed, 65 insertions(+), 96 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index e6dea6dfca16..4fd66a66c5f9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -199,7 +199,6 @@ struct z_erofs_decompress_frontend { struct z_erofs_pagevec_ctor vector; struct z_erofs_pcluster *pcl, *tailpcl; - struct z_erofs_collection *cl; /* a pointer used to pick up inplace I/O pages */ struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; @@ -357,7 +356,7 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, return false; } -/* callers must be with collection lock held */ +/* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct page *page, enum z_erofs_page_type type, bool pvec_safereuse) @@ -372,7 +371,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, pvec_safereuse); - fe->cl->vcnt += (unsigned int)ret; + fe->pcl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; } @@ -405,12 +404,11 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { struct z_erofs_pcluster *pcl = fe->pcl; - struct z_erofs_collection *cl; unsigned int length; /* to avoid unexpected loop formed by corrupted images */ @@ -419,8 +417,7 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, return -EFSCORRUPTED; } - cl = z_erofs_primarycollection(pcl); - if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -443,23 +440,21 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, length = READ_ONCE(pcl->length); } } - mutex_lock(&cl->lock); + mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) fe->tailpcl = pcl; z_erofs_try_to_claim_pcluster(fe); - fe->cl = cl; return 0; } -static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; - struct z_erofs_collection *cl; struct erofs_workgroup *grp; int err; @@ -482,17 +477,15 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; + pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = COLLECT_PRIMARY_FOLLOWED; - cl = z_erofs_primarycollection(pcl); - cl->pageofs = map->m_la & ~PAGE_MASK; - /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_init(&cl->lock); - DBG_BUGON(!mutex_trylock(&cl->lock)); + mutex_init(&pcl->lock); + DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ @@ -519,11 +512,10 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; - fe->cl = cl; return 0; err_out: - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); z_erofs_free_pcluster(pcl); return err; } @@ -535,9 +527,9 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, struct erofs_workgroup *grp; int ret; - DBG_BUGON(fe->cl); + DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); @@ -554,14 +546,14 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_collection(fe, inode, map); + ret = z_erofs_register_pcluster(fe, inode, map); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_collection(fe, inode, map); + ret = z_erofs_lookup_pcluster(fe, inode, map); if (ret) { erofs_workgroup_put(&fe->pcl->obj); return ret; @@ -569,7 +561,7 @@ tailpacking: out: z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->cl->pagevec, fe->cl->vcnt); + fe->pcl->pagevec, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -582,48 +574,36 @@ out: */ static void z_erofs_rcu_callback(struct rcu_head *head) { - struct z_erofs_collection *const cl = - container_of(head, struct z_erofs_collection, rcu); - - z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(head, + struct z_erofs_pcluster, rcu)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); - call_rcu(&cl->rcu, z_erofs_rcu_callback); -} - -static void z_erofs_collection_put(struct z_erofs_collection *cl) -{ - struct z_erofs_pcluster *const pcl = - container_of(cl, struct z_erofs_pcluster, primary_collection); - - erofs_workgroup_put(&pcl->obj); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) { - struct z_erofs_collection *cl = fe->cl; + struct z_erofs_pcluster *pcl = fe->pcl; - if (!cl) + if (!pcl) return false; z_erofs_pagevec_ctor_exit(&fe->vector, false); - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) - z_erofs_collection_put(cl); + erofs_workgroup_put(&pcl->obj); - fe->cl = NULL; + fe->pcl = NULL; return true; } @@ -666,8 +646,8 @@ repeat: /* lucky, within the range of the current map_blocks */ if (offset + cur >= map->m_la && offset + cur < map->m_la + map->m_llen) { - /* didn't get a valid collection previously (very rare) */ - if (!fe->cl) + /* didn't get a valid pcluster previously (very rare) */ + if (!fe->pcl) goto restart_now; goto hitted; } @@ -766,7 +746,7 @@ retry: /* bump up the number of spiltted parts of a page */ ++spiltted; /* also update nr_pages */ - fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1); + fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); next_part: /* can be used for verification */ map->m_llen = offset + cur - map->m_la; @@ -821,15 +801,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, enum z_erofs_page_type page_type; bool overlapped, partial; - struct z_erofs_collection *cl; int err; might_sleep(); - cl = z_erofs_primarycollection(pcl); - DBG_BUGON(!READ_ONCE(cl->nr_pages)); + DBG_BUGON(!READ_ONCE(pcl->nr_pages)); - mutex_lock(&cl->lock); - nr_pages = cl->nr_pages; + mutex_lock(&pcl->lock); + nr_pages = pcl->nr_pages; if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { pages = pages_onstack; @@ -857,9 +835,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, err = 0; z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, 0); + pcl->pagevec, 0); - for (i = 0; i < cl->vcnt; ++i) { + for (i = 0; i < pcl->vcnt; ++i) { unsigned int pagenr; page = z_erofs_pagevec_dequeue(&ctor, &page_type); @@ -945,11 +923,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, goto out; llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { outputsize = llen; partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); } else { - outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; partial = true; } @@ -963,7 +941,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, .in = compressed_pages, .out = pages, .pageofs_in = pcl->pageofs_in, - .pageofs_out = cl->pageofs, + .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, .outputsize = outputsize, .alg = pcl->algorithmformat, @@ -1012,16 +990,12 @@ out: else if (pages != pages_onstack) kvfree(pages); - cl->nr_pages = 0; - cl->vcnt = 0; + pcl->nr_pages = 0; + pcl->vcnt = 0; - /* all cl locks MUST be taken before the following line */ + /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); - - /* all cl locks SHOULD be released right now */ - mutex_unlock(&cl->lock); - - z_erofs_collection_put(cl); + mutex_unlock(&pcl->lock); return err; } @@ -1043,6 +1017,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(pcl->next); z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + erofs_workgroup_put(&pcl->obj); } } diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 800b11c53f57..58053bb5066f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -12,21 +12,40 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_NR_INLINE_PAGEVECS 3 +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only * for everyone else; * - * L: Field should be protected by pageset lock; + * L: Field should be protected by the pcluster lock; * * A: Field should be accessed / updated in atomic for parallelized code. */ -struct z_erofs_collection { +struct z_erofs_pcluster { + struct erofs_workgroup obj; struct mutex lock; + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + /* I: page offset of start position of decompression */ - unsigned short pageofs; + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; /* L: maximum relative page index in pagevec[] */ unsigned short nr_pages; @@ -41,29 +60,6 @@ struct z_erofs_collection { /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; -}; - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct z_erofs_collection primary_collection; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* A: lower limit of decompressed length and if full length or not */ - unsigned int length; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; union { /* I: physical cluster size in pages */ @@ -80,8 +76,6 @@ struct z_erofs_pcluster { struct page *compressed_pages[]; }; -#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) - /* let's avoid the valid 32-bit kernel addresses */ /* the chained workgroup has't submitted io (still open) */ From 39397a46cff3d7b7d3b45b3283491af05bdfb64b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 29 May 2022 13:54:24 +0800 Subject: [PATCH 316/334] erofs: get rid of label `restart_now' Simplify this part of code. No logic changes. Link: https://lore.kernel.org/r/20220529055425.226363-3-xiang@kernel.org Acked-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 4fd66a66c5f9..6dd858f94e44 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -643,28 +643,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, repeat: cur = end - 1; - /* lucky, within the range of the current map_blocks */ - if (offset + cur >= map->m_la && - offset + cur < map->m_la + map->m_llen) { + if (offset + cur < map->m_la || + offset + cur >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + cur); + + if (z_erofs_collector_end(fe)) + fe->backmost = false; + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto err_out; + } else { + if (fe->pcl) + goto hitted; /* didn't get a valid pcluster previously (very rare) */ - if (!fe->pcl) - goto restart_now; - goto hitted; } - /* go ahead the next map_blocks */ - erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); - - if (z_erofs_collector_end(fe)) - fe->backmost = false; - - map->m_la = offset + cur; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto err_out; - -restart_now: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; From aa793b46bb9342ae3c6152fc21654b8ade8dd125 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sun, 29 May 2022 13:54:25 +0800 Subject: [PATCH 317/334] erofs: simplify z_erofs_pcluster_readmore() Get rid of unnecessary label `skip'. No logic changes. Link: https://lore.kernel.org/r/20220529055425.226363-4-xiang@kernel.org Acked-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6dd858f94e44..b33fb64b3393 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1436,22 +1436,19 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct page *page; page = erofs_grab_cache_page_nowait(inode->i_mapping, index); - if (!page) - goto skip; - - if (PageUptodate(page)) { - unlock_page(page); + if (page) { + if (PageUptodate(page)) { + unlock_page(page); + } else { + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + } put_page(page); - goto skip; } - err = z_erofs_do_read_page(f, page, pagepool); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - put_page(page); -skip: if (cur < PAGE_SIZE) break; cur = (index << PAGE_SHIFT) - 1; From 4398d3c31b582db0d640b23434bf344a6c8df57c Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Mon, 30 May 2022 15:51:14 +0800 Subject: [PATCH 318/334] erofs: fix 'backmost' member of z_erofs_decompress_frontend Initialize 'backmost' to true in DECOMPRESS_FRONTEND_INIT. Fixes: 5c6dcc57e2e5 ("erofs: get rid of `struct z_erofs_collector'") Signed-off-by: Weizhao Ouyang Reviewed-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20220530075114.918874-1-o451686892@gmail.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index b33fb64b3393..90c5b10c5794 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -213,7 +213,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED } + .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); From 9571f829f30a89b888ec4c3a72f5a04573f0e058 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 31 May 2022 12:16:49 -0400 Subject: [PATCH 319/334] dm table: fix dm_table_supports_poll to return false if no data devices It was reported that the "generic/250" test in xfstests (which uses the dm-error target) demonstrates a regression where the kernel crashes in bioset_exit(). Since commit cfc97abcbe0b ("dm: conditionally enable BIOSET_PERCPU_CACHE for dm_io bioset") the bioset_init() for the dm_io bioset will setup the bioset's per-cpu alloc cache if all devices have QUEUE_FLAG_POLL set. But there was an bug where a target that doesn't have any data devices (and that doesn't even set the .iterate_devices dm target callback) will incorrectly return true from dm_table_supports_poll(). Fix this by updating dm_table_supports_poll() to follow dm-table.c's well-worn pattern for testing that _all_ targets in a DM table do in fact have underlying devices that set QUEUE_FLAG_POLL. NOTE: An additional block fix is still needed so that bio_alloc_cache_destroy() clears the bioset's ->cache member. Otherwise, a DM device's table reload that transitions the DM device's bioset from using a per-cpu alloc cache to _not_ using one will result in bioset_exit() crashing in bio_alloc_cache_destroy() because dm's dm_io bioset ("io_bs") was left with a stale ->cache member. Fixes: cfc97abcbe0b ("dm: conditionally enable BIOSET_PERCPU_CACHE for dm_io bioset") Reported-by: Matthew Wilcox Reported-by: Dave Chinner Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a37c7b763643..0e833a154b31 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1005,7 +1005,7 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -static int dm_table_supports_poll(struct dm_table *t); +static bool dm_table_supports_poll(struct dm_table *t); static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { @@ -1027,7 +1027,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); } - poll_supported = !!dm_table_supports_poll(t); + poll_supported = dm_table_supports_poll(t); } t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, @@ -1547,9 +1547,20 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, return 0; } -static int dm_table_supports_poll(struct dm_table *t) +static bool dm_table_supports_poll(struct dm_table *t) { - return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL); + struct dm_target *ti; + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) + return false; + } + + return true; } /* From 7894025c783ca36394d3afe49c8cfb4c830b82fe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 11 May 2022 15:18:53 -0500 Subject: [PATCH 320/334] Revert "PCI: brcmstb: Do not turn off WOL regulators on suspend" This reverts commit 11ed8b8624b8085f706864b4addcd304b1e4fc38. This is part of a revert of the following commits: 11ed8b8624b8 ("PCI: brcmstb: Do not turn off WOL regulators on suspend") 93e41f3fca3d ("PCI: brcmstb: Add control of subdevice voltage regulators") 67211aadcb4b ("PCI: brcmstb: Add mechanism to turn on subdev regulators") 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs") Cyril reported that 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs"), which appeared in v5.17-rc1, broke booting on the Raspberry Pi Compute Module 4. Apparently 830aa6f29f07 panics with an Asynchronous SError Interrupt, and after further commits here is a black screen on HDMI and no output on the serial console. This does not seem to affect the Raspberry Pi 4 B. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215925 Link: https://lore.kernel.org/r/20220511201856.808690-2-helgaas@kernel.org Reported-by: Cyril Brulebois Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-brcmstb.c | 53 +++++---------------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 375c0c40bbf8..3edd63735948 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -333,7 +333,6 @@ struct brcm_pcie { void (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val); bool refusal_mode; struct subdev_regulators *sr; - bool ep_wakeup_capable; }; static inline bool is_bmips(const struct brcm_pcie *pcie) @@ -1351,21 +1350,9 @@ static void brcm_pcie_turn_off(struct brcm_pcie *pcie) pcie->bridge_sw_init_set(pcie, 1); } -static int pci_dev_may_wakeup(struct pci_dev *dev, void *data) -{ - bool *ret = data; - - if (device_may_wakeup(&dev->dev)) { - *ret = true; - dev_info(&dev->dev, "disable cancelled for wake-up device\n"); - } - return (int) *ret; -} - static int brcm_pcie_suspend(struct device *dev) { struct brcm_pcie *pcie = dev_get_drvdata(dev); - struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); int ret; brcm_pcie_turn_off(pcie); @@ -1384,22 +1371,11 @@ static int brcm_pcie_suspend(struct device *dev) } if (pcie->sr) { - /* - * Now turn off the regulators, but if at least one - * downstream device is enabled as a wake-up source, do not - * turn off regulators. - */ - pcie->ep_wakeup_capable = false; - pci_walk_bus(bridge->bus, pci_dev_may_wakeup, - &pcie->ep_wakeup_capable); - if (!pcie->ep_wakeup_capable) { - ret = regulator_bulk_disable(pcie->sr->num_supplies, - pcie->sr->supplies); - if (ret) { - dev_err(dev, "Could not turn off regulators\n"); - reset_control_reset(pcie->rescal); - return ret; - } + ret = regulator_bulk_disable(pcie->sr->num_supplies, pcie->sr->supplies); + if (ret) { + dev_err(dev, "Could not turn off regulators\n"); + reset_control_reset(pcie->rescal); + return ret; } } clk_disable_unprepare(pcie->clk); @@ -1420,21 +1396,10 @@ static int brcm_pcie_resume(struct device *dev) return ret; if (pcie->sr) { - if (pcie->ep_wakeup_capable) { - /* - * We are resuming from a suspend. In the suspend we - * did not disable the power supplies, so there is - * no need to enable them (and falsely increase their - * usage count). - */ - pcie->ep_wakeup_capable = false; - } else { - ret = regulator_bulk_enable(pcie->sr->num_supplies, - pcie->sr->supplies); - if (ret) { - dev_err(dev, "Could not turn on regulators\n"); - goto err_disable_clk; - } + ret = regulator_bulk_enable(pcie->sr->num_supplies, pcie->sr->supplies); + if (ret) { + dev_err(dev, "Could not turn on regulators\n"); + goto err_disable_clk; } } From 212942609d83b591f5a2f2691df122d13aa3a87d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 11 May 2022 15:18:54 -0500 Subject: [PATCH 321/334] Revert "PCI: brcmstb: Add control of subdevice voltage regulators" This reverts commit 93e41f3fca3d4a0f927b784012338c37f80a8a80. This is part of a revert of the following commits: 11ed8b8624b8 ("PCI: brcmstb: Do not turn off WOL regulators on suspend") 93e41f3fca3d ("PCI: brcmstb: Add control of subdevice voltage regulators") 67211aadcb4b ("PCI: brcmstb: Add mechanism to turn on subdev regulators") 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs") Cyril reported that 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs"), which appeared in v5.17-rc1, broke booting on the Raspberry Pi Compute Module 4. Apparently 830aa6f29f07 panics with an Asynchronous SError Interrupt, and after further commits here is a black screen on HDMI and no output on the serial console. This does not seem to affect the Raspberry Pi 4 B. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215925 Link: https://lore.kernel.org/r/20220511201856.808690-3-helgaas@kernel.org Reported-by: Cyril Brulebois Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-brcmstb.c | 83 ++------------------------- 1 file changed, 5 insertions(+), 78 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 3edd63735948..fd464d38fecb 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -196,8 +196,6 @@ static inline void brcm_pcie_bridge_sw_init_set_generic(struct brcm_pcie *pcie, static inline void brcm_pcie_perst_set_4908(struct brcm_pcie *pcie, u32 val); static inline void brcm_pcie_perst_set_7278(struct brcm_pcie *pcie, u32 val); static inline void brcm_pcie_perst_set_generic(struct brcm_pcie *pcie, u32 val); -static int brcm_pcie_linkup(struct brcm_pcie *pcie); -static int brcm_pcie_add_bus(struct pci_bus *bus); enum { RGR1_SW_INIT_1, @@ -331,8 +329,6 @@ struct brcm_pcie { u32 hw_rev; void (*perst_set)(struct brcm_pcie *pcie, u32 val); void (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val); - bool refusal_mode; - struct subdev_regulators *sr; }; static inline bool is_bmips(const struct brcm_pcie *pcie) @@ -501,34 +497,6 @@ static int pci_subdev_regulators_add_bus(struct pci_bus *bus) return 0; } -static int brcm_pcie_add_bus(struct pci_bus *bus) -{ - struct device *dev = &bus->dev; - struct brcm_pcie *pcie = (struct brcm_pcie *) bus->sysdata; - int ret; - - if (!dev->of_node || !bus->parent || !pci_is_root_bus(bus->parent)) - return 0; - - ret = pci_subdev_regulators_add_bus(bus); - if (ret) - return ret; - - /* Grab the regulators for suspend/resume */ - pcie->sr = bus->dev.driver_data; - - /* - * If we have failed linkup there is no point to return an error as - * currently it will cause a WARNING() from pci_alloc_child_bus(). - * We return 0 and turn on the "refusal_mode" so that any further - * accesses to the pci_dev just get 0xffffffff - */ - if (brcm_pcie_linkup(pcie) != 0) - pcie->refusal_mode = true; - - return 0; -} - static void pci_subdev_regulators_remove_bus(struct pci_bus *bus) { struct device *dev = &bus->dev; @@ -857,18 +825,6 @@ static void __iomem *brcm_pcie_map_conf(struct pci_bus *bus, unsigned int devfn, /* Accesses to the RC go right to the RC registers if slot==0 */ if (pci_is_root_bus(bus)) return PCI_SLOT(devfn) ? NULL : base + where; - if (pcie->refusal_mode) { - /* - * At this point we do not have link. There will be a CPU - * abort -- a quirk with this controller --if Linux tries - * to read any config-space registers besides those - * targeting the host bridge. To prevent this we hijack - * the address to point to a safe access that will return - * 0xffffffff. - */ - writel(0xffffffff, base + PCIE_MISC_RC_BAR2_CONFIG_HI); - return base + PCIE_MISC_RC_BAR2_CONFIG_HI + (where & 0x3); - } /* For devices, write to the config space index register */ idx = PCIE_ECAM_OFFSET(bus->number, devfn, 0); @@ -897,7 +853,7 @@ static struct pci_ops brcm_pcie_ops = { .map_bus = brcm_pcie_map_conf, .read = pci_generic_config_read, .write = pci_generic_config_write, - .add_bus = brcm_pcie_add_bus, + .add_bus = pci_subdev_regulators_add_bus, .remove_bus = pci_subdev_regulators_remove_bus, }; @@ -1370,14 +1326,6 @@ static int brcm_pcie_suspend(struct device *dev) return ret; } - if (pcie->sr) { - ret = regulator_bulk_disable(pcie->sr->num_supplies, pcie->sr->supplies); - if (ret) { - dev_err(dev, "Could not turn off regulators\n"); - reset_control_reset(pcie->rescal); - return ret; - } - } clk_disable_unprepare(pcie->clk); return 0; @@ -1395,17 +1343,9 @@ static int brcm_pcie_resume(struct device *dev) if (ret) return ret; - if (pcie->sr) { - ret = regulator_bulk_enable(pcie->sr->num_supplies, pcie->sr->supplies); - if (ret) { - dev_err(dev, "Could not turn on regulators\n"); - goto err_disable_clk; - } - } - ret = reset_control_reset(pcie->rescal); if (ret) - goto err_regulator; + goto err_disable_clk; ret = brcm_phy_start(pcie); if (ret) @@ -1437,9 +1377,6 @@ static int brcm_pcie_resume(struct device *dev) err_reset: reset_control_rearm(pcie->rescal); -err_regulator: - if (pcie->sr) - regulator_bulk_disable(pcie->sr->num_supplies, pcie->sr->supplies); err_disable_clk: clk_disable_unprepare(pcie->clk); return ret; @@ -1571,17 +1508,7 @@ static int brcm_pcie_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pcie); - ret = pci_host_probe(bridge); - if (!ret && !brcm_pcie_link_up(pcie)) - ret = -ENODEV; - - if (ret) { - brcm_pcie_remove(pdev); - return ret; - } - - return 0; - + return pci_host_probe(bridge); fail: __brcm_pcie_remove(pcie); return ret; @@ -1590,8 +1517,8 @@ fail: MODULE_DEVICE_TABLE(of, brcm_pcie_match); static const struct dev_pm_ops brcm_pcie_pm_ops = { - .suspend_noirq = brcm_pcie_suspend, - .resume_noirq = brcm_pcie_resume, + .suspend = brcm_pcie_suspend, + .resume = brcm_pcie_resume, }; static struct platform_driver brcm_pcie_driver = { From 420be2f7ebe60c9ba3e332f5290017cd168e2bf8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 11 May 2022 15:18:55 -0500 Subject: [PATCH 322/334] Revert "PCI: brcmstb: Add mechanism to turn on subdev regulators" This reverts commit 67211aadcb4b968d0fdc57bc27240fa71500c2d4. This is part of a revert of the following commits: 11ed8b8624b8 ("PCI: brcmstb: Do not turn off WOL regulators on suspend") 93e41f3fca3d ("PCI: brcmstb: Add control of subdevice voltage regulators") 67211aadcb4b ("PCI: brcmstb: Add mechanism to turn on subdev regulators") 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs") Cyril reported that 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs"), which appeared in v5.17-rc1, broke booting on the Raspberry Pi Compute Module 4. Apparently 830aa6f29f07 panics with an Asynchronous SError Interrupt, and after further commits here is a black screen on HDMI and no output on the serial console. This does not seem to affect the Raspberry Pi 4 B. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215925 Link: https://lore.kernel.org/r/20220511201856.808690-4-helgaas@kernel.org Reported-by: Cyril Brulebois Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-brcmstb.c | 76 --------------------------- 1 file changed, 76 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index fd464d38fecb..0e8346114a8d 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -284,14 +283,6 @@ static const struct pcie_cfg_data bcm2711_cfg = { .bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_generic, }; -struct subdev_regulators { - unsigned int num_supplies; - struct regulator_bulk_data supplies[]; -}; - -static int pci_subdev_regulators_add_bus(struct pci_bus *bus); -static void pci_subdev_regulators_remove_bus(struct pci_bus *bus); - struct brcm_msi { struct device *dev; void __iomem *base; @@ -445,71 +436,6 @@ static int brcm_pcie_set_ssc(struct brcm_pcie *pcie) return ssc && pll ? 0 : -EIO; } -static void *alloc_subdev_regulators(struct device *dev) -{ - static const char * const supplies[] = { - "vpcie3v3", - "vpcie3v3aux", - "vpcie12v", - }; - const size_t size = sizeof(struct subdev_regulators) - + sizeof(struct regulator_bulk_data) * ARRAY_SIZE(supplies); - struct subdev_regulators *sr; - int i; - - sr = devm_kzalloc(dev, size, GFP_KERNEL); - if (sr) { - sr->num_supplies = ARRAY_SIZE(supplies); - for (i = 0; i < ARRAY_SIZE(supplies); i++) - sr->supplies[i].supply = supplies[i]; - } - - return sr; -} - -static int pci_subdev_regulators_add_bus(struct pci_bus *bus) -{ - struct device *dev = &bus->dev; - struct subdev_regulators *sr; - int ret; - - if (!dev->of_node || !bus->parent || !pci_is_root_bus(bus->parent)) - return 0; - - if (dev->driver_data) - dev_err(dev, "dev.driver_data unexpectedly non-NULL\n"); - - sr = alloc_subdev_regulators(dev); - if (!sr) - return -ENOMEM; - - dev->driver_data = sr; - ret = regulator_bulk_get(dev, sr->num_supplies, sr->supplies); - if (ret) - return ret; - - ret = regulator_bulk_enable(sr->num_supplies, sr->supplies); - if (ret) { - dev_err(dev, "failed to enable regulators for downstream device\n"); - return ret; - } - - return 0; -} - -static void pci_subdev_regulators_remove_bus(struct pci_bus *bus) -{ - struct device *dev = &bus->dev; - struct subdev_regulators *sr = dev->driver_data; - - if (!sr || !bus->parent || !pci_is_root_bus(bus->parent)) - return; - - if (regulator_bulk_disable(sr->num_supplies, sr->supplies)) - dev_err(dev, "failed to disable regulators for downstream device\n"); - dev->driver_data = NULL; -} - /* Limits operation to a specific generation (1, 2, or 3) */ static void brcm_pcie_set_gen(struct brcm_pcie *pcie, int gen) { @@ -853,8 +779,6 @@ static struct pci_ops brcm_pcie_ops = { .map_bus = brcm_pcie_map_conf, .read = pci_generic_config_read, .write = pci_generic_config_write, - .add_bus = pci_subdev_regulators_add_bus, - .remove_bus = pci_subdev_regulators_remove_bus, }; static struct pci_ops brcm_pcie_ops32 = { From f4fd559de3434c44bed1d2912bd0c75cfa42898b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 11 May 2022 15:18:56 -0500 Subject: [PATCH 323/334] Revert "PCI: brcmstb: Split brcm_pcie_setup() into two funcs" This reverts commit 830aa6f29f07a4e2f1a947dfa72b3ccddb46dd21. This is part of a revert of the following commits: 11ed8b8624b8 ("PCI: brcmstb: Do not turn off WOL regulators on suspend") 93e41f3fca3d ("PCI: brcmstb: Add control of subdevice voltage regulators") 67211aadcb4b ("PCI: brcmstb: Add mechanism to turn on subdev regulators") 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs") Cyril reported that 830aa6f29f07 ("PCI: brcmstb: Split brcm_pcie_setup() into two funcs"), which appeared in v5.17-rc1, broke booting on the Raspberry Pi Compute Module 4. Apparently 830aa6f29f07 panics with an Asynchronous SError Interrupt, and after further commits here is a black screen on HDMI and no output on the serial console. This does not seem to affect the Raspberry Pi 4 B. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215925 Link: https://lore.kernel.org/r/20220511201856.808690-5-helgaas@kernel.org Reported-by: Cyril Brulebois Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pcie-brcmstb.c | 65 +++++++++++---------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 0e8346114a8d..e61058e13818 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -926,9 +926,16 @@ static inline int brcm_pcie_get_rc_bar2_size_and_offset(struct brcm_pcie *pcie, static int brcm_pcie_setup(struct brcm_pcie *pcie) { + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); u64 rc_bar2_offset, rc_bar2_size; void __iomem *base = pcie->base; - int ret, memc; + struct device *dev = pcie->dev; + struct resource_entry *entry; + bool ssc_good = false; + struct resource *res; + int num_out_wins = 0; + u16 nlw, cls, lnksta; + int i, ret, memc; u32 tmp, burst, aspm_support; /* Reset the bridge */ @@ -1018,40 +1025,6 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) if (pcie->gen) brcm_pcie_set_gen(pcie, pcie->gen); - /* Don't advertise L0s capability if 'aspm-no-l0s' */ - aspm_support = PCIE_LINK_STATE_L1; - if (!of_property_read_bool(pcie->np, "aspm-no-l0s")) - aspm_support |= PCIE_LINK_STATE_L0S; - tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); - u32p_replace_bits(&tmp, aspm_support, - PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK); - writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); - - /* - * For config space accesses on the RC, show the right class for - * a PCIe-PCIe bridge (the default setting is to be EP mode). - */ - tmp = readl(base + PCIE_RC_CFG_PRIV1_ID_VAL3); - u32p_replace_bits(&tmp, 0x060400, - PCIE_RC_CFG_PRIV1_ID_VAL3_CLASS_CODE_MASK); - writel(tmp, base + PCIE_RC_CFG_PRIV1_ID_VAL3); - - return 0; -} - -static int brcm_pcie_linkup(struct brcm_pcie *pcie) -{ - struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); - struct device *dev = pcie->dev; - void __iomem *base = pcie->base; - struct resource_entry *entry; - struct resource *res; - int num_out_wins = 0; - u16 nlw, cls, lnksta; - bool ssc_good = false; - u32 tmp; - int ret, i; - /* Unassert the fundamental reset */ pcie->perst_set(pcie, 0); @@ -1102,6 +1075,24 @@ static int brcm_pcie_linkup(struct brcm_pcie *pcie) num_out_wins++; } + /* Don't advertise L0s capability if 'aspm-no-l0s' */ + aspm_support = PCIE_LINK_STATE_L1; + if (!of_property_read_bool(pcie->np, "aspm-no-l0s")) + aspm_support |= PCIE_LINK_STATE_L0S; + tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); + u32p_replace_bits(&tmp, aspm_support, + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK); + writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); + + /* + * For config space accesses on the RC, show the right class for + * a PCIe-PCIe bridge (the default setting is to be EP mode). + */ + tmp = readl(base + PCIE_RC_CFG_PRIV1_ID_VAL3); + u32p_replace_bits(&tmp, 0x060400, + PCIE_RC_CFG_PRIV1_ID_VAL3_CLASS_CODE_MASK); + writel(tmp, base + PCIE_RC_CFG_PRIV1_ID_VAL3); + if (pcie->ssc) { ret = brcm_pcie_set_ssc(pcie); if (ret == 0) @@ -1290,10 +1281,6 @@ static int brcm_pcie_resume(struct device *dev) if (ret) goto err_reset; - ret = brcm_pcie_linkup(pcie); - if (ret) - goto err_reset; - if (pcie->msi) brcm_msi_set_regs(pcie->msi); From 12068bb346db5776d0ec9bb4cd073f8427a1ac92 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 26 May 2022 16:52:23 -0500 Subject: [PATCH 324/334] PCI/PM: Fix bridge_d3_blacklist[] Elo i2 overwrite of Gigabyte X299 92597f97a40b ("PCI/PM: Avoid putting Elo i2 PCIe Ports in D3cold") omitted braces around the new Elo i2 entry, so it overwrote the existing Gigabyte X299 entry. Add the appropriate braces. Found by: $ make W=1 drivers/pci/pci.o CC drivers/pci/pci.o drivers/pci/pci.c:2974:12: error: initialized field overwritten [-Werror=override-init] 2974 | .ident = "Elo i2", | ^~~~~~~~ Link: https://lore.kernel.org/r/20220526221258.GA409855@bhelgaas Fixes: 92597f97a40b ("PCI/PM: Avoid putting Elo i2 PCIe Ports in D3cold") Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v5.15+ --- drivers/pci/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 876182873468..cfaf40a540a8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2967,6 +2967,8 @@ static const struct dmi_system_id bridge_d3_blacklist[] = { DMI_MATCH(DMI_BOARD_VENDOR, "Gigabyte Technology Co., Ltd."), DMI_MATCH(DMI_BOARD_NAME, "X299 DESIGNARE EX-CF"), }, + }, + { /* * Downstream device is not accessible after putting a root port * into D3cold and back into D0 on Elo i2. From 833e53a4ffe92e742e99347c68d99dc33986598b Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Sat, 28 May 2022 16:14:11 +0100 Subject: [PATCH 325/334] MAINTAINERS: Update Lorenzo Pieralisi's email address I will soon lose my @arm.com email address, so to prevent any possible issue let's update all kernel references (inclusive of .mailmap) to my @kernel.org alias ahead of time. My @arm.com address is still working and will likely resume to work at some point in the future; nonetheless, it is safer to switch to the @kernel.org alias from now onwards so that continuity is guaranteed. Link: https://lore.kernel.org/r/20220528151411.29810-1-lorenzo.pieralisi@arm.com Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Cc: Will Deacon Cc: Catalin Marinas --- .mailmap | 1 + MAINTAINERS | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.mailmap b/.mailmap index 6d484937f901..9ba38a82aba4 100644 --- a/.mailmap +++ b/.mailmap @@ -236,6 +236,7 @@ Linus Lüssing Li Yang Li Yang +Lorenzo Pieralisi Lukasz Luba Maciej W. Rozycki Maciej W. Rozycki diff --git a/MAINTAINERS b/MAINTAINERS index 392467e9ab73..4fa6a8da4b83 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -382,7 +382,7 @@ F: include/acpi/ F: tools/power/acpi/ ACPI FOR ARM64 (ACPI/arm64) -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Hanjun Guo M: Sudeep Holla L: linux-acpi@vger.kernel.org @@ -2946,7 +2946,7 @@ N: uniphier ARM/VERSATILE EXPRESS PLATFORM M: Liviu Dudau M: Sudeep Holla -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: */*/*/vexpress* @@ -5162,7 +5162,7 @@ F: arch/x86/kernel/cpuid.c F: arch/x86/kernel/msr.c CPUIDLE DRIVER - ARM BIG LITTLE -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Daniel Lezcano L: linux-pm@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -5182,7 +5182,7 @@ F: drivers/cpuidle/cpuidle-exynos.c F: include/linux/platform_data/cpuidle-exynos.h CPUIDLE DRIVER - ARM PSCI -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Sudeep Holla L: linux-pm@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -15289,7 +15289,7 @@ F: drivers/pci/controller/pci-v3-semi.c PCI ENDPOINT SUBSYSTEM M: Kishon Vijay Abraham I -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org S: Supported @@ -15352,7 +15352,7 @@ F: Documentation/devicetree/bindings/pci/xgene-pci-msi.txt F: drivers/pci/controller/pci-xgene-msi.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi R: Rob Herring R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org @@ -15905,7 +15905,7 @@ F: include/linux/dtpm.h POWER STATE COORDINATION INTERFACE (PSCI) M: Mark Rutland -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: drivers/firmware/psci/ @@ -18277,7 +18277,7 @@ F: drivers/net/ethernet/smsc/smc91x.* SECURE MONITOR CALL(SMC) CALLING CONVENTION (SMCCC) M: Mark Rutland -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Sudeep Holla L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained From 4caae58406f8ceb741603eee460d79bacca9b1b5 Mon Sep 17 00:00:00 2001 From: Sarthak Kukreti Date: Tue, 31 May 2022 15:56:40 -0400 Subject: [PATCH 326/334] dm verity: set DM_TARGET_IMMUTABLE feature flag The device-mapper framework provides a mechanism to mark targets as immutable (and hence fail table reloads that try to change the target type). Add the DM_TARGET_IMMUTABLE flag to the dm-verity target's feature flags to prevent switching the verity target with a different target type. Fixes: a4ffc152198e ("dm: add verity target") Cc: stable@vger.kernel.org Signed-off-by: Sarthak Kukreti Reviewed-by: Kees Cook Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-target.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 80133aae0db3..d6dbd47492a8 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1312,6 +1312,7 @@ bad: static struct target_type verity_target = { .name = "verity", + .features = DM_TARGET_IMMUTABLE, .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = verity_ctr, From c3ed222745d9ad7b69299b349a64ba533c64a34f Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sat, 14 May 2022 07:05:13 -0400 Subject: [PATCH 327/334] NFSv4: Fix free of uninitialized nfs4_label on referral lookup. Send along the already-allocated fattr along with nfs4_fs_locations, and drop the memcpy of fattr. We end up growing two more allocations, but this fixes up a crash as: PID: 790 TASK: ffff88811b43c000 CPU: 0 COMMAND: "ls" #0 [ffffc90000857920] panic at ffffffff81b9bfde #1 [ffffc900008579c0] do_trap at ffffffff81023a9b #2 [ffffc90000857a10] do_error_trap at ffffffff81023b78 #3 [ffffc90000857a58] exc_stack_segment at ffffffff81be1f45 #4 [ffffc90000857a80] asm_exc_stack_segment at ffffffff81c009de #5 [ffffc90000857b08] nfs_lookup at ffffffffa0302322 [nfs] #6 [ffffc90000857b70] __lookup_slow at ffffffff813a4a5f #7 [ffffc90000857c60] walk_component at ffffffff813a86c4 #8 [ffffc90000857cb8] path_lookupat at ffffffff813a9553 #9 [ffffc90000857cf0] filename_lookup at ffffffff813ab86b Suggested-by: Trond Myklebust Fixes: 9558a007dbc3 ("NFS: Remove the label from the nfs4_lookup_res struct") Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/nfs4namespace.c | 9 +++++++-- fs/nfs/nfs4proc.c | 15 +++++++-------- fs/nfs/nfs4state.c | 9 ++++++++- fs/nfs/nfs4xdr.c | 4 ++-- include/linux/nfs_xdr.h | 2 +- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3680c8da510c..f2dbf904c598 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -417,6 +417,9 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (!fs_locations) goto out_free; + fs_locations->fattr = nfs_alloc_fattr(); + if (!fs_locations->fattr) + goto out_free_2; /* Get locations */ dentry = ctx->clone_data.dentry; @@ -427,14 +430,16 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0) - goto out_free_2; + goto out_free_3; err = -ENOENT; if (fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) - goto out_free_2; + goto out_free_3; err = nfs_follow_referral(fc, fs_locations); +out_free_3: + kfree(fs_locations->fattr); out_free_2: kfree(fs_locations); out_free: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0dfdbb406f96..337609befd17 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4247,6 +4247,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, if (locations == NULL) goto out; + locations->fattr = fattr; + status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; @@ -4256,17 +4258,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, * referral. Cause us to drop into the exception handler, which * will kick off migration recovery. */ - if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ - nfs_fixup_referral_attributes(&locations->fattr); - - /* replace the lookup nfs_fattr with the locations nfs_fattr */ - memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + nfs_fixup_referral_attributes(fattr); memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -7979,7 +7978,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, else bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - nfs_fattr_init(&fs_locations->fattr); + nfs_fattr_init(fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); @@ -8044,7 +8043,7 @@ static int _nfs40_proc_get_locations(struct nfs_server *server, unsigned long now = jiffies; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -8109,7 +8108,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, }; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 775c58def32c..2540b35ec187 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2116,6 +2116,11 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred dprintk("<-- %s: no memory\n", __func__); goto out; } + locations->fattr = nfs_alloc_fattr(); + if (locations->fattr == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, @@ -2130,7 +2135,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred if (!locations->nlocations) goto out; - if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); goto out; @@ -2155,6 +2160,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred out: if (page != NULL) __free_page(page); + if (locations != NULL) + kfree(locations->fattr); kfree(locations); if (result) { pr_err("NFS: migration recovery failed (server %s)\n", diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0b41e00e754f..acfe5f4bda48 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7080,7 +7080,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, if (res->migration) { xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); if (status) @@ -7093,7 +7093,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 4a8ba84f848e..0e3aa0f5f324 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1222,7 +1222,7 @@ struct nfs4_fs_location { #define NFS4_FS_LOCATIONS_MAXENTRIES 10 struct nfs4_fs_locations { - struct nfs_fattr fattr; + struct nfs_fattr *fattr; const struct nfs_server *server; struct nfs4_pathname fs_path; int nlocations; From 11270e7ca268e8d61b5d9e5c3a54bd1550642c9c Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Sun, 22 May 2022 20:36:48 +0800 Subject: [PATCH 328/334] xprtrdma: treat all calls not a bcall when bc_serv is NULL When a rdma server returns a fault format reply, nfs v3 client may treats it as a bcall when bc service is not exist. The debug message at rpcrdma_bc_receive_call are, [56579.837169] RPC: rpcrdma_bc_receive_call: callback XID 00000001, length=20 [56579.837174] RPC: rpcrdma_bc_receive_call: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 04 After that, rpcrdma_bc_receive_call will meets NULL pointer as, [ 226.057890] BUG: unable to handle kernel NULL pointer dereference at 00000000000000c8 ... [ 226.058704] RIP: 0010:_raw_spin_lock+0xc/0x20 ... [ 226.059732] Call Trace: [ 226.059878] rpcrdma_bc_receive_call+0x138/0x327 [rpcrdma] [ 226.060011] __ib_process_cq+0x89/0x170 [ib_core] [ 226.060092] ib_cq_poll_work+0x26/0x80 [ib_core] [ 226.060257] process_one_work+0x1a7/0x360 [ 226.060367] ? create_worker+0x1a0/0x1a0 [ 226.060440] worker_thread+0x30/0x390 [ 226.060500] ? create_worker+0x1a0/0x1a0 [ 226.060574] kthread+0x116/0x130 [ 226.060661] ? kthread_flush_work_fn+0x10/0x10 [ 226.060724] ret_from_fork+0x35/0x40 ... Signed-off-by: Kinglong Mee Reviewed-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 281ddb87ac8d..190a4de239c8 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1121,6 +1121,7 @@ static bool rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; @@ -1144,6 +1145,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) if (*p != cpu_to_be32(RPC_CALL)) return false; + /* No bc service. */ + if (xprt->bc_serv == NULL) + return false; + /* Now that we are sure this is a backchannel call, * advance to the RPC header. */ From 118f09eda21d392e1eeb9f8a4bee044958cccf20 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 25 May 2022 12:12:59 -0400 Subject: [PATCH 329/334] NFSv4.1 mark qualified async operations as MOVEABLE tasks Mark async operations such as RENAME, REMOVE, COMMIT MOVEABLE for the nfsv4.1+ sessions. Fixes: 85e39feead948 ("NFSv4.1 identify and mark RPC tasks that can move between transports") Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 26 ++++++++++++++------------ fs/nfs/pagelist.c | 3 +++ fs/nfs/unlink.c | 8 ++++++++ fs/nfs/write.c | 4 ++++ include/linux/nfs_fs_sb.h | 1 + 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 337609befd17..c0fdcf8c0032 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1162,7 +1162,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, { unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (server->caps & NFS_CAP_MOVEABLE) task_flags = RPC_TASK_MOVEABLE; return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -2568,7 +2568,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; kref_get(&data->kref); @@ -3737,7 +3737,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -4407,7 +4407,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_flags = RPC_TASK_MOVEABLE; /* Is this is an attribute revalidation, subject to softreval? */ @@ -6639,10 +6639,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -6956,10 +6959,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; - struct nfs_client *client = - NFS_SERVER(lsp->ls_state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, @@ -7230,9 +7231,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; - struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -10467,7 +10467,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 - | NFS_CAP_LGOPEN, + | NFS_CAP_LGOPEN + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10502,7 +10503,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR - | NFS_CAP_READ_PLUS, + | NFS_CAP_READ_PLUS + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9157dd19b8b4..317cedfa52bf 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -767,6 +767,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .flags = RPC_TASK_ASYNC | flags, }; + if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); dprintk("NFS: initiated pgio call " diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 6f325e10056c..9697cd5d2561 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -102,6 +102,10 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); + + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); @@ -344,6 +348,10 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; + if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) && + nfs_server_capable(new_dir, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2f41659e232e..1c706465d090 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1709,6 +1709,10 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; + + if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client); trace_nfs_initiate_commit(data); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 157d2bd6b241..ea2f7e6b1b0b 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -287,4 +287,5 @@ struct nfs_server { #define NFS_CAP_XATTR (1U << 28) #define NFS_CAP_READ_PLUS (1U << 29) #define NFS_CAP_FS_LOCATIONS (1U << 30) +#define NFS_CAP_MOVEABLE (1U << 31) #endif From 17eabd42560f4636648ad65ba5b20228071e2363 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 31 May 2022 09:30:40 +0100 Subject: [PATCH 330/334] afs: Fix infinite loop found by xfstest generic/676 In AFS, a directory is handled as a file that the client downloads and parses locally for the purposes of performing lookup and getdents operations. The in-kernel afs filesystem has a number of functions that do this. A directory file is arranged as a series of 2K blocks divided into 32-byte slots, where a directory entry occupies one or more slots, plus each block starts with one or more metadata blocks. When parsing a block, if the last slots are occupied by a dirent that occupies more than a single slot and the file position points at a slot that's not the initial one, the logic in afs_dir_iterate_block() that skips over it won't advance the file pointer to the end of it. This will cause an infinite loop in getdents() as it will keep retrying that block and failing to advance beyond the final entry. Fix this by advancing the file pointer if the next entry will be beyond it when we skip a block. This was found by the generic/676 xfstest but can also be triggered with something like: ~/xfstests-dev/src/t_readdir_3 /xfstest.test/z 4000 1 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: David Howells Reviewed-by: Marc Dionne Tested-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lore.kernel.org/r/165391973497.110268.2939296942213894166.stgit@warthog.procyon.org.uk/ Signed-off-by: Linus Torvalds --- fs/afs/dir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 94aa7356248e..79f6b74336d2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -463,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, } /* skip if starts before the current position */ - if (offset < curr) + if (offset < curr) { + if (next > curr) + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, From 66ed42caf286a7aaeb6f1db4b2995dd9416226c2 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 1 Jun 2022 09:42:12 +0200 Subject: [PATCH 331/334] MAINTAINERS: rectify entries for some i3c drivers after dt conversion Commit 4bd69ecfa672 ("dt-bindings: i3c: Convert cdns,i3c-master to DT schema") and commit 6742ca620bd9 ("dt-bindings: i3c: Convert snps,dw-i3c-master to DT schema") convert some i3c dt-bindings to yaml, but miss to adjust its reference in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about broken references. Repair these file references in I3C DRIVER FOR CADENCE I3C MASTER IP and I3C DRIVER FOR SYNOPSYS DESIGNWARE. Signed-off-by: Lukas Bulwahn Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220601074212.19984-1-lukas.bulwahn@gmail.com --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fd768d43e048..e88feefb903c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9264,13 +9264,13 @@ F: drivers/i2c/i2c-stub.c I3C DRIVER FOR CADENCE I3C MASTER IP M: Przemysław Gaj S: Maintained -F: Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt +F: Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml F: drivers/i3c/master/i3c-master-cdns.c I3C DRIVER FOR SYNOPSYS DESIGNWARE M: Vitor Soares S: Maintained -F: Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt +F: Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml F: drivers/i3c/master/dw* I3C SUBSYSTEM From f78e3d407a339ffdd2620140300f821ea41118f4 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 25 May 2022 22:14:59 -0300 Subject: [PATCH 332/334] rtc: mxc: Silence a clang warning Change the of_device_get_match_data() cast to (uintptr_t) to silence the following clang warning: drivers/rtc/rtc-mxc.c:315:19: warning: cast to smaller integer type 'enum imx_rtc_type' from 'const void *' [-Wvoid-pointer-to-enum-cast] Reported-by: kernel test robot Fixes: ba7aa63000f2 ("rtc: mxc: use of_device_get_match_data") Signed-off-by: Fabio Estevam Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220526011459.1167197-1-festevam@gmail.com --- drivers/rtc/rtc-mxc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 0f08f22df869..53d4e253e81f 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -311,7 +311,7 @@ static int mxc_rtc_probe(struct platform_device *pdev) if (!pdata) return -ENOMEM; - pdata->devtype = (enum imx_rtc_type)of_device_get_match_data(&pdev->dev); + pdata->devtype = (uintptr_t)of_device_get_match_data(&pdev->dev); pdata->ioaddr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pdata->ioaddr)) From d1dc87763f406d4e67caf16dbe438a5647692395 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Thu, 19 May 2022 09:50:30 +0100 Subject: [PATCH 333/334] assoc_array: Fix BUG_ON during garbage collect A rare BUG_ON triggered in assoc_array_gc: [3430308.818153] kernel BUG at lib/assoc_array.c:1609! Which corresponded to the statement currently at line 1593 upstream: BUG_ON(assoc_array_ptr_is_meta(p)); Using the data from the core dump, I was able to generate a userspace reproducer[1] and determine the cause of the bug. [1]: https://github.com/brenns10/kernel_stuff/tree/master/assoc_array_gc After running the iterator on the entire branch, an internal tree node looked like the following: NODE (nr_leaves_on_branch: 3) SLOT [0] NODE (2 leaves) SLOT [1] NODE (1 leaf) SLOT [2..f] NODE (empty) In the userspace reproducer, the pr_devel output when compressing this node was: -- compress node 0x5607cc089380 -- free=0, leaves=0 [0] retain node 2/1 [nx 0] [1] fold node 1/1 [nx 0] [2] fold node 0/1 [nx 2] [3] fold node 0/2 [nx 2] [4] fold node 0/3 [nx 2] [5] fold node 0/4 [nx 2] [6] fold node 0/5 [nx 2] [7] fold node 0/6 [nx 2] [8] fold node 0/7 [nx 2] [9] fold node 0/8 [nx 2] [10] fold node 0/9 [nx 2] [11] fold node 0/10 [nx 2] [12] fold node 0/11 [nx 2] [13] fold node 0/12 [nx 2] [14] fold node 0/13 [nx 2] [15] fold node 0/14 [nx 2] after: 3 At slot 0, an internal node with 2 leaves could not be folded into the node, because there was only one available slot (slot 0). Thus, the internal node was retained. At slot 1, the node had one leaf, and was able to be folded in successfully. The remaining nodes had no leaves, and so were removed. By the end of the compression stage, there were 14 free slots, and only 3 leaf nodes. The tree was ascended and then its parent node was compressed. When this node was seen, it could not be folded, due to the internal node it contained. The invariant for compression in this function is: whenever nr_leaves_on_branch < ASSOC_ARRAY_FAN_OUT, the node should contain all leaf nodes. The compression step currently cannot guarantee this, given the corner case shown above. To fix this issue, retry compression whenever we have retained a node, and yet nr_leaves_on_branch < ASSOC_ARRAY_FAN_OUT. This second compression will then allow the node in slot 1 to be folded in, satisfying the invariant. Below is the output of the reproducer once the fix is applied: -- compress node 0x560e9c562380 -- free=0, leaves=0 [0] retain node 2/1 [nx 0] [1] fold node 1/1 [nx 0] [2] fold node 0/1 [nx 2] [3] fold node 0/2 [nx 2] [4] fold node 0/3 [nx 2] [5] fold node 0/4 [nx 2] [6] fold node 0/5 [nx 2] [7] fold node 0/6 [nx 2] [8] fold node 0/7 [nx 2] [9] fold node 0/8 [nx 2] [10] fold node 0/9 [nx 2] [11] fold node 0/10 [nx 2] [12] fold node 0/11 [nx 2] [13] fold node 0/12 [nx 2] [14] fold node 0/13 [nx 2] [15] fold node 0/14 [nx 2] internal nodes remain despite enough space, retrying -- compress node 0x560e9c562380 -- free=14, leaves=1 [0] fold node 2/15 [nx 0] after: 3 Changes ======= DH: - Use false instead of 0. - Reorder the inserted lines in a couple of places to put retained before next_slot. ver #2) - Fix typo in pr_devel, correct comparison to "<=" Fixes: 3cb989501c26 ("Add a generic associative array implementation.") Cc: Signed-off-by: Stephen Brennan Signed-off-by: David Howells cc: Andrew Morton cc: keyrings@vger.kernel.org Link: https://lore.kernel.org/r/20220511225517.407935-1-stephen.s.brennan@oracle.com/ # v1 Link: https://lore.kernel.org/r/20220512215045.489140-1-stephen.s.brennan@oracle.com/ # v2 Reviewed-by: Jarkko Sakkinen Signed-off-by: Linus Torvalds --- lib/assoc_array.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 079c72e26493..ca0b4f360c1a 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -1461,6 +1461,7 @@ int assoc_array_gc(struct assoc_array *array, struct assoc_array_ptr *cursor, *ptr; struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; unsigned long nr_leaves_on_tree; + bool retained; int keylen, slot, nr_free, next_slot, i; pr_devel("-->%s()\n", __func__); @@ -1536,6 +1537,7 @@ continue_node: goto descend; } +retry_compress: pr_devel("-- compress node %p --\n", new_n); /* Count up the number of empty slots in this node and work out the @@ -1553,6 +1555,7 @@ continue_node: pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); /* See what we can fold in */ + retained = false; next_slot = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_shortcut *s; @@ -1602,9 +1605,14 @@ continue_node: pr_devel("[%d] retain node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); + retained = true; } } + if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { + pr_devel("internal nodes remain despite enough space, retrying\n"); + goto retry_compress; + } pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); nr_leaves_on_tree = new_n->nr_leaves_on_branch; From 690b2549b19563ec5ad53e5c82f6a944d910086e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Jun 2022 14:02:18 +0300 Subject: [PATCH 334/334] i2c: ismt: prevent memory corruption in ismt_access() The "data->block[0]" variable comes from the user and is a number between 0-255. It needs to be capped to prevent writing beyond the end of dma_buffer[]. Fixes: 5e9a97b1f449 ("i2c: ismt: Adding support for I2C_SMBUS_BLOCK_PROC_CALL") Reported-and-tested-by: Zheyu Ma Signed-off-by: Dan Carpenter Reviewed-by: Mika Westerberg Signed-off-by: Linus Torvalds --- drivers/i2c/busses/i2c-ismt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index c16157ee8c52..6078fa0c0d48 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -528,6 +528,9 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr, case I2C_SMBUS_BLOCK_PROC_CALL: dev_dbg(dev, "I2C_SMBUS_BLOCK_PROC_CALL\n"); + if (data->block[0] > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + dma_size = I2C_SMBUS_BLOCK_MAX; desc->tgtaddr_rw = ISMT_DESC_ADDR_RW(addr, 1); desc->wr_len_cmd = data->block[0] + 1;