From 743cdb7bd0f1cb32c03680c8b38257957db2e692 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 19 May 2022 17:45:21 +1000 Subject: [PATCH 01/79] powerpc/kasan: Mark more real-mode code as not to be instrumented This marks more files and functions that can possibly be called in real mode as not to be instrumented by KASAN. Most were found by inspection, except for get_pseries_errorlog() which was reported as causing a crash in testing. Reported-by: Nageswara R Sastry Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/YoX1kZPnmUX4RZEK@cleo --- arch/powerpc/kernel/Makefile | 2 ++ arch/powerpc/kernel/rtas.c | 4 ++-- arch/powerpc/kexec/crash.c | 2 +- arch/powerpc/platforms/powernv/Makefile | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 2e2a2a9bcf43..f91f0f29a566 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -37,6 +37,8 @@ KASAN_SANITIZE_paca.o := n KASAN_SANITIZE_setup_64.o := n KASAN_SANITIZE_mce.o := n KASAN_SANITIZE_mce_power.o := n +KASAN_SANITIZE_udbg.o := n +KASAN_SANITIZE_udbg_16550.o := n # we have to be particularly careful in ppc64 to exclude code that # runs with translations off, as we cannot access the shadow with diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 9bb43aa53d43..a6fce3106e02 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -993,8 +993,8 @@ int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...) * * Return: A pointer to the specified errorlog or NULL if not found. */ -struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, - uint16_t section_id) +noinstr struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id) { struct rtas_ext_event_log_v6 *ext_log = (struct rtas_ext_event_log_v6 *)log->buffer; diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index d85fa9fc6f3c..80f54723cf6d 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -224,7 +224,7 @@ void crash_kexec_secondary(struct pt_regs *regs) /* wait for all the CPUs to hit real mode but timeout if they don't come in */ #if defined(CONFIG_SMP) && defined(CONFIG_PPC64) -static void __maybe_unused crash_kexec_wait_realmode(int cpu) +noinstr static void __maybe_unused crash_kexec_wait_realmode(int cpu) { unsigned int msecs; int i; diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 6488b3842199..19f0fc5c6f1b 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,6 +4,7 @@ # in particular, idle code runs a bunch of things in real mode KASAN_SANITIZE_idle.o := n KASAN_SANITIZE_pci-ioda.o := n +KASAN_SANITIZE_pci-ioda-tce.o := n # pnv_machine_check_early KASAN_SANITIZE_setup.o := n From a1b29ba2f2c171b9bea73be993bfdf0a62d37d15 Mon Sep 17 00:00:00 2001 From: He Ying Date: Thu, 20 Jan 2022 20:44:18 -0500 Subject: [PATCH 02/79] powerpc/kasan: Silence KASAN warnings in __get_wchan() The following KASAN warning was reported in our kernel. BUG: KASAN: stack-out-of-bounds in get_wchan+0x188/0x250 Read of size 4 at addr d216f958 by task ps/14437 CPU: 3 PID: 14437 Comm: ps Tainted: G O 5.10.0 #1 Call Trace: [daa63858] [c0654348] dump_stack+0x9c/0xe4 (unreliable) [daa63888] [c035cf0c] print_address_description.constprop.3+0x8c/0x570 [daa63908] [c035d6bc] kasan_report+0x1ac/0x218 [daa63948] [c00496e8] get_wchan+0x188/0x250 [daa63978] [c0461ec8] do_task_stat+0xce8/0xe60 [daa63b98] [c0455ac8] proc_single_show+0x98/0x170 [daa63bc8] [c03cab8c] seq_read_iter+0x1ec/0x900 [daa63c38] [c03cb47c] seq_read+0x1dc/0x290 [daa63d68] [c037fc94] vfs_read+0x164/0x510 [daa63ea8] [c03808e4] ksys_read+0x144/0x1d0 [daa63f38] [c005b1dc] ret_from_syscall+0x0/0x38 --- interrupt: c00 at 0x8fa8f4 LR = 0x8fa8cc The buggy address belongs to the page: page:98ebcdd2 refcount:0 mapcount:0 mapping:00000000 index:0x2 pfn:0x1216f flags: 0x0() raw: 00000000 00000000 01010122 00000000 00000002 00000000 ffffffff 00000000 raw: 00000000 page dumped because: kasan: bad access detected Memory state around the buggy address: d216f800: 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00 00 00 00 d216f880: f2 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >d216f900: 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 ^ d216f980: f2 f2 f2 f2 f2 f2 f2 00 00 00 00 00 00 00 00 00 d216fa00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 After looking into this issue, I find the buggy address belongs to the task stack region. It seems KASAN has something wrong. I look into the code of __get_wchan in x86 architecture and find the same issue has been resolved by the commit f7d27c35ddff ("x86/mm, kasan: Silence KASAN warnings in get_wchan()"). The solution could be applied to powerpc architecture too. As Andrey Ryabinin said, get_wchan() is racy by design, it may access volatile stack of running task, thus it may access redzone in a stack frame and cause KASAN to warn about this. Use READ_ONCE_NOCHECK() to silence these warnings. Reported-by: Wanming Hu Signed-off-by: He Ying Signed-off-by: Chen Jingwen Reviewed-by: Kees Cook Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220121014418.155675-1-heying24@huawei.com --- arch/powerpc/kernel/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index d00b20c65966..ca4d97688da9 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2157,12 +2157,12 @@ static unsigned long ___get_wchan(struct task_struct *p) return 0; do { - sp = *(unsigned long *)sp; + sp = READ_ONCE_NOCHECK(*(unsigned long *)sp); if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || task_is_running(p)) return 0; if (count > 0) { - ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; + ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]); if (!in_sched_functions(ip)) return ip; } From 1346d00e1bdfd4067f92bc14e8a6131a01de4190 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 25 May 2022 13:26:39 +1000 Subject: [PATCH 03/79] powerpc: Don't select HAVE_IRQ_EXIT_ON_IRQ_STACK The HAVE_IRQ_EXIT_ON_IRQ_STACK option tells generic code that irq_exit() is called while still running on the hard irq stack (hardirq_ctx[] in the powerpc code). Selecting the option means the generic code will *not* switch to the softirq stack before running softirqs, because the code is already running on the (mostly empty) hard irq stack. But since commit 1b1b6a6f4cc0 ("powerpc: handle irq_enter/irq_exit in interrupt handler wrappers"), irq_exit() is now called on the regular task stack, not the hard irq stack. That's because previously irq_exit() was called in __do_irq() which is run on the hard irq stack, but now it is called in interrupt_async_exit_prepare() which is called from do_irq() constructed by the wrapper macro, which is after the switch back to the task stack. So drop HAVE_IRQ_EXIT_ON_IRQ_STACK from the Kconfig. This will mean an extra stack switch when processing some interrupts, but should significantly reduce the likelihood of stack overflow. It also means the softirq stack will be used for running softirqs from other interrupts that don't use the hard irq stack, eg. timer interrupts. Fixes: 1b1b6a6f4cc0 ("powerpc: handle irq_enter/irq_exit in interrupt handler wrappers") Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220525032639.1947280-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3eaddb8997a9..54dbbb1d4b36 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -223,7 +223,6 @@ config PPC select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IOREMAP_PROT - select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE From 07bf9431b1590d1cd7a8d62075d0b50b073f0495 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Tue, 24 May 2022 16:53:53 +0530 Subject: [PATCH 04/79] powerpc/papr_scm: don't requests stats with '0' sized stats buffer Sachin reported [1] that on a POWER-10 lpar he is seeing a kernel panic being reported with vPMEM when papr_scm probe is being called. The panic is of the form below and is observed only with following option disabled(profile) for the said LPAR 'Enable Performance Information Collection' in the HMC: Kernel attempted to write user page (1c) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on write at 0x0000001c Faulting instruction address: 0xc008000001b90844 Oops: Kernel access of bad area, sig: 11 [#1] NIP [c008000001b90844] drc_pmem_query_stats+0x5c/0x270 [papr_scm] LR [c008000001b92794] papr_scm_probe+0x2ac/0x6ec [papr_scm] Call Trace: 0xc00000000941bca0 (unreliable) papr_scm_probe+0x2ac/0x6ec [papr_scm] platform_probe+0x98/0x150 really_probe+0xfc/0x510 __driver_probe_device+0x17c/0x230 ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Fatal exception On investigation looks like this panic was caused due to a 'stat_buffer' of size==0 being provided to drc_pmem_query_stats() to fetch all performance stats-ids of an NVDIMM. However drc_pmem_query_stats() shouldn't have been called since the vPMEM NVDIMM doesn't support and performance stat-id's. This was caused due to missing check for 'p->stat_buffer_len' at the beginning of papr_scm_pmu_check_events() which indicates that the NVDIMM doesn't support performance-stats. Fix this by introducing the check for 'p->stat_buffer_len' at the beginning of papr_scm_pmu_check_events(). [1] https://lore.kernel.org/all/6B3A522A-6A5F-4CC9-B268-0C63AA6E07D3@linux.ibm.com Fixes: 0e0946e22f3665d2732 ("powerpc/papr_scm: Fix leaking nvdimm_events_map elements") Reported-by: Sachin Sant Signed-off-by: Vaibhav Jain Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220524112353.1718454-1-vaibhav@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 181b855b3050..82cae08976bc 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -465,6 +465,9 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu u32 available_events; int index, rc = 0; + if (!p->stat_buffer_len) + return -ENOENT; + available_events = (p->stat_buffer_len - sizeof(struct papr_scm_perf_stats)) / sizeof(struct papr_scm_perf_stat); if (available_events == 0) From 291e7d52d19f114cad6cbf802f3f19ef12a011f8 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Fri, 20 May 2022 19:42:42 +0800 Subject: [PATCH 05/79] mmc: sdhci-pci-gli: Fix GL9763E runtime PM when the system resumes from suspend When the system resumes from suspend (S3 or S4), the power mode is MMC_POWER_OFF. In this status, gl9763e_runtime_resume() should not enable PLL. Add a condition to this function to enable PLL only when the power mode is MMC_POWER_ON. Fixes: d607667bb8fa (mmc: sdhci-pci-gli: Add runtime PM for GL9763E) Signed-off-by: Ben Chuang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220520114242.150235-1-benchuanggli@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-gli.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c index 1499a64ec3aa..f13c08db3da5 100644 --- a/drivers/mmc/host/sdhci-pci-gli.c +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -982,6 +982,9 @@ static int gl9763e_runtime_resume(struct sdhci_pci_chip *chip) struct sdhci_host *host = slot->host; u16 clock; + if (host->mmc->ios.power_mode != MMC_POWER_ON) + return 0; + clock = sdhci_readw(host, SDHCI_CLOCK_CONTROL); clock |= SDHCI_CLOCK_PLL_EN; From 5f92df8ddacb4b97f6865a3bf687f240072f4f68 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 31 May 2022 08:54:02 -0700 Subject: [PATCH 06/79] Input: raspberrypi-ts - add missing HAS_IOMEM dependency Since JOYSTICK_SENSEHAT selects MFD_SIMPLE_MFD_I2C and the latter depends on HAS_IOMEM, and since 'select' does not follow any dependency chains, JOYSTICK_SENSEHAT should also depend on HAS_IOMEM to prevent a kconfig warning and a build error: WARNING: unmet direct dependencies detected for MFD_SIMPLE_MFD_I2C Depends on [n]: HAS_IOMEM [=n] && I2C [=y] Selected by [y]: - JOYSTICK_SENSEHAT [=y] && INPUT_JOYSTICK [=y] && INPUT [=y] && I2C [=y] s390-linux-ld: drivers/mfd/simple-mfd-i2c.o: in function `simple_mfd_i2c_probe': simple-mfd-i2c.c:(.text+0xc8): undefined reference to `devm_mfd_add_devices' Fixes: 41657514c796 ("Input: add Raspberry Pi Sense HAT joystick driver") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20220531022942.16340-1-rdunlap@infradead.org Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig index 505a032e2786..9dcf3f51f2dd 100644 --- a/drivers/input/joystick/Kconfig +++ b/drivers/input/joystick/Kconfig @@ -402,6 +402,7 @@ config JOYSTICK_N64 config JOYSTICK_SENSEHAT tristate "Raspberry Pi Sense HAT joystick" depends on INPUT && I2C + depends on HAS_IOMEM select MFD_SIMPLE_MFD_I2C help Say Y here if you want to enable the driver for the From a051246b786af7e4a9d9219cc7038a6e8a411531 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 31 May 2022 20:19:22 +0300 Subject: [PATCH 07/79] mmc: block: Fix CQE recovery reset success The intention of the use of mmc_blk_reset_success() in mmc_blk_cqe_recovery() was to prevent repeated resets when retrying and getting the same error. However, that may not be the case - any amount of time and I/O may pass before another recovery is needed, in which case there would be no reason to deny it the opportunity to recover via a reset if necessary. CQE recovery is expected seldom and failure to recover (if the clear tasks command fails), even more seldom, so it is better to allow the reset always, which can be done by calling mmc_blk_reset_success() always. Fixes: 1e8e55b67030c6 ("mmc: block: Add CQE support") Cc: stable@vger.kernel.org Signed-off-by: Adrian Hunter Link: https://lore.kernel.org/r/20220531171922.76080-1-adrian.hunter@intel.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 1259ca22d625..f4a1281658db 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1499,8 +1499,7 @@ void mmc_blk_cqe_recovery(struct mmc_queue *mq) err = mmc_cqe_recovery(host); if (err) mmc_blk_reset(mq->blkdata, host, MMC_BLK_CQE_RECOVERY); - else - mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY); + mmc_blk_reset_success(mq->blkdata, MMC_BLK_CQE_RECOVERY); pr_debug("%s: CQE recovery done\n", mmc_hostname(host)); } From 282e5f8fe907dc3f2fbf9f2103b0e62ffc3a68a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 1 Jun 2022 10:47:35 +0200 Subject: [PATCH 08/79] netfilter: nat: really support inet nat without l3 address When no l3 address is given, priv->family is set to NFPROTO_INET and the evaluation function isn't called. Call it too so l4-only rewrite can work. Also add a test case for this. Fixes: a33f387ecd5aa ("netfilter: nft_nat: allow to specify layer 4 protocol NAT only") Reported-by: Yi Chen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_nat.c | 3 +- tools/testing/selftests/netfilter/nft_nat.sh | 43 ++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 4394df4bc99b..e5fd6995e4bf 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -335,7 +335,8 @@ static void nft_nat_inet_eval(const struct nft_expr *expr, { const struct nft_nat *priv = nft_expr_priv(expr); - if (priv->family == nft_pf(pkt)) + if (priv->family == nft_pf(pkt) || + priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index eb8543b9a5c4..924ecb3f1f73 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -374,6 +374,45 @@ EOF return $lret } +test_local_dnat_portonly() +{ + local family=$1 + local daddr=$2 + local lret=0 + local sr_s + local sr_r + +ip netns exec "$ns0" nft -f /dev/stdin < Date: Wed, 1 Jun 2022 16:00:00 +0200 Subject: [PATCH 09/79] netfilter: nf_tables: use kfree_rcu(ptr, rcu) to release hooks in clean_net path Use kfree_rcu(ptr, rcu) variant instead as described by ae089831ff28 ("netfilter: nf_tables: prefer kfree_rcu(ptr, rcu) variant"). Fixes: f9a43007d3f7 ("netfilter: nf_tables: double hook unregistration in netns path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 746be13438ef..129d3ebd6ce5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7332,7 +7332,7 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, nf_unregister_net_hook(net, &hook->ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook); + kfree_rcu(hook, rcu); } } } From 479260419fa4cb30e3e5d935a857fbdf0ffdd854 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 25 May 2022 20:42:04 -0500 Subject: [PATCH 10/79] dt-bindings: mmc: Fix unevaluatedProperties warnings in examples The 'unevaluatedProperties' schema checks is not fully working and doesn't catch some cases where there's a $ref to another schema. A fix is pending, but results in new warnings in examples. Fix the warnings by removing spurious properties or adding a missing property to the schema. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220526014204.2873107-1-robh@kernel.org Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml | 2 -- Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml b/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml index b672202fff4e..5ecdac9de484 100644 --- a/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml +++ b/Documentation/devicetree/bindings/mmc/brcm,sdhci-brcmstb.yaml @@ -75,7 +75,6 @@ examples: sd-uhs-sdr104; sdhci,auto-cmd12; interrupts = <0x0 0x26 0x4>; - interrupt-names = "sdio0_0"; clocks = <&scmi_clk 245>; clock-names = "sw_sdio"; }; @@ -94,7 +93,6 @@ examples: non-removable; bus-width = <0x8>; interrupts = <0x0 0x27 0x4>; - interrupt-names = "sdio1_0"; clocks = <&scmi_clk 245>; clock-names = "sw_sdio"; }; diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml index c79639e9027e..aca1a4a8daea 100644 --- a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml +++ b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.yaml @@ -56,6 +56,9 @@ properties: - const: core - const: axi + interrupts: + maxItems: 1 + marvell,xenon-sdhc-id: $ref: /schemas/types.yaml#/definitions/uint32 minimum: 0 From b6d9014a3335194590abdd2a2471ef5147a67645 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 30 May 2022 18:40:06 +0200 Subject: [PATCH 11/79] netfilter: nf_tables: delete flowtable hooks via transaction list Remove inactive bool field in nft_hook object that was introduced in abadb2f865d7 ("netfilter: nf_tables: delete devices from flowtable"). Move stale flowtable hooks to transaction list instead. Deleting twice the same device does not result in ENOENT. Fixes: abadb2f865d7 ("netfilter: nf_tables: delete devices from flowtable") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - net/netfilter/nf_tables_api.c | 31 ++++++------------------------- 2 files changed, 6 insertions(+), 26 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 20af9d3557b9..279ae0fff7ad 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1090,7 +1090,6 @@ struct nft_stats { struct nft_hook { struct list_head list; - bool inactive; struct nf_hook_ops ops; struct rcu_head rcu; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 129d3ebd6ce5..30588349f96c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1914,7 +1914,6 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_dev; } hook->ops.dev = dev; - hook->inactive = false; return hook; @@ -7618,6 +7617,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; + LIST_HEAD(flowtable_del_list); struct nft_hook *this, *hook; struct nft_trans *trans; int err; @@ -7633,7 +7633,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_flowtable_del_hook; } - hook->inactive = true; + list_move(&hook->list, &flowtable_del_list); } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, @@ -7646,6 +7646,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, nft_trans_flowtable(trans) = flowtable; nft_trans_flowtable_update(trans) = true; INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans)); nft_flowtable_hook_release(&flowtable_hook); nft_trans_commit_list_add_tail(ctx->net, trans); @@ -7653,13 +7654,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, return 0; err_flowtable_del_hook: - list_for_each_entry(this, &flowtable_hook.list, list) { - hook = nft_hook_list_find(&flowtable->hook_list, this); - if (!hook) - break; - - hook->inactive = false; - } + list_splice(&flowtable_del_list, &flowtable->hook_list); nft_flowtable_hook_release(&flowtable_hook); return err; @@ -8563,17 +8558,6 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } -static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, - struct list_head *hook_list) -{ - struct nft_hook *hook, *next; - - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - if (hook->inactive) - list_move(&hook->list, hook_list); - } -} - static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -8918,8 +8902,6 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - nft_flowtable_hooks_del(nft_trans_flowtable(trans), - &nft_trans_flowtable_hooks(trans)); nf_tables_flowtable_notify(&trans->ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), @@ -9000,7 +8982,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; struct nft_trans_elem *te; - struct nft_hook *hook; if (action == NFNL_ABORT_VALIDATE && nf_tables_validate(net) < 0) @@ -9131,8 +9112,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_DELFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list) - hook->inactive = false; + list_splice(&nft_trans_flowtable_hooks(trans), + &nft_trans_flowtable(trans)->hook_list); } else { trans->ctx.table->use++; nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); From 3e8635fb2e072672cbc650989ffedf8300ad67fb Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 2 Jun 2022 00:31:14 +1000 Subject: [PATCH 12/79] powerpc/kasan: Force thread size increase with KASAN KASAN causes increased stack usage, which can lead to stack overflows. The logic in Kconfig to suggest a larger default doesn't work if a user has CONFIG_EXPERT enabled and has an existing .config with a smaller value. Follow the lead of x86 and arm64, and force the thread size to be increased when KASAN is enabled. That also has the effect of enlarging the stack for 64-bit KASAN builds, which is also desirable. Fixes: edbadaf06710 ("powerpc/kasan: Fix stack overflow by increasing THREAD_SHIFT") Reported-by: Erhard Furtner Reported-by: Christophe Leroy [mpe: Use MIN_THREAD_SHIFT as suggested by Christophe] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220601143114.133524-1-mpe@ellerman.id.au --- arch/powerpc/Kconfig | 1 - arch/powerpc/include/asm/thread_info.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 54dbbb1d4b36..b1760d615bb7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -790,7 +790,6 @@ config THREAD_SHIFT range 13 15 default "15" if PPC_256K_PAGES default "14" if PPC64 - default "14" if KASAN default "13" help Used to define the stack size. The default is almost always what you diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 125328d1b980..af58f1ed3952 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -14,10 +14,16 @@ #ifdef __KERNEL__ -#if defined(CONFIG_VMAP_STACK) && CONFIG_THREAD_SHIFT < PAGE_SHIFT +#ifdef CONFIG_KASAN +#define MIN_THREAD_SHIFT (CONFIG_THREAD_SHIFT + 1) +#else +#define MIN_THREAD_SHIFT CONFIG_THREAD_SHIFT +#endif + +#if defined(CONFIG_VMAP_STACK) && MIN_THREAD_SHIFT < PAGE_SHIFT #define THREAD_SHIFT PAGE_SHIFT #else -#define THREAD_SHIFT CONFIG_THREAD_SHIFT +#define THREAD_SHIFT MIN_THREAD_SHIFT #endif #define THREAD_SIZE (1 << THREAD_SHIFT) From 2c9e4559773c261900c674a86b8e455911675d71 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Jun 2022 17:49:36 +0200 Subject: [PATCH 13/79] netfilter: nf_tables: always initialize flowtable hook list in transaction The hook list is used if nft_trans_flowtable_update(trans) == true. However, initialize this list for other cases for safety reasons. Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 30588349f96c..2faa77cd2fe2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -544,6 +544,7 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, if (msg_type == NFT_MSG_NEWFLOWTABLE) nft_activate_next(ctx->net, flowtable); + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); nft_trans_flowtable(trans) = flowtable; nft_trans_commit_list_add_tail(ctx->net, trans); From 7ad4bd887d27c6b6ffbef216f19c19f8fe2b8f52 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 4 Jun 2022 17:50:50 +0900 Subject: [PATCH 14/79] powerpc/book3e: get rid of #include You cannot include here because it is generated in init/Makefile but there is no guarantee that it happens before arch/powerpc/mm/nohash/kaslr_booke.c is compiled for parallel builds. The places where you can reliably include are: - init/ (because init/Makefile can specify the dependency) - arch/*/boot/ (because it is compiled after vmlinux) Commit f231e4333312 ("hexagon: get rid of #include ") fixed the last breakage at that time, but powerpc re-added this. was unneeded because 'build_str' is almost the same as 'linux_banner' defined in init/version.c Let's copy the solution from MIPS. (get_random_boot() in arch/mips/kernel/relocate.c) Fixes: 6a38ea1d7b94 ("powerpc/fsl_booke/32: randomize the kernel image offset") Signed-off-by: Masahiro Yamada Acked-by: Scott Wood Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220604085050.4078927-1-masahiroy@kernel.org --- arch/powerpc/mm/nohash/kaslr_booke.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 1f3f9fedf1bc..0d04f9d5da8d 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -19,7 +19,6 @@ #include #include #include -#include #include struct regions { @@ -37,10 +36,6 @@ struct regions { int reserved_mem_size_cells; }; -/* Simplified build-specific string for starting entropy. */ -static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; - struct regions __initdata regions; static __init void kaslr_get_cmdline(void *fdt) @@ -71,7 +66,8 @@ static unsigned long __init get_boot_seed(void *fdt) { unsigned long hash = 0; - hash = rotate_xor(hash, build_str, sizeof(build_str)); + /* build-specific string for starting entropy. */ + hash = rotate_xor(hash, linux_banner, strlen(linux_banner)); hash = rotate_xor(hash, fdt, fdt_totalsize(fdt)); return hash; From 2aab03b86766a27f99a0b24f63e1730faac128d0 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Sun, 5 Jun 2022 20:55:09 +0800 Subject: [PATCH 15/79] fs: Fix syntax errors in comments Delete the redundant word 'not'. Link: https://lore.kernel.org/r/20220605125509.14837-1-wangxiang@cdjrlc.com Signed-off-by: Xiang wangx Signed-off-by: Jan Kara --- fs/ext2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 360ce3604a2d..e6b932219803 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1549,7 +1549,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) if (IS_ERR(raw_inode)) return -EIO; - /* For fields not not tracking in the in-memory inode, + /* For fields not tracking in the in-memory inode, * initialise them to zero for new inodes. */ if (ei->i_state & EXT2_STATE_NEW) memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); From 10e14073107dd0b6d97d9516a02845a8e501c2c9 Mon Sep 17 00:00:00 2001 From: Jchao Sun Date: Tue, 24 May 2022 08:05:40 -0700 Subject: [PATCH 16/79] writeback: Fix inode->i_io_list not be protected by inode->i_lock error Commit b35250c0816c ("writeback: Protect inode->i_io_list with inode->i_lock") made inode->i_io_list not only protected by wb->list_lock but also inode->i_lock, but inode_io_list_move_locked() was missed. Add lock there and also update comment describing things protected by inode->i_lock. This also fixes a race where __mark_inode_dirty() could move inode under flush worker's hands and thus sync(2) could miss writing some inodes. Fixes: b35250c0816c ("writeback: Protect inode->i_io_list with inode->i_lock") Link: https://lore.kernel.org/r/20220524150540.12552-1-sunjunchao2870@gmail.com CC: stable@vger.kernel.org Signed-off-by: Jchao Sun Signed-off-by: Jan Kara --- fs/fs-writeback.c | 37 ++++++++++++++++++++++++++++--------- fs/inode.c | 2 +- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a21d8f1a56d1..05221366a16d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode, struct list_head *head) { assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); list_move(&inode->i_io_list, head); @@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue, inode = wb_inode(delaying_queue->prev); if (inode_dirtied_after(inode, dirtied_before)) break; + spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; - spin_lock(&inode->i_lock); inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) @@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue, goto out; } - /* Move inodes from one superblock together */ + /* + * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', + * we don't take inode->i_lock here because it is just a pointless overhead. + * Inode is already marked as I_SYNC_QUEUED so writeback list handling is + * fully under our control. + */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { @@ -1826,8 +1832,8 @@ static long writeback_sb_inodes(struct super_block *sb, * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ - spin_unlock(&inode->i_lock); requeue_io(inode, wb); + spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } @@ -2358,6 +2364,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; int dirtytime = 0; + struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); @@ -2409,6 +2416,17 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; + /* + * Grab inode's wb early because it requires dropping i_lock and we + * need to make sure following checks happen atomically with dirty + * list handling so that we don't move inodes under flush worker's + * hands. + */ + if (!was_dirty) { + wb = locked_inode_to_wb_and_lock_list(inode); + spin_lock(&inode->i_lock); + } + /* * If the inode is queued for writeback by flush worker, just * update its dirty state. Once the flush worker is done with @@ -2416,7 +2434,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * list, based upon its state. */ if (inode->i_state & I_SYNC_QUEUED) - goto out_unlock_inode; + goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's @@ -2424,22 +2442,19 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out_unlock_inode; + goto out_unlock; } if (inode->i_state & I_FREEING) - goto out_unlock_inode; + goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { - struct bdi_writeback *wb; struct list_head *dirty_list; bool wakeup_bdi = false; - wb = locked_inode_to_wb_and_lock_list(inode); - inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; @@ -2453,6 +2468,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) dirty_list); spin_unlock(&wb->list_lock); + spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* @@ -2467,6 +2483,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) return; } } +out_unlock: + if (wb) + spin_unlock(&wb->list_lock); out_unlock_inode: spin_unlock(&inode->i_lock); } diff --git a/fs/inode.c b/fs/inode.c index 9d9b422504d1..bd4da9c5207e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -27,7 +27,7 @@ * Inode locking rules: * * inode->i_lock protects: - * inode->i_state, inode->i_hash, __iget() + * inode->i_state, inode->i_hash, __iget(), inode->i_io_list * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode->i_sb->s_inode_list_lock protects: From 537e11cdc7a6b3ce94fa25ed41306193df9677b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 5 Jun 2022 15:38:13 +0100 Subject: [PATCH 17/79] quota: Prevent memory allocation recursion while holding dq_lock As described in commit 02117b8ae9c0 ("f2fs: Set GF_NOFS in read_cache_page_gfp while doing f2fs_quota_read"), we must not enter filesystem reclaim while holding the dq_lock. Prevent this more generally by using memalloc_nofs_save() while holding the lock. Link: https://lore.kernel.org/r/20220605143815.2330891-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Jan Kara --- fs/quota/dquot.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a74aef99bd3d..09d1307959d0 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -79,6 +79,7 @@ #include #include #include +#include #include "../internal.h" /* ugh */ #include @@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty); int dquot_acquire(struct dquot *dquot) { int ret = 0, ret2 = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) @@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire); int dquot_commit(struct dquot *dquot) { int ret = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); if (!clear_dquot_dirty(dquot)) goto out_lock; /* Inactive dquot can be only if there was error during read/init @@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot) else ret = -EIO; out_lock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } @@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit); int dquot_release(struct dquot *dquot) { int ret = 0, ret2 = 0; + unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); + memalloc = memalloc_nofs_save(); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; @@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot) } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_dqlock: + memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } From c271cc9febaaa1bcbc0842d1ee30466aa6148ea8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 5 Jun 2022 13:40:06 +0200 Subject: [PATCH 18/79] netfilter: nf_tables: release new hooks on unsupported flowtable flags Release the list of new hooks that are pending to be registered in case that unsupported flowtable flags are provided. Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2faa77cd2fe2..252796a99f5e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7433,11 +7433,15 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, if (nla[NFTA_FLOWTABLE_FLAGS]) { flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); - if (flags & ~NFT_FLOWTABLE_MASK) - return -EOPNOTSUPP; + if (flags & ~NFT_FLOWTABLE_MASK) { + err = -EOPNOTSUPP; + goto err_flowtable_update_hook; + } if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^ - (flags & NFT_FLOWTABLE_HW_OFFLOAD)) - return -EOPNOTSUPP; + (flags & NFT_FLOWTABLE_HW_OFFLOAD)) { + err = -EOPNOTSUPP; + goto err_flowtable_update_hook; + } } else { flags = flowtable->data.flags; } From 9dd732e0bdf538b1b76dc7c157e2b5e560ff30d3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jun 2022 17:15:57 +0200 Subject: [PATCH 19/79] netfilter: nf_tables: memleak flow rule from commit path Abort path release flow rule object, however, commit path does not. Update code to destroy these objects before releasing the transaction. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 252796a99f5e..1a6a21bfb18d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8329,6 +8329,9 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: @@ -8817,6 +8820,9 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_rule_notify(&trans->ctx, nft_trans_rule(trans), NFT_MSG_NEWRULE); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_DELRULE: From c76acfb7e19dcc3a0964e0563770b1d11b8d4540 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Thu, 26 May 2022 17:03:47 +0800 Subject: [PATCH 20/79] net: phy: dp83867: retrigger SGMII AN when link change There is a limitation in TI DP83867 PHY device where SGMII AN is only triggered once after the device is booted up. Even after the PHY TPI is down and up again, SGMII AN is not triggered and hence no new in-band message from PHY to MAC side SGMII. This could cause an issue during power up, when PHY is up prior to MAC. At this condition, once MAC side SGMII is up, MAC side SGMII wouldn`t receive new in-band message from TI PHY with correct link status, speed and duplex info. As suggested by TI, implemented a SW solution here to retrigger SGMII Auto-Neg whenever there is a link change. v2: Add Fixes tag in commit message. Fixes: 2a10154abcb7 ("net: phy: dp83867: Add TI dp83867 phy") Cc: # 5.4.x Signed-off-by: Sit, Michael Wei Hong Reviewed-by: Voon Weifeng Signed-off-by: Tan Tee Min Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220526090347.128742-1-tee.min.tan@linux.intel.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83867.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 8561f2d4443b..13dafe7a29bd 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -137,6 +137,7 @@ #define DP83867_DOWNSHIFT_2_COUNT 2 #define DP83867_DOWNSHIFT_4_COUNT 4 #define DP83867_DOWNSHIFT_8_COUNT 8 +#define DP83867_SGMII_AUTONEG_EN BIT(7) /* CFG3 bits */ #define DP83867_CFG3_INT_OE BIT(7) @@ -855,6 +856,32 @@ static int dp83867_phy_reset(struct phy_device *phydev) DP83867_PHYCR_FORCE_LINK_GOOD, 0); } +static void dp83867_link_change_notify(struct phy_device *phydev) +{ + /* There is a limitation in DP83867 PHY device where SGMII AN is + * only triggered once after the device is booted up. Even after the + * PHY TPI is down and up again, SGMII AN is not triggered and + * hence no new in-band message from PHY to MAC side SGMII. + * This could cause an issue during power up, when PHY is up prior + * to MAC. At this condition, once MAC side SGMII is up, MAC side + * SGMII wouldn`t receive new in-band message from TI PHY with + * correct link status, speed and duplex info. + * Thus, implemented a SW solution here to retrigger SGMII Auto-Neg + * whenever there is a link change. + */ + if (phydev->interface == PHY_INTERFACE_MODE_SGMII) { + int val = 0; + + val = phy_clear_bits(phydev, DP83867_CFG2, + DP83867_SGMII_AUTONEG_EN); + if (val < 0) + return; + + phy_set_bits(phydev, DP83867_CFG2, + DP83867_SGMII_AUTONEG_EN); + } +} + static struct phy_driver dp83867_driver[] = { { .phy_id = DP83867_PHY_ID, @@ -879,6 +906,8 @@ static struct phy_driver dp83867_driver[] = { .suspend = genphy_suspend, .resume = genphy_resume, + + .link_change_notify = dp83867_link_change_notify, }, }; module_phy_driver(dp83867_driver); From 3a41c64d9c1185a2f3a184015e2a9b78bfc99c71 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jun 2022 17:31:29 +0200 Subject: [PATCH 21/79] netfilter: nf_tables: bail out early if hardware offload is not supported If user requests for NFT_CHAIN_HW_OFFLOAD, then check if either device provides the .ndo_setup_tc interface or there is an indirect flow block that has been registered. Otherwise, bail out early from the preparation phase. Moreover, validate that family == NFPROTO_NETDEV and hook is NF_NETDEV_INGRESS. Fixes: c9626a2cbdb2 ("netfilter: nf_tables: add hardware offload support") Signed-off-by: Pablo Neira Ayuso --- include/net/flow_offload.h | 1 + include/net/netfilter/nf_tables_offload.h | 2 +- net/core/flow_offload.c | 6 ++++++ net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nf_tables_offload.c | 23 ++++++++++++++++++++++- 5 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 021778a7e1af..6484095a8c01 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -612,5 +612,6 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); +bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */ diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 797147843958..3568b6a2f5f0 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -92,7 +92,7 @@ int nft_flow_rule_offload_commit(struct net *net); NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ memset(&(__reg)->mask, 0xff, (__reg)->len); -int nft_chain_offload_priority(struct nft_base_chain *basechain); +bool nft_chain_offload_support(const struct nft_base_chain *basechain); int nft_offload_init(void); void nft_offload_exit(void); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 73f68d4625f3..929f6379a279 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -595,3 +595,9 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); + +bool flow_indr_dev_exists(void) +{ + return !list_empty(&flow_block_indr_dev_list); +} +EXPORT_SYMBOL(flow_indr_dev_exists); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1a6a21bfb18d..51144fc66889 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2166,7 +2166,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - nft_chain_offload_priority(basechain) < 0) + !nft_chain_offload_support(basechain)) return -EOPNOTSUPP; flow_block_init(&basechain->flow_block); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 2d36952b1392..910ef881c3b8 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -208,7 +208,7 @@ static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, return 0; } -int nft_chain_offload_priority(struct nft_base_chain *basechain) +static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) @@ -217,6 +217,27 @@ int nft_chain_offload_priority(struct nft_base_chain *basechain) return 0; } +bool nft_chain_offload_support(const struct nft_base_chain *basechain) +{ + struct net_device *dev; + struct nft_hook *hook; + + if (nft_chain_offload_priority(basechain) < 0) + return false; + + list_for_each_entry(hook, &basechain->hook_list, list) { + if (hook->ops.pf != NFPROTO_NETDEV || + hook->ops.hooknum != NF_NETDEV_INGRESS) + return false; + + dev = hook->ops.dev; + if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) + return false; + } + + return true; +} + static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, From f55a07074fdd38cab8c097ac5bd397d68eff733c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 2 Jun 2022 14:01:06 +0000 Subject: [PATCH 22/79] amt: fix wrong usage of pskb_may_pull() It adds missing pskb_may_pull() in amt_update_handler() and amt_multicast_data_handler(). And it fixes wrong parameter of pskb_may_pull() in amt_advertisement_handler() and amt_membership_query_handler(). Reported-by: Jakub Kicinski Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 55 +++++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index ebee5f07a208..900948e135ad 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2220,8 +2220,7 @@ static bool amt_advertisement_handler(struct amt_dev *amt, struct sk_buff *skb) struct amt_header_advertisement *amta; int hdr_size; - hdr_size = sizeof(*amta) - sizeof(struct amt_header); - + hdr_size = sizeof(*amta) + sizeof(struct udphdr); if (!pskb_may_pull(skb, hdr_size)) return true; @@ -2251,19 +2250,27 @@ static bool amt_multicast_data_handler(struct amt_dev *amt, struct sk_buff *skb) struct ethhdr *eth; struct iphdr *iph; + hdr_size = sizeof(*amtmd) + sizeof(struct udphdr); + if (!pskb_may_pull(skb, hdr_size)) + return true; + amtmd = (struct amt_header_mcast_data *)(udp_hdr(skb) + 1); if (amtmd->reserved || amtmd->version) return true; - hdr_size = sizeof(*amtmd) + sizeof(struct udphdr); if (iptunnel_pull_header(skb, hdr_size, htons(ETH_P_IP), false)) return true; + skb_reset_network_header(skb); skb_push(skb, sizeof(*eth)); skb_reset_mac_header(skb); skb_pull(skb, sizeof(*eth)); eth = eth_hdr(skb); + + if (!pskb_may_pull(skb, sizeof(*iph))) + return true; iph = ip_hdr(skb); + if (iph->version == 4) { if (!ipv4_is_multicast(iph->daddr)) return true; @@ -2274,6 +2281,9 @@ static bool amt_multicast_data_handler(struct amt_dev *amt, struct sk_buff *skb) } else if (iph->version == 6) { struct ipv6hdr *ip6h; + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return true; + ip6h = ipv6_hdr(skb); if (!ipv6_addr_is_multicast(&ip6h->daddr)) return true; @@ -2306,8 +2316,7 @@ static bool amt_membership_query_handler(struct amt_dev *amt, struct iphdr *iph; int hdr_size, len; - hdr_size = sizeof(*amtmq) - sizeof(struct amt_header); - + hdr_size = sizeof(*amtmq) + sizeof(struct udphdr); if (!pskb_may_pull(skb, hdr_size)) return true; @@ -2315,22 +2324,27 @@ static bool amt_membership_query_handler(struct amt_dev *amt, if (amtmq->reserved || amtmq->version) return true; - hdr_size = sizeof(*amtmq) + sizeof(struct udphdr) - sizeof(*eth); + hdr_size -= sizeof(*eth); if (iptunnel_pull_header(skb, hdr_size, htons(ETH_P_TEB), false)) return true; + oeth = eth_hdr(skb); skb_reset_mac_header(skb); skb_pull(skb, sizeof(*eth)); skb_reset_network_header(skb); eth = eth_hdr(skb); + if (!pskb_may_pull(skb, sizeof(*iph))) + return true; + iph = ip_hdr(skb); if (iph->version == 4) { - if (!ipv4_is_multicast(iph->daddr)) - return true; if (!pskb_may_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS + sizeof(*ihv3))) return true; + if (!ipv4_is_multicast(iph->daddr)) + return true; + ihv3 = skb_pull(skb, sizeof(*iph) + AMT_IPHDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph) + AMT_IPHDR_OPTS); @@ -2345,15 +2359,17 @@ static bool amt_membership_query_handler(struct amt_dev *amt, ip_eth_mc_map(iph->daddr, eth->h_dest); #if IS_ENABLED(CONFIG_IPV6) } else if (iph->version == 6) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); struct mld2_query *mld2q; + struct ipv6hdr *ip6h; - if (!ipv6_addr_is_multicast(&ip6h->daddr)) - return true; if (!pskb_may_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS + sizeof(*mld2q))) return true; + ip6h = ipv6_hdr(skb); + if (!ipv6_addr_is_multicast(&ip6h->daddr)) + return true; + mld2q = skb_pull(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); skb_reset_transport_header(skb); skb_push(skb, sizeof(*ip6h) + AMT_IP6HDR_OPTS); @@ -2389,23 +2405,23 @@ static bool amt_update_handler(struct amt_dev *amt, struct sk_buff *skb) { struct amt_header_membership_update *amtmu; struct amt_tunnel_list *tunnel; - struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; - int len; + int len, hdr_size; iph = ip_hdr(skb); - udph = udp_hdr(skb); - if (__iptunnel_pull_header(skb, sizeof(*udph), skb->protocol, - false, false)) + hdr_size = sizeof(*amtmu) + sizeof(struct udphdr); + if (!pskb_may_pull(skb, hdr_size)) return true; - amtmu = (struct amt_header_membership_update *)skb->data; + amtmu = (struct amt_header_membership_update *)(udp_hdr(skb) + 1); if (amtmu->reserved || amtmu->version) return true; - skb_pull(skb, sizeof(*amtmu)); + if (iptunnel_pull_header(skb, hdr_size, skb->protocol, false)) + return true; + skb_reset_network_header(skb); list_for_each_entry_rcu(tunnel, &amt->tunnel_list, list) { @@ -2426,6 +2442,9 @@ static bool amt_update_handler(struct amt_dev *amt, struct sk_buff *skb) return true; report: + if (!pskb_may_pull(skb, sizeof(*iph))) + return true; + iph = ip_hdr(skb); if (iph->version == 4) { if (ip_mc_check_igmp(skb)) { From d16207f92a4a823c48b4ea953ad51f4483456768 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 2 Jun 2022 14:01:07 +0000 Subject: [PATCH 23/79] amt: fix possible null-ptr-deref in amt_rcv() When amt interface receives amt message, it tries to obtain amt private data from sock. If there is no amt private data, it frees an skb immediately. After kfree_skb(), it increases the rx_dropped stats. But in order to use rx_dropped, amt private data is needed. So, it makes amt_rcv() to do not increase rx_dropped stats when it can not obtain amt private data. Reported-by: kernel test robot Reported-by: Dan Carpenter Fixes: 1a1a0e80e005 ("amt: fix possible memory leak in amt_rcv()") Signed-off-by: Taehee Yoo Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 900948e135ad..ef483bf51033 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2698,7 +2698,8 @@ static int amt_rcv(struct sock *sk, struct sk_buff *skb) amt = rcu_dereference_sk_user_data(sk); if (!amt) { err = true; - goto drop; + kfree_skb(skb); + goto out; } skb->dev = amt->dev; From d7970039d87c926bb648982e920cb9851c19f3e1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 2 Jun 2022 14:01:08 +0000 Subject: [PATCH 24/79] amt: fix wrong type string definition amt message type definition starts from 1, not 0. But type_str[] starts from 0. So, it prints wrong type information. Fixes: cbc21dc1cfe9 ("amt: add data plane of amt interface") Signed-off-by: Taehee Yoo Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index ef483bf51033..be2719a3ba70 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -51,6 +51,7 @@ static char *status_str[] = { }; static char *type_str[] = { + "", /* Type 0 is not defined */ "AMT_MSG_DISCOVERY", "AMT_MSG_ADVERTISEMENT", "AMT_MSG_REQUEST", From b8d91399775c55162073bb2aca061ec42e3d4bc1 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 3 Jun 2022 17:32:38 +0400 Subject: [PATCH 25/79] net: ethernet: bgmac: Fix refcount leak in bcma_mdio_mii_register of_get_child_by_name() returns a node pointer with refcount incremented, we should use of_node_put() on it when not need anymore. Add missing of_node_put() to avoid refcount leak. Fixes: 55954f3bfdac ("net: ethernet: bgmac: move BCMA MDIO Phy code into a separate file") Signed-off-by: Miaoqian Lin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220603133238.44114-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c b/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c index 086739e4f40a..9b83d5361699 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma-mdio.c @@ -234,6 +234,7 @@ struct mii_bus *bcma_mdio_mii_register(struct bgmac *bgmac) np = of_get_child_by_name(core->dev.of_node, "mdio"); err = of_mdiobus_register(mii_bus, np); + of_node_put(np); if (err) { dev_err(&core->dev, "Registration of mii bus failed\n"); goto err_free_bus; From 5e74a4b3ec1816e3bbfd715d46ae29d2508079cb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 5 Jun 2022 22:50:48 +0200 Subject: [PATCH 26/79] stmmac: intel: Fix an error handling path in intel_eth_pci_probe() When the managed API is used, there is no need to explicitly call pci_free_irq_vectors(). This looks to be a left-over from the commit in the Fixes tag. Only the .remove() function had been updated. So remove this unused function call and update goto label accordingly. Fixes: 8accc467758e ("stmmac: intel: use managed PCI function on probe and resume") Signed-off-by: Christophe JAILLET Reviewed-by: Wong Vee Khee Link: https://lore.kernel.org/r/1ac9b6787b0db83b0095711882c55c77c8ea8da0.1654462241.git.christophe.jaillet@wanadoo.fr Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index f9f80933e0c9..38fe77d1035e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -1072,13 +1072,11 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, ret = stmmac_dvr_probe(&pdev->dev, plat, &res); if (ret) { - goto err_dvr_probe; + goto err_alloc_irq; } return 0; -err_dvr_probe: - pci_free_irq_vectors(pdev); err_alloc_irq: clk_disable_unprepare(plat->stmmac_clk); clk_unregister_fixed_rate(plat->stmmac_clk); From 662a80946ce13633ae90a55379f1346c10f0c432 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 5 Jun 2022 16:23:25 -0700 Subject: [PATCH 27/79] af_unix: Fix a data-race in unix_dgram_peer_wake_me(). unix_dgram_poll() calls unix_dgram_peer_wake_me() without `other`'s lock held and check if its receive queue is full. Here we need to use unix_recvq_full_lockless() instead of unix_recvq_full(), otherwise KCSAN will report a data-race. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220605232325.11804-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 654dcef7cfb3..2206e6f8902d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -490,7 +490,7 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ - if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD)) + if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) From cf67838c4422eab826679b076dad99f96152b4de Mon Sep 17 00:00:00 2001 From: Lina Wang Date: Mon, 6 Jun 2022 14:45:17 +0800 Subject: [PATCH 28/79] selftests net: fix bpf build error bpf_helpers.h has been moved to tools/lib/bpf since 5.10, so add more including path. Fixes: edae34a3ed92 ("selftests net: add UDP GRO fraglist + bpf self-tests") Reported-by: kernel test robot Signed-off-by: Lina Wang Acked-by: Song Liu Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220606064517.8175-1-lina.wang@mediatek.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/bpf/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/bpf/Makefile b/tools/testing/selftests/net/bpf/Makefile index f91bf14bbee7..8a69c91fcca0 100644 --- a/tools/testing/selftests/net/bpf/Makefile +++ b/tools/testing/selftests/net/bpf/Makefile @@ -2,6 +2,7 @@ CLANG ?= clang CCINCLUDE += -I../../bpf +CCINCLUDE += -I../../../lib CCINCLUDE += -I../../../../../usr/include/ TEST_CUSTOM_PROGS = $(OUTPUT)/bpf/nat6to4.o @@ -10,5 +11,4 @@ all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/%.o: %.c $(CLANG) -O2 -target bpf -c $< $(CCINCLUDE) -o $@ -clean: - rm -f $(TEST_CUSTOM_PROGS) +EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) From e8bc2427018826e02add7b0ed0fc625a60390ae5 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 1 Jun 2022 03:43:28 +0200 Subject: [PATCH 29/79] KVM: Don't null dereference ops->destroy A KVM device cleanup happens in either of two callbacks: 1) destroy() which is called when the VM is being destroyed; 2) release() which is called when a device fd is closed. Most KVM devices use 1) but Book3s's interrupt controller KVM devices (XICS, XIVE, XIVE-native) use 2) as they need to close and reopen during the machine execution. The error handling in kvm_ioctl_create_device() assumes destroy() is always defined which leads to NULL dereference as discovered by Syzkaller. This adds a checks for destroy!=NULL and adds a missing release(). This is not changing kvm_destroy_devices() as devices with defined release() should have been removed from the KVM devices list by then. Suggested-by: Paolo Bonzini Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 342043b30125..f2922ba3b7a8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4300,8 +4300,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm, kvm_put_kvm_no_destroy(kvm); mutex_lock(&kvm->lock); list_del(&dev->vm_node); + if (ops->release) + ops->release(dev); mutex_unlock(&kvm->lock); - ops->destroy(dev); + if (ops->destroy) + ops->destroy(dev); return ret; } From 3e684903a8574ffc9475fdf13c4780a7adb506ad Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 4 May 2022 13:08:40 -0500 Subject: [PATCH 30/79] entry/kvm: Exit to user mode when TIF_NOTIFY_SIGNAL is set A livepatch transition may stall indefinitely when a kvm vCPU is heavily loaded. To the host, the vCPU task is a user thread which is spending a very long time in the ioctl(KVM_RUN) syscall. During livepatch transition, set_notify_signal() will be called on such tasks to interrupt the syscall so that the task can be transitioned. This interrupts guest execution, but when xfer_to_guest_mode_work() sees that TIF_NOTIFY_SIGNAL is set but not TIF_SIGPENDING it concludes that an exit to user mode is unnecessary, and guest execution is resumed without transitioning the task for the livepatch. This handling of TIF_NOTIFY_SIGNAL is incorrect, as set_notify_signal() is expected to break tasks out of interruptible kernel loops and cause them to return to userspace. Change xfer_to_guest_mode_work() to handle TIF_NOTIFY_SIGNAL the same as TIF_SIGPENDING, signaling to the vCPU run loop that an exit to userpsace is needed. Any pending task_work will be run when get_signal() is called from exit_to_user_mode_loop(), so there is no longer any need to run task work from xfer_to_guest_mode_work(). Suggested-by: "Eric W. Biederman" Cc: Petr Mladek Signed-off-by: Seth Forshee Message-Id: <20220504180840.2907296-1-sforshee@digitalocean.com> Signed-off-by: Paolo Bonzini --- kernel/entry/kvm.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 9d09f489b60e..2e0f75bcb7fd 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -9,12 +9,6 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) int ret; if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - } - - if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); return -EINTR; } From cf4a8693d97a51dccf5a1557248d12d6d8be4b9e Mon Sep 17 00:00:00 2001 From: Shaoqin Huang Date: Mon, 6 Jun 2022 18:59:05 -0600 Subject: [PATCH 31/79] KVM: x86/mmu: Check every prev_roots in __kvm_mmu_free_obsolete_roots() When freeing obsolete previous roots, check prev_roots as intended, not the current root. Signed-off-by: Shaoqin Huang Fixes: 527d5cd7eece ("KVM: x86/mmu: Zap only obsolete roots if a root shadow page is zapped") Message-Id: <20220607005905.2933378-1-shaoqin.huang@intel.com> Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index efe5a3dca1e0..e46771e95191 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5179,7 +5179,7 @@ static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { - if (is_obsolete_root(kvm, mmu->root.hpa)) + if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } From 1df931d95f4dc1c11db1123e85d4e08156e46ef9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 7 Jun 2022 17:00:53 +0200 Subject: [PATCH 32/79] x86: drop bogus "cc" clobber from __try_cmpxchg_user_asm() As noted (and fixed) a couple of times in the past, "=@cc" outputs and clobbering of "cc" don't work well together. The compiler appears to mean to reject such, but doesn't - in its upstream form - quite manage to yet for "cc". Furthermore two similar macros don't clobber "cc", and clobbering "cc" is pointless in asm()-s for x86 anyway - the compiler always assumes status flags to be clobbered there. Fixes: 989b5db215a2 ("x86/uaccess: Implement macros for CMPXCHG on user addresses") Signed-off-by: Jan Beulich Message-Id: <485c0c0b-a3a7-0b7c-5264-7d00c01de032@suse.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35f222aa66bf..913e593a3b45 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -439,7 +439,7 @@ do { \ [ptr] "+m" (*_ptr), \ [old] "+a" (__old) \ : [new] ltype (__new) \ - : "memory", "cc"); \ + : "memory"); \ if (unlikely(__err)) \ goto label; \ if (unlikely(!success)) \ From 5ba7c4c6d1c7af47a916f728bb5940669684a087 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 25 May 2022 23:09:04 +0000 Subject: [PATCH 33/79] KVM: x86/MMU: Zap non-leaf SPTEs when disabling dirty logging Currently disabling dirty logging with the TDP MMU is extremely slow. On a 96 vCPU / 96G VM backed with gigabyte pages, it takes ~200 seconds to disable dirty logging with the TDP MMU, as opposed to ~4 seconds with the shadow MMU. When disabling dirty logging, zap non-leaf parent entries to allow replacement with huge pages instead of recursing and zapping all of the child, leaf entries. This reduces the number of TLB flushes required. and reduces the disable dirty log time with the TDP MMU to ~3 seconds. Opportunistically add a WARN() to catch GFNs that are mapped at a higher level than their max level. Signed-off-by: Ben Gardon Message-Id: <20220525230904.1584480-1-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_iter.c | 9 +++++++++ arch/x86/kvm/mmu/tdp_iter.h | 1 + arch/x86/kvm/mmu/tdp_mmu.c | 38 +++++++++++++++++++++++++++++++------ 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 6d3b3e5a5533..ee4802d7b36c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -145,6 +145,15 @@ static bool try_step_up(struct tdp_iter *iter) return true; } +/* + * Step the iterator back up a level in the paging structure. Should only be + * used when the iterator is below the root level. + */ +void tdp_iter_step_up(struct tdp_iter *iter) +{ + WARN_ON(!try_step_up(iter)); +} + /* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index f0af385c56e0..adfca0cf94d3 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); +void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 841feaa48be5..7b9265d67131 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm, gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; + int max_mapping_level; kvm_pfn_t pfn; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { -retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1755,15 +1755,41 @@ retry: !is_last_spte(iter.old_spte, iter.level)) continue; + /* + * This is a leaf SPTE. Check if the PFN it maps can + * be mapped at a higher level. + */ pfn = spte_to_pfn(iter.old_spte); - if (kvm_is_reserved_pfn(pfn) || - iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, - pfn, PG_LEVEL_NUM)) + + if (kvm_is_reserved_pfn(pfn)) continue; + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, + iter.gfn, pfn, PG_LEVEL_NUM); + + WARN_ON(max_mapping_level < iter.level); + + /* + * If this page is already mapped at the highest + * viable level, there's nothing more to do. + */ + if (max_mapping_level == iter.level) + continue; + + /* + * The page can be remapped at a higher level, so step + * up to zap the parent SPTE. + */ + while (max_mapping_level > iter.level) + tdp_iter_step_up(&iter); + /* Note, a successful atomic zap also does a remote TLB flush. */ - if (tdp_mmu_zap_spte_atomic(kvm, &iter)) - goto retry; + tdp_mmu_zap_spte_atomic(kvm, &iter); + + /* + * If the atomic zap fails, the iter will recurse back into + * the same subtree to retry. + */ } rcu_read_unlock(); From eae260be3a0111a28fe95923e117a55dddec0384 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 1 Jun 2022 16:43:22 +0200 Subject: [PATCH 34/79] KVM: selftests: Make hyperv_clock selftest more stable hyperv_clock doesn't always give a stable test result, especially with AMD CPUs. The test compares Hyper-V MSR clocksource (acquired either with rdmsr() from within the guest or KVM_GET_MSRS from the host) against rdtsc(). To increase the accuracy, increase the measured delay (done with nop loop) by two orders of magnitude and take the mean rdtsc() value before and after rdmsr()/KVM_GET_MSRS. Reported-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Tested-by: Maxim Levitsky Message-Id: <20220601144322.1968742-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index e0b2bb1339b1..3330fb183c68 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -44,7 +44,7 @@ static inline void nop_loop(void) { int i; - for (i = 0; i < 1000000; i++) + for (i = 0; i < 100000000; i++) asm volatile("nop"); } @@ -56,12 +56,14 @@ static inline void check_tsc_msr_rdtsc(void) tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY); GUEST_ASSERT(tsc_freq > 0); - /* First, check MSR-based clocksource */ + /* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */ r1 = rdtsc(); t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + r1 = (r1 + rdtsc()) / 2; nop_loop(); r2 = rdtsc(); t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + r2 = (r2 + rdtsc()) / 2; GUEST_ASSERT(r2 > r1 && t2 > t1); @@ -181,12 +183,14 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY); TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); - /* First, check MSR-based clocksource */ + /* For increased accuracy, take mean rdtsc() before and afrer ioctl */ r1 = rdtsc(); t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + r1 = (r1 + rdtsc()) / 2; nop_loop(); r2 = rdtsc(); t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + r2 = (r2 + rdtsc()) / 2; TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); From 11d39e8cc43e1c6737af19ca9372e590061b5ad2 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 6 Jun 2022 21:11:49 +0300 Subject: [PATCH 35/79] KVM: SVM: fix tsc scaling cache logic SVM uses a per-cpu variable to cache the current value of the tsc scaling multiplier msr on each cpu. Commit 1ab9287add5e2 ("KVM: X86: Add vendor callbacks for writing the TSC multiplier") broke this caching logic. Refactor the code so that all TSC scaling multiplier writes go through a single function which checks and updates the cache. This fixes the following scenario: 1. A CPU runs a guest with some tsc scaling ratio. 2. New guest with different tsc scaling ratio starts on this CPU and terminates almost immediately. This ensures that the short running guest had set the tsc scaling ratio just once when it was set via KVM_SET_TSC_KHZ. Due to the bug, the per-cpu cache is not updated. 3. The original guest continues to run, it doesn't restore the msr value back to its own value, because the cache matches, and thus continues to run with a wrong tsc scaling ratio. Fixes: 1ab9287add5e2 ("KVM: X86: Add vendor callbacks for writing the TSC multiplier") Signed-off-by: Maxim Levitsky Message-Id: <20220606181149.103072-1-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/svm.c | 32 ++++++++++++++++++++------------ arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bed5e1692cef..3361258640a2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -982,7 +982,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); } svm->nested.ctl.nested_cr3 = 0; @@ -1387,7 +1387,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); } /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 63880b33ce37..478e6ee81d88 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -465,11 +465,24 @@ static int has_svm(void) return 1; } +void __svm_write_tsc_multiplier(u64 multiplier) +{ + preempt_disable(); + + if (multiplier == __this_cpu_read(current_tsc_ratio)) + goto out; + + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + __this_cpu_write(current_tsc_ratio, multiplier); +out: + preempt_enable(); +} + static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ if (tsc_scaling) - wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); + __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -515,8 +528,7 @@ static int svm_hardware_enable(void) * Set the default value, even if we don't use TSC scaling * to avoid having stale value in the msr */ - wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); - __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT); + __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); } @@ -999,11 +1011,12 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + __svm_write_tsc_multiplier(multiplier); } + /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) @@ -1363,13 +1376,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) sev_es_prepare_switch_to_guest(hostsa); } - if (tsc_scaling) { - u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; - if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); - } - } + if (tsc_scaling) + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); if (likely(tsc_aux_uret_slot >= 0)) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 45a87b2a8b3c..29d6fd205a49 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -590,7 +590,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); -void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); +void __svm_write_tsc_multiplier(u64 multiplier); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, From 10f3b29c65bb2fe0d47c2945cd0b4087be1c5218 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 May 2022 14:51:13 -0700 Subject: [PATCH 36/79] bpf, arm64: Clear prog->jited_len along prog->jited syzbot reported an illegal copy_to_user() attempt from bpf_prog_get_info_by_fd() [1] There was no repro yet on this bug, but I think that commit 0aef499f3172 ("mm/usercopy: Detect vmalloc overruns") is exposing a prior bug in bpf arm64. bpf_prog_get_info_by_fd() looks at prog->jited_len to determine if the JIT image can be copied out to user space. My theory is that syzbot managed to get a prog where prog->jited_len has been set to 43, while prog->bpf_func has ben cleared. It is not clear why copy_to_user(uinsns, NULL, ulen) is triggering this particular warning. I thought find_vma_area(NULL) would not find a vm_struct. As we do not hold vmap_area_lock spinlock, it might be possible that the found vm_struct was garbage. [1] usercopy: Kernel memory exposure attempt detected from vmalloc (offset 792633534417210172, size 43)! kernel BUG at mm/usercopy.c:101! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 25002 Comm: syz-executor.1 Not tainted 5.18.0-syzkaller-10139-g8291eaafed36 #0 Hardware name: linux,dummy-virt (DT) pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : usercopy_abort+0x90/0x94 mm/usercopy.c:101 lr : usercopy_abort+0x90/0x94 mm/usercopy.c:89 sp : ffff80000b773a20 x29: ffff80000b773a30 x28: faff80000b745000 x27: ffff80000b773b48 x26: 0000000000000000 x25: 000000000000002b x24: 0000000000000000 x23: 00000000000000e0 x22: ffff80000b75db67 x21: 0000000000000001 x20: 000000000000002b x19: ffff80000b75db3c x18: 00000000fffffffd x17: 2820636f6c6c616d x16: 76206d6f72662064 x15: 6574636574656420 x14: 74706d6574746120 x13: 2129333420657a69 x12: 73202c3237313031 x11: 3237313434333533 x10: 3336323937207465 x9 : 657275736f707865 x8 : ffff80000a30c550 x7 : ffff80000b773830 x6 : ffff80000b773830 x5 : 0000000000000000 x4 : ffff00007fbbaa10 x3 : 0000000000000000 x2 : 0000000000000000 x1 : f7ff000028fc0000 x0 : 0000000000000064 Call trace: usercopy_abort+0x90/0x94 mm/usercopy.c:89 check_heap_object mm/usercopy.c:186 [inline] __check_object_size mm/usercopy.c:252 [inline] __check_object_size+0x198/0x36c mm/usercopy.c:214 check_object_size include/linux/thread_info.h:199 [inline] check_copy_size include/linux/thread_info.h:235 [inline] copy_to_user include/linux/uaccess.h:159 [inline] bpf_prog_get_info_by_fd.isra.0+0xf14/0xfdc kernel/bpf/syscall.c:3993 bpf_obj_get_info_by_fd+0x12c/0x510 kernel/bpf/syscall.c:4253 __sys_bpf+0x900/0x2150 kernel/bpf/syscall.c:4956 __do_sys_bpf kernel/bpf/syscall.c:5021 [inline] __se_sys_bpf kernel/bpf/syscall.c:5019 [inline] __arm64_sys_bpf+0x28/0x40 kernel/bpf/syscall.c:5019 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall+0x48/0x114 arch/arm64/kernel/syscall.c:52 el0_svc_common.constprop.0+0x44/0xec arch/arm64/kernel/syscall.c:142 do_el0_svc+0xa0/0xc0 arch/arm64/kernel/syscall.c:206 el0_svc+0x44/0xb0 arch/arm64/kernel/entry-common.c:624 el0t_64_sync_handler+0x1ac/0x1b0 arch/arm64/kernel/entry-common.c:642 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:581 Code: aa0003e3 d00038c0 91248000 97fff65f (d4210000) Fixes: db496944fdaa ("bpf: arm64: add JIT support for multi-function programs") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20220531215113.1100754-1-eric.dumazet@gmail.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 8ab4035dea27..42f2e9a8616c 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1478,6 +1478,7 @@ skip_init_ctx: bpf_jit_binary_free(header); prog->bpf_func = NULL; prog->jited = 0; + prog->jited_len = 0; goto out_off; } bpf_jit_binary_lock_ro(header); From fd58f7df2415ef747782e01f94880fefad1247cf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 May 2022 13:24:05 +0300 Subject: [PATCH 37/79] bpf: Use safer kvmalloc_array() where possible The kvmalloc_array() function is safer because it has a check for integer overflows. These sizes come from the user and I was not able to see any bounds checking so an integer overflow seems like a realistic concern. Fixes: 0dcac2725406 ("bpf: Add multi kprobe link") Signed-off-by: Dan Carpenter Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/Yo9VRVMeHbALyjUH@kili Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 10b157a6d73e..7a13e6ac6327 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2263,11 +2263,11 @@ static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 int err = -ENOMEM; unsigned int i; - syms = kvmalloc(cnt * sizeof(*syms), GFP_KERNEL); + syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); if (!syms) goto error; - buf = kvmalloc(cnt * KSYM_NAME_LEN, GFP_KERNEL); + buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); if (!buf) goto error; @@ -2464,7 +2464,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; size = cnt * sizeof(*addrs); - addrs = kvmalloc(size, GFP_KERNEL); + addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; @@ -2489,7 +2489,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { - cookies = kvmalloc(size, GFP_KERNEL); + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!cookies) { err = -ENOMEM; goto error; From f858c2b2ca04fc7ead291821a793638ae120c11d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 6 Jun 2022 09:52:51 +0200 Subject: [PATCH 38/79] bpf: Fix calling global functions from BPF_PROG_TYPE_EXT programs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The verifier allows programs to call global functions as long as their argument types match, using BTF to check the function arguments. One of the allowed argument types to such global functions is PTR_TO_CTX; however the check for this fails on BPF_PROG_TYPE_EXT functions because the verifier uses the wrong type to fetch the vmlinux BTF ID for the program context type. This failure is seen when an XDP program is loaded using libxdp (which loads it as BPF_PROG_TYPE_EXT and attaches it to a global XDP type program). Fix the issue by passing in the target program type instead of the BPF_PROG_TYPE_EXT type to bpf_prog_get_ctx() when checking function argument compatibility. The first Fixes tag refers to the latest commit that touched the code in question, while the second one points to the code that first introduced the global function call verification. v2: - Use resolve_prog_type() Fixes: 3363bd0cfbb8 ("bpf: Extend kfunc with PTR_TO_CTX, PTR_TO_MEM argument support") Fixes: 51c39bb1d5d1 ("bpf: Introduce function-by-function verification") Reported-by: Simon Sundberg Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220606075253.28422-1-toke@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7bccaa4646e5..63d0ac7dfe2f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6054,6 +6054,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_verifier_log *log = &env->log; u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); @@ -6171,7 +6172,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } /* rest of the arguments can be anything, like normal kfunc */ - } else if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { + } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ From 2cf7b7ffdae519b284f1406012b52e2282fa36bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 6 Jun 2022 09:52:52 +0200 Subject: [PATCH 39/79] selftests/bpf: Add selftest for calling global functions from freplace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a selftest that calls a global function with a context object parameter from an freplace function to check that the program context type is correctly converted to the freplace target when fetching the context type from the kernel BTF. v2: - Trim includes - Get rid of global function - Use __noinline Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20220606075253.28422-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fexit_bpf2bpf.c | 14 ++++++++++++++ .../selftests/bpf/progs/freplace_global_func.c | 18 ++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/freplace_global_func.c diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c index d9aad15e0d24..02bb8cbf9194 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c @@ -395,6 +395,18 @@ static void test_func_map_prog_compatibility(void) "./test_attach_probe.o"); } +static void test_func_replace_global_func(void) +{ + const char *prog_name[] = { + "freplace/test_pkt_access", + }; + + test_fexit_bpf2bpf_common("./freplace_global_func.o", + "./test_pkt_access.o", + ARRAY_SIZE(prog_name), + prog_name, false, NULL); +} + /* NOTE: affect other tests, must run in serial mode */ void serial_test_fexit_bpf2bpf(void) { @@ -416,4 +428,6 @@ void serial_test_fexit_bpf2bpf(void) test_func_replace_multi(); if (test__start_subtest("fmod_ret_freplace")) test_fmod_ret_freplace(); + if (test__start_subtest("func_replace_global_func")) + test_func_replace_global_func(); } diff --git a/tools/testing/selftests/bpf/progs/freplace_global_func.c b/tools/testing/selftests/bpf/progs/freplace_global_func.c new file mode 100644 index 000000000000..96cb61a6ce87 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/freplace_global_func.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +__noinline +int test_ctx_global_func(struct __sk_buff *skb) +{ + volatile int retval = 1; + return retval; +} + +SEC("freplace/test_pkt_access") +int new_test_pkt_access(struct __sk_buff *skb) +{ + return test_ctx_global_func(skb); +} + +char _license[] SEC("license") = "GPL"; From 803e9895ea2b0fe80bc85980ae2d7a7e44037914 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Wed, 6 Apr 2022 11:52:51 +0200 Subject: [PATCH 40/79] ixgbe: fix bcast packets Rx on VF after promisc removal After a VF requested to remove the promiscuous flag on an interface, the broadcast packets are not received anymore. This breaks some protocols like ARP. In ixgbe_update_vf_xcast_mode(), we should keep the IXGBE_VMOLR_BAM bit (Broadcast Accept) on promiscuous removal. This flag is already set by default in ixgbe_set_vmolr() on VF reset. Fixes: 8443c1a4b192 ("ixgbe, ixgbevf: Add new mbox API xcast mode") Cc: stable@vger.kernel.org Cc: Nicolas Dichtel Signed-off-by: Olivier Matz Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 7f11c0a8e7a9..8d108a78941b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -1184,9 +1184,9 @@ static int ixgbe_update_vf_xcast_mode(struct ixgbe_adapter *adapter, switch (xcast_mode) { case IXGBEVF_XCAST_MODE_NONE: - disable = IXGBE_VMOLR_BAM | IXGBE_VMOLR_ROMPE | + disable = IXGBE_VMOLR_ROMPE | IXGBE_VMOLR_MPE | IXGBE_VMOLR_UPE | IXGBE_VMOLR_VPE; - enable = 0; + enable = IXGBE_VMOLR_BAM; break; case IXGBEVF_XCAST_MODE_MULTI: disable = IXGBE_VMOLR_MPE | IXGBE_VMOLR_UPE | IXGBE_VMOLR_VPE; From 7bb0fb7c63df95d6027dc50d6af3bc3bbbc25483 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Wed, 6 Apr 2022 11:52:52 +0200 Subject: [PATCH 41/79] ixgbe: fix unexpected VLAN Rx in promisc mode on VF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the promiscuous mode is enabled on a VF, the IXGBE_VMOLR_VPE bit (VLAN Promiscuous Enable) is set. This means that the VF will receive packets whose VLAN is not the same than the VLAN of the VF. For instance, in this situation: ┌────────┐ ┌────────┐ ┌────────┐ │ │ │ │ │ │ │ │ │ │ │ │ │ VF0├────┤VF1 VF2├────┤VF3 │ │ │ │ │ │ │ └────────┘ └────────┘ └────────┘ VM1 VM2 VM3 vf 0: vlan 1000 vf 1: vlan 1000 vf 2: vlan 1001 vf 3: vlan 1001 If we tcpdump on VF3, we see all the packets, even those transmitted on vlan 1000. This behavior prevents to bridge VF1 and VF2 in VM2, because it will create a loop: packets transmitted on VF1 will be received by VF2 and vice-versa, and bridged again through the software bridge. This patch remove the activation of VLAN Promiscuous when a VF enables the promiscuous mode. However, the IXGBE_VMOLR_UPE bit (Unicast Promiscuous) is kept, so that a VF receives all packets that has the same VLAN, whatever the destination MAC address. Fixes: 8443c1a4b192 ("ixgbe, ixgbevf: Add new mbox API xcast mode") Cc: stable@vger.kernel.org Cc: Nicolas Dichtel Signed-off-by: Olivier Matz Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index 8d108a78941b..d4e63f0644c3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -1208,9 +1208,9 @@ static int ixgbe_update_vf_xcast_mode(struct ixgbe_adapter *adapter, return -EPERM; } - disable = 0; + disable = IXGBE_VMOLR_VPE; enable = IXGBE_VMOLR_BAM | IXGBE_VMOLR_ROMPE | - IXGBE_VMOLR_MPE | IXGBE_VMOLR_UPE | IXGBE_VMOLR_VPE; + IXGBE_VMOLR_MPE | IXGBE_VMOLR_UPE; break; default: return -EOPNOTSUPP; From c42e65664390be7c1ef3838cd84956d3a2739d60 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 7 Jun 2022 12:11:33 -0700 Subject: [PATCH 42/79] Input: bcm5974 - set missing URB_NO_TRANSFER_DMA_MAP urb flag The bcm5974 driver does the allocation and dma mapping of the usb urb data buffer, but driver does not set the URB_NO_TRANSFER_DMA_MAP flag to let usb core know the buffer is already mapped. usb core tries to map the already mapped buffer, causing a warning: "xhci_hcd 0000:00:14.0: rejecting DMA map of vmalloc memory" Fix this by setting the URB_NO_TRANSFER_DMA_MAP, letting usb core know buffer is already mapped by bcm5974 driver Signed-off-by: Mathias Nyman Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=215890 Link: https://lore.kernel.org/r/20220606113636.588955-1-mathias.nyman@linux.intel.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/bcm5974.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index 59a14505b9cd..ca150618d32f 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -942,17 +942,22 @@ static int bcm5974_probe(struct usb_interface *iface, if (!dev->tp_data) goto err_free_bt_buffer; - if (dev->bt_urb) + if (dev->bt_urb) { usb_fill_int_urb(dev->bt_urb, udev, usb_rcvintpipe(udev, cfg->bt_ep), dev->bt_data, dev->cfg.bt_datalen, bcm5974_irq_button, dev, 1); + dev->bt_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; + } + usb_fill_int_urb(dev->tp_urb, udev, usb_rcvintpipe(udev, cfg->tp_ep), dev->tp_data, dev->cfg.tp_datalen, bcm5974_irq_trackpad, dev, 1); + dev->tp_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; + /* create bcm5974 device */ usb_make_path(udev, dev->phys, sizeof(dev->phys)); strlcat(dev->phys, "/input0", sizeof(dev->phys)); From 6ab2e51898cd4343bbdf8587af8ce8fbabddbcb5 Mon Sep 17 00:00:00 2001 From: Marius Hoch Date: Tue, 7 Jun 2022 12:10:52 -0700 Subject: [PATCH 43/79] Input: soc_button_array - also add Lenovo Yoga Tablet2 1051F to dmi_use_low_level_irq Commit 223f61b8c5ad ("Input: soc_button_array - add Lenovo Yoga Tablet2 1051L to the dmi_use_low_level_irq list") added the 1051L to this list already, but the same problem applies to the 1051F. As there are no further 1051 variants (just the F/L), we can just DMI match 1051. Tested on a Lenovo Yoga Tablet2 1051F: Without this patch the home-button stops working after a wakeup from suspend. Signed-off-by: Marius Hoch Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20220603120246.3065-1-mail@mariushoch.de Signed-off-by: Dmitry Torokhov --- drivers/input/misc/soc_button_array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c index cb6ec59a045d..efffcf0ebd3b 100644 --- a/drivers/input/misc/soc_button_array.c +++ b/drivers/input/misc/soc_button_array.c @@ -85,13 +85,13 @@ static const struct dmi_system_id dmi_use_low_level_irq[] = { }, { /* - * Lenovo Yoga Tab2 1051L, something messes with the home-button + * Lenovo Yoga Tab2 1051F/1051L, something messes with the home-button * IRQ settings, leading to a non working home-button. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), DMI_MATCH(DMI_PRODUCT_NAME, "60073"), - DMI_MATCH(DMI_PRODUCT_VERSION, "1051L"), + DMI_MATCH(DMI_PRODUCT_VERSION, "1051"), }, }, {} /* Terminating entry */ From 0737e018a05e2aa352828c52bdeed3b02cff2930 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Sun, 5 Jun 2022 11:23:34 +0400 Subject: [PATCH 44/79] net: dsa: lantiq_gswip: Fix refcount leak in gswip_gphy_fw_list Every iteration of for_each_available_child_of_node() decrements the reference count of the previous node. when breaking early from a for_each_available_child_of_node() loop, we need to explicitly call of_node_put() on the gphy_fw_np. Add missing of_node_put() to avoid refcount leak. Fixes: 14fceff4771e ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220605072335.11257-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_gswip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 8af4def38a98..e531b93f3cb2 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -2070,8 +2070,10 @@ static int gswip_gphy_fw_list(struct gswip_priv *priv, for_each_available_child_of_node(gphy_fw_list_np, gphy_fw_np) { err = gswip_gphy_fw_probe(priv, &priv->gphy_fw[i], gphy_fw_np, i); - if (err) + if (err) { + of_node_put(gphy_fw_np); goto remove_gphy; + } i++; } From f5826c8c9d57210a17031af5527056eefdc2b7eb Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 6 Jun 2022 14:57:18 +0300 Subject: [PATCH 45/79] net/mlx4_en: Fix wrong return value on ioctl EEPROM query failure The ioctl EEPROM query wrongly returns success on read failures, fix that by returning the appropriate error code. Fixes: 7202da8b7f71 ("ethtool, net/mlx4_en: Cable info, get_module_info/eeprom ethtool support") Signed-off-by: Gal Pressman Signed-off-by: Tariq Toukan Link: https://lore.kernel.org/r/20220606115718.14233-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index ed5038d98ef6..6400a827173c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -2110,7 +2110,7 @@ static int mlx4_en_get_module_eeprom(struct net_device *dev, en_err(priv, "mlx4_get_module_info i(%d) offset(%d) bytes_to_read(%d) - FAILED (0x%x)\n", i, offset, ee->len - i, ret); - return 0; + return ret; } i += ret; From 54aa83c90198e68eee8b0850c749bc70efb548da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jun 2022 10:07:11 -0400 Subject: [PATCH 46/79] KVM: x86: do not set st->preempted when going back to user space Similar to the Xen path, only change the vCPU's reported state if the vCPU was actually preempted. The reason for KVM's behavior is that for example optimistic spinning might not be a good idea if the guest is doing repeated exits to userspace; however, it is confusing and unlikely to make a difference, because well-tuned guests will hardly ever exit KVM_RUN in the first place. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 26 ++++++++++++++------------ arch/x86/kvm/xen.h | 6 ++++-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b81ef4f497f4..a8bb635cb76b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4654,19 +4654,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; - if (vcpu->preempted && !vcpu->arch.guest_state_protected) - vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + if (vcpu->preempted) { + if (!vcpu->arch.guest_state_protected) + vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_xen_msr_enabled(vcpu->kvm)) - kvm_xen_runstate_set_preempted(vcpu); - else - kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_xen_msr_enabled(vcpu->kvm)) + kvm_xen_runstate_set_preempted(vcpu); + else + kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index ee5c4ae0755c..532a535a9e99 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -159,8 +159,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) * behalf of the vCPU. Only if the VMM does actually block * does it need to enter RUNSTATE_blocked. */ - if (vcpu->preempted) - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); + if (WARN_ON_ONCE(!vcpu->preempted)) + return; + + kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); } /* 32-bit compatibility definitions, also used natively in 32-bit build */ From 6cd88243c7e03845a450795e134b488fc2afb736 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jun 2022 10:09:03 -0400 Subject: [PATCH 47/79] KVM: x86: do not report a vCPU as preempted outside instruction boundaries If a vCPU is outside guest mode and is scheduled out, it might be in the process of making a memory access. A problem occurs if another vCPU uses the PV TLB flush feature during the period when the vCPU is scheduled out, and a virtual address has already been translated but has not yet been accessed, because this is equivalent to using a stale TLB entry. To avoid this, only report a vCPU as preempted if sure that the guest is at an instruction boundary. A rescheduling request will be delivered to the host physical CPU as an external interrupt, so for simplicity consider any vmexit *not* instruction boundary except for external interrupts. It would in principle be okay to report the vCPU as preempted also if it is sleeping in kvm_vcpu_block(): a TLB flush IPI will incur the vmentry/vmexit overhead unnecessarily, and optimistic spinning is also unlikely to succeed. However, leave it for later because right now kvm_vcpu_check_block() is doing memory accesses. Even though the TLB flush issue only applies to virtual memory address, it's very much preferrable to be conservative. Reported-by: Jann Horn Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 4 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 959d66b9be94..3a240a64ac68 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -653,6 +653,7 @@ struct kvm_vcpu_arch { u64 ia32_misc_enable_msr; u64 smbase; u64 smi_count; + bool at_instruction_boundary; bool tpr_access_reporting; bool xsaves_enabled; bool xfd_no_write_intercept; @@ -1300,6 +1301,8 @@ struct kvm_vcpu_stat { u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; + u64 preemption_reported; + u64 preemption_other; u64 guest_mode; }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 478e6ee81d88..921fcb85a9cd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4263,6 +4263,8 @@ out: static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + vcpu->arch.at_instruction_boundary = true; } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f5aeade623d6..14e01178a753 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6547,6 +6547,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + vcpu->arch.at_instruction_boundary = true; } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a8bb635cb76b..25a517206c4d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -296,6 +296,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), + STATS_DESC_COUNTER(VCPU, preemption_reported), + STATS_DESC_COUNTER(VCPU, preemption_other), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; @@ -4625,6 +4627,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) struct kvm_memslots *slots; static const u8 preempted = KVM_VCPU_PREEMPTED; + /* + * The vCPU can be marked preempted if and only if the VM-Exit was on + * an instruction boundary and will not trigger guest emulation of any + * kind (see vcpu_run). Vendor specific code controls (conservatively) + * when this is true, for example allowing the vCPU to be marked + * preempted if and only if the VM-Exit was due to a host interrupt. + */ + if (!vcpu->arch.at_instruction_boundary) { + vcpu->stat.preemption_other++; + return; + } + + vcpu->stat.preemption_reported++; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -10424,6 +10439,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.l1tf_flush_l1d = true; for (;;) { + /* + * If another guest vCPU requests a PV TLB flush in the middle + * of instruction emulation, the rest of the emulation could + * use a stale page translation. Assume that any code after + * this point can start executing an instruction. + */ + vcpu->arch.at_instruction_boundary = false; if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { From dda5384313a40ecbaafd8a9a80f47483255e4c4d Mon Sep 17 00:00:00 2001 From: David Safford Date: Tue, 7 Jun 2022 14:07:57 -0400 Subject: [PATCH 48/79] KEYS: trusted: tpm2: Fix migratable logic When creating (sealing) a new trusted key, migratable trusted keys have the FIXED_TPM and FIXED_PARENT attributes set, and non-migratable keys don't. This is backwards, and also causes creation to fail when creating a migratable key under a migratable parent. (The TPM thinks you are trying to seal a non-migratable blob under a migratable parent.) The following simple patch fixes the logic, and has been tested for all four combinations of migratable and non-migratable trusted keys and parent storage keys. With this logic, you will get a proper failure if you try to create a non-migratable trusted key under a migratable parent storage key, and all other combinations work correctly. Cc: stable@vger.kernel.org # v5.13+ Fixes: e5fb5d2c5a03 ("security: keys: trusted: Make sealed key properly interoperable") Signed-off-by: David Safford Reviewed-by: Ahmad Fatoum Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_tpm2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c index 0165da386289..2b2c8eb258d5 100644 --- a/security/keys/trusted-keys/trusted_tpm2.c +++ b/security/keys/trusted-keys/trusted_tpm2.c @@ -283,8 +283,8 @@ int tpm2_seal_trusted(struct tpm_chip *chip, /* key properties */ flags = 0; flags |= options->policydigest_len ? 0 : TPM2_OA_USER_WITH_AUTH; - flags |= payload->migratable ? (TPM2_OA_FIXED_TPM | - TPM2_OA_FIXED_PARENT) : 0; + flags |= payload->migratable ? 0 : (TPM2_OA_FIXED_TPM | + TPM2_OA_FIXED_PARENT); tpm_buf_append_u32(&buf, flags); /* policy */ From d678cbd2f867a564a3c5b276c454e873f43f02f8 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 7 Jun 2022 16:22:00 +0200 Subject: [PATCH 49/79] xsk: Fix handling of invalid descriptors in XSK TX batching API xdpxceiver run on a AF_XDP ZC enabled driver revealed a problem with XSK Tx batching API. There is a test that checks how invalid Tx descriptors are handled by AF_XDP. Each valid descriptor is followed by invalid one on Tx side whereas the Rx side expects only to receive a set of valid descriptors. In current xsk_tx_peek_release_desc_batch() function, the amount of available descriptors is hidden inside xskq_cons_peek_desc_batch(). This can be problematic in cases where invalid descriptors are present due to the fact that xskq_cons_peek_desc_batch() returns only a count of valid descriptors. This means that it is impossible to properly update XSK ring state when calling xskq_cons_release_n(). To address this issue, pull out the contents of xskq_cons_peek_desc_batch() so that callers (currently only xsk_tx_peek_release_desc_batch()) will always be able to update the state of ring properly, as total count of entries is now available and use this value as an argument in xskq_cons_release_n(). By doing so, xskq_cons_peek_desc_batch() can be dropped altogether. Fixes: 9349eb3a9d2a ("xsk: Introduce batched Tx descriptor interfaces") Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220607142200.576735-1-maciej.fijalkowski@intel.com --- net/xdp/xsk.c | 5 +++-- net/xdp/xsk_queue.h | 8 -------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index e0a4526ab66b..19ac872a6624 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -373,7 +373,8 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) goto out; } - nb_pkts = xskq_cons_peek_desc_batch(xs->tx, pool, max_entries); + max_entries = xskq_cons_nb_entries(xs->tx, max_entries); + nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries); if (!nb_pkts) { xs->tx->queue_empty_descs++; goto out; @@ -389,7 +390,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) if (!nb_pkts) goto out; - xskq_cons_release_n(xs->tx, nb_pkts); + xskq_cons_release_n(xs->tx, max_entries); __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a794410989cc..fb20bf7207cf 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -282,14 +282,6 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, pool); } -static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, - u32 max) -{ - u32 entries = xskq_cons_nb_entries(q, max); - - return xskq_cons_read_desc_batch(q, pool, entries); -} - /* To improve performance in the xskq_cons_release functions, only update local state here. * Reflect this to global state when we get new entries from the ring in * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. From 7c217aca85dd31dd2c8f45f6a7520767c9fae766 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 8 Jun 2022 13:14:28 +0100 Subject: [PATCH 50/79] MAINTAINERS: Add a maintainer for bpftool I've been contributing and reviewing patches for bpftool for some time, and I'm taking care of its external mirror. On Alexei, KP, and Daniel's suggestion, I would like to step forwards and become a maintainer for the tool. This patch adds a dedicated entry to MAINTAINERS. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Acked-by: KP Singh Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220608121428.69708-1-quentin@isovalent.com --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 033a01b07f8f..92c8adc5471b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3731,6 +3731,13 @@ F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c F: security/bpf/ +BPFTOOL +M: Quentin Monnet +L: bpf@vger.kernel.org +S: Maintained +F: kernel/bpf/disasm.* +F: tools/bpf/bpftool/ + BROADCOM B44 10/100 ETHERNET DRIVER M: Michael Chan L: netdev@vger.kernel.org From 35b42dce619701f1300fb8498dae82c9bb1f0263 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 6 Jun 2022 13:53:53 +0900 Subject: [PATCH 51/79] net: mdio: unexport __init-annotated mdio_bus_init() EXPORT_SYMBOL and __init is a bad combination because the .init.text section is freed up after the initialization. Hence, modules cannot use symbols annotated __init. The access to a freed symbol may end up with kernel panic. modpost used to detect it, but it has been broken for a decade. Recently, I fixed modpost so it started to warn it again, then this showed up in linux-next builds. There are two ways to fix it: - Remove __init - Remove EXPORT_SYMBOL I chose the latter for this case because the only in-tree call-site, drivers/net/phy/phy_device.c is never compiled as modular. (CONFIG_PHYLIB is boolean) Fixes: 90eff9096c01 ("net: phy: Allow splitting MDIO bus/device support from PHYs") Reported-by: Stephen Rothwell Signed-off-by: Masahiro Yamada Reviewed-by: Florian Fainelli Reviewed-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- drivers/net/phy/mdio_bus.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 58d602985877..8a2dbe849866 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -1046,7 +1046,6 @@ int __init mdio_bus_init(void) return ret; } -EXPORT_SYMBOL_GPL(mdio_bus_init); #if IS_ENABLED(CONFIG_PHYLIB) void mdio_bus_exit(void) From 4a388f08d8784af48f352193d2b72aaf167a57a1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 6 Jun 2022 13:53:54 +0900 Subject: [PATCH 52/79] net: xfrm: unexport __init-annotated xfrm4_protocol_init() EXPORT_SYMBOL and __init is a bad combination because the .init.text section is freed up after the initialization. Hence, modules cannot use symbols annotated __init. The access to a freed symbol may end up with kernel panic. modpost used to detect it, but it has been broken for a decade. Recently, I fixed modpost so it started to warn it again, then this showed up in linux-next builds. There are two ways to fix it: - Remove __init - Remove EXPORT_SYMBOL I chose the latter for this case because the only in-tree call-site, net/ipv4/xfrm4_policy.c is never compiled as modular. (CONFIG_XFRM is boolean) Fixes: 2f32b51b609f ("xfrm: Introduce xfrm_input_afinfo to access the the callbacks properly") Reported-by: Stephen Rothwell Signed-off-by: Masahiro Yamada Acked-by: Steffen Klassert Signed-off-by: Jakub Kicinski --- net/ipv4/xfrm4_protocol.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 2fe5860c21d6..b146ce88c5d0 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -304,4 +304,3 @@ void __init xfrm4_protocol_init(void) { xfrm_input_register_afinfo(&xfrm4_input_afinfo); } -EXPORT_SYMBOL(xfrm4_protocol_init); From 5801f064e35181c71857a80ff18af4dbec3c5f5c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 6 Jun 2022 13:53:55 +0900 Subject: [PATCH 53/79] net: ipv6: unexport __init-annotated seg6_hmac_init() EXPORT_SYMBOL and __init is a bad combination because the .init.text section is freed up after the initialization. Hence, modules cannot use symbols annotated __init. The access to a freed symbol may end up with kernel panic. modpost used to detect it, but it has been broken for a decade. Recently, I fixed modpost so it started to warn it again, then this showed up in linux-next builds. There are two ways to fix it: - Remove __init - Remove EXPORT_SYMBOL I chose the latter for this case because the caller (net/ipv6/seg6.c) and the callee (net/ipv6/seg6_hmac.c) belong to the same module. It seems an internal function call in ipv6.ko. Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support") Reported-by: Stephen Rothwell Signed-off-by: Masahiro Yamada Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 29bc4e7c3046..6de01185cc68 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -399,7 +399,6 @@ int __init seg6_hmac_init(void) { return seg6_hmac_init_algo(); } -EXPORT_SYMBOL(seg6_hmac_init); int __net_init seg6_hmac_net_init(struct net *net) { From 77e5fe8f176a525523ae091d6fd0fbb8834c156d Mon Sep 17 00:00:00 2001 From: Martin Faltesek Date: Mon, 6 Jun 2022 21:57:27 -0500 Subject: [PATCH 54/79] nfc: st21nfca: fix incorrect validating logic in EVT_TRANSACTION The first validation check for EVT_TRANSACTION has two different checks tied together with logical AND. One is a check for minimum packet length, and the other is for a valid aid_tag. If either condition is true (fails), then an error should be triggered. The fix is to change && to ||. Fixes: 26fc6c7f02cb ("NFC: st21nfca: Add HCI transaction event support") Cc: stable@vger.kernel.org Signed-off-by: Martin Faltesek Reviewed-by: Guenter Roeck Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski --- drivers/nfc/st21nfca/se.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index 7e213f8ddc98..9645777f2544 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -315,7 +315,7 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, * AID 81 5 to 16 * PARAMETERS 82 0 to 255 */ - if (skb->len < NFC_MIN_AID_LENGTH + 2 && + if (skb->len < NFC_MIN_AID_LENGTH + 2 || skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) return -EPROTO; From 996419e0594abb311fb958553809f24f38e7abbe Mon Sep 17 00:00:00 2001 From: Martin Faltesek Date: Mon, 6 Jun 2022 21:57:28 -0500 Subject: [PATCH 55/79] nfc: st21nfca: fix memory leaks in EVT_TRANSACTION handling Error paths do not free previously allocated memory. Add devm_kfree() to those failure paths. Fixes: 26fc6c7f02cb ("NFC: st21nfca: Add HCI transaction event support") Fixes: 4fbcc1a4cb20 ("nfc: st21nfca: Fix potential buffer overflows in EVT_TRANSACTION") Cc: stable@vger.kernel.org Signed-off-by: Martin Faltesek Reviewed-by: Guenter Roeck Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski --- drivers/nfc/st21nfca/se.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index 9645777f2544..8e1113ce139b 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -326,22 +326,29 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, transaction->aid_len = skb->data[1]; /* Checking if the length of the AID is valid */ - if (transaction->aid_len > sizeof(transaction->aid)) + if (transaction->aid_len > sizeof(transaction->aid)) { + devm_kfree(dev, transaction); return -EINVAL; + } memcpy(transaction->aid, &skb->data[2], transaction->aid_len); /* Check next byte is PARAMETERS tag (82) */ if (skb->data[transaction->aid_len + 2] != - NFC_EVT_TRANSACTION_PARAMS_TAG) + NFC_EVT_TRANSACTION_PARAMS_TAG) { + devm_kfree(dev, transaction); return -EPROTO; + } transaction->params_len = skb->data[transaction->aid_len + 3]; /* Total size is allocated (skb->len - 2) minus fixed array members */ - if (transaction->params_len > ((skb->len - 2) - sizeof(struct nfc_evt_transaction))) + if (transaction->params_len > ((skb->len - 2) - + sizeof(struct nfc_evt_transaction))) { + devm_kfree(dev, transaction); return -EINVAL; + } memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); From f2e19b36593caed4c977c2f55aeba7408aeb2132 Mon Sep 17 00:00:00 2001 From: Martin Faltesek Date: Mon, 6 Jun 2022 21:57:29 -0500 Subject: [PATCH 56/79] nfc: st21nfca: fix incorrect sizing calculations in EVT_TRANSACTION The transaction buffer is allocated by using the size of the packet buf, and subtracting two which seem intended to remove the two tags which are not present in the target structure. This calculation leads to under counting memory because of differences between the packet contents and the target structure. The aid_len field is a u8 in the packet, but a u32 in the structure, resulting in at least 3 bytes always being under counted. Further, the aid data is a variable length field in the packet, but fixed in the structure, so if this field is less than the max, the difference is added to the under counting. The last validation check for transaction->params_len is also incorrect since it employs the same accounting error. To fix, perform validation checks progressively to safely reach the next field, to determine the size of both buffers and verify both tags. Once all validation checks pass, allocate the buffer and copy the data. This eliminates freeing memory on the error path, as those checks are moved ahead of memory allocation. Fixes: 26fc6c7f02cb ("NFC: st21nfca: Add HCI transaction event support") Fixes: 4fbcc1a4cb20 ("nfc: st21nfca: Fix potential buffer overflows in EVT_TRANSACTION") Cc: stable@vger.kernel.org Signed-off-by: Martin Faltesek Reviewed-by: Guenter Roeck Reviewed-by: Krzysztof Kozlowski Signed-off-by: Jakub Kicinski --- drivers/nfc/st21nfca/se.c | 66 +++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index 8e1113ce139b..df8d27cf2956 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -300,6 +300,8 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, int r = 0; struct device *dev = &hdev->ndev->dev; struct nfc_evt_transaction *transaction; + u32 aid_len; + u8 params_len; pr_debug("connectivity gate event: %x\n", event); @@ -308,50 +310,48 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, r = nfc_se_connectivity(hdev->ndev, host); break; case ST21NFCA_EVT_TRANSACTION: - /* - * According to specification etsi 102 622 + /* According to specification etsi 102 622 * 11.2.2.4 EVT_TRANSACTION Table 52 * Description Tag Length * AID 81 5 to 16 * PARAMETERS 82 0 to 255 + * + * The key differences are aid storage length is variably sized + * in the packet, but fixed in nfc_evt_transaction, and that the aid_len + * is u8 in the packet, but u32 in the structure, and the tags in + * the packet are not included in nfc_evt_transaction. + * + * size in bytes: 1 1 5-16 1 1 0-255 + * offset: 0 1 2 aid_len + 2 aid_len + 3 aid_len + 4 + * member name: aid_tag(M) aid_len aid params_tag(M) params_len params + * example: 0x81 5-16 X 0x82 0-255 X */ - if (skb->len < NFC_MIN_AID_LENGTH + 2 || - skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) + if (skb->len < 2 || skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) return -EPROTO; - transaction = devm_kzalloc(dev, skb->len - 2, GFP_KERNEL); + aid_len = skb->data[1]; + + if (skb->len < aid_len + 4 || aid_len > sizeof(transaction->aid)) + return -EPROTO; + + params_len = skb->data[aid_len + 3]; + + /* Verify PARAMETERS tag is (82), and final check that there is enough + * space in the packet to read everything. + */ + if ((skb->data[aid_len + 2] != NFC_EVT_TRANSACTION_PARAMS_TAG) || + (skb->len < aid_len + 4 + params_len)) + return -EPROTO; + + transaction = devm_kzalloc(dev, sizeof(*transaction) + params_len, GFP_KERNEL); if (!transaction) return -ENOMEM; - transaction->aid_len = skb->data[1]; + transaction->aid_len = aid_len; + transaction->params_len = params_len; - /* Checking if the length of the AID is valid */ - if (transaction->aid_len > sizeof(transaction->aid)) { - devm_kfree(dev, transaction); - return -EINVAL; - } - - memcpy(transaction->aid, &skb->data[2], - transaction->aid_len); - - /* Check next byte is PARAMETERS tag (82) */ - if (skb->data[transaction->aid_len + 2] != - NFC_EVT_TRANSACTION_PARAMS_TAG) { - devm_kfree(dev, transaction); - return -EPROTO; - } - - transaction->params_len = skb->data[transaction->aid_len + 3]; - - /* Total size is allocated (skb->len - 2) minus fixed array members */ - if (transaction->params_len > ((skb->len - 2) - - sizeof(struct nfc_evt_transaction))) { - devm_kfree(dev, transaction); - return -EINVAL; - } - - memcpy(transaction->params, skb->data + - transaction->aid_len + 4, transaction->params_len); + memcpy(transaction->aid, &skb->data[2], aid_len); + memcpy(transaction->params, &skb->data[aid_len + 4], params_len); r = nfc_se_transaction(hdev->ndev, host, transaction); break; From 8a4d480702b71184fabcf379b80bf7539716752e Mon Sep 17 00:00:00 2001 From: Xiaohui Zhang Date: Tue, 7 Jun 2022 16:32:30 +0800 Subject: [PATCH 57/79] nfc: nfcmrvl: Fix memory leak in nfcmrvl_play_deferred Similar to the handling of play_deferred in commit 19cfe912c37b ("Bluetooth: btusb: Fix memory leak in play_deferred"), we thought a patch might be needed here as well. Currently usb_submit_urb is called directly to submit deferred tx urbs after unanchor them. So the usb_giveback_urb_bh would failed to unref it in usb_unanchor_urb and cause memory leak. Put those urbs in tx_anchor to avoid the leak, and also fix the error handling. Signed-off-by: Xiaohui Zhang Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220607083230.6182-1-xiaohuizhang@ruc.edu.cn Signed-off-by: Jakub Kicinski --- drivers/nfc/nfcmrvl/usb.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/nfc/nfcmrvl/usb.c b/drivers/nfc/nfcmrvl/usb.c index a99aedff795d..ea7309453096 100644 --- a/drivers/nfc/nfcmrvl/usb.c +++ b/drivers/nfc/nfcmrvl/usb.c @@ -388,13 +388,25 @@ static void nfcmrvl_play_deferred(struct nfcmrvl_usb_drv_data *drv_data) int err; while ((urb = usb_get_from_anchor(&drv_data->deferred))) { + usb_anchor_urb(urb, &drv_data->tx_anchor); + err = usb_submit_urb(urb, GFP_ATOMIC); - if (err) + if (err) { + kfree(urb->setup_packet); + usb_unanchor_urb(urb); + usb_free_urb(urb); break; + } drv_data->tx_in_flight++; + usb_free_urb(urb); + } + + /* Cleanup the rest deferred urbs. */ + while ((urb = usb_get_from_anchor(&drv_data->deferred))) { + kfree(urb->setup_packet); + usb_free_urb(urb); } - usb_scuttle_anchored_urbs(&drv_data->deferred); } static int nfcmrvl_resume(struct usb_interface *intf) From f93431c86b631bbca5614c66f966bf3ddb3c2803 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 7 Jun 2022 20:00:27 +0800 Subject: [PATCH 58/79] ipv6: Fix signed integer overflow in __ip6_append_data Resurrect ubsan overflow checks and ubsan report this warning, fix it by change the variable [length] type to size_t. UBSAN: signed-integer-overflow in net/ipv6/ip6_output.c:1489:19 2147479552 + 8567 cannot be represented in type 'int' CPU: 0 PID: 253 Comm: err Not tainted 5.16.0+ #1 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x214/0x230 show_stack+0x30/0x78 dump_stack_lvl+0xf8/0x118 dump_stack+0x18/0x30 ubsan_epilogue+0x18/0x60 handle_overflow+0xd0/0xf0 __ubsan_handle_add_overflow+0x34/0x44 __ip6_append_data.isra.48+0x1598/0x1688 ip6_append_data+0x128/0x260 udpv6_sendmsg+0x680/0xdd0 inet6_sendmsg+0x54/0x90 sock_sendmsg+0x70/0x88 ____sys_sendmsg+0xe8/0x368 ___sys_sendmsg+0x98/0xe0 __sys_sendmmsg+0xf4/0x3b8 __arm64_sys_sendmmsg+0x34/0x48 invoke_syscall+0x64/0x160 el0_svc_common.constprop.4+0x124/0x300 do_el0_svc+0x44/0xc8 el0_svc+0x3c/0x1e8 el0t_64_sync_handler+0x88/0xb0 el0t_64_sync+0x16c/0x170 Changes since v1: -Change the variable [length] type to unsigned, as Eric Dumazet suggested. Changes since v2: -Don't change exthdrlen type in ip6_make_skb, as Paolo Abeni suggested. Changes since v3: -Don't change ulen type in udpv6_sendmsg and l2tp_ip6_sendmsg, as Jakub Kicinski suggested. Reported-by: Hulk Robot Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/20220607120028.845916-1-wangyufen@huawei.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 4 ++-- net/ipv6/ip6_output.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5b38bf1a586b..de9dcc5652c4 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1063,7 +1063,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); @@ -1079,7 +1079,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4081b12a01ff..77e3f5970ce4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1450,7 +1450,7 @@ static int __ip6_append_data(struct sock *sk, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; @@ -1798,7 +1798,7 @@ error: int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags) { @@ -1995,7 +1995,7 @@ EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, + void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork) { From f638a84afef3dfe10554c51820c16e39a278c915 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Tue, 7 Jun 2022 20:00:28 +0800 Subject: [PATCH 59/79] ipv6: Fix signed integer overflow in l2tp_ip6_sendmsg When len >= INT_MAX - transhdrlen, ulen = len + transhdrlen will be overflow. To fix, we can follow what udpv6 does and subtract the transhdrlen from the max. Signed-off-by: Wang Yufen Link: https://lore.kernel.org/r/20220607120028.845916-2-wangyufen@huawei.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_ip6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index c6ff8bf9b55f..9dbd801ddb98 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -504,14 +504,15 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int transhdrlen = 4; /* zero session-id */ - int ulen = len + transhdrlen; + int ulen; int err; /* Rough check on arithmetic overflow, * better check is made in ip6_append_data(). */ - if (len > INT_MAX) + if (len > INT_MAX - transhdrlen) return -EMSGSIZE; + ulen = len + transhdrlen; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) From a6958951ebe7db60e84b2437ee53aa4843028726 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 7 Jun 2022 11:01:46 +0200 Subject: [PATCH 60/79] au1000_eth: stop using virt_to_bus() The conversion to the dma-mapping API in linux-2.6.11 was incomplete and left a virt_to_bus() call around. There have been a number of fixes for DMA mapping API abuse in this driver, but this one always slipped through. Change it to just use the existing dma_addr_t pointer, and make it use the correct types throughout the driver to make it easier to understand the virtual vs dma address spaces. Signed-off-by: Arnd Bergmann Tested-by: Manuel Lauss Link: https://lore.kernel.org/r/20220607090206.19830-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/au1000_eth.c | 22 +++++++++++----------- drivers/net/ethernet/amd/au1000_eth.h | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c index c6f003975621..d5f2c6989221 100644 --- a/drivers/net/ethernet/amd/au1000_eth.c +++ b/drivers/net/ethernet/amd/au1000_eth.c @@ -820,7 +820,7 @@ static int au1000_rx(struct net_device *dev) pr_cont("\n"); } } - prxd->buff_stat = (u32)(pDB->dma_addr | RX_DMA_ENABLE); + prxd->buff_stat = lower_32_bits(pDB->dma_addr) | RX_DMA_ENABLE; aup->rx_head = (aup->rx_head + 1) & (NUM_RX_DMA - 1); wmb(); /* drain writebuffer */ @@ -996,7 +996,7 @@ static netdev_tx_t au1000_tx(struct sk_buff *skb, struct net_device *dev) ps->tx_packets++; ps->tx_bytes += ptxd->len; - ptxd->buff_stat = pDB->dma_addr | TX_DMA_ENABLE; + ptxd->buff_stat = lower_32_bits(pDB->dma_addr) | TX_DMA_ENABLE; wmb(); /* drain writebuffer */ dev_kfree_skb(skb); aup->tx_head = (aup->tx_head + 1) & (NUM_TX_DMA - 1); @@ -1131,9 +1131,9 @@ static int au1000_probe(struct platform_device *pdev) /* Allocate the data buffers * Snooping works fine with eth on all au1xxx */ - aup->vaddr = (u32)dma_alloc_coherent(&pdev->dev, MAX_BUF_SIZE * - (NUM_TX_BUFFS + NUM_RX_BUFFS), - &aup->dma_addr, 0); + aup->vaddr = dma_alloc_coherent(&pdev->dev, MAX_BUF_SIZE * + (NUM_TX_BUFFS + NUM_RX_BUFFS), + &aup->dma_addr, 0); if (!aup->vaddr) { dev_err(&pdev->dev, "failed to allocate data buffers\n"); err = -ENOMEM; @@ -1234,8 +1234,8 @@ static int au1000_probe(struct platform_device *pdev) for (i = 0; i < (NUM_TX_BUFFS+NUM_RX_BUFFS); i++) { pDB->pnext = pDBfree; pDBfree = pDB; - pDB->vaddr = (u32 *)((unsigned)aup->vaddr + MAX_BUF_SIZE*i); - pDB->dma_addr = (dma_addr_t)virt_to_bus(pDB->vaddr); + pDB->vaddr = aup->vaddr + MAX_BUF_SIZE * i; + pDB->dma_addr = aup->dma_addr + MAX_BUF_SIZE * i; pDB++; } aup->pDBfree = pDBfree; @@ -1246,7 +1246,7 @@ static int au1000_probe(struct platform_device *pdev) if (!pDB) goto err_out; - aup->rx_dma_ring[i]->buff_stat = (unsigned)pDB->dma_addr; + aup->rx_dma_ring[i]->buff_stat = lower_32_bits(pDB->dma_addr); aup->rx_db_inuse[i] = pDB; } @@ -1255,7 +1255,7 @@ static int au1000_probe(struct platform_device *pdev) if (!pDB) goto err_out; - aup->tx_dma_ring[i]->buff_stat = (unsigned)pDB->dma_addr; + aup->tx_dma_ring[i]->buff_stat = lower_32_bits(pDB->dma_addr); aup->tx_dma_ring[i]->len = 0; aup->tx_db_inuse[i] = pDB; } @@ -1310,7 +1310,7 @@ err_remap2: iounmap(aup->mac); err_remap1: dma_free_coherent(&pdev->dev, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS), - (void *)aup->vaddr, aup->dma_addr); + aup->vaddr, aup->dma_addr); err_vaddr: free_netdev(dev); err_alloc: @@ -1343,7 +1343,7 @@ static int au1000_remove(struct platform_device *pdev) au1000_ReleaseDB(aup, aup->tx_db_inuse[i]); dma_free_coherent(&pdev->dev, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS), - (void *)aup->vaddr, aup->dma_addr); + aup->vaddr, aup->dma_addr); iounmap(aup->macdma); iounmap(aup->mac); diff --git a/drivers/net/ethernet/amd/au1000_eth.h b/drivers/net/ethernet/amd/au1000_eth.h index e3a3ed29db61..2489c2f4fd8a 100644 --- a/drivers/net/ethernet/amd/au1000_eth.h +++ b/drivers/net/ethernet/amd/au1000_eth.h @@ -106,8 +106,8 @@ struct au1000_private { struct mac_reg *mac; /* mac registers */ u32 *enable; /* address of MAC Enable Register */ void __iomem *macdma; /* base of MAC DMA port */ - u32 vaddr; /* virtual address of rx/tx buffers */ - dma_addr_t dma_addr; /* dma address of rx/tx buffers */ + void *vaddr; /* virtual address of rx/tx buffers */ + dma_addr_t dma_addr; /* dma address of rx/tx buffers */ spinlock_t lock; /* Serialise access to device */ From 6bfb56e93bcef41859c2d5ab234ffd80b691be35 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jun 2022 13:18:39 -0700 Subject: [PATCH 61/79] cert host tools: Stop complaining about deprecated OpenSSL functions OpenSSL 3.0 deprecated the OpenSSL's ENGINE API. That is as may be, but the kernel build host tools still use it. Disable the warning about deprecated declarations until somebody who cares fixes it. Signed-off-by: Linus Torvalds --- certs/extract-cert.c | 7 +++++++ scripts/sign-file.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/certs/extract-cert.c b/certs/extract-cert.c index f7ef7862f207..8c1fb9a70d66 100644 --- a/certs/extract-cert.c +++ b/certs/extract-cert.c @@ -23,6 +23,13 @@ #include #include +/* + * OpenSSL 3.0 deprecates the OpenSSL's ENGINE API. + * + * Remove this if/when that API is no longer used + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + #define PKEY_ID_PKCS7 2 static __attribute__((noreturn)) diff --git a/scripts/sign-file.c b/scripts/sign-file.c index fbd34b8e8f57..7434e9ea926e 100644 --- a/scripts/sign-file.c +++ b/scripts/sign-file.c @@ -29,6 +29,13 @@ #include #include +/* + * OpenSSL 3.0 deprecates the OpenSSL's ENGINE API. + * + * Remove this if/when that API is no longer used + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + /* * Use CMS if we have openssl-1.0.0 or newer available - otherwise we have to * assume that it's not available and its header file is missing and that we From 8d21e9963bec1aad2280cdd034c8993033ef2948 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 6 Jun 2022 09:21:07 -0400 Subject: [PATCH 62/79] ip_gre: test csum_start instead of transport header GRE with TUNNEL_CSUM will apply local checksum offload on CHECKSUM_PARTIAL packets. ipgre_xmit must validate csum_start after an optional skb_pull, else lco_csum may trigger an overflow. The original check was if (csum && skb_checksum_start(skb) < skb->data) return -EINVAL; This had false positives when skb_checksum_start is undefined: when ip_summed is not CHECKSUM_PARTIAL. A discussed refinement was straightforward if (csum && skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_start(skb) < skb->data) return -EINVAL; But was eventually revised more thoroughly: - restrict the check to the only branch where needed, in an uncommon GRE path that uses header_ops and calls skb_pull. - test skb_transport_header, which is set along with csum_start in skb_partial_csum_set in the normal header_ops datapath. Turns out skbs can arrive in this branch without the transport header set, e.g., through BPF redirection. Revise the check back to check csum_start directly, and only if CHECKSUM_PARTIAL. Do leave the check in the updated location. Check field regardless of whether TUNNEL_CSUM is configured. Link: https://lore.kernel.org/netdev/YS+h%2FtqCJJiQei+W@shredder/ Link: https://lore.kernel.org/all/20210902193447.94039-2-willemdebruijn.kernel@gmail.com/T/#u Fixes: 8a0ed250f911 ("ip_gre: validate csum_start only on pull") Reported-by: syzbot Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20220606132107.3582565-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7e474a85deaf..3b9cd487075a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -629,21 +629,20 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { - const int pull_len = tunnel->hlen + sizeof(struct iphdr); - if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; - if (pull_len > skb_transport_offset(skb)) - goto free_skb; - /* Pull skb since ip_tunnel_xmit() needs skb->data pointing * to gre header. */ - skb_pull(skb, pull_len); + skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); skb_reset_mac_header(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_start(skb) < skb->data) + goto free_skb; } else { if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; From 2f2c0d2919a14002760f89f4e02960c735a316d2 Mon Sep 17 00:00:00 2001 From: Chen Lin Date: Wed, 8 Jun 2022 20:46:53 +0800 Subject: [PATCH 63/79] net: ethernet: mtk_eth_soc: fix misuse of mem alloc interface netdev[napi]_alloc_frag When rx_flag == MTK_RX_FLAGS_HWLRO, rx_data_len = MTK_MAX_LRO_RX_LENGTH(4096 * 3) > PAGE_SIZE. netdev_alloc_frag is for alloction of page fragment only. Reference to other drivers and Documentation/vm/page_frags.rst Branch to use __get_free_pages when ring->frag_size > PAGE_SIZE. Signed-off-by: Chen Lin Link: https://lore.kernel.org/r/1654692413-2598-1-git-send-email-chen45464546@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index b3b3c079a0fa..59c9a10f83ba 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -899,6 +899,17 @@ static bool mtk_rx_get_desc(struct mtk_eth *eth, struct mtk_rx_dma_v2 *rxd, return true; } +static void *mtk_max_lro_buf_alloc(gfp_t gfp_mask) +{ + unsigned int size = mtk_max_frag_size(MTK_MAX_LRO_RX_LENGTH); + unsigned long data; + + data = __get_free_pages(gfp_mask | __GFP_COMP | __GFP_NOWARN, + get_order(size)); + + return (void *)data; +} + /* the qdma core needs scratch memory to be setup */ static int mtk_init_fq_dma(struct mtk_eth *eth) { @@ -1467,7 +1478,10 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, goto release_desc; /* alloc new buffer */ - new_data = napi_alloc_frag(ring->frag_size); + if (ring->frag_size <= PAGE_SIZE) + new_data = napi_alloc_frag(ring->frag_size); + else + new_data = mtk_max_lro_buf_alloc(GFP_ATOMIC); if (unlikely(!new_data)) { netdev->stats.rx_dropped++; goto release_desc; @@ -1914,7 +1928,10 @@ static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag) return -ENOMEM; for (i = 0; i < rx_dma_size; i++) { - ring->data[i] = netdev_alloc_frag(ring->frag_size); + if (ring->frag_size <= PAGE_SIZE) + ring->data[i] = netdev_alloc_frag(ring->frag_size); + else + ring->data[i] = mtk_max_lro_buf_alloc(GFP_KERNEL); if (!ring->data[i]) return -ENOMEM; } From 2061ecfdf2350994e5b61c43e50e98a7a70e95ee Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 7 Jun 2022 00:11:40 +0200 Subject: [PATCH 64/79] net: openvswitch: fix misuse of the cached connection on tuple changes If packet headers changed, the cached nfct is no longer relevant for the packet and attempt to re-use it leads to the incorrect packet classification. This issue is causing broken connectivity in OpenStack deployments with OVS/OVN due to hairpin traffic being unexpectedly dropped. The setup has datapath flows with several conntrack actions and tuple changes between them: actions:ct(commit,zone=8,mark=0/0x1,nat(src)), set(eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:06)), set(ipv4(src=172.18.2.10,dst=192.168.100.6,ttl=62)), ct(zone=8),recirc(0x4) After the first ct() action the packet headers are almost fully re-written. The next ct() tries to re-use the existing nfct entry and marks the packet as invalid, so it gets dropped later in the pipeline. Clearing the cached conntrack entry whenever packet tuple is changed to avoid the issue. The flow key should not be cleared though, because we should still be able to match on the ct_state if the recirculation happens after the tuple change but before the next ct() action. Cc: stable@vger.kernel.org Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Reported-by: Frode Nordahl Link: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-May/051829.html Link: https://bugs.launchpad.net/ubuntu/+source/ovn/+bug/1967856 Signed-off-by: Ilya Maximets Link: https://lore.kernel.org/r/20220606221140.488984-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/actions.c | 6 ++++++ net/openvswitch/conntrack.c | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1b5d73079dc9..868db4669a29 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -373,6 +373,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, update_ip_l4_checksum(skb, nh, *addr, new_addr); csum_replace4(&nh->check, *addr, new_addr); skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); *addr = new_addr; } @@ -420,6 +421,7 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, update_ipv6_checksum(skb, l4_proto, addr, new_addr); skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); memcpy(addr, new_addr, sizeof(__be32[4])); } @@ -660,6 +662,7 @@ static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) { + ovs_ct_clear(skb, NULL); inet_proto_csum_replace2(check, skb, *port, new_port, false); *port = new_port; } @@ -699,6 +702,7 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, uh->dest = dst; flow_key->tp.src = src; flow_key->tp.dst = dst; + ovs_ct_clear(skb, NULL); } skb_clear_hash(skb); @@ -761,6 +765,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, sh->checksum = old_csum ^ old_correct_csum ^ new_csum; skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); + flow_key->tp.src = sh->source; flow_key->tp.dst = sh->dest; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 4a947c13c813..4e70df91d0f2 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1342,7 +1342,9 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) nf_ct_put(ct); nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - ovs_ct_fill_key(skb, key, false); + + if (key) + ovs_ct_fill_key(skb, key, false); return 0; } From 11ec18b1d8d92b9df307d31950dcba0b3dd7283c Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 7 Jun 2022 08:11:43 +0400 Subject: [PATCH 65/79] net: altera: Fix refcount leak in altera_tse_mdio_create Every iteration of for_each_child_of_node() decrements the reference count of the previous node. When break from a for_each_child_of_node() loop, we need to explicitly call of_node_put() on the child node when not need anymore. Add missing of_node_put() to avoid refcount leak. Fixes: bbd2190ce96d ("Altera TSE: Add main and header file for Altera Ethernet Driver") Signed-off-by: Miaoqian Lin Link: https://lore.kernel.org/r/20220607041144.7553-1-linmq006@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/altera/altera_tse_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index a3816264c35c..8c5828582c21 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -163,7 +163,8 @@ static int altera_tse_mdio_create(struct net_device *dev, unsigned int id) mdio = mdiobus_alloc(); if (mdio == NULL) { netdev_err(dev, "Error allocating MDIO bus\n"); - return -ENOMEM; + ret = -ENOMEM; + goto put_node; } mdio->name = ALTERA_TSE_RESOURCE_NAME; @@ -180,6 +181,7 @@ static int altera_tse_mdio_create(struct net_device *dev, unsigned int id) mdio->id); goto out_free_mdio; } + of_node_put(mdio_node); if (netif_msg_drv(priv)) netdev_info(dev, "MDIO bus %s: created\n", mdio->id); @@ -189,6 +191,8 @@ static int altera_tse_mdio_create(struct net_device *dev, unsigned int id) out_free_mdio: mdiobus_free(mdio); mdio = NULL; +put_node: + of_node_put(mdio_node); return ret; } From 47e96930d6e6106d5252e85b868d3c7e29296de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 7 Jun 2022 12:28:42 +0100 Subject: [PATCH 66/79] net: dsa: mv88e6xxx: use BMSR_ANEGCOMPLETE bit for filling an_complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit ede359d8843a ("net: dsa: mv88e6xxx: Link in pcs_get_state() if AN is bypassed") added the ability to link if AN was bypassed, and added filling of state->an_complete field, but set it to true if AN was enabled in BMCR, not when AN was reported complete in BMSR. This was done because for some reason, when I wanted to use BMSR value to infer an_complete, I was looking at BMSR_ANEGCAPABLE bit (which was always 1), instead of BMSR_ANEGCOMPLETE bit. Use BMSR_ANEGCOMPLETE for filling state->an_complete. Fixes: ede359d8843a ("net: dsa: mv88e6xxx: Link in pcs_get_state() if AN is bypassed") Signed-off-by: Marek Behún Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/serdes.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 7b37d45bc9fb..1a19c5284f2c 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -50,22 +50,17 @@ static int mv88e6390_serdes_write(struct mv88e6xxx_chip *chip, } static int mv88e6xxx_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, - u16 ctrl, u16 status, u16 lpa, + u16 bmsr, u16 lpa, u16 status, struct phylink_link_state *state) { state->link = !!(status & MV88E6390_SGMII_PHY_STATUS_LINK); + state->an_complete = !!(bmsr & BMSR_ANEGCOMPLETE); if (status & MV88E6390_SGMII_PHY_STATUS_SPD_DPL_VALID) { /* The Spped and Duplex Resolved register is 1 if AN is enabled * and complete, or if AN is disabled. So with disabled AN we - * still get here on link up. But we want to set an_complete - * only if AN was enabled, thus we look at BMCR_ANENABLE. - * (According to 802.3-2008 section 22.2.4.2.10, we should be - * able to get this same value from BMSR_ANEGCAPABLE, but tests - * show that these Marvell PHYs don't conform to this part of - * the specificaion - BMSR_ANEGCAPABLE is simply always 1.) + * still get here on link up. */ - state->an_complete = !!(ctrl & BMCR_ANENABLE); state->duplex = status & MV88E6390_SGMII_PHY_STATUS_DUPLEX_FULL ? DUPLEX_FULL : DUPLEX_HALF; @@ -191,12 +186,12 @@ int mv88e6352_serdes_pcs_config(struct mv88e6xxx_chip *chip, int port, int mv88e6352_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, int port, int lane, struct phylink_link_state *state) { - u16 lpa, status, ctrl; + u16 bmsr, lpa, status; int err; - err = mv88e6352_serdes_read(chip, MII_BMCR, &ctrl); + err = mv88e6352_serdes_read(chip, MII_BMSR, &bmsr); if (err) { - dev_err(chip->dev, "can't read Serdes PHY control: %d\n", err); + dev_err(chip->dev, "can't read Serdes BMSR: %d\n", err); return err; } @@ -212,7 +207,7 @@ int mv88e6352_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, int port, return err; } - return mv88e6xxx_serdes_pcs_get_state(chip, ctrl, status, lpa, state); + return mv88e6xxx_serdes_pcs_get_state(chip, bmsr, lpa, status, state); } int mv88e6352_serdes_pcs_an_restart(struct mv88e6xxx_chip *chip, int port, @@ -918,13 +913,13 @@ int mv88e6390_serdes_pcs_config(struct mv88e6xxx_chip *chip, int port, static int mv88e6390_serdes_pcs_get_state_sgmii(struct mv88e6xxx_chip *chip, int port, int lane, struct phylink_link_state *state) { - u16 lpa, status, ctrl; + u16 bmsr, lpa, status; int err; err = mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS, - MV88E6390_SGMII_BMCR, &ctrl); + MV88E6390_SGMII_BMSR, &bmsr); if (err) { - dev_err(chip->dev, "can't read Serdes PHY control: %d\n", err); + dev_err(chip->dev, "can't read Serdes PHY BMSR: %d\n", err); return err; } @@ -942,7 +937,7 @@ static int mv88e6390_serdes_pcs_get_state_sgmii(struct mv88e6xxx_chip *chip, return err; } - return mv88e6xxx_serdes_pcs_get_state(chip, ctrl, status, lpa, state); + return mv88e6xxx_serdes_pcs_get_state(chip, bmsr, lpa, status, state); } static int mv88e6390_serdes_pcs_get_state_10g(struct mv88e6xxx_chip *chip, From 2b4bb9cd9bcdbe1f791fec18a7c8728cb6989bf8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 7 Jun 2022 12:28:47 +0100 Subject: [PATCH 67/79] net: dsa: mv88e6xxx: fix BMSR error to be consistent with others Other errors accessing the registers in mv88e6352_serdes_pcs_get_state() print "PHY " before the register name, except for the BMSR. Make this consistent with the other error messages. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/serdes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 1a19c5284f2c..47bf87d530b0 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -191,7 +191,7 @@ int mv88e6352_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, int port, err = mv88e6352_serdes_read(chip, MII_BMSR, &bmsr); if (err) { - dev_err(chip->dev, "can't read Serdes BMSR: %d\n", err); + dev_err(chip->dev, "can't read Serdes PHY BMSR: %d\n", err); return err; } From b4d78731b34bd6bfd1bfedce26a55e3582b0bc14 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 7 Jun 2022 12:28:52 +0100 Subject: [PATCH 68/79] net: dsa: mv88e6xxx: correctly report serdes link failure Phylink wants to know if the link has dropped since the last time state was retrieved, and the BMSR gives us that. Read the BMSR and use it when deciding the link state. Fill in the an_complete member as well for the emulated PHY state. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/serdes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 47bf87d530b0..d94150d8f3f4 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -53,6 +53,14 @@ static int mv88e6xxx_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, u16 bmsr, u16 lpa, u16 status, struct phylink_link_state *state) { + state->link = false; + + /* If the BMSR reports that the link had failed, report this to + * phylink. + */ + if (!(bmsr & BMSR_LSTATUS)) + return 0; + state->link = !!(status & MV88E6390_SGMII_PHY_STATUS_LINK); state->an_complete = !!(bmsr & BMSR_ANEGCOMPLETE); From 487994ff75880569d32504d7e70da8b3328e0693 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= Date: Tue, 7 Jun 2022 20:46:24 +0200 Subject: [PATCH 69/79] net: dsa: realtek: rtl8365mb: fix GMII caps for ports with internal PHY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a18e6521a7d9 ("net: phylink: handle NA interface mode in phylink_fwnode_phy_connect()"), phylib defaults to GMII when no phy-mode or phy-connection-type property is specified in a DSA port node of the device tree. The same commit caused a regression in rtl8365mb whereby phylink would fail to connect, because the driver did not advertise support for GMII for ports with internal PHY. It should be noted that the aforementioned regression is not because the blamed commit was incorrect: on the contrary, the blamed commit is correcting the previous behaviour whereby unspecified phy-mode would cause the internal interface mode to be PHY_INTERFACE_MODE_NA. The rtl8365mb driver only worked by accident before because it _did_ advertise support for PHY_INTERFACE_MODE_NA, despite NA being reserved for internal use by phylink. With one mistake fixed, the other was exposed. Commit a5dba0f207e5 ("net: dsa: rtl8365mb: add GMII as user port mode") then introduced implicit support for GMII mode on ports with internal PHY to allow a PHY connection for device trees where the phy-mode is not explicitly set to "internal". At this point everything was working OK again. Subsequently, commit 6ff6064605e9 ("net: dsa: realtek: convert to phylink_generic_validate()") broke this behaviour again by discarding the usage of rtl8365mb_phy_mode_supported() - where this GMII support was indicated - while switching to the new .phylink_get_caps API. With the new API, rtl8365mb_phy_mode_supported() is no longer needed. Remove it altogether and add back the GMII capability - this time to rtl8365mb_phylink_get_caps() - so that the above default behaviour works for ports with internal PHY again. Fixes: 6ff6064605e9 ("net: dsa: realtek: convert to phylink_generic_validate()") Signed-off-by: Alvin Šipraga Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220607184624.417641-1-alvin@pqrs.dk Signed-off-by: Jakub Kicinski --- drivers/net/dsa/realtek/rtl8365mb.c | 38 +++++++---------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/net/dsa/realtek/rtl8365mb.c b/drivers/net/dsa/realtek/rtl8365mb.c index 3bb42a9f236d..769f672e9128 100644 --- a/drivers/net/dsa/realtek/rtl8365mb.c +++ b/drivers/net/dsa/realtek/rtl8365mb.c @@ -955,35 +955,21 @@ static int rtl8365mb_ext_config_forcemode(struct realtek_priv *priv, int port, return 0; } -static bool rtl8365mb_phy_mode_supported(struct dsa_switch *ds, int port, - phy_interface_t interface) -{ - int ext_int; - - ext_int = rtl8365mb_extint_port_map[port]; - - if (ext_int < 0 && - (interface == PHY_INTERFACE_MODE_NA || - interface == PHY_INTERFACE_MODE_INTERNAL || - interface == PHY_INTERFACE_MODE_GMII)) - /* Internal PHY */ - return true; - else if ((ext_int >= 1) && - phy_interface_mode_is_rgmii(interface)) - /* Extension MAC */ - return true; - - return false; -} - static void rtl8365mb_phylink_get_caps(struct dsa_switch *ds, int port, struct phylink_config *config) { - if (dsa_is_user_port(ds, port)) + if (dsa_is_user_port(ds, port)) { __set_bit(PHY_INTERFACE_MODE_INTERNAL, config->supported_interfaces); - else if (dsa_is_cpu_port(ds, port)) + + /* GMII is the default interface mode for phylib, so + * we have to support it for ports with integrated PHY. + */ + __set_bit(PHY_INTERFACE_MODE_GMII, + config->supported_interfaces); + } else if (dsa_is_cpu_port(ds, port)) { phy_interface_set_rgmii(config->supported_interfaces); + } config->mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD; @@ -996,12 +982,6 @@ static void rtl8365mb_phylink_mac_config(struct dsa_switch *ds, int port, struct realtek_priv *priv = ds->priv; int ret; - if (!rtl8365mb_phy_mode_supported(ds, port, state->interface)) { - dev_err(priv->dev, "phy mode %s is unsupported on port %d\n", - phy_modes(state->interface), port); - return; - } - if (mode != MLO_AN_PHY && mode != MLO_AN_FIXED) { dev_err(priv->dev, "port %d supports only conventional PHY or fixed-link\n", From e67b72b90b7e19a4be4d9c29f3feea6f58ab43f8 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 7 Jun 2022 15:02:14 +0800 Subject: [PATCH 70/79] tcp: use alloc_large_system_hash() to allocate table_perturb In our server, there may be no high order (>= 6) memory since we reserve lots of HugeTLB pages when booting. Then the system panic. So use alloc_large_system_hash() to allocate table_perturb. Fixes: e9261476184b ("tcp: dynamically allocate the perturb table used by source ports") Signed-off-by: Muchun Song Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220607070214.94443-1-songmuchun@bytedance.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8de5e699b3f..545f91b6cb5e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1026,10 +1026,12 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, init_hashinfo_lhash2(h); /* this one is used for source ports of outgoing connections */ - table_perturb = kmalloc_array(INET_TABLE_PERTURB_SIZE, - sizeof(*table_perturb), GFP_KERNEL); - if (!table_perturb) - panic("TCP: failed to alloc table_perturb"); + table_perturb = alloc_large_system_hash("Table-perturb", + sizeof(*table_perturb), + INET_TABLE_PERTURB_SIZE, + 0, 0, NULL, NULL, + INET_TABLE_PERTURB_SIZE, + INET_TABLE_PERTURB_SIZE); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) From 647df0d41b6bd8f4987dde6e8d8d0aba5b082985 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Tue, 7 Jun 2022 12:11:19 -0700 Subject: [PATCH 71/79] net: amd-xgbe: fix clang -Wformat warning see warning: | drivers/net/ethernet/amd/xgbe/xgbe-drv.c:2787:43: warning: format specifies | type 'unsigned short' but the argument has type 'int' [-Wformat] | netdev_dbg(netdev, "Protocol: %#06hx\n", ntohs(eth->h_proto)); | ~~~~~~ ^~~~~~~~~~~~~~~~~~~ Variadic functions (printf-like) undergo default argument promotion. Documentation/core-api/printk-formats.rst specifically recommends using the promoted-to-type's format flag. Also, as per C11 6.3.1.1: (https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1548.pdf) `If an int can represent all values of the original type ..., the value is converted to an int; otherwise, it is converted to an unsigned int. These are called the integer promotions.` Since the argument is a u16 it will get promoted to an int and thus it is most accurate to use the %x format specifier here. It should be noted that the `#06` formatting sugar does not alter the promotion rules. Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Justin Stitt Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20220607191119.20686-1-jstitt007@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index a3593290886f..4d46780fad13 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2784,7 +2784,7 @@ void xgbe_print_pkt(struct net_device *netdev, struct sk_buff *skb, bool tx_rx) netdev_dbg(netdev, "Dst MAC addr: %pM\n", eth->h_dest); netdev_dbg(netdev, "Src MAC addr: %pM\n", eth->h_source); - netdev_dbg(netdev, "Protocol: %#06hx\n", ntohs(eth->h_proto)); + netdev_dbg(netdev, "Protocol: %#06x\n", ntohs(eth->h_proto)); for (i = 0; i < skb->len; i += 32) { unsigned int len = min(skb->len - i, 32U); From 8e1278444446fc97778a5e5c99bca1ce0bbc5ec9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 7 Jun 2022 00:34:56 +1000 Subject: [PATCH 72/79] powerpc/32: Fix overread/overwrite of thread_struct via ptrace The ptrace PEEKUSR/POKEUSR (aka PEEKUSER/POKEUSER) API allows a process to read/write registers of another process. To get/set a register, the API takes an index into an imaginary address space called the "USER area", where the registers of the process are laid out in some fashion. The kernel then maps that index to a particular register in its own data structures and gets/sets the value. The API only allows a single machine-word to be read/written at a time. So 4 bytes on 32-bit kernels and 8 bytes on 64-bit kernels. The way floating point registers (FPRs) are addressed is somewhat complicated, because double precision float values are 64-bit even on 32-bit CPUs. That means on 32-bit kernels each FPR occupies two word-sized locations in the USER area. On 64-bit kernels each FPR occupies one word-sized location in the USER area. Internally the kernel stores the FPRs in an array of u64s, or if VSX is enabled, an array of pairs of u64s where one half of each pair stores the FPR. Which half of the pair stores the FPR depends on the kernel's endianness. To handle the different layouts of the FPRs depending on VSX/no-VSX and big/little endian, the TS_FPR() macro was introduced. Unfortunately the TS_FPR() macro does not take into account the fact that the addressing of each FPR differs between 32-bit and 64-bit kernels. It just takes the index into the "USER area" passed from userspace and indexes into the fp_state.fpr array. On 32-bit there are 64 indexes that address FPRs, but only 32 entries in the fp_state.fpr array, meaning the user can read/write 256 bytes past the end of the array. Because the fp_state sits in the middle of the thread_struct there are various fields than can be overwritten, including some pointers. As such it may be exploitable. It has also been observed to cause systems to hang or otherwise misbehave when using gdbserver, and is probably the root cause of this report which could not be easily reproduced: https://lore.kernel.org/linuxppc-dev/dc38afe9-6b78-f3f5-666b-986939e40fc6@keymile.com/ Rather than trying to make the TS_FPR() macro even more complicated to fix the bug, or add more macros, instead add a special-case for 32-bit kernels. This is more obvious and hopefully avoids a similar bug happening again in future. Note that because 32-bit kernels never have VSX enabled the code doesn't need to consider TS_FPRWIDTH/OFFSET at all. Add a BUILD_BUG_ON() to ensure that 32-bit && VSX is never enabled. Fixes: 87fec0514f61 ("powerpc: PTRACE_PEEKUSR/PTRACE_POKEUSER of FPR registers in little endian builds") Cc: stable@vger.kernel.org # v3.13+ Reported-by: Ariel Miculas Tested-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220609133245.573565-1-mpe@ellerman.id.au --- arch/powerpc/kernel/ptrace/ptrace-fpu.c | 20 ++++++++++++++------ arch/powerpc/kernel/ptrace/ptrace.c | 3 +++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c b/arch/powerpc/kernel/ptrace/ptrace-fpu.c index 5dca19361316..09c49632bfe5 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-fpu.c +++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c @@ -17,9 +17,13 @@ int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data) #ifdef CONFIG_PPC_FPU_REGS flush_fp_to_thread(child); - if (fpidx < (PT_FPSCR - PT_FPR0)) - memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long)); - else + if (fpidx < (PT_FPSCR - PT_FPR0)) { + if (IS_ENABLED(CONFIG_PPC32)) + // On 32-bit the index we are passed refers to 32-bit words + *data = ((u32 *)child->thread.fp_state.fpr)[fpidx]; + else + memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long)); + } else *data = child->thread.fp_state.fpscr; #else *data = 0; @@ -39,9 +43,13 @@ int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data) #ifdef CONFIG_PPC_FPU_REGS flush_fp_to_thread(child); - if (fpidx < (PT_FPSCR - PT_FPR0)) - memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long)); - else + if (fpidx < (PT_FPSCR - PT_FPR0)) { + if (IS_ENABLED(CONFIG_PPC32)) + // On 32-bit the index we are passed refers to 32-bit words + ((u32 *)child->thread.fp_state.fpr)[fpidx] = data; + else + memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long)); + } else child->thread.fp_state.fpscr = data; #endif diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 4d2dc22d4a2d..5d7a72b41ae7 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -444,4 +444,7 @@ void __init pt_regs_check(void) * real registers. */ BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); + + // ptrace_get/put_fpr() rely on PPC32 and VSX being incompatible + BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC32) && IS_ENABLED(CONFIG_VSX)); } From 7aefd8b53815274f3ef398d370a3c9b27dd9f00c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jun 2022 16:59:29 -0700 Subject: [PATCH 73/79] drm: imx: fix compiler warning with gcc-12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gcc-12 correctly warned about this code using a non-NULL pointer as a truth value: drivers/gpu/drm/imx/ipuv3-crtc.c: In function ‘ipu_crtc_disable_planes’: drivers/gpu/drm/imx/ipuv3-crtc.c:72:21: error: the comparison will always evaluate as ‘true’ for the address of ‘plane’ will never be NULL [-Werror=address] 72 | if (&ipu_crtc->plane[1] && plane == &ipu_crtc->plane[1]->base) | ^ due to the extraneous '&' address-of operator. Philipp Zabel points out that The mistake had no adverse effect since the following condition doesn't actually dereference the NULL pointer, but the intent of the code was obviously to check for it, not to take the address of the member. Fixes: eb8c88808c83 ("drm/imx: add deferred plane disabling") Acked-by: Philipp Zabel Signed-off-by: Linus Torvalds --- drivers/gpu/drm/imx/ipuv3-crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/imx/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3-crtc.c index 9c8829f945b2..f7863d6dea80 100644 --- a/drivers/gpu/drm/imx/ipuv3-crtc.c +++ b/drivers/gpu/drm/imx/ipuv3-crtc.c @@ -69,7 +69,7 @@ static void ipu_crtc_disable_planes(struct ipu_crtc *ipu_crtc, drm_atomic_crtc_state_for_each_plane(plane, old_crtc_state) { if (plane == &ipu_crtc->plane[0]->base) disable_full = true; - if (&ipu_crtc->plane[1] && plane == &ipu_crtc->plane[1]->base) + if (ipu_crtc->plane[1] && plane == &ipu_crtc->plane[1]->base) disable_partial = true; } From 49beadbd47c270a00754c107a837b4f29df4c822 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 9 Jun 2022 09:41:42 -0700 Subject: [PATCH 74/79] gcc-12: disable '-Wdangling-pointer' warning for now MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the concept of checking for dangling pointers to local variables at function exit is really interesting, the gcc-12 implementation is not compatible with reality, and results in false positives. For example, gcc sees us putting things on a local list head allocated on the stack, which involves exactly those kinds of pointers to the local stack entry: In function ‘__list_add’, inlined from ‘list_add_tail’ at include/linux/list.h:102:2, inlined from ‘rebuild_snap_realms’ at fs/ceph/snap.c:434:2: include/linux/list.h:74:19: warning: storing the address of local variable ‘realm_queue’ in ‘*&realm_27(D)->rebuild_item.prev’ [-Wdangling-pointer=] 74 | new->prev = prev; | ~~~~~~~~~~^~~~~~ But then gcc - understandably - doesn't really understand the big picture how the doubly linked list works, so doesn't see how we then end up emptying said list head in a loop and the pointer we added has been removed. Gcc also complains about us (intentionally) using this as a way to store a kind of fake stack trace, eg drivers/acpi/acpica/utdebug.c:40:38: warning: storing the address of local variable ‘current_sp’ in ‘acpi_gbl_entry_stack_pointer’ [-Wdangling-pointer=] 40 | acpi_gbl_entry_stack_pointer = ¤t_sp; | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~ which is entirely reasonable from a compiler standpoint, and we may want to change those kinds of patterns, but not not. So this is one of those "it would be lovely if the compiler were to complain about us leaving dangling pointers to the stack", but not this way. Signed-off-by: Linus Torvalds --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index c43d825a3c4c..09208ffca353 100644 --- a/Makefile +++ b/Makefile @@ -805,6 +805,9 @@ endif KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) +# These result in bogus false positives +KBUILD_CFLAGS += $(call cc-disable-warning, dangling-pointer) + ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls else From 842c3b3ddc5f4d17275edbaa09e23d712bf8b915 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 9 Jun 2022 10:03:28 -0700 Subject: [PATCH 75/79] mellanox: mlx5: avoid uninitialized variable warning with gcc-12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc-12 started warning about 'tracker' being used uninitialized: drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c: In function ‘mlx5_do_bond’: drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c:786:28: warning: ‘tracker’ is used uninitialized [-Wuninitialized] 786 | struct lag_tracker tracker; | ^~~~~~~ which seems to be because it doesn't track how the use (and initialization) is bound by the 'do_bond' flag. But admittedly that 'do_bond' usage is fairly complicated, and involves passing it around as an argument to helper functions, so it's somewhat understandable that gcc doesn't see how that all works. This function could be rewritten to make the use of that tracker variable more obviously safe, but for now I'm just adding the forced initialization of it. Signed-off-by: Linus Torvalds --- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 552b6e26e701..2a8fc547eb37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -783,7 +783,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev) { struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev; struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev; - struct lag_tracker tracker; + struct lag_tracker tracker = { }; bool do_bond, roce_lag; int err; int i; From f0be87c42cbd341d436d06da4792e6b0c83c3aeb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 9 Jun 2022 10:11:12 -0700 Subject: [PATCH 76/79] gcc-12: disable '-Warray-bounds' universally for now In commit 8b202ee21839 ("s390: disable -Warray-bounds") the s390 people disabled the '-Warray-bounds' warning for gcc-12, because the new logic in gcc would cause warnings for their use of the S390_lowcore macro, which accesses absolute pointers. It turns out gcc-12 has many other issues in this area, so this takes that s390 warning disable logic, and turns it into a kernel build config entry instead. Part of the intent is that we can make this all much more targeted, and use this conflig flag to disable it in only particular configurations that cause problems, with the s390 case as an example: select GCC12_NO_ARRAY_BOUNDS and we could do that for other configuration cases that cause issues. Or we could possibly use the CONFIG_CC_NO_ARRAY_BOUNDS thing in a more targeted way, and disable the warning only for particular uses: again the s390 case as an example: KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) but this ends up just doing it globally in the top-level Makefile, since the current issues are spread fairly widely all over: KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds We'll try to limit this later, since the gcc-12 problems are rare enough that *much* of the kernel can be built with it without disabling this warning. Cc: Kees Cook Cc: Nathan Chancellor Signed-off-by: Linus Torvalds --- Makefile | 1 + arch/s390/Kconfig | 1 + arch/s390/Makefile | 10 +--------- init/Kconfig | 9 +++++++++ 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 09208ffca353..b2e93c1a8021 100644 --- a/Makefile +++ b/Makefile @@ -788,6 +788,7 @@ stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG) := -fstack-protector-strong KBUILD_CFLAGS += $(stackp-flags-y) KBUILD_CFLAGS-$(CONFIG_WERROR) += -Werror +KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH) ifdef CONFIG_CC_IS_CLANG diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b1a88f6cc349..5126a5db41fd 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -125,6 +125,7 @@ config S390 select CLONE_BACKWARDS2 select DMA_OPS if PCI select DYNAMIC_FTRACE if FUNCTION_TRACER + select GCC12_NO_ARRAY_BOUNDS select GENERIC_ALLOCATOR select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES diff --git a/arch/s390/Makefile b/arch/s390/Makefile index d73611b35164..495c68a4df1e 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -32,15 +32,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) - -ifdef CONFIG_CC_IS_GCC - ifeq ($(call cc-ifversion, -ge, 1200, y), y) - ifeq ($(call cc-ifversion, -lt, 1300, y), y) - KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) - KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, array-bounds) - endif - endif -endif +KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds) UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) diff --git a/init/Kconfig b/init/Kconfig index c984afc489de..c7900e8975f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -885,6 +885,15 @@ config CC_IMPLICIT_FALLTHROUGH default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough) +# Currently, disable gcc-12 array-bounds globally. +# We may want to target only particular configurations some day. +config GCC12_NO_ARRAY_BOUNDS + def_bool y + +config CC_NO_ARRAY_BOUNDS + bool + default y if CC_IS_GCC && GCC_VERSION >= 120000 && GCC_VERSION < 130000 && GCC12_NO_ARRAY_BOUNDS + # # For architectures that know their GCC __int128 support is sound # From 507160f46c55913955d272ebf559d63809a8e560 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 9 Jun 2022 11:29:36 -0700 Subject: [PATCH 77/79] netfs: gcc-12: temporarily disable '-Wattribute-warning' for now MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a pure band-aid so that I can continue merging stuff from people while some of the gcc-12 fallout gets sorted out. In particular, gcc-12 is very unhappy about the kinds of pointer arithmetic tricks that netfs does, and that makes the fortify checks trigger in afs and ceph: In function ‘fortify_memset_chk’, inlined from ‘netfs_i_context_init’ at include/linux/netfs.h:327:2, inlined from ‘afs_set_netfs_context’ at fs/afs/inode.c:61:2, inlined from ‘afs_root_iget’ at fs/afs/inode.c:543:2: include/linux/fortify-string.h:258:25: warning: call to ‘__write_overflow_field’ declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 258 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ and the reason is that netfs_i_context_init() is passed a 'struct inode' pointer, and then it does struct netfs_i_context *ctx = netfs_i_context(inode); memset(ctx, 0, sizeof(*ctx)); where that netfs_i_context() function just does pointer arithmetic on the inode pointer, knowing that the netfs_i_context is laid out immediately after it in memory. This is all truly disgusting, since the whole "netfs_i_context is laid out immediately after it in memory" is not actually remotely true in general, but is just made to be that way for afs and ceph. See for example fs/cifs/cifsglob.h: struct cifsInodeInfo { struct { /* These must be contiguous */ struct inode vfs_inode; /* the VFS's inode record */ struct netfs_i_context netfs_ctx; /* Netfslib context */ }; [...] and realize that this is all entirely wrong, and the pointer arithmetic that netfs_i_context() is doing is also very very wrong and wouldn't give the right answer if netfs_ctx had different alignment rules from a 'struct inode', for example). Anyway, that's just a long-winded way to say "the gcc-12 warning is actually quite reasonable, and our code happens to work but is pretty disgusting". This is getting fixed properly, but for now I made the mistake of thinking "the week right after the merge window tends to be calm for me as people take a breather" and I did a sustem upgrade. And I got gcc-12 as a result, so to continue merging fixes from people and not have the end result drown in warnings, I am fixing all these gcc-12 issues I hit. Including with these kinds of temporary fixes. Cc: Kees Cook Cc: David Howells Link: https://lore.kernel.org/all/AEEBCF5D-8402-441D-940B-105AA718C71F@chromium.org/ Signed-off-by: Linus Torvalds --- fs/afs/inode.c | 3 +++ fs/ceph/inode.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 30b066299d39..65b439cd53d2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -25,6 +25,9 @@ #include "internal.h" #include "afs_fs.h" +// Temporary: netfs does disgusting things with inode pointers +#pragma GCC diagnostic ignored "-Wattribute-warning" + static const struct inode_operations afs_symlink_inode_operations = { .get_link = page_get_link, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index b7e9cac3aeef..5f1cc2b4ed06 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -20,6 +20,9 @@ #include "cache.h" #include +// Temporary: netfs does disgusting things with inode pointers +#pragma GCC diagnostic ignored "-Wattribute-warning" + /* * Ceph inode operations * From 41e456400212803704e82691716e1d7b0865114a Mon Sep 17 00:00:00 2001 From: Yupeng Li Date: Wed, 8 Jun 2022 09:12:29 +0800 Subject: [PATCH 78/79] MIPS: Loongson-3: fix compile mips cpu_hwmon as module build error. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit set cpu_hwmon as a module build with loongson_sysconf, loongson_chiptemp undefined error,fix cpu_hwmon compile options to be bool.Some kernel compilation error information is as follows: Checking missing-syscalls for N32 CALL scripts/checksyscalls.sh Checking missing-syscalls for O32 CALL scripts/checksyscalls.sh CALL scripts/checksyscalls.sh CHK include/generated/compile.h CC [M] drivers/platform/mips/cpu_hwmon.o Building modules, stage 2. MODPOST 200 modules ERROR: "loongson_sysconf" [drivers/platform/mips/cpu_hwmon.ko] undefined! ERROR: "loongson_chiptemp" [drivers/platform/mips/cpu_hwmon.ko] undefined! make[1]: *** [scripts/Makefile.modpost:92:__modpost] 错误 1 make: *** [Makefile:1261:modules] 错误 2 Signed-off-by: Yupeng Li Reviewed-by: Guenter Roeck Reviewed-by: Huacai Chen Signed-off-by: Thomas Bogendoerfer --- drivers/platform/mips/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/mips/Kconfig b/drivers/platform/mips/Kconfig index d421e1482395..6b51ad01f791 100644 --- a/drivers/platform/mips/Kconfig +++ b/drivers/platform/mips/Kconfig @@ -17,7 +17,7 @@ menuconfig MIPS_PLATFORM_DEVICES if MIPS_PLATFORM_DEVICES config CPU_HWMON - tristate "Loongson-3 CPU HWMon Driver" + bool "Loongson-3 CPU HWMon Driver" depends on MACH_LOONGSON64 select HWMON default y From 874c8ca1e60b2c564a48f7e7acc40d328d5c8733 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 9 Jun 2022 21:46:04 +0100 Subject: [PATCH 79/79] netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton Signed-off-by: David Howells Reviewed-by: Jeff Layton Reviewed-by: Kees Cook Reviewed-by: Xiubo Li cc: Jonathan Corbet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Marc Dionne cc: Ilya Dryomov cc: Steve French cc: William Kucharski cc: "Matthew Wilcox (Oracle)" cc: Dave Chinner cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds --- Documentation/filesystems/netfs_library.rst | 37 ++++--- fs/9p/cache.c | 4 +- fs/9p/v9fs.c | 2 +- fs/9p/v9fs.h | 10 +- fs/9p/vfs_addr.c | 2 +- fs/9p/vfs_inode.c | 4 +- fs/afs/callback.c | 2 +- fs/afs/dir.c | 32 +++--- fs/afs/dir_edit.c | 10 +- fs/afs/dir_silly.c | 4 +- fs/afs/dynroot.c | 2 +- fs/afs/file.c | 4 +- fs/afs/fs_operation.c | 6 +- fs/afs/inode.c | 41 ++++---- fs/afs/internal.h | 23 ++--- fs/afs/super.c | 6 +- fs/afs/write.c | 21 ++-- fs/ceph/addr.c | 4 +- fs/ceph/cache.c | 4 +- fs/ceph/cache.h | 2 +- fs/ceph/caps.c | 104 ++++++++++---------- fs/ceph/file.c | 2 +- fs/ceph/inode.c | 13 +-- fs/ceph/mds_client.c | 4 +- fs/ceph/snap.c | 8 +- fs/ceph/super.c | 2 +- fs/ceph/super.h | 10 +- fs/ceph/xattr.c | 14 +-- fs/cifs/cifsfs.c | 8 +- fs/cifs/cifsglob.h | 12 +-- fs/cifs/file.c | 8 +- fs/cifs/fscache.c | 8 +- fs/cifs/fscache.h | 8 +- fs/cifs/inode.c | 4 +- fs/cifs/misc.c | 4 +- fs/cifs/smb2ops.c | 8 +- fs/netfs/buffered_read.c | 6 +- fs/netfs/internal.h | 2 +- fs/netfs/objects.c | 2 +- include/linux/netfs.h | 41 +++----- 40 files changed, 227 insertions(+), 261 deletions(-) diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index a80a59941d2f..3276c3d55142 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -37,30 +37,31 @@ The network filesystem helper library needs a place to store a bit of state for its use on each netfs inode it is helping to manage. To this end, a context structure is defined:: - struct netfs_i_context { + struct netfs_inode { + struct inode inode; const struct netfs_request_ops *ops; - struct fscache_cookie *cache; + struct fscache_cookie *cache; }; -A network filesystem that wants to use netfs lib must place one of these -directly after the VFS ``struct inode`` it allocates, usually as part of its -own struct. This can be done in a way similar to the following:: +A network filesystem that wants to use netfs lib must place one of these in its +inode wrapper struct instead of the VFS ``struct inode``. This can be done in +a way similar to the following:: struct my_inode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; - struct netfs_i_context netfs_ctx; - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ ... }; -This allows netfslib to find its state by simple offset from the inode pointer, -thereby allowing the netfslib helper functions to be pointed to directly by the -VFS/VM operation tables. +This allows netfslib to find its state by using ``container_of()`` from the +inode pointer, thereby allowing the netfslib helper functions to be pointed to +directly by the VFS/VM operation tables. The structure contains the following fields: + * ``inode`` + + The VFS inode structure. + * ``ops`` The set of operations provided by the network filesystem to netfslib. @@ -78,14 +79,12 @@ To help deal with the per-inode context, a number helper functions are provided. Firstly, a function to perform basic initialisation on a context and set the operations table pointer:: - void netfs_i_context_init(struct inode *inode, - const struct netfs_request_ops *ops); + void netfs_inode_init(struct inode *inode, + const struct netfs_request_ops *ops); -then two functions to cast between the VFS inode structure and the netfs -context:: +then a function to cast from the VFS inode structure to the netfs context:: - struct netfs_i_context *netfs_i_context(struct inode *inode); - struct inode *netfs_inode(struct netfs_i_context *ctx); + struct netfs_inode *netfs_node(struct inode *inode); and finally, a function to get the cache cookie pointer from the context attached to an inode (or NULL if fscache is disabled):: diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 1c8dc696d516..cebba4eaa0b5 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) version = cpu_to_le32(v9inode->qid.version); path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->netfs_ctx.cache = + v9inode->netfs.cache = fscache_acquire_cookie(v9fs_session_cache(v9ses), 0, &path, sizeof(path), &version, sizeof(version), - i_size_read(&v9inode->vfs_inode)); + i_size_read(&v9inode->netfs.inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e28ddf763b3b..0129de2ea31a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo) struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; memset(&v9inode->qid, 0, sizeof(v9inode->qid)); - inode_init_once(&v9inode->vfs_inode); + inode_init_once(&v9inode->netfs.inode); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index ec0e8df3b2eb..1b219c21d15e 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -109,11 +109,7 @@ struct v9fs_session_info { #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; @@ -122,13 +118,13 @@ struct v9fs_inode { static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { - return container_of(inode, struct v9fs_inode, vfs_inode); + return container_of(inode, struct v9fs_inode, netfs.inode); } static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return netfs_i_cookie(&v9inode->vfs_inode); + return netfs_i_cookie(&v9inode->netfs.inode); #else return NULL; #endif diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 8ce82ff1e40a..90c6c1ba03ab 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -140,7 +140,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, transferred_or_error != -ENOBUFS) { version = cpu_to_le32(v9inode->qid.version); fscache_invalidate(v9fs_inode_cookie(v9inode), &version, - i_size_read(&v9inode->vfs_inode), 0); + i_size_read(&v9inode->netfs.inode), 0); } } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 55367ecb9442..e660c6348b9d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); - return &v9inode->vfs_inode; + return &v9inode->netfs.inode; } /** @@ -252,7 +252,7 @@ void v9fs_free_inode(struct inode *inode) */ static void v9fs_set_netfs_context(struct inode *inode) { - netfs_i_context_init(inode, &v9fs_req_ops); + netfs_inode_init(inode, &v9fs_req_ops); } int v9fs_init_inode(struct v9fs_session_info *v9ses, diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1b4d5809808d..a484fa642808 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); - unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false); + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } void afs_server_init_callback_work(struct work_struct *work) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 79f6b74336d2..56ae5cd5184f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -109,7 +109,7 @@ struct afs_lookup_cookie { */ static void afs_dir_read_cleanup(struct afs_read *req) { - struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct address_space *mapping = req->vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; @@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, block = kmap_local_folio(folio, offset); if (block->hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", - __func__, dvnode->vfs_inode.i_ino, + __func__, dvnode->netfs.inode.i_ino, pos, offset, size, ntohs(block->hdr.magic)); trace_afs_dir_check_failed(dvnode, pos + offset, i_size); kunmap_local(block); @@ -183,7 +183,7 @@ error: static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { union afs_xdr_dir_block *block; - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; size_t offset, size; @@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) */ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; int ret = 0; @@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file) static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) __acquires(&dvnode->validate_lock) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct afs_read *req; loff_t i_size; int nr_pages, i; @@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) req->cleanup = afs_dir_read_cleanup; expand: - i_size = i_size_read(&dvnode->vfs_inode); + i_size = i_size_read(&dvnode->netfs.inode); if (i_size < 2048) { ret = afs_bad(dvnode, afs_file_error_dir_small); goto error; @@ -305,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; @@ -897,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, out_op: if (op->error == 0) { - inode = &op->file[1].vnode->vfs_inode; + inode = &op->file[1].vnode->netfs.inode; op->file[1].vnode = NULL; } @@ -1139,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1170,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - vnode->vfs_inode.i_generation); + vnode->netfs.inode.i_generation); goto not_found; } goto out_valid; @@ -1368,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -1487,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op) /* Already done */ } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { write_seqlock(&vnode->cb_lock); - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_unlink); } @@ -1504,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op) op->error = ret; } - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); } static void afs_unlink_success(struct afs_operation *op) @@ -1680,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op) afs_update_dentry_version(op, dvp, op->dentry); if (op->dentry_2->d_parent == op->dentry->d_parent) afs_update_dentry_version(op, dvp, op->dentry_2); - ihold(&vp->vnode->vfs_inode); - d_instantiate(op->dentry, &vp->vnode->vfs_inode); + ihold(&vp->vnode->netfs.inode); + d_instantiate(op->dentry, &vp->vnode->netfs.inode); } static void afs_link_put(struct afs_operation *op) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d98e109ecee9..0ab7752d1b75 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, */ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; folio = __filemap_get_folio(mapping, index, @@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -336,7 +336,7 @@ found_space: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] -= need_slots; - inode_inc_iversion_raw(&vnode->vfs_inode); + inode_inc_iversion_raw(&vnode->netfs.inode); afs_stat_v(vnode, n_dir_cr); _debug("Insert %s in %u[%u]", name->name, b, slot); @@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size < AFS_DIR_BLOCK_SIZE || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { @@ -463,7 +463,7 @@ found_dirent: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] += need_slots; - inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); afs_stat_v(vnode, n_dir_rm); _debug("Remove %s from %u[%u]", name->name, b, slot); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 45cfd50a9521..bb5807e87fa4 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, goto out; } while (!d_is_negative(sdentry)); - ihold(&vnode->vfs_inode); + ihold(&vnode->netfs.inode); ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key); switch (ret) { @@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, d_drop(sdentry); } - iput(&vnode->vfs_inode); + iput(&vnode->netfs.inode); dput(sdentry); out: _leave(" = %d", ret); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index f120bcb8bf73..3a5bbffdf053 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); - netfs_i_context_init(inode, NULL); + netfs_inode_init(inode, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index a8e8832179e4..4de7af7c2f09 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file) afs_put_wb_key(af->wb); if ((file->f_mode & FMODE_WRITE)) { - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); afs_set_cache_aux(vnode, &aux); fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); } else { @@ -325,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->iter = &fsreq->def_iter; iov_iter_xarray(&fsreq->def_iter, READ, - &fsreq->vnode->vfs_inode.i_mapping->i_pages, + &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); afs_fetch_data(fsreq->vnode, fsreq); diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index d222dfbe976b..7a3803ce3a22 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op) if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) - iput(&op->file[0].vnode->vfs_inode); + iput(&op->file[0].vnode->netfs.inode); if (op->file[1].put_vnode) - iput(&op->file[1].vnode->vfs_inode); + iput(&op->file[1].vnode->netfs.inode); if (op->more_files) { for (i = 0; i < op->nr_files - 2; i++) if (op->more_files[i].put_vnode) - iput(&op->more_files[i].vnode->vfs_inode); + iput(&op->more_files[i].vnode->netfs.inode); kfree(op->more_files); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 65b439cd53d2..22811e9eacf5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -25,9 +25,6 @@ #include "internal.h" #include "afs_fs.h" -// Temporary: netfs does disgusting things with inode pointers -#pragma GCC diagnostic ignored "-Wattribute-warning" - static const struct inode_operations afs_symlink_inode_operations = { .get_link = page_get_link, }; @@ -61,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren */ static void afs_set_netfs_context(struct afs_vnode *vnode) { - netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops); + netfs_inode_init(&vnode->netfs.inode, &afs_req_ops); } /* @@ -99,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); - set_nlink(&vnode->vfs_inode, status->nlink); + set_nlink(&vnode->netfs.inode, status->nlink); switch (status->type) { case AFS_FTYPE_FILE: @@ -142,7 +139,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; - inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver @@ -166,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; bool data_changed = false; @@ -249,7 +246,7 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ - vnode->netfs_ctx.remote_i_size = status->size; + vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); inode->i_ctime = t; @@ -292,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v */ if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } @@ -309,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v if (vp->scb.have_cb) afs_apply_callback(op, vp); } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_deleted); } @@ -329,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->vfs_inode.i_state & I_NEW) { + if (vnode->netfs.inode.i_state & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); op->error = ret; if (ret == 0) @@ -433,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct afs_vnode_cache_aux aux; if (vnode->status.type != AFS_FTYPE_FILE) { - vnode->netfs_ctx.cache = NULL; + vnode->netfs.cache = NULL; return; } @@ -460,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) { struct afs_vnode_param *dvp = &op->file[0]; - struct super_block *sb = dvp->vnode->vfs_inode.i_sb; + struct super_block *sb = dvp->vnode->netfs.inode.i_sb; struct afs_vnode *vnode; struct inode *inode; int ret; @@ -585,10 +582,10 @@ static void afs_zap_data(struct afs_vnode *vnode) /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a * directory or symlink */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - invalidate_remote_inode(&vnode->vfs_inode); + if (S_ISREG(vnode->netfs.inode.i_mode)) + invalidate_remote_inode(&vnode->netfs.inode); else - invalidate_inode_pages2(vnode->vfs_inode.i_mapping); + invalidate_inode_pages2(vnode->netfs.inode.i_mapping); } /* @@ -686,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) key_serial(key)); if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->vfs_inode.i_nlink) - clear_nlink(&vnode->vfs_inode); + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); goto valid; } @@ -829,7 +826,7 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; loff_t old_i_size = i_size_read(inode); op->setattr.old_i_size = old_i_size; @@ -846,7 +843,7 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; @@ -878,7 +875,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; loff_t i_size; int ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a30995901266..984b113a9107 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -619,12 +619,7 @@ enum afs_lock_state { * leak from one inode to another. */ struct afs_vnode { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; - + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ @@ -675,7 +670,7 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return netfs_i_cookie(&vnode->vfs_inode); + return netfs_i_cookie(&vnode->netfs.inode); #else return NULL; #endif @@ -685,7 +680,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, struct fscache_cookie *cookie) { #ifdef CONFIG_AFS_FSCACHE - vnode->netfs_ctx.cache = cookie; + vnode->netfs.cache = cookie; #endif } @@ -892,7 +887,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl afs_set_cache_aux(vnode, &aux); fscache_invalidate(afs_vnode_cache(vnode), &aux, - i_size_read(&vnode->vfs_inode), flags); + i_size_read(&vnode->netfs.inode), flags); } /* @@ -1217,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode) static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { - return afs_i2net(&vnode->vfs_inode); + return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) @@ -1593,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *); */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { - return container_of(inode, struct afs_vnode, vfs_inode); + return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { - return &vnode->vfs_inode; + return &vnode->netfs.inode; } /* @@ -1621,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op, */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; + i_size_write(&vnode->netfs.inode, size); + vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fea195b0b27..95d713074dc8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode) struct afs_vnode *vnode = _vnode; memset(vnode, 0, sizeof(*vnode)); - inode_init_once(&vnode->vfs_inode); + inode_init_once(&vnode->netfs.inode); mutex_init(&vnode->io_lock); init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); @@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb) init_rwsem(&vnode->rmdir_lock); INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); - _leave(" = %p", &vnode->vfs_inode); - return &vnode->vfs_inode; + _leave(" = %p", &vnode->netfs.inode); + return &vnode->netfs.inode; } static void afs_free_inode(struct inode *inode) diff --git a/fs/afs/write.c b/fs/afs/write.c index 2236b2165e37..f80a6096d91c 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_end_pos = pos + copied; - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (write_end_pos > i_size) { write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (write_end_pos > i_size) afs_set_i_size(vnode, write_end_pos); write_sequnlock(&vnode->cb_lock); @@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t end; @@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = { static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, bool laundering) { - struct netfs_i_context *ictx = &vnode->netfs_ctx; struct afs_operation *op; struct afs_wb_key *wbk = NULL; loff_t size = iov_iter_count(iter); @@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, ictx->remote_i_size); + op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); op->store.laundering = laundering; - op->mtime = vnode->vfs_inode.i_mtime; + op->mtime = vnode->netfs.inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, struct iov_iter iter; unsigned long priv; unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->vfs_inode); + loff_t i_size = i_size_read(&vnode->netfs.inode); bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; @@ -845,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) _enter("{%llx:%llu},{%zu},", vnode->fid.vid, vnode->fid.vnode, count); - if (IS_SWAPFILE(&vnode->vfs_inode)) { + if (IS_SWAPFILE(&vnode->netfs.inode)) { printk(KERN_INFO "AFS: Attempt to write to active swap file!\n"); return -EBUSY; @@ -958,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* Discard unused keys */ spin_lock(&vnode->wb_lock); - if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) && - !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) { + if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) && + !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) { list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) { if (refcount_read(&wbk->usage) == 1) list_move(&wbk->vnode_link, &graveyard); @@ -1034,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode, bool caching) { fscache_write_to_cache(afs_vnode_cache(vnode), - vnode->vfs_inode.i_mapping, start, len, i_size, + vnode->netfs.inode.i_mapping, start, len, i_size, afs_write_to_cache_done, vnode, caching); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e5221be6eb55..f5f116ed1b9e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1798,7 +1798,7 @@ enum { static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; @@ -1913,7 +1913,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_mtime = ci->netfs.inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index ddea99922073..177d8e8d73fe 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!(inode->i_state & I_NEW)) return; - WARN_ON_ONCE(ci->netfs_ctx.cache); + WARN_ON_ONCE(ci->netfs.cache); - ci->netfs_ctx.cache = + ci->netfs.cache = fscache_acquire_cookie(fsc->fscache, 0, &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 7255b790a4c1..26c6ae06e2f4 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write); static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - return netfs_i_cookie(&ci->vfs_inode); + return netfs_i_cookie(&ci->netfs.inode); } static inline void ceph_fscache_resize(struct inode *inode, loff_t to) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bf2e94005598..38c930384d41 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode, ci->i_hold_caps_max - jiffies); } @@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); @@ -531,7 +531,7 @@ no_change: static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) @@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + dout("__cap_delay_cancel %p\n", &ci->netfs.inode); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); @@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; @@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); - if (S_ISDIR(ci->vfs_inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->vfs_inode); + if (S_ISDIR(ci->netfs.inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->netfs.inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ - if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); @@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap) if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " - "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode, cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } @@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (!__cap_is_valid(cap)) continue; dout("__ceph_caps_issued %p cap %p issued %s\n", - &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + &ci->netfs.inode, cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; @@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap) spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { - dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", - &cap->ci->vfs_inode, cap, s->s_mds); + &cap->ci->netfs.inode, cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } @@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; @@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) continue; if ((cap->issued & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap, + " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) @@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) have |= cap->issued; if ((have & mask) == mask) { dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" - " (mask %s)\n", ceph_ino(&ci->vfs_inode), + " (mask %s)\n", ceph_ino(&ci->netfs.inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { @@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); @@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int ret; spin_lock(&ci->i_ceph_lock); @@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci) if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || - (S_ISREG(ci->vfs_inode.i_mode) && - ci->vfs_inode.i_data.nrpages)) + (S_ISREG(ci->netfs.inode.i_mode) && + ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ @@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (S_ISDIR(ci->vfs_inode.i_mode)) { + if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; @@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); - mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); @@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - ceph_change_snap_realm(&ci->vfs_inode, NULL); + ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } @@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - fsc = ceph_inode_to_client(&ci->vfs_inode); + fsc = ceph_inode_to_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && - !ceph_inode_is_shutdown(&ci->vfs_inode)); + !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } @@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); @@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); if (!msg) { @@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; @@ -1622,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; int mds; @@ -1682,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct inode *inode = &ci->vfs_inode; + ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc; + struct inode *inode = &ci->netfs.inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1696,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, return 0; } - dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; @@ -1712,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ci->i_snap_realm->cached_context); } dout(" inode %p now dirty snapc %p auth cap %p\n", - &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); @@ -1875,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = i_size_read(&ci->vfs_inode); + loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -1900,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; @@ -2467,7 +2467,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; @@ -2560,7 +2560,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2610,7 +2610,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err("%p auth cap %p not mds%d ???\n", - &ci->vfs_inode, cap, session->s_mds); + &ci->netfs.inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2630,7 +2630,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, lockdep_assert_held(&ci->i_ceph_lock); - dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, + dout("%s %p flushing %s\n", __func__, &ci->netfs.inode, ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { @@ -2673,10 +2673,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) - ihold(&ci->vfs_inode); + ihold(&ci->netfs.inode); ci->i_wb_ref++; dout("%s %p wb %d -> %d (?)\n", __func__, - &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); + &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -3004,7 +3004,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return ret; } - if (S_ISREG(ci->vfs_inode.i_mode) && + if (S_ISREG(ci->netfs.inode.i_mode) && ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { @@ -3094,7 +3094,7 @@ enum put_cap_refs_mode { static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; @@ -3202,7 +3202,7 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; @@ -3698,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, session->s_mds, &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, - i_flushing_item)->vfs_inode); + i_flushing_item)->netfs.inode); } } mdsc->num_cap_flushing--; @@ -4345,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) break; list_del_init(&ci->i_cap_delay_list); - inode = igrab(&ci->vfs_inode); + inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); @@ -4373,7 +4373,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); @@ -4407,7 +4407,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci, void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; @@ -4441,7 +4441,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; @@ -4656,7 +4656,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali lockdep_assert_held(&ci->i_ceph_lock); dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c8226c0feac..da59e836a06e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -205,7 +205,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->vfs_inode)->mount_options; + ceph_inode_to_client(&ci->netfs.inode)->mount_options; struct ceph_file_info *fi; int ret; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5f1cc2b4ed06..650746b3ba99 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -20,9 +20,6 @@ #include "cache.h" #include -// Temporary: netfs does disgusting things with inode pointers -#pragma GCC diagnostic ignored "-Wattribute-warning" - /* * Ceph inode operations * @@ -179,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_insert_color(&frag->node, &ci->i_fragtree); dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->vfs_inode), f); + ceph_vinop(&ci->netfs.inode), f); return frag; } @@ -460,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->vfs_inode); + dout("alloc_inode %p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ - netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops); + netfs_inode_init(&ci->netfs.inode, &ceph_netfs_ops); spin_lock_init(&ci->i_ceph_lock); @@ -550,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); - return &ci->vfs_inode; + return &ci->netfs.inode; } void ceph_free_inode(struct inode *inode) @@ -1981,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work) { struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_work); - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { dout("writeback %p\n", inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f5d110d90b77..33f517d549ce 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1564,7 +1564,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, p = session->s_caps.next; while (p != &session->s_caps) { cap = list_entry(p, struct ceph_cap, session_caps); - inode = igrab(&cap->ci->vfs_inode); + inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; @@ -1622,7 +1622,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, int iputs; dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->vfs_inode); + cap, ci, &ci->netfs.inode); spin_lock(&ci->i_ceph_lock); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 322ee5add942..864cdaa0d2bd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o, static void ceph_queue_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap **pcapsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_snap_context *old_snapc, *new_snapc; struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; @@ -652,7 +652,7 @@ update_snapc: int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { - struct inode *inode = &ci->vfs_inode; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); @@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); + struct inode *inode = igrab(&ci->netfs.inode); if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); @@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); - inode = &ci->vfs_inode; + inode = &ci->netfs.inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b73b4f75462c..40140805bdcf 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -876,7 +876,7 @@ mempool_t *ceph_wb_pagevec_pool; static void ceph_inode_init_once(void *foo) { struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); + inode_init_once(&ci->netfs.inode); } static int __init init_caches(void) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dd7dac0f984a..f59dac66955b 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -316,11 +316,7 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ struct ceph_inode_info { - struct { - /* These must be contiguous */ - struct inode vfs_inode; - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; @@ -436,7 +432,7 @@ struct ceph_inode_info { static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { - return container_of(inode, struct ceph_inode_info, vfs_inode); + return container_of(inode, struct ceph_inode_info, netfs.inode); } static inline struct ceph_fs_client * @@ -1316,7 +1312,7 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) - ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); + ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } extern void ceph_handle_quota(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 8c2dc2c762a4..f141f5246163 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_string *pool_ns; s64 pool = ci->i_layout.pool_id; @@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); - dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { @@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, char *val, size_t size) { ssize_t ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ci->i_layout.pool_id; const char *pool_name; @@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); } @@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "client%lld", ceph_client_gid(fsc->client)); @@ -629,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci, } dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n", - ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val); + ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val); return 0; } @@ -871,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) struct ceph_buffer *old_blob = NULL; void *dest; - dout("__build_xattrs_blob %p\n", &ci->vfs_inode); + dout("__build_xattrs_blob %p\n", &ci->netfs.inode); if (ci->i_xattrs.dirty) { int need = __get_required_blob_size(ci, 0, 0); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 12c872800326..c85d9a378325 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -377,7 +377,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->flags = 0; spin_lock_init(&cifs_inode->writers_lock); cifs_inode->writers = 0; - cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ + cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; @@ -389,12 +389,12 @@ cifs_alloc_inode(struct super_block *sb) * Can not set i_flags here - they get immediately overwritten to zero * by the VFS. */ - /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ + /* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); INIT_LIST_HEAD(&cifs_inode->llist); INIT_LIST_HEAD(&cifs_inode->deferred_closes); spin_lock_init(&cifs_inode->deferred_lock); - return &cifs_inode->vfs_inode; + return &cifs_inode->netfs.inode; } static void @@ -1418,7 +1418,7 @@ cifs_init_once(void *inode) { struct cifsInodeInfo *cifsi = inode; - inode_init_once(&cifsi->vfs_inode); + inode_init_once(&cifsi->netfs.inode); init_rwsem(&cifsi->lock_sem); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f873379066c7..e7737166e5b8 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1479,20 +1479,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) #define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) +#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE)) #define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) -#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) +#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE)) /* * One of these for each file inode */ struct cifsInodeInfo { - struct { - /* These must be contiguous */ - struct inode vfs_inode; /* the VFS's inode record */ - struct netfs_i_context netfs_ctx; /* Netfslib context */ - }; + struct netfs_inode netfs; /* Netfslib context and vfs inode */ bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ /* @@ -1531,7 +1527,7 @@ struct cifsInodeInfo { static inline struct cifsInodeInfo * CIFS_I(struct inode *inode) { - return container_of(inode, struct cifsInodeInfo, vfs_inode); + return container_of(inode, struct cifsInodeInfo, netfs.inode); } static inline struct cifs_sb_info * diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1618e0537d58..e64cda7a7610 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2004,7 +2004,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file = NULL; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -2060,7 +2060,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, return rc; } - cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) @@ -4669,14 +4669,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) /* This inode is open for write at least once */ struct cifs_sb_info *cifs_sb; - cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb); + cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) { /* since no page cache to corrupt on directio we can change size safely */ return true; } - if (i_size_read(&cifsInode->vfs_inode) < end_of_file) + if (i_size_read(&cifsInode->netfs.inode) < end_of_file) return true; return false; diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a638b29e9062..23ef56f55ce5 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); + cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd); - cifsi->netfs_ctx.cache = + cifsi->netfs.cache = fscache_acquire_cookie(tcon->fscache, 0, &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), - i_size_read(&cifsi->vfs_inode)); + i_size_read(&cifsi->netfs.inode)); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) @@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) if (cookie) { cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie); fscache_relinquish_cookie(cookie, false); - cifsi->netfs_ctx.cache = NULL; + cifsi->netfs.cache = NULL; } } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 52355c0912ae..ab9a51d0125c 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -52,10 +52,10 @@ void cifs_fscache_fill_coherency(struct inode *inode, struct cifsInodeInfo *cifsi = CIFS_I(inode); memset(cd, 0, sizeof(*cd)); - cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); - cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); - cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); - cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); + cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 2f9e7d2f81b6..81da81e18553 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -115,7 +115,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) __func__, cifs_i->uniqueid); set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags); /* Invalidate fscache cookie */ - cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd); + cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd); fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); } @@ -2499,7 +2499,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, u64 len) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; struct cifsFileInfo *cfile; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 35962a1a23b9..cbc3b433f502 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -537,11 +537,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) if (oplock == OPLOCK_EXCLUSIVE) { cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == OPLOCK_READ) { cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else cinode->oplock = 0; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 98a76fa791c0..8543cafdfd34 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4260,15 +4260,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { cinode->oplock = CIFS_CACHE_RHW_FLG; cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { cinode->oplock = CIFS_CACHE_RW_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else if (oplock == SMB2_OPLOCK_LEVEL_II) { cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); + &cinode->netfs.inode); } else cinode->oplock = 0; } @@ -4307,7 +4307,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, - &cinode->vfs_inode); + &cinode->netfs.inode); } static void diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 8742d22dfd2b..d37e012386f3 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq, void netfs_readahead(struct readahead_control *ractl) { struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host); + struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); @@ -215,7 +215,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) { struct address_space *mapping = folio_file_mapping(folio); struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(mapping->host); + struct netfs_inode *ctx = netfs_inode(mapping->host); int ret; _enter("%lx", folio_index(folio)); @@ -331,7 +331,7 @@ int netfs_write_begin(struct file *file, struct address_space *mapping, void **_fsdata) { struct netfs_io_request *rreq; - struct netfs_i_context *ctx = netfs_i_context(file_inode(file )); + struct netfs_inode *ctx = netfs_inode(file_inode(file )); struct folio *folio; unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index b7b0e3d18d9e..43fac1b14e40 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat) /* * Miscellaneous functions. */ -static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx) +static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx) { #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cookie = ctx->cache; diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index e86107b30ba4..c6afa605b63b 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, { static atomic_t debug_ids; struct inode *inode = file ? file_inode(file) : mapping->host; - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); struct netfs_io_request *rreq; int ret; diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 77fa6a61706a..6dbb4c9ce50d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -119,9 +119,10 @@ typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, bool was_async); /* - * Per-inode description. This must be directly after the inode struct. + * Per-inode context. This wraps the VFS inode. */ -struct netfs_i_context { +struct netfs_inode { + struct inode inode; /* The VFS inode */ const struct netfs_request_ops *ops; #if IS_ENABLED(CONFIG_FSCACHE) struct fscache_cookie *cache; @@ -256,7 +257,7 @@ struct netfs_cache_ops { * boundary as appropriate. */ enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq, - loff_t i_size); + loff_t i_size); /* Prepare a write operation, working out what part of the write we can * actually do. @@ -288,45 +289,35 @@ extern void netfs_put_subrequest(struct netfs_io_subrequest *subreq, extern void netfs_stats_show(struct seq_file *); /** - * netfs_i_context - Get the netfs inode context from the inode + * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query * * Get the netfs lib inode context from the network filesystem's inode. The * context struct is expected to directly follow on from the VFS inode struct. */ -static inline struct netfs_i_context *netfs_i_context(struct inode *inode) +static inline struct netfs_inode *netfs_inode(struct inode *inode) { - return (void *)inode + sizeof(*inode); + return container_of(inode, struct netfs_inode, inode); } /** - * netfs_inode - Get the netfs inode from the inode context - * @ctx: The context to query - * - * Get the netfs inode from the netfs library's inode context. The VFS inode - * is expected to directly precede the context struct. - */ -static inline struct inode *netfs_inode(struct netfs_i_context *ctx) -{ - return (void *)ctx - sizeof(struct inode); -} - -/** - * netfs_i_context_init - Initialise a netfs lib context + * netfs_inode_init - Initialise a netfslib inode context * @inode: The inode with which the context is associated * @ops: The netfs's operations list * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ -static inline void netfs_i_context_init(struct inode *inode, - const struct netfs_request_ops *ops) +static inline void netfs_inode_init(struct inode *inode, + const struct netfs_request_ops *ops) { - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); - memset(ctx, 0, sizeof(*ctx)); ctx->ops = ops; ctx->remote_i_size = i_size_read(inode); +#if IS_ENABLED(CONFIG_FSCACHE) + ctx->cache = NULL; +#endif } /** @@ -338,7 +329,7 @@ static inline void netfs_i_context_init(struct inode *inode, */ static inline void netfs_resize_file(struct inode *inode, loff_t new_i_size) { - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); ctx->remote_i_size = new_i_size; } @@ -352,7 +343,7 @@ static inline void netfs_resize_file(struct inode *inode, loff_t new_i_size) static inline struct fscache_cookie *netfs_i_cookie(struct inode *inode) { #if IS_ENABLED(CONFIG_FSCACHE) - struct netfs_i_context *ctx = netfs_i_context(inode); + struct netfs_inode *ctx = netfs_inode(inode); return ctx->cache; #else return NULL;