From 079eefaecfd7bbb8fcc30eccb0dfdf50c91f1805 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Sun, 24 Dec 2023 00:02:49 +0000 Subject: [PATCH 001/151] keys, dns: Fix missing size check of V1 server-list header commit 1997b3cb4217b09e49659b634c94da47f0340409 upstream. The dns_resolver_preparse() function has a check on the size of the payload for the basic header of the binary-style payload, but is missing a check for the size of the V1 server-list payload header after determining that's what we've been given. Fix this by getting rid of the the pointer to the basic header and just assuming that we have a V1 server-list payload and moving the V1 server list pointer inside the if-statement. Dealing with other types and versions can be left for when such have been defined. This can be tested by doing the following with KASAN enabled: echo -n -e '\x0\x0\x1\x2' | keyctl padd dns_resolver foo @p and produces an oops like the following: BUG: KASAN: slab-out-of-bounds in dns_resolver_preparse+0xc9f/0xd60 net/dns_resolver/dns_key.c:127 Read of size 1 at addr ffff888028894084 by task syz-executor265/5069 ... Call Trace: dns_resolver_preparse+0xc9f/0xd60 net/dns_resolver/dns_key.c:127 __key_create_or_update+0x453/0xdf0 security/keys/key.c:842 key_create_or_update+0x42/0x50 security/keys/key.c:1007 __do_sys_add_key+0x29c/0x450 security/keys/keyctl.c:134 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x62/0x6a This patch was originally by Edward Adam Davis, but was modified by Linus. Fixes: b946001d3bb1 ("keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on expiry") Reported-and-tested-by: syzbot+94bbb75204a05da3d89f@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/0000000000009b39bc060c73e209@google.com/ Suggested-by: Linus Torvalds Signed-off-by: Edward Adam Davis Signed-off-by: David Howells Tested-by: David Howells Cc: Edward Adam Davis Cc: Jarkko Sakkinen Cc: Jeffrey E Altman Cc: Wang Lei Cc: Jeff Layton Cc: Steve French Cc: Marc Dionne Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Reviewed-by: Simon Horman Signed-off-by: Linus Torvalds Cc: Jeffrey E Altman Signed-off-by: Greg Kroah-Hartman --- net/dns_resolver/dns_key.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 03f8f33dc134..8324e9f97066 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -91,8 +91,6 @@ const struct cred *dns_resolver_cache; static int dns_resolver_preparse(struct key_preparsed_payload *prep) { - const struct dns_server_list_v1_header *v1; - const struct dns_payload_header *bin; struct user_key_payload *upayload; unsigned long derrno; int ret; @@ -103,27 +101,28 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) return -EINVAL; if (data[0] == 0) { + const struct dns_server_list_v1_header *v1; + /* It may be a server list. */ - if (datalen <= sizeof(*bin)) + if (datalen <= sizeof(*v1)) return -EINVAL; - bin = (const struct dns_payload_header *)data; - kenter("[%u,%u],%u", bin->content, bin->version, datalen); - if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) { + v1 = (const struct dns_server_list_v1_header *)data; + kenter("[%u,%u],%u", v1->hdr.content, v1->hdr.version, datalen); + if (v1->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST) { pr_warn_ratelimited( "dns_resolver: Unsupported content type (%u)\n", - bin->content); + v1->hdr.content); return -EINVAL; } - if (bin->version != 1) { + if (v1->hdr.version != 1) { pr_warn_ratelimited( "dns_resolver: Unsupported server list version (%u)\n", - bin->version); + v1->hdr.version); return -EINVAL; } - v1 = (const struct dns_server_list_v1_header *)bin; if ((v1->status != DNS_LOOKUP_GOOD && v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) { if (prep->expiry == TIME64_MAX) From 9539e3b56e0d822de3807d56c90545ef650467ab Mon Sep 17 00:00:00 2001 From: Sarthak Kukreti Date: Wed, 11 Oct 2023 13:12:30 -0700 Subject: [PATCH 002/151] block: Don't invalidate pagecache for invalid falloc modes commit 1364a3c391aedfeb32aa025303ead3d7c91cdf9d upstream. Only call truncate_bdev_range() if the fallocate mode is supported. This fixes a bug where data in the pagecache could be invalidated if the fallocate() was called on the block device with an invalid mode. Fixes: 25f4c41415e5 ("block: implement (some of) fallocate for block devices") Cc: stable@vger.kernel.org Reported-by: "Darrick J. Wong" Signed-off-by: Sarthak Kukreti Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Mike Snitzer Fixes: line? I've never seen those wrapped. Link: https://lore.kernel.org/r/20231011201230.750105-1-sarthakkukreti@chromium.org Signed-off-by: Jens Axboe Signed-off-by: Sarthak Kukreti Signed-off-by: Greg Kroah-Hartman --- block/fops.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/block/fops.c b/block/fops.c index 6197d1c41652..01cb6260fa24 100644 --- a/block/fops.c +++ b/block/fops.c @@ -655,24 +655,35 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, filemap_invalidate_lock(inode->i_mapping); - /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); - if (error) - goto fail; - + /* + * Invalidate the page cache, including dirty pages, for valid + * de-allocate mode calls to fallocate(). + */ switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + goto fail; + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + goto fail; + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + goto fail; + error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL); break; From beda900d3aaf0a8c900d03d827ee6cf3722f29fe Mon Sep 17 00:00:00 2001 From: Aabish Malik Date: Fri, 29 Dec 2023 22:33:54 +0530 Subject: [PATCH 003/151] ALSA: hda/realtek: enable SND_PCI_QUIRK for hp pavilion 14-ec1xxx series commit 13a5b21197587a3d9cac9e1a00de9b91526a55e4 upstream. The HP Pavilion 14 ec1xxx series uses the HP mainboard 8A0F with the ALC287 codec. The mute led can be enabled using the already existing ALC287_FIXUP_HP_GPIO_LED quirk. Tested on an HP Pavilion ec1003AU Signed-off-by: Aabish Malik Cc: Link: https://lore.kernel.org/r/20231229170352.742261-3-aabishmalik3337@gmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a88ed60dcd96..06cd2402fdd2 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9663,6 +9663,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x89c6, "Zbook Fury 17 G9", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x89ca, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x89d3, "HP EliteBook 645 G9 (MB 89D2)", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8a0f, "HP Pavilion 14-ec1xxx", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8a20, "HP Laptop 15s-fq5xxx", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x8a25, "HP Victus 16-d1xxx (MB 8A25)", ALC245_FIXUP_HP_MUTE_LED_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8a78, "HP Dev One", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), From 0fa3cf2d151e6542e8825433b8f89d2a20bdac89 Mon Sep 17 00:00:00 2001 From: Andy Chi Date: Tue, 2 Jan 2024 10:49:15 +0800 Subject: [PATCH 004/151] ALSA: hda/realtek: fix mute/micmute LEDs for a HP ZBook commit 18a434f32fa61b3fda8ddcd9a63d5274569c6a41 upstream. There is a HP ZBook which using ALC236 codec and need the ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mute LED and micmute LED work. [ confirmed that the new entries are for new models that have no proper name, so the strings are left as "HP" which will be updated eventually later -- tiwai ] Signed-off-by: Andy Chi Cc: Link: https://lore.kernel.org/r/20240102024916.19093-1-andy.chi@canonical.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 06cd2402fdd2..f35dc1cdd428 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9708,6 +9708,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c70, "HP EliteBook 835 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c71, "HP EliteBook 845 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c72, "HP EliteBook 865 G11", ALC287_FIXUP_CS35L41_I2C_2_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8c96, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8cf5, "HP ZBook Studio 16", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), From 105063f7f44192c794f1a1fef223a3ee9dd18678 Mon Sep 17 00:00:00 2001 From: Siddhesh Dharme Date: Thu, 4 Jan 2024 11:37:36 +0530 Subject: [PATCH 005/151] ALSA: hda/realtek: Fix mute and mic-mute LEDs for HP ProBook 440 G6 commit b6ce6e6c79e4ec650887f1fe391a70e54972001a upstream. LEDs in 'HP ProBook 440 G6' laptop are controlled by ALC236 codec. Enable already existing quirk 'ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF' to fix mute and mic-mute LEDs. Signed-off-by: Siddhesh Dharme Cc: Link: https://lore.kernel.org/r/20240104060736.5149-1-siddheshdharme18@gmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f35dc1cdd428..919f3e391de1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9581,6 +9581,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x84da, "HP OMEN dc0019-ur", ALC295_FIXUP_HP_OMEN), SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x8519, "HP Spectre x360 15-df0xxx", ALC285_FIXUP_HP_SPECTRE_X360), + SND_PCI_QUIRK(0x103c, 0x8537, "HP ProBook 440 G6", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x860f, "HP ZBook 15 G6", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x861f, "HP Elite Dragonfly G1", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED), From af9a5307656d150d6704b38d8155c4ada2a128b1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 26 Dec 2023 13:10:18 +0100 Subject: [PATCH 006/151] mptcp: prevent tcp diag from closing listener subflows commit 4c0288299fd09ee7c6fbe2f57421f314d8c981db upstream. The MPTCP protocol does not expect that any other entity could change the first subflow status when such socket is listening. Unfortunately the TCP diag interface allows aborting any TCP socket, including MPTCP listeners subflows. As reported by syzbot, that trigger a WARN() and could lead to later bigger trouble. The MPTCP protocol needs to do some MPTCP-level cleanup actions to properly shutdown the listener. To keep the fix simple, prevent entirely the diag interface from stopping such listeners. We could refine the diag callback in a later, larger patch targeting net-next. Fixes: 57fc0f1ceaa4 ("mptcp: ensure listener is unhashed before updating the sk status") Cc: stable@vger.kernel.org Reported-by: Closes: https://lore.kernel.org/netdev/0000000000004f4579060c68431b@google.com/ Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231226-upstream-net-20231226-mptcp-prevent-warn-v1-2-1404dcc431ea@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/mptcp/subflow.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d611783c2601..8ed7769cae83 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1899,6 +1899,17 @@ static void tcp_release_cb_override(struct sock *ssk) tcp_release_cb(ssk); } +static int tcp_abort_override(struct sock *ssk, int err) +{ + /* closing a listener subflow requires a great deal of care. + * keep it simple and just prevent such operation + */ + if (inet_sk_state_load(ssk) == TCP_LISTEN) + return -EINVAL; + + return tcp_abort(ssk, err); +} + static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, @@ -1942,6 +1953,7 @@ void __init mptcp_subflow_init(void) tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; + tcp_prot_override.diag_destroy = tcp_abort_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock @@ -1977,6 +1989,7 @@ void __init mptcp_subflow_init(void) tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; + tcpv6_prot_override.diag_destroy = tcp_abort_override; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); From b9c370b61d735a0e5390c42771e7eb21413f7868 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 1 Jan 2024 12:08:18 -0600 Subject: [PATCH 007/151] Revert "PCI/ASPM: Remove pcie_aspm_pm_state_change()" commit f93e71aea6c60ebff8adbd8941e678302d377869 upstream. This reverts commit 08d0cc5f34265d1a1e3031f319f594bd1970976c. Michael reported that when attempting to resume from suspend to RAM on ASUS mini PC PN51-BB757MDE1 (DMI model: MINIPC PN51-E1), 08d0cc5f3426 ("PCI/ASPM: Remove pcie_aspm_pm_state_change()") caused a 12-second delay with no output, followed by a reboot. Workarounds include: - Reverting 08d0cc5f3426 ("PCI/ASPM: Remove pcie_aspm_pm_state_change()") - Booting with "pcie_aspm=off" - Booting with "pcie_aspm.policy=performance" - "echo 0 | sudo tee /sys/bus/pci/devices/0000:03:00.0/link/l1_aspm" before suspending - Connecting a USB flash drive Link: https://lore.kernel.org/r/20240102232550.1751655-1-helgaas@kernel.org Fixes: 08d0cc5f3426 ("PCI/ASPM: Remove pcie_aspm_pm_state_change()") Reported-by: Michael Schaller Link: https://lore.kernel.org/r/76c61361-b8b4-435f-a9f1-32b716763d62@5challer.de Signed-off-by: Bjorn Helgaas Cc: Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci.c | 6 ++++++ drivers/pci/pci.h | 2 ++ drivers/pci/pcie/aspm.c | 19 +++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8df156c28aad..5368a37154cf 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1302,6 +1302,9 @@ static int pci_set_full_power_state(struct pci_dev *dev) pci_restore_bars(dev); } + if (dev->bus->self) + pcie_aspm_pm_state_change(dev->bus->self); + return 0; } @@ -1396,6 +1399,9 @@ static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state) pci_power_name(dev->current_state), pci_power_name(state)); + if (dev->bus->self) + pcie_aspm_pm_state_change(dev->bus->self); + return 0; } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ffccb03933e2..ed6d75d138c7 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -561,10 +561,12 @@ bool pcie_wait_for_link(struct pci_dev *pdev, bool active); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); +void pcie_aspm_pm_state_change(struct pci_dev *pdev); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } +static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } #endif diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 5d1756f53ba8..25736d408e88 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -1055,6 +1055,25 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev) up_read(&pci_bus_sem); } +/* @pdev: the root port or switch downstream port */ +void pcie_aspm_pm_state_change(struct pci_dev *pdev) +{ + struct pcie_link_state *link = pdev->link_state; + + if (aspm_disabled || !link) + return; + /* + * Devices changed PM state, we should recheck if latency + * meets all functions' requirement + */ + down_read(&pci_bus_sem); + mutex_lock(&aspm_lock); + pcie_update_aspm_capable(link->root); + pcie_config_aspm_path(link); + mutex_unlock(&aspm_lock); + up_read(&pci_bus_sem); +} + void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { struct pcie_link_state *link = pdev->link_state; From 4afcb82518b9b0a07551734662b770513bf0a9ba Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Thu, 14 Dec 2023 17:38:06 +0100 Subject: [PATCH 008/151] drm/mgag200: Fix gamma lut not initialized for G200ER, G200EV, G200SE commit 11f9eb899ecc8c02b769cf8d2532ba12786a7af7 upstream. When mgag200 switched from simple KMS to regular atomic helpers, the initialization of the gamma settings was lost. This leads to a black screen, if the bios/uefi doesn't use the same pixel color depth. This has been fixed with commit ad81e23426a6 ("drm/mgag200: Fix gamma lut not initialized.") for most G200, but G200ER, G200EV, G200SE use their own version of crtc_helper_atomic_enable() and need to be fixed too. Fixes: 1baf9127c482 ("drm/mgag200: Replace simple-KMS with regular atomic helpers") Cc: #v6.1+ Reported-by: Roger Sewell Suggested-by: Roger Sewell Signed-off-by: Jocelyn Falempe Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20231214163849.359691-1-jfalempe@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/mgag200/mgag200_drv.h | 5 +++++ drivers/gpu/drm/mgag200/mgag200_g200er.c | 5 +++++ drivers/gpu/drm/mgag200/mgag200_g200ev.c | 5 +++++ drivers/gpu/drm/mgag200/mgag200_g200se.c | 5 +++++ drivers/gpu/drm/mgag200/mgag200_mode.c | 10 +++++----- 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/mgag200/mgag200_drv.h b/drivers/gpu/drm/mgag200/mgag200_drv.h index f0c2349404b4..aebd09e2d408 100644 --- a/drivers/gpu/drm/mgag200/mgag200_drv.h +++ b/drivers/gpu/drm/mgag200/mgag200_drv.h @@ -390,6 +390,11 @@ void mgag200_primary_plane_helper_atomic_disable(struct drm_plane *plane, .destroy = drm_plane_cleanup, \ DRM_GEM_SHADOW_PLANE_FUNCS +void mgag200_crtc_set_gamma_linear(struct mga_device *mdev, const struct drm_format_info *format); +void mgag200_crtc_set_gamma(struct mga_device *mdev, + const struct drm_format_info *format, + struct drm_color_lut *lut); + enum drm_mode_status mgag200_crtc_helper_mode_valid(struct drm_crtc *crtc, const struct drm_display_mode *mode); int mgag200_crtc_helper_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *new_state); diff --git a/drivers/gpu/drm/mgag200/mgag200_g200er.c b/drivers/gpu/drm/mgag200/mgag200_g200er.c index bce267e0f7de..8d4538b71047 100644 --- a/drivers/gpu/drm/mgag200/mgag200_g200er.c +++ b/drivers/gpu/drm/mgag200/mgag200_g200er.c @@ -202,6 +202,11 @@ static void mgag200_g200er_crtc_helper_atomic_enable(struct drm_crtc *crtc, mgag200_g200er_reset_tagfifo(mdev); + if (crtc_state->gamma_lut) + mgag200_crtc_set_gamma(mdev, format, crtc_state->gamma_lut->data); + else + mgag200_crtc_set_gamma_linear(mdev, format); + mgag200_enable_display(mdev); if (funcs->enable_vidrst) diff --git a/drivers/gpu/drm/mgag200/mgag200_g200ev.c b/drivers/gpu/drm/mgag200/mgag200_g200ev.c index ac957f42abe1..56e6f986bff3 100644 --- a/drivers/gpu/drm/mgag200/mgag200_g200ev.c +++ b/drivers/gpu/drm/mgag200/mgag200_g200ev.c @@ -203,6 +203,11 @@ static void mgag200_g200ev_crtc_helper_atomic_enable(struct drm_crtc *crtc, mgag200_g200ev_set_hiprilvl(mdev); + if (crtc_state->gamma_lut) + mgag200_crtc_set_gamma(mdev, format, crtc_state->gamma_lut->data); + else + mgag200_crtc_set_gamma_linear(mdev, format); + mgag200_enable_display(mdev); if (funcs->enable_vidrst) diff --git a/drivers/gpu/drm/mgag200/mgag200_g200se.c b/drivers/gpu/drm/mgag200/mgag200_g200se.c index bd6e573c9a1a..ff2b3c6622e7 100644 --- a/drivers/gpu/drm/mgag200/mgag200_g200se.c +++ b/drivers/gpu/drm/mgag200/mgag200_g200se.c @@ -334,6 +334,11 @@ static void mgag200_g200se_crtc_helper_atomic_enable(struct drm_crtc *crtc, mgag200_g200se_set_hiprilvl(mdev, adjusted_mode, format); + if (crtc_state->gamma_lut) + mgag200_crtc_set_gamma(mdev, format, crtc_state->gamma_lut->data); + else + mgag200_crtc_set_gamma_linear(mdev, format); + mgag200_enable_display(mdev); if (funcs->enable_vidrst) diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c index ae90b260312a..554adf05e073 100644 --- a/drivers/gpu/drm/mgag200/mgag200_mode.c +++ b/drivers/gpu/drm/mgag200/mgag200_mode.c @@ -28,8 +28,8 @@ * This file contains setup code for the CRTC. */ -static void mgag200_crtc_set_gamma_linear(struct mga_device *mdev, - const struct drm_format_info *format) +void mgag200_crtc_set_gamma_linear(struct mga_device *mdev, + const struct drm_format_info *format) { int i; @@ -65,9 +65,9 @@ static void mgag200_crtc_set_gamma_linear(struct mga_device *mdev, } } -static void mgag200_crtc_set_gamma(struct mga_device *mdev, - const struct drm_format_info *format, - struct drm_color_lut *lut) +void mgag200_crtc_set_gamma(struct mga_device *mdev, + const struct drm_format_info *format, + struct drm_color_lut *lut) { int i; From 5982a625fc0d56ed5ad4b02f88cb3d1ae432460e Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 29 Dec 2023 11:16:15 +0000 Subject: [PATCH 009/151] cifs: cifs_chan_is_iface_active should be called with chan_lock held commit 7257bcf3bdc785eabc4eef1f329a59815b032508 upstream. cifs_chan_is_iface_active checks the channels of a session to see if the associated iface is active. This should always happen with chan_lock held. However, these two callers of this function were missing this locking. This change makes sure the function calls are protected with proper locking. Fixes: b54034a73baf ("cifs: during reconnect, update interface if necessary") Fixes: fa1d0508bdd4 ("cifs: account for primary channel in the interface list") Cc: stable@vger.kernel.org Signed-off-by: Shyam Prasad N Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/connect.c | 9 ++++++--- fs/smb/client/smb2ops.c | 7 ++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index f725a119ce31..49fdc6dfdcf8 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -258,10 +258,13 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&cifs_tcp_ses_lock); list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { /* check if iface is still active */ - if (!cifs_chan_is_iface_active(ses, server)) - cifs_chan_update_iface(ses, server); - spin_lock(&ses->chan_lock); + if (!cifs_chan_is_iface_active(ses, server)) { + spin_unlock(&ses->chan_lock); + cifs_chan_update_iface(ses, server); + spin_lock(&ses->chan_lock); + } + if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) { spin_unlock(&ses->chan_lock); continue; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index df03d80ab6d5..5ac58880c286 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -778,9 +778,14 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_ goto out; /* check if iface is still active */ + spin_lock(&ses->chan_lock); pserver = ses->chans[0].server; - if (pserver && !cifs_chan_is_iface_active(ses, pserver)) + if (pserver && !cifs_chan_is_iface_active(ses, pserver)) { + spin_unlock(&ses->chan_lock); cifs_chan_update_iface(ses, pserver); + spin_lock(&ses->chan_lock); + } + spin_unlock(&ses->chan_lock); out: kfree(out_buf); From 3152a7d361c6713a17d2156c3c4693d1c8e1fa19 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 29 Dec 2023 11:16:16 +0000 Subject: [PATCH 010/151] cifs: do not depend on release_iface for maintaining iface_list commit 09eeb0723f219fbd96d8865bf9b935e03ee2ec22 upstream. parse_server_interfaces should be in complete charge of maintaining the iface_list linked list. Today, iface entries are removed from the list only when the last refcount is dropped. i.e. in release_iface. However, this can result in undercounting of refcount if the server stops advertising interfaces (which Azure SMB server does). This change puts parse_server_interfaces in full charge of maintaining the iface_list. So if an empty list is returned by the server, the entries in the list will immediately be removed. This way, a following call to the same function will not find entries in the list. Fixes: aa45dadd34e4 ("cifs: change iface_list from array to sorted linked list") Cc: stable@vger.kernel.org Signed-off-by: Shyam Prasad N Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/cifsglob.h | 1 - fs/smb/client/smb2ops.c | 27 +++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 512ac9dea978..7f1aea4c11b9 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -972,7 +972,6 @@ release_iface(struct kref *ref) struct cifs_server_iface *iface = container_of(ref, struct cifs_server_iface, refcount); - list_del_init(&iface->iface_head); kfree(iface); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 5ac58880c286..285d360eb59a 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -588,16 +588,12 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, } /* - * Go through iface_list and do kref_put to remove - * any unused ifaces. ifaces in use will be removed - * when the last user calls a kref_put on it + * Go through iface_list and mark them as inactive */ list_for_each_entry_safe(iface, niface, &ses->iface_list, - iface_head) { + iface_head) iface->is_active = 0; - kref_put(&iface->refcount, release_iface); - ses->iface_count--; - } + spin_unlock(&ses->iface_lock); /* @@ -672,10 +668,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, iface_head) { ret = iface_cmp(iface, &tmp_iface); if (!ret) { - /* just get a ref so that it doesn't get picked/freed */ iface->is_active = 1; - kref_get(&iface->refcount); - ses->iface_count++; spin_unlock(&ses->iface_lock); goto next_iface; } else if (ret < 0) { @@ -742,6 +735,20 @@ next_iface: } out: + /* + * Go through the list again and put the inactive entries + */ + spin_lock(&ses->iface_lock); + list_for_each_entry_safe(iface, niface, &ses->iface_list, + iface_head) { + if (!iface->is_active) { + list_del(&iface->iface_head); + kref_put(&iface->refcount, release_iface); + ses->iface_count--; + } + } + spin_unlock(&ses->iface_lock); + return rc; } From 493d556278a3253e3d136b2b1a39485b8772cc94 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 4 Jan 2024 16:15:17 +0100 Subject: [PATCH 011/151] KVM: x86/pmu: fix masking logic for MSR_CORE_PERF_GLOBAL_CTRL commit 971079464001c6856186ca137778e534d983174a upstream. When commit c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") switched the initialization of cpuc->guest_switch_msrs to use compound literals, it screwed up the boolean logic: + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; ... - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - arr[0].guest &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), Before the patch, the value of arr[0].guest would have been intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask. The intent is to always treat PEBS events as host-only because, while the guest runs, there is no way to tell the processor about the virtual address where to put PEBS records intended for the host. Unfortunately, the new expression can be expanded to (intel_ctrl & ~cpuc->intel_ctrl_host_mask) | (intel_ctrl & ~pebs_mask) which makes no sense; it includes any bit that isn't *both* marked as exclude_guest and using PEBS. So, reinstate the old logic. Another way to write it could be "intel_ctrl & ~(cpuc->intel_ctrl_host_mask | pebs_mask)", presumably the intention of the author of the faulty. However, I personally find the repeated application of A AND NOT B to be a bit more readable. This shows up as guest failures when running concurrent long-running perf workloads on the host, and was reported to happen with rcutorture. All guests on a given host would die simultaneously with something like an instruction fault or a segmentation violation. Reported-by: Paul E. McKenney Analyzed-by: Sean Christopherson Tested-by: Paul E. McKenney Cc: stable@vger.kernel.org Fixes: c59a1f106f5c ("KVM: x86/pmu: Add IA32_PEBS_ENABLE MSR emulation for extended PEBS") Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/intel/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2fb5e1541efc..949129443b1c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4033,12 +4033,17 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; int global_ctrl, pebs_enable; + /* + * In addition to obeying exclude_guest/exclude_host, remove bits being + * used for PEBS when running a guest, because PEBS writes to virtual + * addresses (not physical addresses). + */ *nr = 0; global_ctrl = (*nr)++; arr[global_ctrl] = (struct perf_guest_switch_msr){ .msr = MSR_CORE_PERF_GLOBAL_CTRL, .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, - .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask, }; if (!x86_pmu.pebs) From 3da4868907dad5fc92cb9eb99a972c10ef9b084d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 15 Dec 2023 11:13:34 +0100 Subject: [PATCH 012/151] wifi: iwlwifi: pcie: don't synchronize IRQs from IRQ [ Upstream commit 400f6ebbc175286576c7f7fddf3c347d09d12310 ] On older devices (before unified image!) we can end up calling stop_device from an rfkill interrupt. However, in stop_device we attempt to synchronize IRQs, which then of course deadlocks. Avoid this by checking the context, if running from the IRQ thread then don't synchronize. This wouldn't be correct on a new device since RSS is supported, but older devices only have a single interrupt/queue. Fixes: 37fb29bd1f90 ("wifi: iwlwifi: pcie: synchronize IRQs before NAPI") Reviewed-by: Miri Korenblit Reviewed-by: Emmanuel Grumbach Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://msgid.link/20231215111335.59aab00baed7.Iadfe154d6248e7f9dfd69522e5429dbbd72925d7@changeid Signed-off-by: Sasha Levin --- .../net/wireless/intel/iwlwifi/pcie/internal.h | 4 ++-- drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 8 ++++---- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 17 +++++++++-------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index 69b95ad5993b..2ec4ee8ab317 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -745,7 +745,7 @@ static inline void iwl_enable_rfkill_int(struct iwl_trans *trans) } } -void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans); +void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans, bool from_irq); static inline bool iwl_is_rfkill_set(struct iwl_trans *trans) { @@ -792,7 +792,7 @@ static inline bool iwl_pcie_dbg_on(struct iwl_trans *trans) return (trans->dbg.dest_tlv || iwl_trans_dbg_ini_valid(trans)); } -void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state); +void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state, bool from_irq); void iwl_trans_pcie_dump_regs(struct iwl_trans *trans); #ifdef CONFIG_IWLWIFI_DEBUGFS diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index 90a46faaaffd..57a11ee05bc3 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -1781,7 +1781,7 @@ static u32 iwl_pcie_int_cause_ict(struct iwl_trans *trans) return inta; } -void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans) +void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans, bool from_irq) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct isr_statistics *isr_stats = &trans_pcie->isr_stats; @@ -1805,7 +1805,7 @@ void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans) isr_stats->rfkill++; if (prev != report) - iwl_trans_pcie_rf_kill(trans, report); + iwl_trans_pcie_rf_kill(trans, report, from_irq); mutex_unlock(&trans_pcie->mutex); if (hw_rfkill) { @@ -1945,7 +1945,7 @@ irqreturn_t iwl_pcie_irq_handler(int irq, void *dev_id) /* HW RF KILL switch toggled */ if (inta & CSR_INT_BIT_RF_KILL) { - iwl_pcie_handle_rfkill_irq(trans); + iwl_pcie_handle_rfkill_irq(trans, true); handled |= CSR_INT_BIT_RF_KILL; } @@ -2362,7 +2362,7 @@ irqreturn_t iwl_pcie_irq_msix_handler(int irq, void *dev_id) /* HW RF KILL switch toggled */ if (inta_hw & MSIX_HW_INT_CAUSES_REG_RF_KILL) - iwl_pcie_handle_rfkill_irq(trans); + iwl_pcie_handle_rfkill_irq(trans, true); if (inta_hw & MSIX_HW_INT_CAUSES_REG_HW_ERR) { IWL_ERR(trans, diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index 796972f22432..c7ed35b3dd8d 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -1080,7 +1080,7 @@ bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans) report = test_bit(STATUS_RFKILL_OPMODE, &trans->status); if (prev != report) - iwl_trans_pcie_rf_kill(trans, report); + iwl_trans_pcie_rf_kill(trans, report, false); return hw_rfkill; } @@ -1234,7 +1234,7 @@ static void iwl_pcie_init_msix(struct iwl_trans_pcie *trans_pcie) trans_pcie->hw_mask = trans_pcie->hw_init_mask; } -static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans) +static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool from_irq) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); @@ -1261,7 +1261,8 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans) if (test_and_clear_bit(STATUS_DEVICE_ENABLED, &trans->status)) { IWL_DEBUG_INFO(trans, "DEVICE_ENABLED bit was set and is now cleared\n"); - iwl_pcie_synchronize_irqs(trans); + if (!from_irq) + iwl_pcie_synchronize_irqs(trans); iwl_pcie_rx_napi_sync(trans); iwl_pcie_tx_stop(trans); iwl_pcie_rx_stop(trans); @@ -1451,7 +1452,7 @@ void iwl_trans_pcie_handle_stop_rfkill(struct iwl_trans *trans, clear_bit(STATUS_RFKILL_OPMODE, &trans->status); } if (hw_rfkill != was_in_rfkill) - iwl_trans_pcie_rf_kill(trans, hw_rfkill); + iwl_trans_pcie_rf_kill(trans, hw_rfkill, false); } static void iwl_trans_pcie_stop_device(struct iwl_trans *trans) @@ -1466,12 +1467,12 @@ static void iwl_trans_pcie_stop_device(struct iwl_trans *trans) mutex_lock(&trans_pcie->mutex); trans_pcie->opmode_down = true; was_in_rfkill = test_bit(STATUS_RFKILL_OPMODE, &trans->status); - _iwl_trans_pcie_stop_device(trans); + _iwl_trans_pcie_stop_device(trans, false); iwl_trans_pcie_handle_stop_rfkill(trans, was_in_rfkill); mutex_unlock(&trans_pcie->mutex); } -void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state) +void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state, bool from_irq) { struct iwl_trans_pcie __maybe_unused *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); @@ -1484,7 +1485,7 @@ void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state) if (trans->trans_cfg->gen2) _iwl_trans_pcie_gen2_stop_device(trans); else - _iwl_trans_pcie_stop_device(trans); + _iwl_trans_pcie_stop_device(trans, from_irq); } } @@ -2815,7 +2816,7 @@ static ssize_t iwl_dbgfs_rfkill_write(struct file *file, IWL_WARN(trans, "changing debug rfkill %d->%d\n", trans_pcie->debug_rfkill, new_value); trans_pcie->debug_rfkill = new_value; - iwl_pcie_handle_rfkill_irq(trans); + iwl_pcie_handle_rfkill_irq(trans, false); return count; } From 5f523f1beb465ff2e398946abd090c5a14e69c75 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 14 Dec 2023 12:37:52 -0800 Subject: [PATCH 013/151] drm/bridge: ti-sn65dsi86: Never store more than msg->size bytes in AUX xfer [ Upstream commit aca58eac52b88138ab98c814afb389a381725cd7 ] For aux reads, the value `msg->size` indicates the size of the buffer provided by `msg->buffer`. We should never in any circumstances write more bytes to the buffer since it may overflow the buffer. In the ti-sn65dsi86 driver there is one code path that reads the transfer length from hardware. Even though it's never been seen to be a problem, we should make extra sure that the hardware isn't increasing the length since doing so would cause us to overrun the buffer. Fixes: 982f589bde7a ("drm/bridge: ti-sn65dsi86: Update reply on aux failures") Reviewed-by: Stephen Boyd Reviewed-by: Guenter Roeck Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20231214123752.v3.2.I7b83c0f31aeedc6b1dc98c7c741d3e1f94f040f8@changeid Signed-off-by: Sasha Levin --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index 1b5c27ed2737..ff4d0564122a 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -527,6 +527,7 @@ static ssize_t ti_sn_aux_transfer(struct drm_dp_aux *aux, u32 request_val = AUX_CMD_REQ(msg->request); u8 *buf = msg->buffer; unsigned int len = msg->size; + unsigned int short_len; unsigned int val; int ret; u8 addr_len[SN_AUX_LENGTH_REG + 1 - SN_AUX_ADDR_19_16_REG]; @@ -600,7 +601,8 @@ static ssize_t ti_sn_aux_transfer(struct drm_dp_aux *aux, } if (val & AUX_IRQ_STATUS_AUX_SHORT) { - ret = regmap_read(pdata->regmap, SN_AUX_LENGTH_REG, &len); + ret = regmap_read(pdata->regmap, SN_AUX_LENGTH_REG, &short_len); + len = min(len, short_len); if (ret) goto exit; } else if (val & AUX_IRQ_STATUS_NAT_I2C_FAIL) { From 9487cc4c90fbb0b8b34a3835be410676c6cb24eb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Jan 2023 10:58:34 -0500 Subject: [PATCH 014/151] netfilter: use skb_ip_totlen and iph_totlen [ Upstream commit a13fbf5ed5b4fc9095f12e955ca3a59b5507ff01 ] There are also quite some places in netfilter that may process IPv4 TCP GSO packets, we need to replace them too. In length_mt(), we have to use u_int32_t/int to accept skb_ip_totlen() return value, otherwise it may overflow and mismatch. This change will also help us add selftest for IPv4 BIG TCP in the following patch. Note that we don't need to replace the one in tcpmss_tg4(), as it will return if there is data after tcphdr in tcpmss_mangle_packet(). The same in mangle_contents() in nf_nat_helper.c, it returns false when skb->len + extra > 65535 in enlarge_skb(). Signed-off-by: Xin Long Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski Stable-dep-of: 0ae8e4cca787 ("netfilter: nf_tables: set transport offset from mac header for netdev/egress") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables_ipv4.h | 4 ++-- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/netfilter/nf_log_syslog.c | 2 +- net/netfilter/xt_length.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index c4a6147b0ef8..d8f6cb47ebe3 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) if (iph->ihl < 5 || iph->version != 4) return -1; - len = ntohs(iph->tot_len); + len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) return -1; @@ -62,7 +62,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; - len = ntohs(iph->tot_len); + len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 7243079ef354..b452eb3ddcec 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) - *payload_len = ntohs(old_iph->tot_len); + *payload_len = skb_ip_totlen(skb); } /* Implement full-functionality option for ECN encapsulation */ diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index cb894f0d63e9..c66689ad2b49 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index 9fbfad13176f..ca730cedb5d4 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,7 +21,7 @@ static bool length_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_length_info *info = par->matchinfo; - u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + u32 pktlen = skb_ip_totlen(skb); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } From 282e3fb61285242b5029f569039378ea7285cb73 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 14 Dec 2023 11:50:12 +0100 Subject: [PATCH 015/151] netfilter: nf_tables: set transport offset from mac header for netdev/egress [ Upstream commit 0ae8e4cca78781401b17721bfb72718fdf7b4912 ] Before this patch, transport offset (pkt->thoff) provides an offset relative to the network header. This is fine for the inet families because skb->data points to the network header in such case. However, from netdev/egress, skb->data points to the mac header (if available), thus, pkt->thoff is missing the mac header length. Add skb_network_offset() to the transport offset (pkt->thoff) for netdev, so transport header mangling works as expected. Adjust payload fast eval function to use skb->data now that pkt->thoff provides an absolute offset. This explains why users report that matching on egress/netdev works but payload mangling does not. This patch implicitly fixes payload mangling for IPv4 packets in netdev/egress given skb_store_bits() requires an offset from skb->data to reach the transport header. I suspect that nft_exthdr and the trace infra were also broken from netdev/egress because they also take skb->data as start, and pkt->thoff was not correct. Note that IPv6 is fine because ipv6_find_hdr() already provides a transport offset starting from skb->data, which includes skb_network_offset(). The bridge family also uses nft_set_pktinfo_ipv4_validate(), but there skb_network_offset() is zero, so the update in this patch does not alter the existing behaviour. Fixes: 42df6e1d221d ("netfilter: Introduce egress hook") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_tables_ipv4.h | 2 +- net/netfilter/nf_tables_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index d8f6cb47ebe3..5225d2bd1a6e 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -30,7 +30,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) return -1; len = iph_totlen(pkt->skb, iph); - thoff = iph->ihl * 4; + thoff = skb_network_offset(pkt->skb) + (iph->ihl * 4); if (pkt->skb->len < len) return -1; else if (len < thoff) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index cee3e4e905ec..e0c117229ee9 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -141,7 +141,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, else { if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) return false; - ptr = skb_network_header(skb) + nft_thoff(pkt); + ptr = skb->data + nft_thoff(pkt); } ptr += priv->offset; From a4b0a9b80a963c617227629890a706de459d462b Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Tue, 19 Dec 2023 23:19:43 +0530 Subject: [PATCH 016/151] nfc: llcp_core: Hold a ref to llcp_local->dev when holding a ref to llcp_local [ Upstream commit c95f919567d6f1914f13350af61a1b044ac85014 ] llcp_sock_sendmsg() calls nfc_llcp_send_ui_frame() which in turn calls nfc_alloc_send_skb(), which accesses the nfc_dev from the llcp_sock for getting the headroom and tailroom needed for skb allocation. Parallelly the nfc_dev can be freed, as the refcount is decreased via nfc_free_device(), leading to a UAF reported by Syzkaller, which can be summarized as follows: (1) llcp_sock_sendmsg() -> nfc_llcp_send_ui_frame() -> nfc_alloc_send_skb() -> Dereference *nfc_dev (2) virtual_ncidev_close() -> nci_free_device() -> nfc_free_device() -> put_device() -> nfc_release() -> Free *nfc_dev When a reference to llcp_local is acquired, we do not acquire the same for the nfc_dev. This leads to freeing even when the llcp_local is in use, and this is the case with the UAF described above too. Thus, when we acquire a reference to llcp_local, we should acquire a reference to nfc_dev, and release the references appropriately later. References for llcp_local is initialized in nfc_llcp_register_device() (which is called by nfc_register_device()). Thus, we should acquire a reference to nfc_dev there. nfc_unregister_device() calls nfc_llcp_unregister_device() which in turn calls nfc_llcp_local_put(). Thus, the reference to nfc_dev is appropriately released later. Reported-and-tested-by: syzbot+bbe84a4010eeea00982d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bbe84a4010eeea00982d Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") Reviewed-by: Suman Ghosh Signed-off-by: Siddh Raman Pant Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/nfc/llcp_core.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 1dac28136e6a..18be13fb9b75 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -145,6 +145,13 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { + /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever + * we hold a reference to local, we also need to hold a reference to + * the device to avoid UAF. + */ + if (!nfc_get_device(local->dev->idx)) + return NULL; + kref_get(&local->ref); return local; @@ -177,10 +184,18 @@ static void local_release(struct kref *ref) int nfc_llcp_local_put(struct nfc_llcp_local *local) { + struct nfc_dev *dev; + int ret; + if (local == NULL) return 0; - return kref_put(&local->ref, local_release); + dev = local->dev; + + ret = kref_put(&local->ref, local_release); + nfc_put_device(dev); + + return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, @@ -959,8 +974,17 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, } new_sock = nfc_llcp_sock(new_sk); - new_sock->dev = local->dev; + new_sock->local = nfc_llcp_local_get(local); + if (!new_sock->local) { + reason = LLCP_DM_REJ; + sock_put(&new_sock->sk); + release_sock(&sock->sk); + sock_put(&sock->sk); + goto fail; + } + + new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; @@ -1597,7 +1621,16 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) if (local == NULL) return -ENOMEM; - local->dev = ndev; + /* As we are going to initialize local's refcount, we need to get the + * nfc_dev to avoid UAF, otherwise there is no point in continuing. + * See nfc_llcp_local_get(). + */ + local->dev = nfc_get_device(ndev->idx); + if (!local->dev) { + kfree(local); + return -ENODEV; + } + INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); From 6cf7235bc1fb6de2bf1b26f69cd874940cfd0945 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Tue, 19 Dec 2023 19:56:33 +0530 Subject: [PATCH 017/151] octeontx2-af: Fix marking couple of structure as __packed [ Upstream commit 0ee2384a5a0f3b4eeac8d10bb01a0609d245a4d1 ] Couple of structures was not marked as __packed. This patch fixes the same and mark them as __packed. Fixes: 42006910b5ea ("octeontx2-af: cleanup KPU config data") Signed-off-by: Suman Ghosh Reviewed-by: Jacob Keller Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/af/npc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/npc.h index d027c23b8ef8..aaff91bc7415 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h @@ -514,7 +514,7 @@ struct npc_lt_def { u8 ltype_mask; u8 ltype_match; u8 lid; -}; +} __packed; struct npc_lt_def_ipsec { u8 ltype_mask; @@ -522,7 +522,7 @@ struct npc_lt_def_ipsec { u8 lid; u8 spi_offset; u8 spi_nz; -}; +} __packed; struct npc_lt_def_apad { u8 ltype_mask; From f3f6a23e054c7aa75bdb3e12029f619f29788b97 Mon Sep 17 00:00:00 2001 From: Khaled Almahallawy Date: Wed, 13 Dec 2023 13:15:42 -0800 Subject: [PATCH 018/151] drm/i915/dp: Fix passing the correct DPCD_REV for drm_dp_set_phy_test_pattern [ Upstream commit 2bd7a06a1208aaacb4e7a2a5436c23bce8d70801 ] Using link_status to get DPCD_REV fails when disabling/defaulting phy pattern. Use intel_dp->dpcd to access DPCD_REV correctly. Fixes: 8cdf72711928 ("drm/i915/dp: Program vswing, pre-emphasis, test-pattern") Cc: Jani Nikula Cc: Imre Deak Cc: Lee Shawn C Signed-off-by: Khaled Almahallawy Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20231213211542.3585105-3-khaled.almahallawy@intel.com (cherry picked from commit 3ee302ec22d6e1d7d1e6d381b0d507ee80f2135c) Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/display/intel_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 5970f4149090..4699c2110226 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -3707,7 +3707,7 @@ static void intel_dp_process_phy_request(struct intel_dp *intel_dp, intel_dp->train_set, crtc_state->lane_count); drm_dp_set_phy_test_pattern(&intel_dp->aux, data, - link_status[DP_DPCD_REV]); + intel_dp->dpcd[DP_DPCD_REV]); } static u8 intel_dp_autotest_phy_pattern(struct intel_dp *intel_dp) From 83b80170b7fa2eabde1a12b9e4efa04253bf4adc Mon Sep 17 00:00:00 2001 From: Katarzyna Wieczerzycka Date: Fri, 15 Dec 2023 12:01:56 +0100 Subject: [PATCH 019/151] ice: Fix link_down_on_close message [ Upstream commit 6a8d8bb55e7001de2d50920381cc858f3a3e9fb7 ] The driver should not report an error message when for a medialess port the link_down_on_close flag is enabled and the physical link cannot be set down. Fixes: 8ac7132704f3 ("ice: Fix interface being down after reset with link-down-on-close flag on") Reviewed-by: Przemek Kitszel Signed-off-by: Katarzyna Wieczerzycka Signed-off-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f0f39364819a..5eb3b80b293c 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2138,7 +2138,7 @@ static int ice_configure_phy(struct ice_vsi *vsi) /* Ensure we have media as we cannot configure a medialess port */ if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE)) - return -EPERM; + return -ENOMEDIUM; ice_print_topo_conflict(vsi); @@ -9065,8 +9065,12 @@ int ice_stop(struct net_device *netdev) int link_err = ice_force_phys_link_state(vsi, false); if (link_err) { - netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", - vsi->vsi_num, link_err); + if (link_err == -ENOMEDIUM) + netdev_info(vsi->netdev, "Skipping link reconfig - no media attached, VSI %d\n", + vsi->vsi_num); + else + netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", + vsi->vsi_num, link_err); return -EIO; } } From 188c9970d05e70f1d9a31d0ee0c30bd7fdd4a45c Mon Sep 17 00:00:00 2001 From: Ngai-Mint Kwan Date: Fri, 15 Dec 2023 12:01:57 +0100 Subject: [PATCH 020/151] ice: Shut down VSI with "link-down-on-close" enabled [ Upstream commit 6d05ff55ef4f4954d28551236239f297bd52ea48 ] Disabling netdev with ethtool private flag "link-down-on-close" enabled can cause NULL pointer dereference bug. Shut down VSI regardless of "link-down-on-close" state. Fixes: 8ac7132704f3 ("ice: Fix interface being down after reset with link-down-on-close flag on") Reviewed-by: Przemek Kitszel Signed-off-by: Ngai-Mint Kwan Signed-off-by: Wojciech Drewek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 5eb3b80b293c..ab46cfca4028 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -9071,6 +9071,8 @@ int ice_stop(struct net_device *netdev) else netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", vsi->vsi_num, link_err); + + ice_vsi_close(vsi); return -EIO; } } From e76d1913f6a8a89459ee42cfb6dbb99d6cba1d57 Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Wed, 29 Nov 2023 11:23:11 +0100 Subject: [PATCH 021/151] i40e: Fix filter input checks to prevent config with invalid values [ Upstream commit 3e48041d9820c17e0a51599d12e66c6e12a8d08d ] Prevent VF from configuring filters with unsupported actions or use REDIRECT action with invalid tc number. Current checks could cause out of bounds access on PF side. Fixes: e284fc280473 ("i40e: Add and delete cloud filter") Reviewed-by: Andrii Staikov Signed-off-by: Sudheer Mogilappagari Signed-off-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Bharathi Sreenivas Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index cb925baf72ce..3c38129a5224 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -3451,16 +3451,16 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf, bool found = false; int bkt; - if (!tc_filter->action) { + if (tc_filter->action != VIRTCHNL_ACTION_TC_REDIRECT) { dev_info(&pf->pdev->dev, - "VF %d: Currently ADq doesn't support Drop Action\n", - vf->vf_id); + "VF %d: ADQ doesn't support this action (%d)\n", + vf->vf_id, tc_filter->action); goto err; } /* action_meta is TC number here to which the filter is applied */ if (!tc_filter->action_meta || - tc_filter->action_meta > I40E_MAX_VF_VSI) { + tc_filter->action_meta > vf->num_tc) { dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n", vf->vf_id, tc_filter->action_meta); goto err; From d27b98f4aeaeba3d1b09d39bf875c1dae3a8b7f8 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 1 Dec 2023 08:50:42 +0100 Subject: [PATCH 022/151] igc: Report VLAN EtherType matching back to user [ Upstream commit 088464abd48cf3735aee91f9e211b32da9d81117 ] Currently the driver allows to configure matching by VLAN EtherType. However, the retrieval function does not report it back to the user. Add it. Before: |root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 action 0 |Added rule with ID 63 |root@host:~# ethtool --show-ntuple enp3s0 |4 RX rings available |Total 1 rules | |Filter: 63 | Flow Type: Raw Ethernet | Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Ethertype: 0x0 mask: 0xFFFF | Action: Direct to queue 0 After: |root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 action 0 |Added rule with ID 63 |root@host:~# ethtool --show-ntuple enp3s0 |4 RX rings available |Total 1 rules | |Filter: 63 | Flow Type: Raw Ethernet | Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Ethertype: 0x0 mask: 0xFFFF | VLAN EtherType: 0x8100 mask: 0x0 | VLAN: 0x0 mask: 0xffff | User-defined: 0x0 mask: 0xffffffffffffffff | Action: Direct to queue 0 Fixes: 2b477d057e33 ("igc: Integrate flex filter into ethtool ops") Signed-off-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Reviewed-by: Simon Horman Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 81897f7a90a9..51ef18060dbc 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -979,6 +979,12 @@ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK; } + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_ETYPE) { + fsp->flow_type |= FLOW_EXT; + fsp->h_ext.vlan_etype = rule->filter.vlan_etype; + fsp->m_ext.vlan_etype = ETHER_TYPE_FULL_MASK; + } + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { fsp->flow_type |= FLOW_EXT; fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci); From 6edff0b8381c99870548670417eb958d4b3abbb5 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 1 Dec 2023 08:50:43 +0100 Subject: [PATCH 023/151] igc: Check VLAN TCI mask [ Upstream commit b5063cbe148b829e8eb97672c2cbccc058835476 ] Currently the driver accepts VLAN TCI steering rules regardless of the configured mask. And things might fail silently or with confusing error messages to the user. There are two ways to handle the VLAN TCI mask: 1. Match on the PCP field using a VLAN prio filter 2. Match on complete TCI field using a flex filter Therefore, add checks and code for that. For instance the following rule is invalid and will be converted into a VLAN prio rule which is not correct: |root@host:~# ethtool -N enp3s0 flow-type ether vlan 0x0001 m 0xf000 \ | action 1 |Added rule with ID 61 |root@host:~# ethtool --show-ntuple enp3s0 |4 RX rings available |Total 1 rules | |Filter: 61 | Flow Type: Raw Ethernet | Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Ethertype: 0x0 mask: 0xFFFF | VLAN EtherType: 0x0 mask: 0xffff | VLAN: 0x1 mask: 0x1fff | User-defined: 0x0 mask: 0xffffffffffffffff | Action: Direct to queue 1 After: |root@host:~# ethtool -N enp3s0 flow-type ether vlan 0x0001 m 0xf000 \ | action 1 |rmgr: Cannot insert RX class rule: Operation not supported Fixes: 7991487ecb2d ("igc: Allow for Flex Filters to be installed") Signed-off-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Reviewed-by: Simon Horman Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_ethtool.c | 28 +++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 43c05b41627f..2a894ca49d93 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -538,6 +538,7 @@ struct igc_nfc_filter { u16 etype; __be16 vlan_etype; u16 vlan_tci; + u16 vlan_tci_mask; u8 src_addr[ETH_ALEN]; u8 dst_addr[ETH_ALEN]; u8 user_data[8]; diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 51ef18060dbc..e146357d61a8 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -957,6 +957,7 @@ static int igc_ethtool_set_coalesce(struct net_device *netdev, } #define ETHER_TYPE_FULL_MASK ((__force __be16)~0) +#define VLAN_TCI_FULL_MASK ((__force __be16)~0) static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, struct ethtool_rxnfc *cmd) { @@ -988,7 +989,7 @@ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { fsp->flow_type |= FLOW_EXT; fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci); - fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK); + fsp->m_ext.vlan_tci = htons(rule->filter.vlan_tci_mask); } if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { @@ -1223,6 +1224,7 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule, if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) { rule->filter.vlan_tci = ntohs(fsp->h_ext.vlan_tci); + rule->filter.vlan_tci_mask = ntohs(fsp->m_ext.vlan_tci); rule->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI; } @@ -1260,11 +1262,19 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule, memcpy(rule->filter.user_mask, fsp->m_ext.data, sizeof(fsp->m_ext.data)); } - /* When multiple filter options or user data or vlan etype is set, use a - * flex filter. + /* The i225/i226 has various different filters. Flex filters provide a + * way to match up to the first 128 bytes of a packet. Use them for: + * a) For specific user data + * b) For VLAN EtherType + * c) For full TCI match + * d) Or in case multiple filter criteria are set + * + * Otherwise, use the simple MAC, VLAN PRIO or EtherType filters. */ if ((rule->filter.match_flags & IGC_FILTER_FLAG_USER_DATA) || (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_ETYPE) || + ((rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) && + rule->filter.vlan_tci_mask == ntohs(VLAN_TCI_FULL_MASK)) || (rule->filter.match_flags & (rule->filter.match_flags - 1))) rule->flex = true; else @@ -1334,6 +1344,18 @@ static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter, return -EINVAL; } + /* There are two ways to match the VLAN TCI: + * 1. Match on PCP field and use vlan prio filter for it + * 2. Match on complete TCI field and use flex filter for it + */ + if ((fsp->flow_type & FLOW_EXT) && + fsp->m_ext.vlan_tci && + fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK) && + fsp->m_ext.vlan_tci != VLAN_TCI_FULL_MASK) { + netdev_dbg(netdev, "VLAN mask not supported\n"); + return -EOPNOTSUPP; + } + if (fsp->location >= IGC_MAX_RXNFC_RULES) { netdev_dbg(netdev, "Invalid location\n"); return -EINVAL; From c3a37dc1568581a94c1b6008369e9a9d9723eccb Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 6 Dec 2023 15:07:18 +0100 Subject: [PATCH 024/151] igc: Check VLAN EtherType mask [ Upstream commit 7afd49a38e73afd57ff62c8d1cf5af760c4d49c0 ] Currently the driver accepts VLAN EtherType steering rules regardless of the configured mask. And things might fail silently or with confusing error messages to the user. The VLAN EtherType can only be matched by full mask. Therefore, add a check for that. For instance the following rule is invalid, but the driver accepts it and ignores the user specified mask: |root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 \ | m 0x00ff action 0 |Added rule with ID 63 |root@host:~# ethtool --show-ntuple enp3s0 |4 RX rings available |Total 1 rules | |Filter: 63 | Flow Type: Raw Ethernet | Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF | Ethertype: 0x0 mask: 0xFFFF | VLAN EtherType: 0x8100 mask: 0x0 | VLAN: 0x0 mask: 0xffff | User-defined: 0x0 mask: 0xffffffffffffffff | Action: Direct to queue 0 After: |root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 \ | m 0x00ff action 0 |rmgr: Cannot insert RX class rule: Operation not supported Fixes: 2b477d057e33 ("igc: Integrate flex filter into ethtool ops") Suggested-by: Suman Ghosh Signed-off-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Reviewed-by: Simon Horman Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index e146357d61a8..2bee9cace598 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1356,6 +1356,14 @@ static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter, return -EOPNOTSUPP; } + /* VLAN EtherType can only be matched by full mask. */ + if ((fsp->flow_type & FLOW_EXT) && + fsp->m_ext.vlan_etype && + fsp->m_ext.vlan_etype != ETHER_TYPE_FULL_MASK) { + netdev_dbg(netdev, "VLAN EtherType mask not supported\n"); + return -EOPNOTSUPP; + } + if (fsp->location >= IGC_MAX_RXNFC_RULES) { netdev_dbg(netdev, "Invalid location\n"); return -EINVAL; From 811604fb02c441acb41cd0ff64ce0c2dc5ec1edc Mon Sep 17 00:00:00 2001 From: Chancel Liu Date: Mon, 25 Dec 2023 17:06:08 +0900 Subject: [PATCH 025/151] ASoC: fsl_rpmsg: Fix error handler with pm_runtime_enable [ Upstream commit f9d378fc68c43fd41b35133edec9cd902ec334ec ] There is error message when defer probe happens: fsl_rpmsg rpmsg_audio: Unbalanced pm_runtime_enable! Fix the error handler with pm_runtime_enable. Fixes: b73d9e6225e8 ("ASoC: fsl_rpmsg: Add CPU DAI driver for audio base on rpmsg") Signed-off-by: Chancel Liu Acked-by: Shengjiu Wang Link: https://lore.kernel.org/r/20231225080608.967953-1-chancel.liu@nxp.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/fsl/fsl_rpmsg.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sound/soc/fsl/fsl_rpmsg.c b/sound/soc/fsl/fsl_rpmsg.c index bf94838bdbef..5c07a8ff0c9c 100644 --- a/sound/soc/fsl/fsl_rpmsg.c +++ b/sound/soc/fsl/fsl_rpmsg.c @@ -231,7 +231,7 @@ static int fsl_rpmsg_probe(struct platform_device *pdev) ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, &fsl_rpmsg_dai, 1); if (ret) - return ret; + goto err_pm_disable; rpmsg->card_pdev = platform_device_register_data(&pdev->dev, "imx-audio-rpmsg", @@ -241,16 +241,22 @@ static int fsl_rpmsg_probe(struct platform_device *pdev) if (IS_ERR(rpmsg->card_pdev)) { dev_err(&pdev->dev, "failed to register rpmsg card\n"); ret = PTR_ERR(rpmsg->card_pdev); - return ret; + goto err_pm_disable; } return 0; + +err_pm_disable: + pm_runtime_disable(&pdev->dev); + return ret; } static int fsl_rpmsg_remove(struct platform_device *pdev) { struct fsl_rpmsg *rpmsg = platform_get_drvdata(pdev); + pm_runtime_disable(&pdev->dev); + if (rpmsg->card_pdev) platform_device_unregister(rpmsg->card_pdev); From 6d7f45492706b2d9f28fdeffd98d364dcd6377e5 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Fri, 29 Dec 2023 13:43:42 +0200 Subject: [PATCH 026/151] ASoC: mediatek: mt8186: fix AUD_PAD_TOP register and offset [ Upstream commit 38744c3fa00109c51076121c2deb4f02e2f09194 ] AUD_PAD_TOP widget's correct register is AFE_AUD_PAD_TOP , and not zero. Having a zero as register, it would mean that the `snd_soc_dapm_new_widgets` would try to read the register at offset zero when trying to get the power status of this widget, which is incorrect. Fixes: b65c466220b3 ("ASoC: mediatek: mt8186: support adda in platform driver") Signed-off-by: Eugen Hristev Link: https://lore.kernel.org/r/20231229114342.195867-1-eugen.hristev@collabora.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/mediatek/mt8186/mt8186-dai-adda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/mediatek/mt8186/mt8186-dai-adda.c b/sound/soc/mediatek/mt8186/mt8186-dai-adda.c index 094402470dc2..858b95b199dc 100644 --- a/sound/soc/mediatek/mt8186/mt8186-dai-adda.c +++ b/sound/soc/mediatek/mt8186/mt8186-dai-adda.c @@ -499,7 +499,7 @@ static const struct snd_soc_dapm_widget mtk_dai_adda_widgets[] = { SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), SND_SOC_DAPM_SUPPLY_S("AUD_PAD_TOP", SUPPLY_SEQ_ADDA_AUD_PAD_TOP, - 0, 0, 0, + AFE_AUD_PAD_TOP, RG_RX_FIFO_ON_SFT, 0, mtk_adda_pad_top_event, SND_SOC_DAPM_PRE_PMU), SND_SOC_DAPM_SUPPLY_S("ADDA_MTKAIF_CFG", SUPPLY_SEQ_ADDA_MTKAIF_CFG, From ac5cbe931c4355a8bd1628e2c8070efddb53e031 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Wed, 20 Dec 2023 18:47:39 -0500 Subject: [PATCH 027/151] mlxbf_gige: fix receive packet race condition [ Upstream commit dcea1bd45e6d111cc8fc1aaefa7e31694089bda3 ] Under heavy traffic, the BlueField Gigabit interface can become unresponsive. This is due to a possible race condition in the mlxbf_gige_rx_packet function, where the function exits with producer and consumer indices equal but there are remaining packet(s) to be processed. In order to prevent this situation, read receive consumer index *before* the HW replenish so that the mlxbf_gige_rx_packet function returns an accurate return value even if a packet is received into just-replenished buffer prior to exiting this routine. If the just-replenished buffer is received and occupies the last RX ring entry, the interface would not recover and instead would encounter RX packet drops related to internal buffer shortages since the driver RX logic is not being triggered to drain the RX ring. This patch will address and prevent this "ring full" condition. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Reviewed-by: Asmaa Mnebhi Signed-off-by: David Thompson Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c index 0d5a41a2ae01..227d01cace3f 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c @@ -267,6 +267,13 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) priv->stats.rx_truncate_errors++; } + /* Read receive consumer index before replenish so that this routine + * returns accurate return value even if packet is received into + * just-replenished buffer prior to exiting this routine. + */ + rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI); + rx_ci_rem = rx_ci % priv->rx_q_entries; + /* Let hardware know we've replenished one buffer */ rx_pi++; @@ -279,8 +286,6 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) rx_pi_rem = rx_pi % priv->rx_q_entries; if (rx_pi_rem == 0) priv->valid_polarity ^= 1; - rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI); - rx_ci_rem = rx_ci % priv->rx_q_entries; if (skb) netif_receive_skb(skb); From 565460e180d9d8542513859db86d11ec7b9bfdb6 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 21 Dec 2023 10:25:31 +0800 Subject: [PATCH 028/151] net: sched: em_text: fix possible memory leak in em_text_destroy() [ Upstream commit 8fcb0382af6f1ef50936f1be05b8149eb2f88496 ] m->data needs to be freed when em_text_destroy is called. Fixes: d675c989ed2d ("[PKT_SCHED]: Packet classification based on textsearch (ematch)") Acked-by: Jamal Hadi Salim Signed-off-by: Hangyu Hua Reviewed-by: Simon Horman Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sched/em_text.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 6f3c1fb2fb44..f176afb70559 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -97,8 +97,10 @@ retry: static void em_text_destroy(struct tcf_ematch *m) { - if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) + if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) { textsearch_destroy(EM_TEXT_PRIV(m)->config); + kfree(EM_TEXT_PRIV(m)); + } } static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) From 633a49e34b32b6f9bd56df405bc81eac7a6a2f7c Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 22 Dec 2023 12:34:09 +0800 Subject: [PATCH 029/151] r8169: Fix PCI error on system resume [ Upstream commit 9c476269bff2908a20930c58085bf0b05ebd569a ] Some r8168 NICs stop working upon system resume: [ 688.051096] r8169 0000:02:00.1 enp2s0f1: rtl_ep_ocp_read_cond == 0 (loop: 10, delay: 10000). [ 688.175131] r8169 0000:02:00.1 enp2s0f1: Link is Down ... [ 691.534611] r8169 0000:02:00.1 enp2s0f1: PCI error (cmd = 0x0407, status_errs = 0x0000) Not sure if it's related, but those NICs have a BMC device at function 0: 02:00.0 Unassigned class [ff00]: Realtek Semiconductor Co., Ltd. Realtek RealManage BMC [10ec:816e] (rev 1a) Trial and error shows that increase the loop wait on rtl_ep_ocp_read_cond to 30 can eliminate the issue, so let rtl8168ep_driver_start() to wait a bit longer. Fixes: e6d6ca6e1204 ("r8169: Add support for another RTL8168FP") Signed-off-by: Kai-Heng Feng Reviewed-by: Heiner Kallweit Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/realtek/r8169_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index d22457f2cf9c..06663c11ca96 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1145,7 +1145,7 @@ static void rtl8168ep_driver_start(struct rtl8169_private *tp) { r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START); r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); - rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 10); + rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); } static void rtl8168_driver_start(struct rtl8169_private *tp) From b2130366a952ba9896dc880dc4c6984f013a4d27 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 9 Dec 2022 10:10:08 +0100 Subject: [PATCH 030/151] can: raw: add support for SO_MARK [ Upstream commit 0826e82b8a32e646b7b32ba8b68ba30812028e47 ] Add support for SO_MARK to the CAN_RAW protocol. This makes it possible to add traffic control filters based on the fwmark. Link: https://lore.kernel.org/all/20221210113653.170346-1-mkl@pengutronix.de Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") Signed-off-by: Sasha Levin --- net/can/raw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/can/raw.c b/net/can/raw.c index 8c104339d538..488320738e31 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -881,6 +881,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); From 5d586f7ca0fc81097724d09fc0997137a77ef9dd Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 6 Mar 2023 08:07:38 -0800 Subject: [PATCH 031/151] net-timestamp: extend SOF_TIMESTAMPING_OPT_ID to HW timestamps [ Upstream commit 8ca5a5790b9a1ce147484d2a2c4e66d2553f3d6c ] When the feature was added it was enabled for SW timestamps only but with current hardware the same out-of-order timestamps can be seen. Let's expand the area for the feature to all types of timestamps. Signed-off-by: Vadim Fedorenko Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") Signed-off-by: Sasha Levin --- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 493c679ea54f..d8ec802f9752 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -990,7 +990,7 @@ static int __ip_append_data(struct sock *sk, mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; - if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + if (cork->tx_flags & SKBTX_ANY_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 3c2b2a85de36..04822e2cba74 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1506,7 +1506,7 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; - if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && + if (cork->tx_flags & SKBTX_ANY_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; From c48fcb4f49061b8bdda946474215ba8c4e8c27b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Aug 2023 13:52:11 +0000 Subject: [PATCH 032/151] net: annotate data-races around sk->sk_tsflags [ Upstream commit e3390b30a5dfb112e8e802a59c0f68f947b638b2 ] sk->sk_tsflags can be read locklessly, add corresponding annotations. Fixes: b9f40e21ef42 ("net-timestamp: move timestamp flags out of sk_flags") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") Signed-off-by: Sasha Levin --- include/net/ip.h | 2 +- include/net/sock.h | 17 ++++++++++------- net/can/j1939/socket.c | 10 ++++++---- net/core/skbuff.c | 10 ++++++---- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- net/socket.c | 13 +++++++------ 13 files changed, 40 insertions(+), 32 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index c286344628db..c83c09c65623 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -95,7 +95,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.tsflags = inet->sk.sk_tsflags; + ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; ipcm->protocol = inet->inet_num; diff --git a/include/net/sock.h b/include/net/sock.h index b6027b01c245..d8ed62a8e1a3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1928,7 +1928,9 @@ struct sockcm_cookie { static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { - *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags) + }; } int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, @@ -2741,9 +2743,9 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - ktime_t kt = skb->tstamp; struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); - + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t kt = skb->tstamp; /* * generate control messages if * - receive time stamping in software requested @@ -2751,10 +2753,10 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || - (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || - (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || + (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); @@ -2776,7 +2778,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY) + if (sk->sk_flags & FLAGS_RECV_CMSGS || + READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 9c828067b448..b0be23559243 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -974,6 +974,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; + u32 tsflags; int err; jsk = j1939_sk(sk); @@ -981,13 +982,14 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; + tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) + if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) + if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: @@ -997,7 +999,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) + if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: @@ -1054,7 +1056,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, } serr->opt_stats = true; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 73b1e0e53534..8a819d0a7bfb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4913,7 +4913,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); @@ -4969,21 +4969,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, { struct sk_buff *skb; bool tsonly, opt_stats = false; + u32 tsflags; if (!sk) return; - if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; - tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 4305e55dbfba..929055bc0cc7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -926,7 +926,7 @@ int sock_set_timestamping(struct sock *sk, int optname, return ret; } - sk->sk_tsflags = val; + WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) @@ -1705,7 +1705,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); - v.timestamping.flags = sk->sk_tsflags; + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); v.timestamping.bind_phc = sk->sk_bind_phc; break; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d8ec802f9752..e19ef88ae181 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -991,7 +991,7 @@ static int __ip_append_data(struct sock *sk, paged = !!cork->gso_size; if (cork->tx_flags & SKBTX_ANY_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 63aa52becd88..c1fb7580ea58 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -509,7 +509,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk, * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ info = PKTINFO_SKB_CB(skb); - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) || + if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || !info->ipi_ifindex) return false; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 58409ea2da0a..3935451ad061 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2359,14 +2359,14 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } } - if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 04822e2cba74..e9ae084d038d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1507,7 +1507,7 @@ static int __ip6_append_data(struct sock *sk, orig_mtu = mtu; if (cork->tx_flags & SKBTX_ANY_TSTAMP && - sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 4d5a27dd9a4b..a5d7d1915ba7 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, np); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index df3abd9e5237..dc31752a7edc 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -776,7 +776,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = fl6.flowi6_mark; if (sin6) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 64b36c2ba774..7f49f69226a2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1358,7 +1358,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = sk->sk_tsflags; + ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ diff --git a/net/socket.c b/net/socket.c index 04cba91c7cbe..9c1fb94b1285 100644 --- a/net/socket.c +++ b/net/socket.c @@ -826,7 +826,7 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) { - bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC; + bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); struct net_device *orig_dev; ktime_t hwtstamp; @@ -878,12 +878,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); struct scm_timestamping_internal tss; - int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); int if_index; ktime_t hwtstamp; + u32 tsflags; /* Race occurred between timestamp enabling and packet receiving. Fill in the current time for now. */ @@ -925,11 +925,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, } memset(&tss, 0, sizeof(tss)); - if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && + tsflags = READ_ONCE(sk->sk_tsflags); + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && - (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) @@ -937,14 +938,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, else hwtstamp = shhwtstamps->hwtstamp; - if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, sk->sk_bind_phc); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } From ac5fde92b5103d2fcb4aa72cb01e2dd714d704c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 31 Aug 2023 13:52:12 +0000 Subject: [PATCH 033/151] net: annotate data-races around sk->sk_bind_phc [ Upstream commit 251cd405a9e6e70b92fe5afbdd17fd5caf9d3266 ] sk->sk_bind_phc is read locklessly. Add corresponding annotations. Fixes: d463126e23f1 ("net: sock: extend SO_TIMESTAMPING for PHC binding") Signed-off-by: Eric Dumazet Cc: Yangbo Lu Signed-off-by: David S. Miller Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") Signed-off-by: Sasha Levin --- net/core/sock.c | 4 ++-- net/socket.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 929055bc0cc7..49b7f252ddae 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -890,7 +890,7 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) if (!match) return -EINVAL; - sk->sk_bind_phc = phc_index; + WRITE_ONCE(sk->sk_bind_phc, phc_index); return 0; } @@ -1706,7 +1706,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, case SO_TIMESTAMPING_OLD: lv = sizeof(v.timestamping); v.timestamping.flags = READ_ONCE(sk->sk_tsflags); - v.timestamping.bind_phc = sk->sk_bind_phc; + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); break; case SO_RCVTIMEO_OLD: diff --git a/net/socket.c b/net/socket.c index 9c1fb94b1285..07470724e735 100644 --- a/net/socket.c +++ b/net/socket.c @@ -940,7 +940,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, if (tsflags & SOF_TIMESTAMPING_BIND_PHC) hwtstamp = ptp_convert_timestamp(&hwtstamp, - sk->sk_bind_phc); + READ_ONCE(sk->sk_bind_phc)); if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; From 3edd66bd4e42225298338a2238fb3938fd2174ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn-Thorben=20Hinz?= Date: Fri, 22 Dec 2023 00:19:01 +0100 Subject: [PATCH 034/151] net: Implement missing getsockopt(SO_TIMESTAMPING_NEW) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7f6ca95d16b96567ce4cf458a2790ff17fa620c3 ] Commit 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") added the new socket option SO_TIMESTAMPING_NEW. Setting the option is handled in sk_setsockopt(), querying it was not handled in sk_getsockopt(), though. Following remarks on an earlier submission of this patch, keep the old behavior of getsockopt(SO_TIMESTAMPING_OLD) which returns the active flags even if they actually have been set through SO_TIMESTAMPING_NEW. The new getsockopt(SO_TIMESTAMPING_NEW) is stricter, returning flags only if they have been set through the same option. Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") Link: https://lore.kernel.org/lkml/20230703175048.151683-1-jthinz@mailbox.tu-berlin.de/ Link: https://lore.kernel.org/netdev/0d7cddc9-03fa-43db-a579-14f3e822615b@app.fastmail.com/ Signed-off-by: Jörn-Thorben Hinz Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/core/sock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 49b7f252ddae..0d8754ec837d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1704,9 +1704,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: lv = sizeof(v.timestamping); - v.timestamping.flags = READ_ONCE(sk->sk_tsflags); - v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only + * returning the flags when they were set through the same option. + * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. + */ + if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + } break; case SO_RCVTIMEO_OLD: From 85f6fae44bba43a97449a3d22f6dabde22c2b842 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sat, 23 Dec 2023 20:59:22 +0800 Subject: [PATCH 035/151] selftests: bonding: do not set port down when adding to bond [ Upstream commit 61fa2493ca76fd7bb74e13f0205274f4ab0aa696 ] Similar to commit be809424659c ("selftests: bonding: do not set port down before adding to bond"). The bond-arp-interval-causes-panic test failed after commit a4abfa627c38 ("net: rtnetlink: Enslave device before bringing it up") as the kernel will set the port down _after_ adding to bond if setting port down specifically. Fix it by removing the link down operation when adding to bond. Fixes: 2ffd57327ff1 ("selftests: bonding: cause oops in bond_rr_gen_slave_id") Signed-off-by: Hangbin Liu Tested-by: Benjamin Poirier Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- .../drivers/net/bonding/bond-arp-interval-causes-panic.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh index 71c00bfafbc9..2ff58fed76e2 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh @@ -33,16 +33,16 @@ ip netns add "client" ip link set dev link1_1 netns client down name eth0 ip netns exec client ip link add dev bond0 down type bond mode 1 \ miimon 100 all_slaves_active 1 -ip netns exec client ip link set dev eth0 down master bond0 +ip netns exec client ip link set dev eth0 master bond0 ip netns exec client ip link set dev bond0 up ip netns exec client ip addr add ${client_ip4}/24 dev bond0 ip netns exec client ping -c 5 $server_ip4 >/dev/null -ip netns exec client ip link set dev eth0 down nomaster +ip netns exec client ip link set dev eth0 nomaster ip netns exec client ip link set dev bond0 down ip netns exec client ip link set dev bond0 type bond mode 0 \ arp_interval 1000 arp_ip_target "+${server_ip4}" -ip netns exec client ip link set dev eth0 down master bond0 +ip netns exec client ip link set dev eth0 master bond0 ip netns exec client ip link set dev bond0 up ip netns exec client ping -c 5 $server_ip4 >/dev/null From 725d44e49fb5effc82df125eae6408f289e604c9 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Thu, 28 Dec 2023 20:39:02 +0100 Subject: [PATCH 036/151] ARM: sun9i: smp: Fix array-index-out-of-bounds read in sunxi_mc_smp_init [ Upstream commit 72ad3b772b6d393701df58ba1359b0bb346a19ed ] Running a multi-arch kernel (multi_v7_defconfig) on a Raspberry Pi 3B+ with enabled CONFIG_UBSAN triggers the following warning: UBSAN: array-index-out-of-bounds in arch/arm/mach-sunxi/mc_smp.c:810:29 index 2 is out of range for type 'sunxi_mc_smp_data [2]' CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6-00248-g5254c0cbc92d Hardware name: BCM2835 unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x40/0x4c dump_stack_lvl from ubsan_epilogue+0x8/0x34 ubsan_epilogue from __ubsan_handle_out_of_bounds+0x78/0x80 __ubsan_handle_out_of_bounds from sunxi_mc_smp_init+0xe4/0x4cc sunxi_mc_smp_init from do_one_initcall+0xa0/0x2fc do_one_initcall from kernel_init_freeable+0xf4/0x2f4 kernel_init_freeable from kernel_init+0x18/0x158 kernel_init from ret_from_fork+0x14/0x28 Since the enabled method couldn't match with any entry from sunxi_mc_smp_data, the value of the index shouldn't be used right after the loop. So move it after the check of ret in order to have a valid index. Fixes: 1631090e34f5 ("ARM: sun9i: smp: Add is_a83t field") Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20231228193903.9078-1-wahrenst@gmx.net Reviewed-by: Chen-Yu Tsai Signed-off-by: Arnd Bergmann Signed-off-by: Sasha Levin --- arch/arm/mach-sunxi/mc_smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-sunxi/mc_smp.c b/arch/arm/mach-sunxi/mc_smp.c index 26cbce135338..b2f5f4f28705 100644 --- a/arch/arm/mach-sunxi/mc_smp.c +++ b/arch/arm/mach-sunxi/mc_smp.c @@ -808,12 +808,12 @@ static int __init sunxi_mc_smp_init(void) break; } - is_a83t = sunxi_mc_smp_data[i].is_a83t; - of_node_put(node); if (ret) return -ENODEV; + is_a83t = sunxi_mc_smp_data[i].is_a83t; + if (!sunxi_mc_smp_cpu_table_init()) return -EINVAL; From e75715e1c2e5af328f8daf78fbf0327d2c8f051c Mon Sep 17 00:00:00 2001 From: Zhipeng Lu Date: Mon, 25 Dec 2023 19:29:14 +0800 Subject: [PATCH 037/151] sfc: fix a double-free bug in efx_probe_filters [ Upstream commit d5a306aedba34e640b11d7026dbbafb78ee3a5f6 ] In efx_probe_filters, the channel->rps_flow_id is freed in a efx_for_each_channel marco when success equals to 0. However, after the following call chain: ef100_net_open |-> efx_probe_filters |-> ef100_net_stop |-> efx_remove_filters The channel->rps_flow_id is freed again in the efx_for_each_channel of efx_remove_filters, triggering a double-free bug. Fixes: a9dc3d5612ce ("sfc_ef100: RX filter table management and related gubbins") Reviewed-by: Simon Horman Reviewed-by: Edward Cree Signed-off-by: Zhipeng Lu Link: https://lore.kernel.org/r/20231225112915.3544581-1-alexious@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/sfc/rx_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c index 9220afeddee8..3f290791df1c 100644 --- a/drivers/net/ethernet/sfc/rx_common.c +++ b/drivers/net/ethernet/sfc/rx_common.c @@ -820,8 +820,10 @@ int efx_probe_filters(struct efx_nic *efx) } if (!success) { - efx_for_each_channel(channel, efx) + efx_for_each_channel(channel, efx) { kfree(channel->rps_flow_id); + channel->rps_flow_id = NULL; + } efx->type->filter_table_remove(efx); rc = -ENOMEM; goto out_unlock; From bb1bf97fa1877d40ad93f27e6d4b4f993565e465 Mon Sep 17 00:00:00 2001 From: Adrian Cinal Date: Thu, 28 Dec 2023 14:56:38 +0100 Subject: [PATCH 038/151] net: bcmgenet: Fix FCS generation for fragmented skbuffs [ Upstream commit e584f2ff1e6cc9b1d99e8a6b0f3415940d1b3eb3 ] The flag DMA_TX_APPEND_CRC was only written to the first DMA descriptor in the TX path, where each descriptor corresponds to a single skbuff fragment (or the skbuff head). This led to packets with no FCS appearing on the wire if the kernel allocated the packet in fragments, which would always happen when using PACKET_MMAP/TPACKET (cf. tpacket_fill_skb() in net/af_packet.c). Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file") Signed-off-by: Adrian Cinal Acked-by: Doug Berger Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20231228135638.1339245-1-adriancinal1@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 1ae082eb9e90..c2a991308215 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2131,8 +2131,10 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev) /* Note: if we ever change from DMA_TX_APPEND_CRC below we * will need to restore software padding of "runt" packets */ + len_stat |= DMA_TX_APPEND_CRC; + if (!i) { - len_stat |= DMA_TX_APPEND_CRC | DMA_SOP; + len_stat |= DMA_SOP; if (skb->ip_summed == CHECKSUM_PARTIAL) len_stat |= DMA_TX_DO_CSUM; } From 81f8a995ebc8fa90903ff690c48a563116cb5dc4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 1 Jan 2024 20:15:33 +0100 Subject: [PATCH 039/151] netfilter: nft_immediate: drop chain reference counter on error [ Upstream commit b29be0ca8e816119ccdf95cc7d7c7be9bde005f1 ] In the init path, nft_data_init() bumps the chain reference counter, decrement it on error by following the error path which calls nft_data_release() to restore it. Fixes: 4bedf9eee016 ("netfilter: nf_tables: fix chain binding transaction logic") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nft_immediate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 5f59dbab3e93..55fcf0280c5c 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -78,7 +78,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx, case NFT_GOTO: err = nf_tables_bind_chain(ctx, chain); if (err < 0) - return err; + goto err1; break; default: break; From 72fa66177859d9552163835d9f1fc5f6bd68d43f Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 21 Dec 2023 09:12:30 -0400 Subject: [PATCH 040/151] net: Save and restore msg_namelen in sock_sendmsg [ Upstream commit 01b2885d9415152bcb12ff1f7788f500a74ea0ed ] Commit 86a7e0b69bd5 ("net: prevent rewrite of msg_name in sock_sendmsg()") made sock_sendmsg save the incoming msg_name pointer and restore it before returning, to insulate the caller against msg_name being changed by the called code. If the address length was also changed however, we may return with an inconsistent structure where the length doesn't match the address, and attempts to reuse it may lead to lost packets. For example, a kernel that doesn't have commit 1c5950fc6fe9 ("udp6: fix potential access to stale information") will replace a v4 mapped address with its ipv4 equivalent, and shorten namelen accordingly from 28 to 16. If the caller attempts to reuse the resulting msg structure, it will have the original ipv6 (v4 mapped) address but an incorrect v4 length. Fixes: 86a7e0b69bd5 ("net: prevent rewrite of msg_name in sock_sendmsg()") Signed-off-by: Marc Dionne Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/socket.c b/net/socket.c index 07470724e735..0104617b440d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -740,6 +740,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) { struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; struct sockaddr_storage address; + int save_len = msg->msg_namelen; int ret; if (msg->msg_name) { @@ -749,6 +750,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) ret = __sock_sendmsg(sock, msg); msg->msg_name = save_addr; + msg->msg_namelen = save_len; return ret; } From 2f3b6e8600c9a4089942c0530575c53e18600a4e Mon Sep 17 00:00:00 2001 From: Ke Xiao Date: Mon, 18 Dec 2023 15:08:50 +0800 Subject: [PATCH 041/151] i40e: fix use-after-free in i40e_aqc_add_filters() [ Upstream commit 6a15584e99db8918b60e507539c7446375dcf366 ] Commit 3116f59c12bd ("i40e: fix use-after-free in i40e_sync_filters_subtask()") avoided use-after-free issues, by increasing refcount during update the VSI filter list to the HW. However, it missed the unicast situation. When deleting an unicast FDB entry, the i40e driver will release the mac_filter, and i40e_service_task will concurrently request firmware to add the mac_filter, which will lead to the following use-after-free issue. Fix again for both netdev->uc and netdev->mc. BUG: KASAN: use-after-free in i40e_aqc_add_filters+0x55c/0x5b0 [i40e] Read of size 2 at addr ffff888eb3452d60 by task kworker/8:7/6379 CPU: 8 PID: 6379 Comm: kworker/8:7 Kdump: loaded Tainted: G Workqueue: i40e i40e_service_task [i40e] Call Trace: dump_stack+0x71/0xab print_address_description+0x6b/0x290 kasan_report+0x14a/0x2b0 i40e_aqc_add_filters+0x55c/0x5b0 [i40e] i40e_sync_vsi_filters+0x1676/0x39c0 [i40e] i40e_service_task+0x1397/0x2bb0 [i40e] process_one_work+0x56a/0x11f0 worker_thread+0x8f/0xf40 kthread+0x2a0/0x390 ret_from_fork+0x1f/0x40 Allocated by task 21948: kasan_kmalloc+0xa6/0xd0 kmem_cache_alloc_trace+0xdb/0x1c0 i40e_add_filter+0x11e/0x520 [i40e] i40e_addr_sync+0x37/0x60 [i40e] __hw_addr_sync_dev+0x1f5/0x2f0 i40e_set_rx_mode+0x61/0x1e0 [i40e] dev_uc_add_excl+0x137/0x190 i40e_ndo_fdb_add+0x161/0x260 [i40e] rtnl_fdb_add+0x567/0x950 rtnetlink_rcv_msg+0x5db/0x880 netlink_rcv_skb+0x254/0x380 netlink_unicast+0x454/0x610 netlink_sendmsg+0x747/0xb00 sock_sendmsg+0xe2/0x120 __sys_sendto+0x1ae/0x290 __x64_sys_sendto+0xdd/0x1b0 do_syscall_64+0xa0/0x370 entry_SYSCALL_64_after_hwframe+0x65/0xca Freed by task 21948: __kasan_slab_free+0x137/0x190 kfree+0x8b/0x1b0 __i40e_del_filter+0x116/0x1e0 [i40e] i40e_del_mac_filter+0x16c/0x300 [i40e] i40e_addr_unsync+0x134/0x1b0 [i40e] __hw_addr_sync_dev+0xff/0x2f0 i40e_set_rx_mode+0x61/0x1e0 [i40e] dev_uc_del+0x77/0x90 rtnl_fdb_del+0x6a5/0x860 rtnetlink_rcv_msg+0x5db/0x880 netlink_rcv_skb+0x254/0x380 netlink_unicast+0x454/0x610 netlink_sendmsg+0x747/0xb00 sock_sendmsg+0xe2/0x120 __sys_sendto+0x1ae/0x290 __x64_sys_sendto+0xdd/0x1b0 do_syscall_64+0xa0/0x370 entry_SYSCALL_64_after_hwframe+0x65/0xca Fixes: 3116f59c12bd ("i40e: fix use-after-free in i40e_sync_filters_subtask()") Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Ke Xiao Signed-off-by: Ding Hui Cc: Di Zhu Reviewed-by: Jan Sokolowski Reviewed-by: Simon Horman Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b4157ff370a3..cdc68b78bd9e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -104,12 +104,18 @@ static struct workqueue_struct *i40e_wq; static void netdev_hw_addr_refcnt(struct i40e_mac_filter *f, struct net_device *netdev, int delta) { + struct netdev_hw_addr_list *ha_list; struct netdev_hw_addr *ha; if (!f || !netdev) return; - netdev_for_each_mc_addr(ha, netdev) { + if (is_unicast_ether_addr(f->macaddr) || is_link_local_ether_addr(f->macaddr)) + ha_list = &netdev->uc; + else + ha_list = &netdev->mc; + + netdev_hw_addr_list_for_each(ha, ha_list) { if (ether_addr_equal(ha->addr, f->macaddr)) { ha->refcount += delta; if (ha->refcount <= 0) From 95b4d4093ac0c3f5b821c94b4f7eba7c4a6cf74b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 3 Jan 2024 18:34:01 +0000 Subject: [PATCH 042/151] ASoC: meson: g12a-toacodec: Validate written enum values [ Upstream commit 3150b70e944ead909260285dfb5707d0bedcf87b ] When writing to an enum we need to verify that the value written is valid for the enumeration, the helper function snd_soc_item_enum_to_val() doesn't do it since it needs to return an unsigned (and in any case we'd need to check the return value). Fixes: af2618a2eee8 ("ASoC: meson: g12a: add internal DAC glue driver") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-1-424af7a8fb91@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/meson/g12a-toacodec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/meson/g12a-toacodec.c b/sound/soc/meson/g12a-toacodec.c index ddc667956cf5..3b1ce9143c65 100644 --- a/sound/soc/meson/g12a-toacodec.c +++ b/sound/soc/meson/g12a-toacodec.c @@ -71,6 +71,9 @@ static int g12a_toacodec_mux_put_enum(struct snd_kcontrol *kcontrol, struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; unsigned int mux, reg; + if (ucontrol->value.enumerated.item[0] >= e->items) + return -EINVAL; + mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]); regmap_field_read(priv->field_dat_sel, ®); From 5de3c8496e770b6d4c18c02ed252c3ae239f560d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 3 Jan 2024 18:34:02 +0000 Subject: [PATCH 043/151] ASoC: meson: g12a-tohdmitx: Validate written enum values [ Upstream commit 1e001206804be3f3d21f4a1cf16e5d059d75643f ] When writing to an enum we need to verify that the value written is valid for the enumeration, the helper function snd_soc_item_enum_to_val() doesn't do it since it needs to return an unsigned (and in any case we'd need to check the return value). Fixes: c8609f3870f7 ("ASoC: meson: add g12a tohdmitx control") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-2-424af7a8fb91@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/meson/g12a-tohdmitx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c index 579a04ad4d19..46d1f04e0e8a 100644 --- a/sound/soc/meson/g12a-tohdmitx.c +++ b/sound/soc/meson/g12a-tohdmitx.c @@ -45,6 +45,9 @@ static int g12a_tohdmitx_i2s_mux_put_enum(struct snd_kcontrol *kcontrol, struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; unsigned int mux, changed; + if (ucontrol->value.enumerated.item[0] >= e->items) + return -EINVAL; + mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]); changed = snd_soc_component_test_bits(component, e->reg, CTRL0_I2S_DAT_SEL, @@ -93,6 +96,9 @@ static int g12a_tohdmitx_spdif_mux_put_enum(struct snd_kcontrol *kcontrol, struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; unsigned int mux, changed; + if (ucontrol->value.enumerated.item[0] >= e->items) + return -EINVAL; + mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]); changed = snd_soc_component_test_bits(component, TOHDMITX_CTRL0, CTRL0_SPDIF_SEL, From 8719838c126ac82d98193d356fe6ce2b0872cdf2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 3 Jan 2024 18:34:03 +0000 Subject: [PATCH 044/151] ASoC: meson: g12a-toacodec: Fix event generation [ Upstream commit 172c88244b5f2d3375403ebb504d407be0fded59 ] When a control changes value the return value from _put() should be 1 so we get events generated to userspace notifying applications of the change. We are checking if there has been a change and exiting early if not but we are not providing the correct return value in the latter case, fix this. Fixes: af2618a2eee8 ("ASoC: meson: g12a: add internal DAC glue driver") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-3-424af7a8fb91@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/meson/g12a-toacodec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/meson/g12a-toacodec.c b/sound/soc/meson/g12a-toacodec.c index 3b1ce9143c65..8d8d848ebd58 100644 --- a/sound/soc/meson/g12a-toacodec.c +++ b/sound/soc/meson/g12a-toacodec.c @@ -104,7 +104,7 @@ static int g12a_toacodec_mux_put_enum(struct snd_kcontrol *kcontrol, snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); - return 0; + return 1; } static SOC_ENUM_SINGLE_DECL(g12a_toacodec_mux_enum, TOACODEC_CTRL0, From 5735f529e318de4a798c50c2281f5686d05b22ab Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 3 Jan 2024 18:34:04 +0000 Subject: [PATCH 045/151] ASoC: meson: g12a-tohdmitx: Fix event generation for S/PDIF mux [ Upstream commit b036d8ef3120b996751495ce25994eea58032a98 ] When a control changes value the return value from _put() should be 1 so we get events generated to userspace notifying applications of the change. While the I2S mux gets this right the S/PDIF mux does not, fix the return value. Fixes: c8609f3870f7 ("ASoC: meson: add g12a tohdmitx control") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-4-424af7a8fb91@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/meson/g12a-tohdmitx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c index 46d1f04e0e8a..154c324fdd42 100644 --- a/sound/soc/meson/g12a-tohdmitx.c +++ b/sound/soc/meson/g12a-tohdmitx.c @@ -118,7 +118,7 @@ static int g12a_tohdmitx_spdif_mux_put_enum(struct snd_kcontrol *kcontrol, snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); - return 0; + return 1; } static SOC_ENUM_SINGLE_DECL(g12a_tohdmitx_spdif_mux_enum, TOHDMITX_CTRL0, From 7663226274af14a55b3a95893f9ab13edec10d27 Mon Sep 17 00:00:00 2001 From: Andrii Staikov Date: Thu, 21 Dec 2023 14:27:35 +0100 Subject: [PATCH 046/151] i40e: Restore VF MSI-X state during PCI reset [ Upstream commit 371e576ff3e8580d91d49026e5d5faebf5565558 ] During a PCI FLR the MSI-X Enable flag in the VF PCI MSI-X capability register will be cleared. This can lead to issues when a VF is assigned to a VM because in these cases the VF driver receives no indication of the PF PCI error/reset and additionally it is incapable of restoring the cleared flag in the hypervisor configuration space without fully reinitializing the driver interrupt functionality. Since the VF driver is unable to easily resolve this condition on its own, restore the VF MSI-X flag during the PF PCI reset handling. Fixes: 19b7960b2da1 ("i40e: implement split PCI error reset handler") Co-developed-by: Karen Ostrowska Signed-off-by: Karen Ostrowska Co-developed-by: Mateusz Palczewski Signed-off-by: Mateusz Palczewski Reviewed-by: Wojciech Drewek Reviewed-by: Przemek Kitszel Signed-off-by: Andrii Staikov Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++ .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +++++++++++++++++++ .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 3 +++ 3 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index cdc68b78bd9e..63d43ef86f9b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16450,6 +16450,9 @@ static void i40e_pci_error_reset_done(struct pci_dev *pdev) return; i40e_reset_and_rebuild(pf, false, false); +#ifdef CONFIG_PCI_IOV + i40e_restore_all_vfs_msi_state(pdev); +#endif /* CONFIG_PCI_IOV */ } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 3c38129a5224..c7d761426d6c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -152,6 +152,32 @@ void i40e_vc_notify_reset(struct i40e_pf *pf) (u8 *)&pfe, sizeof(struct virtchnl_pf_event)); } +#ifdef CONFIG_PCI_IOV +void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev) +{ + u16 vf_id; + u16 pos; + + /* Continue only if this is a PF */ + if (!pdev->is_physfn) + return; + + if (!pci_num_vf(pdev)) + return; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); + if (pos) { + struct pci_dev *vf_dev = NULL; + + pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID, &vf_id); + while ((vf_dev = pci_get_device(pdev->vendor, vf_id, vf_dev))) { + if (vf_dev->is_virtfn && vf_dev->physfn == pdev) + pci_restore_msi_state(vf_dev); + } + } +} +#endif /* CONFIG_PCI_IOV */ + /** * i40e_vc_notify_vf_reset * @vf: pointer to the VF structure diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h index 358bbdb58795..bd497cc5303a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h @@ -135,6 +135,9 @@ int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable); void i40e_vc_notify_link_state(struct i40e_pf *pf); void i40e_vc_notify_reset(struct i40e_pf *pf); +#ifdef CONFIG_PCI_IOV +void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev); +#endif /* CONFIG_PCI_IOV */ int i40e_get_vf_stats(struct net_device *netdev, int vf_id, struct ifla_vf_stats *vf_stats); From 9b050429223739135467038d092d2d40139799bb Mon Sep 17 00:00:00 2001 From: Rodrigo Cataldo Date: Fri, 8 Dec 2023 15:58:16 +0100 Subject: [PATCH 047/151] igc: Fix hicredit calculation [ Upstream commit 947dfc8138dfaeb6e966e2d661de89eb203e3064 ] According to the Intel Software Manual for I225, Section 7.5.2.7, hicredit should be multiplied by the constant link-rate value, 0x7736. Currently, the old constant link-rate value, 0x7735, from the boards supported on igb are being used, most likely due to a copy'n'paste, as the rest of the logic is the same for both drivers. Update hicredit accordingly. Fixes: 1ab011b0bf07 ("igc: Add support for CBS offloading") Reviewed-by: Kurt Kanzenbach Signed-off-by: Rodrigo Cataldo Acked-by: Vinicius Costa Gomes Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 725db36e399d..31ea0781b65e 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -178,7 +178,7 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_TQAVCC(i), tqavcc); wr32(IGC_TQAVHC(i), - 0x80000000 + ring->hicredit * 0x7735); + 0x80000000 + ring->hicredit * 0x7736); } else { /* Disable any CBS for the queue */ txqctl &= ~(IGC_TXQCTL_QAV_SEL_MASK); From 0af75845ff5e62370b602752b100740eff08ecc1 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 27 Dec 2023 15:02:27 +0800 Subject: [PATCH 048/151] net/qla3xxx: fix potential memleak in ql_alloc_buffer_queues [ Upstream commit 89f45c30172c80e55c887f32f1af8e184124577b ] When dma_alloc_coherent() fails, we should free qdev->lrg_buf to prevent potential memleak. Fixes: 1357bfcf7106 ("qla3xxx: Dynamically size the rx buffer queue based on the MTU.") Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20231227070227.10527-1-dinghao.liu@zju.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qla3xxx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index 0d57ffcedf0c..fc78bc959ded 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -2591,6 +2591,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) if (qdev->lrg_buf_q_alloc_virt_addr == NULL) { netdev_err(qdev->ndev, "lBufQ failed\n"); + kfree(qdev->lrg_buf); return -ENOMEM; } qdev->lrg_buf_q_virt_addr = qdev->lrg_buf_q_alloc_virt_addr; @@ -2615,6 +2616,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) qdev->lrg_buf_q_alloc_size, qdev->lrg_buf_q_alloc_virt_addr, qdev->lrg_buf_q_alloc_phy_addr); + kfree(qdev->lrg_buf); return -ENOMEM; } From 84c3833a93bb50296437569e6ba235e2b30f72d2 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Dec 2023 15:40:35 +0800 Subject: [PATCH 049/151] net/smc: fix invalid link access in dumping SMC-R connections [ Upstream commit 9dbe086c69b8902c85cece394760ac212e9e4ccc ] A crash was found when dumping SMC-R connections. It can be reproduced by following steps: - environment: two RNICs on both sides. - run SMC-R between two sides, now a SMC_LGR_SYMMETRIC type link group will be created. - set the first RNIC down on either side and link group will turn to SMC_LGR_ASYMMETRIC_LOCAL then. - run 'smcss -R' and the crash will be triggered. BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000101fdd067 P4D 8000000101fdd067 PUD 10ce46067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 1810 Comm: smcss Kdump: loaded Tainted: G W E 6.7.0-rc6+ #51 RIP: 0010:__smc_diag_dump.constprop.0+0x36e/0x620 [smc_diag] Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x66/0x150 ? exc_page_fault+0x69/0x140 ? asm_exc_page_fault+0x26/0x30 ? __smc_diag_dump.constprop.0+0x36e/0x620 [smc_diag] smc_diag_dump_proto+0xd0/0xf0 [smc_diag] smc_diag_dump+0x26/0x60 [smc_diag] netlink_dump+0x19f/0x320 __netlink_dump_start+0x1dc/0x300 smc_diag_handler_dump+0x6a/0x80 [smc_diag] ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag] sock_diag_rcv_msg+0x121/0x140 ? __pfx_sock_diag_rcv_msg+0x10/0x10 netlink_rcv_skb+0x5a/0x110 sock_diag_rcv+0x28/0x40 netlink_unicast+0x22a/0x330 netlink_sendmsg+0x240/0x4a0 __sock_sendmsg+0xb0/0xc0 ____sys_sendmsg+0x24e/0x300 ? copy_msghdr_from_user+0x62/0x80 ___sys_sendmsg+0x7c/0xd0 ? __do_fault+0x34/0x1a0 ? do_read_fault+0x5f/0x100 ? do_fault+0xb0/0x110 __sys_sendmsg+0x4d/0x80 do_syscall_64+0x45/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 When the first RNIC is set down, the lgr->lnk[0] will be cleared and an asymmetric link will be allocated in lgr->link[SMC_LINKS_PER_LGR_MAX - 1] by smc_llc_alloc_alt_link(). Then when we try to dump SMC-R connections in __smc_diag_dump(), the invalid lgr->lnk[0] will be accessed, resulting in this issue. So fix it by accessing the right link. Fixes: f16a7dd5cf27 ("smc: netlink interface for SMC sockets") Reported-by: henaumars Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7616 Signed-off-by: Wen Gu Reviewed-by: Tony Lu Link: https://lore.kernel.org/r/1703662835-53416-1-git-send-email-guwen@linux.alibaba.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/smc/smc_diag.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 80ea7d954ece..801044e7d194 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -153,8 +153,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .lnk[0].link_id = link->link_id, }; - memcpy(linfo.lnk[0].ibname, - smc->conn.lgr->lnk[0].smcibdev->ibdev->name, + memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name, sizeof(link->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); From b67e7d78e48a47f2040b42b0926fa3ddd9cd5029 Mon Sep 17 00:00:00 2001 From: Naveen Mamindlapalli Date: Tue, 2 Jan 2024 15:26:43 +0530 Subject: [PATCH 050/151] octeontx2-af: Always configure NIX TX link credits based on max frame size [ Upstream commit a0d9528f6daf7fe8de217fa80a94d2989d2a57a7 ] Currently the NIX TX link credits are initialized based on the max frame size that can be transmitted on a link but when the MTU is changed, the NIX TX link credits are reprogrammed by the SW based on the new MTU value. Since SMQ max packet length is programmed to max frame size by default, there is a chance that NIX TX may stall while sending a max frame sized packet on the link with insufficient credits to send the packet all at once. This patch avoids stall issue by not changing the link credits dynamically when the MTU is changed. Fixes: 1c74b89171c3 ("octeontx2-af: Wait for TX link idle for credits change") Signed-off-by: Naveen Mamindlapalli Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: Nithin Kumar Dabilpuram Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- .../ethernet/marvell/octeontx2/af/rvu_nix.c | 110 +----------------- 1 file changed, 3 insertions(+), 107 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 959f36efdc4a..15f698020ec4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -3923,90 +3923,18 @@ static void nix_find_link_frs(struct rvu *rvu, req->minlen = minlen; } -static int -nix_config_link_credits(struct rvu *rvu, int blkaddr, int link, - u16 pcifunc, u64 tx_credits) -{ - struct rvu_hwinfo *hw = rvu->hw; - int pf = rvu_get_pf(pcifunc); - u8 cgx_id = 0, lmac_id = 0; - unsigned long poll_tmo; - bool restore_tx_en = 0; - struct nix_hw *nix_hw; - u64 cfg, sw_xoff = 0; - u32 schq = 0; - u32 credits; - int rc; - - nix_hw = get_nix_hw(rvu->hw, blkaddr); - if (!nix_hw) - return NIX_AF_ERR_INVALID_NIXBLK; - - if (tx_credits == nix_hw->tx_credits[link]) - return 0; - - /* Enable cgx tx if disabled for credits to be back */ - if (is_pf_cgxmapped(rvu, pf)) { - rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); - restore_tx_en = !rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), - lmac_id, true); - } - - mutex_lock(&rvu->rsrc_lock); - /* Disable new traffic to link */ - if (hw->cap.nix_shaping) { - schq = nix_get_tx_link(rvu, pcifunc); - sw_xoff = rvu_read64(rvu, blkaddr, NIX_AF_TL1X_SW_XOFF(schq)); - rvu_write64(rvu, blkaddr, - NIX_AF_TL1X_SW_XOFF(schq), BIT_ULL(0)); - } - - rc = NIX_AF_ERR_LINK_CREDITS; - poll_tmo = jiffies + usecs_to_jiffies(200000); - /* Wait for credits to return */ - do { - if (time_after(jiffies, poll_tmo)) - goto exit; - usleep_range(100, 200); - - cfg = rvu_read64(rvu, blkaddr, - NIX_AF_TX_LINKX_NORM_CREDIT(link)); - credits = (cfg >> 12) & 0xFFFFFULL; - } while (credits != nix_hw->tx_credits[link]); - - cfg &= ~(0xFFFFFULL << 12); - cfg |= (tx_credits << 12); - rvu_write64(rvu, blkaddr, NIX_AF_TX_LINKX_NORM_CREDIT(link), cfg); - rc = 0; - - nix_hw->tx_credits[link] = tx_credits; - -exit: - /* Enable traffic back */ - if (hw->cap.nix_shaping && !sw_xoff) - rvu_write64(rvu, blkaddr, NIX_AF_TL1X_SW_XOFF(schq), 0); - - /* Restore state of cgx tx */ - if (restore_tx_en) - rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false); - - mutex_unlock(&rvu->rsrc_lock); - return rc; -} - int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req, struct msg_rsp *rsp) { struct rvu_hwinfo *hw = rvu->hw; u16 pcifunc = req->hdr.pcifunc; int pf = rvu_get_pf(pcifunc); - int blkaddr, schq, link = -1; - struct nix_txsch *txsch; - u64 cfg, lmac_fifo_len; + int blkaddr, link = -1; struct nix_hw *nix_hw; struct rvu_pfvf *pfvf; u8 cgx = 0, lmac = 0; u16 max_mtu; + u64 cfg; blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc); if (blkaddr < 0) @@ -4027,25 +3955,6 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req, if (req->update_minlen && req->minlen < NIC_HW_MIN_FRS) return NIX_AF_ERR_FRS_INVALID; - /* Check if requester wants to update SMQ's */ - if (!req->update_smq) - goto rx_frscfg; - - /* Update min/maxlen in each of the SMQ attached to this PF/VF */ - txsch = &nix_hw->txsch[NIX_TXSCH_LVL_SMQ]; - mutex_lock(&rvu->rsrc_lock); - for (schq = 0; schq < txsch->schq.max; schq++) { - if (TXSCH_MAP_FUNC(txsch->pfvf_map[schq]) != pcifunc) - continue; - cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(schq)); - cfg = (cfg & ~(0xFFFFULL << 8)) | ((u64)req->maxlen << 8); - if (req->update_minlen) - cfg = (cfg & ~0x7FULL) | ((u64)req->minlen & 0x7F); - rvu_write64(rvu, blkaddr, NIX_AF_SMQX_CFG(schq), cfg); - } - mutex_unlock(&rvu->rsrc_lock); - -rx_frscfg: /* Check if config is for SDP link */ if (req->sdp_link) { if (!hw->sdp_links) @@ -4068,7 +3977,6 @@ rx_frscfg: if (link < 0) return NIX_AF_ERR_RX_LINK_INVALID; - linkcfg: nix_find_link_frs(rvu, req, pcifunc); @@ -4078,19 +3986,7 @@ linkcfg: cfg = (cfg & ~0xFFFFULL) | req->minlen; rvu_write64(rvu, blkaddr, NIX_AF_RX_LINKX_CFG(link), cfg); - if (req->sdp_link || pf == 0) - return 0; - - /* Update transmit credits for CGX links */ - lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, lmac); - if (!lmac_fifo_len) { - dev_err(rvu->dev, - "%s: Failed to get CGX/RPM%d:LMAC%d FIFO size\n", - __func__, cgx, lmac); - return 0; - } - return nix_config_link_credits(rvu, blkaddr, link, pcifunc, - (lmac_fifo_len - req->maxlen) / 16); + return 0; } int rvu_mbox_handler_nix_set_rx_cfg(struct rvu *rvu, struct nix_rx_cfg *req, From db9c4a1f37ee93fdcf34c3f86037daad4cafd16e Mon Sep 17 00:00:00 2001 From: Naveen Mamindlapalli Date: Tue, 2 Jan 2024 19:44:00 +0530 Subject: [PATCH 051/151] octeontx2-af: Re-enable MAC TX in otx2_stop processing [ Upstream commit 818ed8933bd17bc91a9fa8b94a898189c546fc1a ] During QoS scheduling testing with multiple strict priority flows, the netdev tx watchdog timeout routine is invoked when a low priority QoS queue doesn't get a chance to transmit the packets because other high priority flows are completely subscribing the transmit link. The netdev tx watchdog timeout routine will stop MAC RX and TX functionality in otx2_stop() routine before cleanup of HW TX queues which results in SMQ flush errors because the packets belonging to low priority queues will never gets flushed since MAC TX is disabled. This patch fixes the issue by re-enabling MAC TX to ensure the packets in HW pipeline gets flushed properly. Fixes: a7faa68b4e7f ("octeontx2-af: Start/Stop traffic in CGX along with NPC") Signed-off-by: Naveen Mamindlapalli Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 1 + .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 17 +++++++++++++++++ .../net/ethernet/marvell/octeontx2/af/rvu_nix.c | 8 +++++++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 95a7bc396e8e..ab78e9d02075 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -850,6 +850,7 @@ u32 rvu_cgx_get_fifolen(struct rvu *rvu); void *rvu_first_cgx_pdata(struct rvu *rvu); int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id); int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable); +int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable); int rvu_cgx_prio_flow_ctrl_cfg(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause, u16 pfc_en); int rvu_cgx_cfg_pause_frm(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index c60b9580ca96..fa658bd4dfb3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -456,6 +456,23 @@ int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start) return mac_ops->mac_rx_tx_enable(cgxd, lmac_id, start); } +int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable) +{ + int pf = rvu_get_pf(pcifunc); + struct mac_ops *mac_ops; + u8 cgx_id, lmac_id; + void *cgxd; + + if (!is_cgx_config_permitted(rvu, pcifunc)) + return LMAC_AF_ERR_PERM_DENIED; + + rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); + cgxd = rvu_cgx_pdata(cgx_id, rvu); + mac_ops = get_mac_ops(cgxd); + + return mac_ops->mac_tx_enable(cgxd, lmac_id, enable); +} + int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable) { struct mac_ops *mac_ops; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 15f698020ec4..7f9581ce7f1f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -4506,7 +4506,13 @@ int rvu_mbox_handler_nix_lf_stop_rx(struct rvu *rvu, struct msg_req *req, pfvf = rvu_get_pfvf(rvu, pcifunc); clear_bit(NIXLF_INITIALIZED, &pfvf->flags); - return rvu_cgx_start_stop_io(rvu, pcifunc, false); + err = rvu_cgx_start_stop_io(rvu, pcifunc, false); + if (err) + return err; + + rvu_cgx_tx_enable(rvu, pcifunc, true); + + return 0; } #define RX_SA_BASE GENMASK_ULL(52, 7) From 8a09b0f01c404cc59a3b64a4136c4ef801fe4846 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 3 Jan 2024 03:35:34 +0000 Subject: [PATCH 052/151] asix: Add check for usbnet_get_endpoints [ Upstream commit eaac6a2d26b65511e164772bec6918fcbc61938e ] Add check for usbnet_get_endpoints() and return the error if it fails in order to transfer the error. Fixes: 16626b0cc3d5 ("asix: Add a new driver for the AX88172A") Signed-off-by: Chen Ni Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/usb/ax88172a.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c index 3777c7e2e6fc..e47bb125048d 100644 --- a/drivers/net/usb/ax88172a.c +++ b/drivers/net/usb/ax88172a.c @@ -161,7 +161,9 @@ static int ax88172a_bind(struct usbnet *dev, struct usb_interface *intf) u8 buf[ETH_ALEN]; struct ax88172a_private *priv; - usbnet_get_endpoints(dev, intf); + ret = usbnet_get_endpoints(dev, intf); + if (ret) + return ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) From 55fbcd83aacac2cdaeec1bf8844a5721f6daa303 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 3 Jan 2024 10:13:53 +0200 Subject: [PATCH 053/151] net: ravb: Wait for operating mode to be applied [ Upstream commit 9039cd4c61635b2d541009a7cd5e2cc052402f28 ] CSR.OPS bits specify the current operating mode and (according to documentation) they are updated by HW when the operating mode change request is processed. To comply with this check CSR.OPS before proceeding. Commit introduces ravb_set_opmode() that does all the necessities for setting the operating mode (set CCC.OPC (and CCC.GAC, CCC.CSEL, if any) and wait for CSR.OPS) and call it where needed. This should comply with all the HW manuals requirements as different manual variants specify that different modes need to be checked in CSR.OPS when setting CCC.OPC. If gPTP active in config mode is supported and it needs to be enabled, the CCC.GAC and CCC.CSEL needs to be configured along with CCC.OPC in the same write access. For this, ravb_set_opmode() allows passing GAC and CSEL as part of opmode and the function updates accordingly CCC register. Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") Signed-off-by: Claudiu Beznea Reviewed-by: Sergey Shtylyov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/renesas/ravb_main.c | 65 +++++++++++++++--------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 68cb5616ef99..c2c56a5289ca 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -68,16 +68,27 @@ int ravb_wait(struct net_device *ndev, enum ravb_reg reg, u32 mask, u32 value) return -ETIMEDOUT; } -static int ravb_config(struct net_device *ndev) +static int ravb_set_opmode(struct net_device *ndev, u32 opmode) { + u32 csr_ops = 1U << (opmode & CCC_OPC); + u32 ccc_mask = CCC_OPC; int error; - /* Set config mode */ - ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG); - /* Check if the operating mode is changed to the config mode */ - error = ravb_wait(ndev, CSR, CSR_OPS, CSR_OPS_CONFIG); - if (error) - netdev_err(ndev, "failed to switch device to config mode\n"); + /* If gPTP active in config mode is supported it needs to be configured + * along with CSEL and operating mode in the same access. This is a + * hardware limitation. + */ + if (opmode & CCC_GAC) + ccc_mask |= CCC_GAC | CCC_CSEL; + + /* Set operating mode */ + ravb_modify(ndev, CCC, ccc_mask, opmode); + /* Check if the operating mode is changed to the requested one */ + error = ravb_wait(ndev, CSR, CSR_OPS, csr_ops); + if (error) { + netdev_err(ndev, "failed to switch device to requested mode (%u)\n", + opmode & CCC_OPC); + } return error; } @@ -675,7 +686,7 @@ static int ravb_dmac_init(struct net_device *ndev) int error; /* Set CONFIG mode */ - error = ravb_config(ndev); + error = ravb_set_opmode(ndev, CCC_OPC_CONFIG); if (error) return error; @@ -684,9 +695,7 @@ static int ravb_dmac_init(struct net_device *ndev) return error; /* Setting the control will start the AVB-DMAC process. */ - ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_OPERATION); - - return 0; + return ravb_set_opmode(ndev, CCC_OPC_OPERATION); } static void ravb_get_tx_tstamp(struct net_device *ndev) @@ -1048,7 +1057,7 @@ static int ravb_stop_dma(struct net_device *ndev) return error; /* Stop AVB-DMAC process */ - return ravb_config(ndev); + return ravb_set_opmode(ndev, CCC_OPC_CONFIG); } /* E-MAC interrupt handler */ @@ -2576,21 +2585,25 @@ static int ravb_set_gti(struct net_device *ndev) return 0; } -static void ravb_set_config_mode(struct net_device *ndev) +static int ravb_set_config_mode(struct net_device *ndev) { struct ravb_private *priv = netdev_priv(ndev); const struct ravb_hw_info *info = priv->info; + int error; if (info->gptp) { - ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG); + error = ravb_set_opmode(ndev, CCC_OPC_CONFIG); + if (error) + return error; /* Set CSEL value */ ravb_modify(ndev, CCC, CCC_CSEL, CCC_CSEL_HPB); } else if (info->ccc_gac) { - ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG | - CCC_GAC | CCC_CSEL_HPB); + error = ravb_set_opmode(ndev, CCC_OPC_CONFIG | CCC_GAC | CCC_CSEL_HPB); } else { - ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG); + error = ravb_set_opmode(ndev, CCC_OPC_CONFIG); } + + return error; } /* Set tx and rx clock internal delay modes */ @@ -2810,7 +2823,9 @@ static int ravb_probe(struct platform_device *pdev) ndev->ethtool_ops = &ravb_ethtool_ops; /* Set AVB config mode */ - ravb_set_config_mode(ndev); + error = ravb_set_config_mode(ndev); + if (error) + goto out_disable_gptp_clk; if (info->gptp || info->ccc_gac) { /* Set GTI value */ @@ -2933,8 +2948,7 @@ static int ravb_remove(struct platform_device *pdev) dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat, priv->desc_bat_dma); - /* Set reset mode */ - ravb_write(ndev, CCC_OPC_RESET, CCC); + ravb_set_opmode(ndev, CCC_OPC_RESET); clk_disable_unprepare(priv->gptp_clk); clk_disable_unprepare(priv->refclk); @@ -3018,8 +3032,11 @@ static int __maybe_unused ravb_resume(struct device *dev) int ret = 0; /* If WoL is enabled set reset mode to rearm the WoL logic */ - if (priv->wol_enabled) - ravb_write(ndev, CCC_OPC_RESET, CCC); + if (priv->wol_enabled) { + ret = ravb_set_opmode(ndev, CCC_OPC_RESET); + if (ret) + return ret; + } /* All register have been reset to default values. * Restore all registers which where setup at probe time and @@ -3027,7 +3044,9 @@ static int __maybe_unused ravb_resume(struct device *dev) */ /* Set AVB config mode */ - ravb_set_config_mode(ndev); + ret = ravb_set_config_mode(ndev); + if (ret) + return ret; if (info->gptp || info->ccc_gac) { /* Set GTI value */ From 14937f47a48f4c4ec2d344f878e03b96324894ad Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 3 Jan 2024 16:59:24 -0800 Subject: [PATCH 054/151] bnxt_en: Remove mis-applied code from bnxt_cfg_ntp_filters() [ Upstream commit e009b2efb7a8850498796b360043ac25c8d3d28f ] The 2 lines to check for the BNXT_HWRM_PF_UNLOAD_SP_EVENT bit was mis-applied to bnxt_cfg_ntp_filters() and should have been applied to bnxt_sp_task(). Fixes: 19241368443f ("bnxt_en: Send PF driver unload notification to all VFs.") Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 623cdeb29ed9..df4d88d35701 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12081,6 +12081,8 @@ static void bnxt_sp_task(struct work_struct *work) bnxt_cfg_ntp_filters(bp); if (test_and_clear_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event)) bnxt_hwrm_exec_fwd_req(bp); + if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event)) + netdev_info(bp->dev, "Receive PF driver unload event!\n"); if (test_and_clear_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event)) { bnxt_hwrm_port_qstats(bp, 0); bnxt_hwrm_port_qstats_ext(bp, 0); @@ -13059,8 +13061,6 @@ static void bnxt_cfg_ntp_filters(struct bnxt *bp) } } } - if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event)) - netdev_info(bp->dev, "Receive PF driver unload event!\n"); } #else From c38c5cfd3ed7c59fc03134f6a3e53ed440922fdc Mon Sep 17 00:00:00 2001 From: Thomas Lange Date: Thu, 4 Jan 2024 09:57:44 +0100 Subject: [PATCH 055/151] net: Implement missing SO_TIMESTAMPING_NEW cmsg support [ Upstream commit 382a32018b74f407008615e0e831d05ed28e81cd ] Commit 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") added the new socket option SO_TIMESTAMPING_NEW. However, it was never implemented in __sock_cmsg_send thus breaking SO_TIMESTAMPING cmsg for platforms using SO_TIMESTAMPING_NEW. Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") Link: https://lore.kernel.org/netdev/6a7281bf-bc4a-4f75-bb88-7011908ae471@app.fastmail.com/ Signed-off-by: Thomas Lange Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20240104085744.49164-1-thomas@corelatus.se Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/sock.c b/net/core/sock.c index 0d8754ec837d..c50a14a02edd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2771,6 +2771,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, sockc->mark = *(u32 *)CMSG_DATA(cmsg); break; case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) return -EINVAL; From a364c18553d0ffb16c7bf7c7d0c71d9564afe156 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 14 Dec 2023 15:19:30 +0500 Subject: [PATCH 056/151] selftests: secretmem: floor the memory size to the multiple of page_size [ Upstream commit 0aac13add26d546ac74c89d2883b3a5f0fbea039 ] The "locked-in-memory size" limit per process can be non-multiple of page_size. The mmap() fails if we try to allocate locked-in-memory with same size as the allowed limit if it isn't multiple of the page_size because mmap() rounds off the memory size to be allocated to next multiple of page_size. Fix this by flooring the length to be allocated with mmap() to the previous multiple of the page_size. This was getting triggered on KernelCI regularly because of different ulimit settings which wasn't multiple of the page_size. Find logs here: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/ The bug in was present from the time test was first added. Link: https://lkml.kernel.org/r/20231214101931.1155586-1-usama.anjum@collabora.com Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)") Signed-off-by: Muhammad Usama Anjum Reported-by: "kernelci.org bot" Closes: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/ Cc: "James E.J. Bottomley" Cc: Mike Rapoport (IBM) Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- tools/testing/selftests/vm/memfd_secret.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c index 957b9e18c729..9b298f6a04b3 100644 --- a/tools/testing/selftests/vm/memfd_secret.c +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -62,6 +62,9 @@ static void test_mlock_limit(int fd) char *mem; len = mlock_limit_cur; + if (len % page_size != 0) + len = (len/page_size) * page_size; + mem = mmap(NULL, len, prot, mode, fd, 0); if (mem == MAP_FAILED) { fail("unable to mmap secret memory\n"); From 482fa21635c8832db022cd2d649db26b8e6170ac Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 5 Jul 2023 16:51:39 +0200 Subject: [PATCH 057/151] cpu/SMT: Create topology_smt_thread_allowed() [ Upstream commit 38253464bc821d6de6bba81bb1412ebb36f6cbd1 ] Some architectures allows partial SMT states, i.e. when not all SMT threads are brought online. To support that, add an architecture helper which checks whether a given CPU is allowed to be brought online depending on how many SMT threads are currently enabled. Since this is only applicable to architecture supporting partial SMT, only these architectures should select the new configuration variable CONFIG_SMT_NUM_THREADS_DYNAMIC. For the other architectures, not supporting the partial SMT states, there is no need to define topology_cpu_smt_allowed(), the generic code assumed that all the threads are allowed or only the primary ones. Call the helper from cpu_smt_enable(), and cpu_smt_allowed() when SMT is enabled, to check if the particular thread should be onlined. Notably, also call it from cpu_smt_disable() if CPU_SMT_ENABLED, to allow offlining some threads to move from a higher to lower number of threads online. [ ldufour: Slightly reword the commit's description ] [ ldufour: Introduce CONFIG_SMT_NUM_THREADS_DYNAMIC ] Suggested-by: Thomas Gleixner Signed-off-by: Michael Ellerman Signed-off-by: Laurent Dufour Signed-off-by: Thomas Gleixner Tested-by: Zhang Rui Link: https://lore.kernel.org/r/20230705145143.40545-7-ldufour@linux.ibm.com Stable-dep-of: d91bdd96b55c ("cpu/SMT: Make SMT control more robust against enumeration failures") Signed-off-by: Sasha Levin --- arch/Kconfig | 3 +++ kernel/cpu.c | 24 +++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index b60d271bf76a..14273a6203df 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -34,6 +34,9 @@ config ARCH_HAS_SUBPAGE_FAULTS config HOTPLUG_SMT bool +config SMT_NUM_THREADS_DYNAMIC + bool + config GENERIC_ENTRY bool diff --git a/kernel/cpu.c b/kernel/cpu.c index 551468d9c5a8..c37f1758a486 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -446,9 +446,23 @@ static int __init smt_cmdline_disable(char *str) } early_param("nosmt", smt_cmdline_disable); +/* + * For Archicture supporting partial SMT states check if the thread is allowed. + * Otherwise this has already been checked through cpu_smt_max_threads when + * setting the SMT level. + */ +static inline bool cpu_smt_thread_allowed(unsigned int cpu) +{ +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC + return topology_smt_thread_allowed(cpu); +#else + return true; +#endif +} + static inline bool cpu_smt_allowed(unsigned int cpu) { - if (cpu_smt_control == CPU_SMT_ENABLED) + if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; if (topology_is_primary_thread(cpu)) @@ -2294,6 +2308,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) for_each_online_cpu(cpu) { if (topology_is_primary_thread(cpu)) continue; + /* + * Disable can be called with CPU_SMT_ENABLED when changing + * from a higher to lower number of SMT threads per core. + */ + if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) + continue; ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (ret) break; @@ -2328,6 +2348,8 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; + if (!cpu_smt_thread_allowed(cpu)) + continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) break; From abc3e3fb71a553c53e7009a7bef62ae31d9d0f25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Aug 2023 10:18:27 +0200 Subject: [PATCH 058/151] cpu/SMT: Make SMT control more robust against enumeration failures [ Upstream commit d91bdd96b55cc3ce98d883a60f133713821b80a6 ] The SMT control mechanism got added as speculation attack vector mitigation. The implemented logic relies on the primary thread mask to be set up properly. This turns out to be an issue with XEN/PV guests because their CPU hotplug mechanics do not enumerate APICs and therefore the mask is never correctly populated. This went unnoticed so far because by chance XEN/PV ends up with smp_num_siblings == 2. So smt_hotplug_control stays at its default value CPU_SMT_ENABLED and the primary thread mask is never evaluated in the context of CPU hotplug. This stopped "working" with the upcoming overhaul of the topology evaluation which legitimately provides a fake topology for XEN/PV. That sets smp_num_siblings to 1, which causes the core CPU hot-plug core to refuse to bring up the APs. This happens because smt_hotplug_control is set to CPU_SMT_NOT_SUPPORTED which causes cpu_smt_allowed() to evaluate the unpopulated primary thread mask with the conclusion that all non-boot CPUs are not valid to be plugged. Make cpu_smt_allowed() more robust and take CPU_SMT_NOT_SUPPORTED and CPU_SMT_NOT_IMPLEMENTED into account. Rename it to cpu_bootable() while at it as that makes it more clear what the function is about. The primary mask issue on x86 XEN/PV needs to be addressed separately as there are users outside of the CPU hotplug code too. Fixes: 05736e4ac13c ("cpu/hotplug: Provide knobs to control SMT") Reported-by: Juergen Gross Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Tested-by: Sohil Mehta Tested-by: Michael Kelley Tested-by: Peter Zijlstra (Intel) Tested-by: Zhang Rui Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230814085112.149440843@linutronix.de Signed-off-by: Sasha Levin --- kernel/cpu.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index c37f1758a486..e6f0101941ed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -460,11 +460,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu) #endif } -static inline bool cpu_smt_allowed(unsigned int cpu) +static inline bool cpu_bootable(unsigned int cpu) { if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; + /* All CPUs are bootable if controls are not configured */ + if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED) + return true; + + /* All CPUs are bootable if CPU is not SMT capable */ + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return true; + if (topology_is_primary_thread(cpu)) return true; @@ -485,7 +493,7 @@ bool cpu_smt_possible(void) } EXPORT_SYMBOL_GPL(cpu_smt_possible); #else -static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +static inline bool cpu_bootable(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state @@ -588,10 +596,10 @@ static int bringup_wait_for_ap(unsigned int cpu) * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The * CPU marked itself as booted_once in notify_cpu_starting() so the - * cpu_smt_allowed() check will now return false if this is not the + * cpu_bootable() check will now return false if this is not the * primary sibling. */ - if (!cpu_smt_allowed(cpu)) + if (!cpu_bootable(cpu)) return -ECANCELED; if (st->target <= CPUHP_AP_ONLINE_IDLE) @@ -1478,7 +1486,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) err = -EBUSY; goto out; } - if (!cpu_smt_allowed(cpu)) { + if (!cpu_bootable(cpu)) { err = -EPERM; goto out; } From 5573fdbc3423475aae4b0c2e3b0076d6216e9ed1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:28:59 +0200 Subject: [PATCH 059/151] srcu: Fix callbacks acceleration mishandling [ Upstream commit 4a8e65b0c348e42107c64381e692e282900be361 ] SRCU callbacks acceleration might fail if the preceding callbacks advance also fails. This can happen when the following steps are met: 1) The RCU_WAIT_TAIL segment has callbacks (say for gp_num 8) and the RCU_NEXT_READY_TAIL also has callbacks (say for gp_num 12). 2) The grace period for RCU_WAIT_TAIL is observed as started but not yet completed so rcu_seq_current() returns 4 + SRCU_STATE_SCAN1 = 5. 3) This value is passed to rcu_segcblist_advance() which can't move any segment forward and fails. 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. But then the call to rcu_seq_snap() observes the grace period for the RCU_WAIT_TAIL segment (gp_num 8) as completed and the subsequent one for the RCU_NEXT_READY_TAIL segment as started (ie: 8 + SRCU_STATE_SCAN1 = 9) so it returns a snapshot of the next grace period, which is 16. 5) The value of 16 is passed to rcu_segcblist_accelerate() but the freshly enqueued callback in RCU_NEXT_TAIL can't move to RCU_NEXT_READY_TAIL which already has callbacks for a previous grace period (gp_num = 12). So acceleration fails. 6) Note in all these steps, srcu_invoke_callbacks() hadn't had a chance to run srcu_invoke_callbacks(). Then some very bad outcome may happen if the following happens: 7) Some other CPU races and starts the grace period number 16 before the CPU handling previous steps had a chance. Therefore srcu_gp_start() isn't called on the latter sdp to fix the acceleration leak from previous steps with a new pair of call to advance/accelerate. 8) The grace period 16 completes and srcu_invoke_callbacks() is finally called. All the callbacks from previous grace periods (8 and 12) are correctly advanced and executed but callbacks in RCU_NEXT_READY_TAIL still remain. Then rcu_segcblist_accelerate() is called with a snaphot of 20. 9) Since nothing started the grace period number 20, callbacks stay unhandled. This has been reported in real load: [3144162.608392] INFO: task kworker/136:12:252684 blocked for more than 122 seconds. [3144162.615986] Tainted: G O K 5.4.203-1-tlinux4-0011.1 #1 [3144162.623053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [3144162.631162] kworker/136:12 D 0 252684 2 0x90004000 [3144162.631189] Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] [3144162.631192] Call Trace: [3144162.631202] __schedule+0x2ee/0x660 [3144162.631206] schedule+0x33/0xa0 [3144162.631209] schedule_timeout+0x1c4/0x340 [3144162.631214] ? update_load_avg+0x82/0x660 [3144162.631217] ? raw_spin_rq_lock_nested+0x1f/0x30 [3144162.631218] wait_for_completion+0x119/0x180 [3144162.631220] ? wake_up_q+0x80/0x80 [3144162.631224] __synchronize_srcu.part.19+0x81/0xb0 [3144162.631226] ? __bpf_trace_rcu_utilization+0x10/0x10 [3144162.631227] synchronize_srcu+0x5f/0xc0 [3144162.631236] irqfd_shutdown+0x3c/0xb0 [kvm] [3144162.631239] ? __schedule+0x2f6/0x660 [3144162.631243] process_one_work+0x19a/0x3a0 [3144162.631244] worker_thread+0x37/0x3a0 [3144162.631247] kthread+0x117/0x140 [3144162.631247] ? process_one_work+0x3a0/0x3a0 [3144162.631248] ? __kthread_cancel_work+0x40/0x40 [3144162.631250] ret_from_fork+0x1f/0x30 Fix this with taking the snapshot for acceleration _before_ the read of the current grace period number. The only side effect of this solution is that callbacks advancing happen then _after_ the full barrier in rcu_seq_snap(). This is not a problem because that barrier only cares about: 1) Ordering accesses of the update side before call_srcu() so they don't bleed. 2) See all the accesses prior to the grace period of the current gp_num The only things callbacks advancing need to be ordered against are carried by snp locking. Reported-by: Yong He Co-developed-by:: Yong He Signed-off-by: Yong He Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Neeraj upadhyay Signed-off-by: Neeraj upadhyay Link: http://lore.kernel.org/CANZk6aR+CqZaqmMWrC2eRRPY12qAZnDZLwLnHZbNi=xXMB401g@mail.gmail.com Fixes: da915ad5cf25 ("srcu: Parallelize callback handling") Signed-off-by: Frederic Weisbecker Signed-off-by: Sasha Levin --- kernel/rcu/srcutree.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 8fdf07672038..929dcbc04d29 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1100,10 +1100,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + /* + * The snapshot for acceleration must be taken _before_ the read of the + * current gp sequence used for advancing, otherwise advancing may fail + * and acceleration may then fail too. + * + * This could happen if: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ + s = rcu_seq_snap(&ssp->srcu_gp_seq); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; From 605c8d8f9966fcd2f0b858fabebe416fc83f2209 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 12 Sep 2023 23:04:41 +0800 Subject: [PATCH 060/151] bpf, x64: Fix tailcall infinite loop [ Upstream commit 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 ] From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT"), the tailcall on x64 works better than before. From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms for x64 JIT"), tailcall is able to run in BPF subprograms on x64. From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program to other BPF programs"), BPF program is able to trace other BPF programs. How about combining them all together? 1. FENTRY/FEXIT on a BPF subprogram. 2. A tailcall runs in the BPF subprogram. 3. The tailcall calls the subprogram's caller. As a result, a tailcall infinite loop comes up. And the loop would halt the machine. As we know, in tail call context, the tail_call_cnt propagates by stack and rax register between BPF subprograms. So do in trampolines. Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") Reviewed-by: Maciej Fijalkowski Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------ include/linux/bpf.h | 5 +++++ kernel/bpf/trampoline.c | 4 ++-- kernel/bpf/verifier.c | 3 +++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4686c1d9d0cf..e6a031f8dd2e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -893,6 +893,10 @@ static void emit_nops(u8 **pprog, int len) #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) +/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ +#define RESTORE_TAIL_CALL_CNT(stack) \ + EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -1436,9 +1440,7 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; if (tail_call_reachable) { - /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ - EMIT3_off32(0x48, 0x8B, 0x85, - -round_up(bpf_prog->aux->stack_depth, 8) - 8); + RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) return -EINVAL; } else { @@ -2070,6 +2072,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * * RBP - run_ctx_off [ bpf_tramp_run_ctx ] + * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX */ /* room for return value of orig_call or fentry prog */ @@ -2106,6 +2109,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + EMIT1(0x50); /* push rax */ EMIT1(0x53); /* push rbx */ /* Store number of argument registers of the traced function: @@ -2156,9 +2161,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_CALL_ORIG) { restore_regs(m, &prog, nr_args, regs_off); + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before calling the original function, restore the + * tail_call_cnt from stack to rax. + */ + RESTORE_TAIL_CALL_CNT(stack_size); + if (flags & BPF_TRAMP_F_ORIG_STACK) { - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); - EMIT2(0xff, 0xd0); /* call *rax */ + emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8); + EMIT2(0xff, 0xd3); /* call *rbx */ } else { /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2209,7 +2220,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - } + } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before running the original function, restore the + * tail_call_cnt from stack to rax. + */ + RESTORE_TAIL_CALL_CNT(stack_size); + /* restore return value of orig_call or fentry prog back into RAX */ if (save_ret) emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ce9e39ecdb8..619fcba84be2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -825,6 +825,11 @@ struct btf_func_model { */ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) +/* Indicate that current trampoline is in a tail call context. Then, it has to + * cache and restore tail_call_cnt to avoid infinite tail call loop. + */ +#define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c4381dfcd6b0..748ac8616994 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -443,8 +443,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut goto out; } - /* clear all bits except SHARE_IPMODIFY */ - tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; + /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ + tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 12d360d80c14..ee6e811b4315 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15442,6 +15442,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) if (!tr) return -ENOMEM; + if (tgt_prog && tgt_prog->aux->tail_call_reachable) + tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX; + prog->aux->dst_trampoline = tr; return 0; } From 4ee461c5dc99471fec206e9749b16c0bbb3dac8b Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Thu, 5 Jan 2023 11:50:26 +0800 Subject: [PATCH 061/151] bpf, x86: Simplify the parsing logic of structure parameters [ Upstream commit 7f7880495770329d095d402c2865bfa7089192f8 ] Extra_nregs of structure parameters and nr_args can be added directly at the beginning, and using a flip flag to identifiy structure parameters. Meantime, renaming some variables to make them more sense. Signed-off-by: Pu Lehui Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230105035026.3091988-1-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau Stable-dep-of: 2b5dcb31a19a ("bpf, x64: Fix tailcall infinite loop") Signed-off-by: Sasha Levin --- arch/x86/net/bpf_jit_comp.c | 101 +++++++++++++++++------------------- 1 file changed, 48 insertions(+), 53 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e6a031f8dd2e..87cea23f2da1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1752,62 +1752,59 @@ emit_jmp: return proglen; } -static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args, +static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, int stack_size) { - int i, j, arg_size, nr_regs; + int i, j, arg_size; + bool next_same_struct = false; + /* Store function arguments to stack. * For a function that accepts two pointers the sequence will be: * mov QWORD PTR [rbp-0x10],rdi * mov QWORD PTR [rbp-0x8],rsi */ - for (i = 0, j = 0; i < min(nr_args, 6); i++) { - if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { - nr_regs = (m->arg_size[i] + 7) / 8; + for (i = 0, j = 0; i < min(nr_regs, 6); i++) { + /* The arg_size is at most 16 bytes, enforced by the verifier. */ + arg_size = m->arg_size[j]; + if (arg_size > 8) { arg_size = 8; - } else { - nr_regs = 1; - arg_size = m->arg_size[i]; + next_same_struct = !next_same_struct; } - while (nr_regs) { - emit_stx(prog, bytes_to_bpf_size(arg_size), - BPF_REG_FP, - j == 5 ? X86_REG_R9 : BPF_REG_1 + j, - -(stack_size - j * 8)); - nr_regs--; - j++; - } + emit_stx(prog, bytes_to_bpf_size(arg_size), + BPF_REG_FP, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + -(stack_size - i * 8)); + + j = next_same_struct ? j : j + 1; } } -static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, +static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, int stack_size) { - int i, j, arg_size, nr_regs; + int i, j, arg_size; + bool next_same_struct = false; /* Restore function arguments from stack. * For a function that accepts two pointers the sequence will be: * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] */ - for (i = 0, j = 0; i < min(nr_args, 6); i++) { - if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { - nr_regs = (m->arg_size[i] + 7) / 8; + for (i = 0, j = 0; i < min(nr_regs, 6); i++) { + /* The arg_size is at most 16 bytes, enforced by the verifier. */ + arg_size = m->arg_size[j]; + if (arg_size > 8) { arg_size = 8; - } else { - nr_regs = 1; - arg_size = m->arg_size[i]; + next_same_struct = !next_same_struct; } - while (nr_regs) { - emit_ldx(prog, bytes_to_bpf_size(arg_size), - j == 5 ? X86_REG_R9 : BPF_REG_1 + j, - BPF_REG_FP, - -(stack_size - j * 8)); - nr_regs--; - j++; - } + emit_ldx(prog, bytes_to_bpf_size(arg_size), + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + BPF_REG_FP, + -(stack_size - i * 8)); + + j = next_same_struct ? j : j + 1; } } @@ -2033,8 +2030,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_links *tlinks, void *func_addr) { - int ret, i, nr_args = m->nr_args, extra_nregs = 0; - int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; + int i, ret, nr_regs = m->nr_args, stack_size = 0; + int regs_off, nregs_off, ip_off, run_ctx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; @@ -2043,17 +2040,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i u8 *prog; bool save_ret; - /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ - if (nr_args > 6) - return -ENOTSUPP; - - for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { + /* extra registers for struct arguments */ + for (i = 0; i < m->nr_args; i++) if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) - extra_nregs += (m->arg_size[i] + 7) / 8 - 1; - } - if (nr_args + extra_nregs > 6) + nr_regs += (m->arg_size[i] + 7) / 8 - 1; + + /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ + if (nr_regs > 6) return -ENOTSUPP; - stack_size += extra_nregs * 8; /* Generated trampoline stack layout: * @@ -2067,7 +2061,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * [ ... ] * RBP - regs_off [ reg_arg1 ] program's ctx pointer * - * RBP - args_off [ arg regs count ] always + * RBP - nregs_off [ regs count ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * @@ -2080,11 +2074,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (save_ret) stack_size += 8; + stack_size += nr_regs * 8; regs_off = stack_size; - /* args count */ + /* regs count */ stack_size += 8; - args_off = stack_size; + nregs_off = stack_size; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -2114,11 +2109,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT1(0x53); /* push rbx */ /* Store number of argument registers of the traced function: - * mov rax, nr_args + extra_nregs - * mov QWORD PTR [rbp - args_off], rax + * mov rax, nr_regs + * mov QWORD PTR [rbp - nregs_off], rax */ - emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs); - emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off); + emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs); + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off); if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function: @@ -2129,7 +2124,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); } - save_regs(m, &prog, nr_args, regs_off); + save_regs(m, &prog, nr_regs, regs_off); if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ @@ -2159,7 +2154,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (flags & BPF_TRAMP_F_CALL_ORIG) { - restore_regs(m, &prog, nr_args, regs_off); + restore_regs(m, &prog, nr_regs, regs_off); if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) /* Before calling the original function, restore the @@ -2206,7 +2201,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (flags & BPF_TRAMP_F_RESTORE_REGS) - restore_regs(m, &prog, nr_args, regs_off); + restore_regs(m, &prog, nr_regs, regs_off); /* This needs to be done regardless. If there were fmod_ret programs, * the return value is only updated on the stack and still needs to be From 89b51e70e5e335ba5c00abad1bfd9d52acd8afce Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 13 Jul 2023 12:07:36 +0800 Subject: [PATCH 062/151] bpf, x86: save/restore regs with BPF_DW size [ Upstream commit 02a6dfa8ff43efb1c989f87a4d862aedf436088a ] As we already reserve 8 byte in the stack for each reg, it is ok to store/restore the regs in BPF_DW size. This will make the code in save_regs()/restore_regs() simpler. Signed-off-by: Menglong Dong Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20230713040738.1789742-2-imagedong@tencent.com Signed-off-by: Alexei Starovoitov Stable-dep-of: 2b5dcb31a19a ("bpf, x64: Fix tailcall infinite loop") Signed-off-by: Sasha Levin --- arch/x86/net/bpf_jit_comp.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 87cea23f2da1..84c695ae1940 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1755,57 +1755,34 @@ emit_jmp: static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, int stack_size) { - int i, j, arg_size; - bool next_same_struct = false; + int i; /* Store function arguments to stack. * For a function that accepts two pointers the sequence will be: * mov QWORD PTR [rbp-0x10],rdi * mov QWORD PTR [rbp-0x8],rsi */ - for (i = 0, j = 0; i < min(nr_regs, 6); i++) { - /* The arg_size is at most 16 bytes, enforced by the verifier. */ - arg_size = m->arg_size[j]; - if (arg_size > 8) { - arg_size = 8; - next_same_struct = !next_same_struct; - } - - emit_stx(prog, bytes_to_bpf_size(arg_size), - BPF_REG_FP, + for (i = 0; i < min(nr_regs, 6); i++) + emit_stx(prog, BPF_DW, BPF_REG_FP, i == 5 ? X86_REG_R9 : BPF_REG_1 + i, -(stack_size - i * 8)); - - j = next_same_struct ? j : j + 1; - } } static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, int stack_size) { - int i, j, arg_size; - bool next_same_struct = false; + int i; /* Restore function arguments from stack. * For a function that accepts two pointers the sequence will be: * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] */ - for (i = 0, j = 0; i < min(nr_regs, 6); i++) { - /* The arg_size is at most 16 bytes, enforced by the verifier. */ - arg_size = m->arg_size[j]; - if (arg_size > 8) { - arg_size = 8; - next_same_struct = !next_same_struct; - } - - emit_ldx(prog, bytes_to_bpf_size(arg_size), + for (i = 0; i < min(nr_regs, 6); i++) + emit_ldx(prog, BPF_DW, i == 5 ? X86_REG_R9 : BPF_REG_1 + i, BPF_REG_FP, -(stack_size - i * 8)); - - j = next_same_struct ? j : j + 1; - } } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, From 6bcc79a4e76072b89541a62eeb41e22b281a365f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:10 +0100 Subject: [PATCH 063/151] net: Declare MSG_SPLICE_PAGES internal sendmsg() flag [ Upstream commit b841b901c452d92610f739a36e54978453528876 ] Declare MSG_SPLICE_PAGES, an internal sendmsg() flag, that hints to a network protocol that it should splice pages from the source iterator rather than copying the data if it can. This flag is added to a list that is cleared by sendmsg syscalls on entry. This is intended as a replacement for the ->sendpage() op, allowing a way to splice in several multipage folios in one go. Signed-off-by: David Howells Reviewed-by: Willem de Bruijn cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") Signed-off-by: Sasha Levin --- include/linux/socket.h | 3 +++ io_uring/net.c | 2 ++ net/socket.c | 2 ++ 3 files changed, 7 insertions(+) diff --git a/include/linux/socket.h b/include/linux/socket.h index 1db29aab8f9c..b3c58042bd25 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -324,6 +324,7 @@ struct ucred { */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through @@ -334,6 +335,8 @@ struct ucred { #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif +/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ +#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 diff --git a/io_uring/net.c b/io_uring/net.c index 57c626cb4d1a..67f09a40bcb2 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -389,6 +389,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); if (ret < min_ret) { @@ -1137,6 +1138,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); + msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg.msg_flags = msg_flags; msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; diff --git a/net/socket.c b/net/socket.c index 0104617b440d..6f39f7b0cc85 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2131,6 +2131,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; @@ -2482,6 +2483,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, msg_sys->msg_control = ctl_buf; msg_sys->msg_control_is_user = false; } + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) From ac8c69e448f7e43586e102395844a117b0595031 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 13:11:22 +0100 Subject: [PATCH 064/151] udp: Convert udp_sendpage() to use MSG_SPLICE_PAGES [ Upstream commit 7ac7c987850c3ec617c778f7bd871804dc1c648d ] Convert udp_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than directly splicing in the pages itself. This allows ->sendpage() to be replaced by something that can handle multiple multipage folios in a single transaction. Signed-off-by: David Howells cc: Willem de Bruijn cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") Signed-off-by: Sasha Levin --- net/ipv4/udp.c | 51 ++++++-------------------------------------------- 1 file changed, 6 insertions(+), 45 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 65abc92a81bd..b49cb3df01bb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1335,54 +1335,15 @@ EXPORT_SYMBOL(udp_sendmsg); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - struct inet_sock *inet = inet_sk(sk); - struct udp_sock *up = udp_sk(sk); - int ret; + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; if (flags & MSG_SENDPAGE_NOTLAST) - flags |= MSG_MORE; + msg.msg_flags |= MSG_MORE; - if (!up->pending) { - struct msghdr msg = { .msg_flags = flags|MSG_MORE }; - - /* Call udp_sendmsg to specify destination address which - * sendpage interface can't pass. - * This will succeed only when the socket is connected. - */ - ret = udp_sendmsg(sk, &msg, 0); - if (ret < 0) - return ret; - } - - lock_sock(sk); - - if (unlikely(!up->pending)) { - release_sock(sk); - - net_dbg_ratelimited("cork failed\n"); - return -EINVAL; - } - - ret = ip_append_page(sk, &inet->cork.fl.u.ip4, - page, offset, size, flags); - if (ret == -EOPNOTSUPP) { - release_sock(sk); - return sock_no_sendpage(sk->sk_socket, page, offset, - size, flags); - } - if (ret < 0) { - udp_flush_pending_frames(sk); - goto out; - } - - up->len += size; - if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) - ret = udp_push_pending_frames(sk); - if (!ret) - ret = size; -out: - release_sock(sk); - return ret; + bvec_set_page(&bvec, page, size, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + return udp_sendmsg(sk, &msg, size); } #define UDP_SKB_IS_STATELESS 0x80000000 From 4713b7c7568bac9aff4a5346695d6bd691b08a82 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:10 +0100 Subject: [PATCH 065/151] splice, net: Add a splice_eof op to file-ops and socket-ops [ Upstream commit 2bfc66850952b6921b2033b09729ec59eabbc81d ] Add an optional method, ->splice_eof(), to allow splice to indicate the premature termination of a splice to struct file_operations and struct proto_ops. This is called if sendfile() or splice() encounters all of the following conditions inside splice_direct_to_actor(): (1) the user did not set SPLICE_F_MORE (splice only), and (2) an EOF condition occurred (->splice_read() returned 0), and (3) we haven't read enough to fulfill the request (ie. len > 0 still), and (4) we have already spliced at least one byte. A further patch will modify the behaviour of SPLICE_F_MORE to always be passed to the actor if either the user set it or we haven't yet read sufficient data to fulfill the request. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ Signed-off-by: David Howells Reviewed-by: Jakub Kicinski cc: Jens Axboe cc: Christoph Hellwig cc: Al Viro cc: Matthew Wilcox cc: Jan Kara cc: Jeff Layton cc: David Hildenbrand cc: Christian Brauner cc: Chuck Lever cc: Boris Pismenny cc: John Fastabend cc: linux-mm@kvack.org Signed-off-by: Jakub Kicinski Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") Signed-off-by: Sasha Levin --- fs/splice.c | 31 ++++++++++++++++++++++++++++++- include/linux/fs.h | 1 + include/linux/net.h | 1 + include/linux/splice.h | 1 + include/net/sock.h | 1 + net/socket.c | 10 ++++++++++ 6 files changed, 44 insertions(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index 5969b7a1d353..c4ae54deac42 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -764,6 +764,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, return out->f_op->splice_write(pipe, out, ppos, len, flags); } +/* + * Indicate to the caller that there was a premature EOF when reading from the + * source and the caller didn't indicate they would be sending more data after + * this. + */ +static void do_splice_eof(struct splice_desc *sd) +{ + if (sd->splice_eof) + sd->splice_eof(sd); +} + /* * Attempt to initiate a splice from a file to a pipe. */ @@ -864,7 +875,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) - goto out_release; + goto read_failure; read_len = ret; sd->total_len = read_len; @@ -904,6 +915,15 @@ done: file_accessed(in); return bytes; +read_failure: + /* + * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that + * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a + * "->splice_in()" that returned EOF (ie zero) *and* we have sent at + * least 1 byte *then* we will also do the ->splice_eof() call. + */ + if (ret == 0 && !more && len > 0 && bytes) + do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release @@ -932,6 +952,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, sd->flags); } +static void direct_file_splice_eof(struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + if (file->f_op->splice_eof) + file->f_op->splice_eof(file); +} + /** * do_splice_direct - splices data directly between two files * @in: file to splice from @@ -957,6 +985,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .splice_eof = direct_file_splice_eof, .opos = opos, }; long ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index b6af6abc7a77..4a1911dcf834 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2177,6 +2177,7 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + void (*splice_eof)(struct file *file); int (*setlease)(struct file *, long, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); diff --git a/include/linux/net.h b/include/linux/net.h index 18d942bbdf6e..25baca60f6cb 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -209,6 +209,7 @@ struct proto_ops { int offset, size_t size, int flags); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); + void (*splice_eof)(struct socket *sock); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); diff --git a/include/linux/splice.h b/include/linux/splice.h index a55179fd60fc..41a70687be85 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -38,6 +38,7 @@ struct splice_desc { struct file *file; /* file to read/write */ void *data; /* cookie */ } u; + void (*splice_eof)(struct splice_desc *sd); /* Unexpected EOF handler */ loff_t pos; /* file position */ loff_t *opos; /* sendfile: output position */ size_t num_spliced; /* number of bytes already spliced */ diff --git a/include/net/sock.h b/include/net/sock.h index d8ed62a8e1a3..9de9f070537c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1279,6 +1279,7 @@ struct proto { size_t len, int flags, int *addr_len); int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); + void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, diff --git a/net/socket.c b/net/socket.c index 6f39f7b0cc85..639d76f20384 100644 --- a/net/socket.c +++ b/net/socket.c @@ -130,6 +130,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, static ssize_t sock_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); +static void sock_splice_eof(struct file *file); #ifdef CONFIG_PROC_FS static void sock_show_fdinfo(struct seq_file *m, struct file *f) @@ -164,6 +165,7 @@ static const struct file_operations socket_file_ops = { .sendpage = sock_sendpage, .splice_write = generic_splice_sendpage, .splice_read = sock_splice_read, + .splice_eof = sock_splice_eof, .show_fdinfo = sock_show_fdinfo, }; @@ -1091,6 +1093,14 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, return sock->ops->splice_read(sock, ppos, pipe, len, flags); } +static void sock_splice_eof(struct file *file) +{ + struct socket *sock = file->private_data; + + if (sock->ops->splice_eof) + sock->ops->splice_eof(sock); +} + static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; From 2489502fb1f5e5cf86824dadb45a9bac02fbd3aa Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Jun 2023 19:19:13 +0100 Subject: [PATCH 066/151] ipv4, ipv6: Use splice_eof() to flush [ Upstream commit 1d7e4538a5463faa0b0e26a7a7b6bd68c7dfdd78 ] Allow splice to undo the effects of MSG_MORE after prematurely ending a splice/sendfile due to getting an EOF condition (->splice_read() returned 0) after splice had called sendmsg() with MSG_MORE set when the user didn't set MSG_MORE. For UDP, a pending packet will not be emitted if the socket is closed before it is flushed; with this change, it be flushed by ->splice_eof(). For TCP, it's not clear that MSG_MORE is actually effective. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ Signed-off-by: David Howells cc: Kuniyuki Iwashima cc: Willem de Bruijn cc: David Ahern cc: Jens Axboe cc: Matthew Wilcox Signed-off-by: Jakub Kicinski Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") Signed-off-by: Sasha Levin --- include/net/inet_common.h | 1 + include/net/tcp.h | 1 + include/net/udp.h | 1 + net/ipv4/af_inet.c | 18 ++++++++++++++++++ net/ipv4/tcp.c | 16 ++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/udp.c | 16 ++++++++++++++++ net/ipv6/af_inet6.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/ipv6/udp.c | 15 +++++++++++++++ 10 files changed, 71 insertions(+) diff --git a/include/net/inet_common.h b/include/net/inet_common.h index cec453c18f1d..4673bbfd2811 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -33,6 +33,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); +void inet_splice_eof(struct socket *sock); ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/include/net/tcp.h b/include/net/tcp.h index c3d56b337f35..4c838f7290dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -332,6 +332,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); +void tcp_splice_eof(struct socket *sock); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, diff --git a/include/net/udp.h b/include/net/udp.h index fee053bcd17c..fa4cdbe55552 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -269,6 +269,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5d379df90c82..347c3768df6e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -838,6 +838,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(inet_sendmsg); +void inet_splice_eof(struct socket *sock) +{ + const struct proto *prot; + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->splice_eof) + prot->splice_eof(sock); +} +EXPORT_SYMBOL_GPL(inet_splice_eof); + ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { @@ -1057,6 +1072,7 @@ const struct proto_ops inet_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, @@ -1091,6 +1107,7 @@ const struct proto_ops inet_dgram_ops = { .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT @@ -1122,6 +1139,7 @@ static const struct proto_ops inet_sockraw_ops = { .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3935451ad061..0b7844a8d571 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1492,6 +1492,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } EXPORT_SYMBOL(tcp_sendmsg); +void tcp_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; + + if (!tcp_write_queue_tail(sk)) + return; + + lock_sock(sk); + mss_now = tcp_send_mss(sk, &size_goal, 0); + tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); + release_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_splice_eof); + /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7ebbbe561e40..be2c807eed15 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3067,6 +3067,7 @@ struct proto tcp_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_eof = tcp_splice_eof, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v4_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b49cb3df01bb..e8dd2880ac9a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1332,6 +1332,21 @@ do_confirm: } EXPORT_SYMBOL(udp_sendmsg); +void udp_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + + if (!up->pending || READ_ONCE(up->corkflag)) + return; + + lock_sock(sk); + if (up->pending && !READ_ONCE(up->corkflag)) + udp_push_pending_frames(sk); + release_sock(sk); +} +EXPORT_SYMBOL_GPL(udp_splice_eof); + int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -2907,6 +2922,7 @@ struct proto udp_prot = { .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, + .splice_eof = udp_splice_eof, .sendpage = udp_sendpage, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b5309ae87fd7..a2f29ca51600 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -711,6 +711,7 @@ const struct proto_ops inet6_stream_ops = { #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif + .splice_eof = inet_splice_eof, .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7be89dcfd5fc..ba9a22db5805 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2158,6 +2158,7 @@ struct proto tcpv6_prot = { .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, + .splice_eof = tcp_splice_eof, .sendpage = tcp_sendpage, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7f49f69226a2..2a65136dca77 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1657,6 +1657,20 @@ do_confirm: goto out; } +static void udpv6_splice_eof(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + + if (!up->pending || READ_ONCE(up->corkflag)) + return; + + lock_sock(sk); + if (up->pending && !READ_ONCE(up->corkflag)) + udp_v6_push_pending_frames(sk); + release_sock(sk); +} + void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -1768,6 +1782,7 @@ struct proto udpv6_prot = { .getsockopt = udpv6_getsockopt, .sendmsg = udpv6_sendmsg, .recvmsg = udpv6_recvmsg, + .splice_eof = udpv6_splice_eof, .release_cb = ip6_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, From e2a4392b61f6ddae212dc8a04e58391eb6ce34df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:21 +0000 Subject: [PATCH 067/151] udp: introduce udp->udp_flags [ Upstream commit 81b36803ac139827538ac5ce4028e750a3c53f53 ] According to syzbot, it is time to use proper atomic flags for various UDP flags. Add udp_flags field, and convert udp->corkflag to first bit in it. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") Signed-off-by: Sasha Levin --- include/linux/udp.h | 28 +++++++++++++++++++++------- net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 6 +++--- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index e96da4157d04..10b56b8231e3 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -30,14 +30,20 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) return (num + net_hash_mix(net)) & mask; } +enum { + UDP_FLAGS_CORK, /* Cork is required */ +}; + struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; #define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0] #define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1] #define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node + + unsigned long udp_flags; + int pending; /* Any pending frames ? */ - unsigned int corkflag; /* Cork is required */ __u8 encap_type; /* Is this an Encapsulation socket? */ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ @@ -49,6 +55,11 @@ struct udp_sock { gro_enabled:1, /* Request GRO aggregation */ accept_udp_l4:1, accept_udp_fraglist:1; +/* indicator bits used by pcflag: */ +#define UDPLITE_BIT 0x1 /* set by udplite proto init function */ +#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ +#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ + __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -60,12 +71,6 @@ struct udp_sock { */ __u16 pcslen; __u16 pcrlen; -/* indicator bits used by pcflag: */ -#define UDPLITE_BIT 0x1 /* set by udplite proto init function */ -#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ -#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ - __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ - __u8 unused[3]; /* * For encapsulation sockets. */ @@ -89,6 +94,15 @@ struct udp_sock { int forward_deficit; }; +#define udp_test_bit(nr, sk) \ + test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) +#define udp_set_bit(nr, sk) \ + set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) +#define udp_clear_bit(nr, sk) \ + clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) +#define udp_assign_bit(nr, sk, val) \ + assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) + #define UDP_MAX_SEGMENTS (1 << 6UL) static inline struct udp_sock *udp_sk(const struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e8dd2880ac9a..60a754477efb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1068,7 +1068,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; + int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1337,11 +1337,11 @@ void udp_splice_eof(struct socket *sock) struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - if (!up->pending || READ_ONCE(up->corkflag)) + if (!up->pending || udp_test_bit(CORK, sk)) return; lock_sock(sk); - if (up->pending && !READ_ONCE(up->corkflag)) + if (up->pending && !udp_test_bit(CORK, sk)) udp_push_pending_frames(sk); release_sock(sk); } @@ -2673,9 +2673,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - WRITE_ONCE(up->corkflag, 1); + udp_set_bit(CORK, sk); } else { - WRITE_ONCE(up->corkflag, 0); + udp_clear_bit(CORK, sk); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2800,7 +2800,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = READ_ONCE(up->corkflag); + val = udp_test_bit(CORK, sk); break; case UDP_ENCAP: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2a65136dca77..85653e3a04fe 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1351,7 +1351,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; + int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); @@ -1662,11 +1662,11 @@ static void udpv6_splice_eof(struct socket *sock) struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); - if (!up->pending || READ_ONCE(up->corkflag)) + if (!up->pending || udp_test_bit(CORK, sk)) return; lock_sock(sk); - if (up->pending && !READ_ONCE(up->corkflag)) + if (up->pending && !udp_test_bit(CORK, sk)) udp_v6_push_pending_frames(sk); release_sock(sk); } From 50e41aa9ea0d54527dd3007021d6f71935f65829 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:22 +0000 Subject: [PATCH 068/151] udp: move udp->no_check6_tx to udp->udp_flags [ Upstream commit a0002127cd746fcaa182ad3386ef6931c37f3bda ] syzbot reported that udp->no_check6_tx can be read locklessly. Use one atomic bit from udp->udp_flags Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- include/linux/udp.h | 10 +++++----- net/ipv4/udp.c | 4 ++-- net/ipv6/udp.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 10b56b8231e3..b5ca5760ae34 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -32,6 +32,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) enum { UDP_FLAGS_CORK, /* Cork is required */ + UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ }; struct udp_sock { @@ -45,8 +46,7 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ - unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ - no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ + unsigned char no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ encap_enabled:1, /* This socket enabled encap * processing; UDP tunnels and * different encapsulation layer set @@ -112,7 +112,7 @@ static inline struct udp_sock *udp_sk(const struct sock *sk) static inline void udp_set_no_check6_tx(struct sock *sk, bool val) { - udp_sk(sk)->no_check6_tx = val; + udp_assign_bit(NO_CHECK6_TX, sk, val); } static inline void udp_set_no_check6_rx(struct sock *sk, bool val) @@ -120,9 +120,9 @@ static inline void udp_set_no_check6_rx(struct sock *sk, bool val) udp_sk(sk)->no_check6_rx = val; } -static inline bool udp_get_no_check6_tx(struct sock *sk) +static inline bool udp_get_no_check6_tx(const struct sock *sk) { - return udp_sk(sk)->no_check6_tx; + return udp_test_bit(NO_CHECK6_TX, sk); } static inline bool udp_get_no_check6_rx(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 60a754477efb..513035e83a82 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2711,7 +2711,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_NO_CHECK6_TX: - up->no_check6_tx = valbool; + udp_set_no_check6_tx(sk, valbool); break; case UDP_NO_CHECK6_RX: @@ -2808,7 +2808,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, break; case UDP_NO_CHECK6_TX: - val = up->no_check6_tx; + val = udp_get_no_check6_tx(sk); break; case UDP_NO_CHECK6_RX: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 85653e3a04fe..c6e20293c521 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1260,7 +1260,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, kfree_skb(skb); return -EINVAL; } - if (udp_sk(sk)->no_check6_tx) { + if (udp_get_no_check6_tx(sk)) { kfree_skb(skb); return -EINVAL; } @@ -1281,7 +1281,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, if (is_udplite) csum = udplite_csum(skb); - else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */ + else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ From a01cff15ccdc3b71b217f8776a84f3c5e136ea0b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:23 +0000 Subject: [PATCH 069/151] udp: move udp->no_check6_rx to udp->udp_flags [ Upstream commit bcbc1b1de884647aa0318bf74eb7f293d72a1e40 ] syzbot reported that udp->no_check6_rx can be read locklessly. Use one atomic bit from udp->udp_flags. Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- include/linux/udp.h | 10 +++++----- net/ipv4/udp.c | 4 ++-- net/ipv6/udp.c | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index b5ca5760ae34..e6cd46e2b083 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -33,6 +33,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) enum { UDP_FLAGS_CORK, /* Cork is required */ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ + UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ }; struct udp_sock { @@ -46,8 +47,7 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ - unsigned char no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ - encap_enabled:1, /* This socket enabled encap + unsigned char encap_enabled:1, /* This socket enabled encap * processing; UDP tunnels and * different encapsulation layer set * this @@ -117,7 +117,7 @@ static inline void udp_set_no_check6_tx(struct sock *sk, bool val) static inline void udp_set_no_check6_rx(struct sock *sk, bool val) { - udp_sk(sk)->no_check6_rx = val; + udp_assign_bit(NO_CHECK6_RX, sk, val); } static inline bool udp_get_no_check6_tx(const struct sock *sk) @@ -125,9 +125,9 @@ static inline bool udp_get_no_check6_tx(const struct sock *sk) return udp_test_bit(NO_CHECK6_TX, sk); } -static inline bool udp_get_no_check6_rx(struct sock *sk) +static inline bool udp_get_no_check6_rx(const struct sock *sk) { - return udp_sk(sk)->no_check6_rx; + return udp_test_bit(NO_CHECK6_RX, sk); } static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 513035e83a82..01e74919885a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2715,7 +2715,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_NO_CHECK6_RX: - up->no_check6_rx = valbool; + udp_set_no_check6_rx(sk, valbool); break; case UDP_SEGMENT: @@ -2812,7 +2812,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, break; case UDP_NO_CHECK6_RX: - val = up->no_check6_rx; + val = udp_get_no_check6_rx(sk); break; case UDP_SEGMENT: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c6e20293c521..ae4f7f983f95 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -882,7 +882,7 @@ start_lookup: /* If zero checksum and no_check is not on for * the socket then skip it. */ - if (!uh->check && !udp_sk(sk)->no_check6_rx) + if (!uh->check && !udp_get_no_check6_rx(sk)) continue; if (!first) { first = sk; @@ -1000,7 +1000,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp6_sk_rx_dst_set(sk, dst); - if (!uh->check && !udp_sk(sk)->no_check6_rx) { + if (!uh->check && !udp_get_no_check6_rx(sk)) { if (refcounted) sock_put(sk); goto report_csum_error; @@ -1022,7 +1022,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, /* Unicast */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { - if (!uh->check && !udp_sk(sk)->no_check6_rx) + if (!uh->check && !udp_get_no_check6_rx(sk)) goto report_csum_error; return udp6_unicast_rcv_skb(sk, skb, uh); } From 753886c0b994f66925cc3eaa3dc9b161d4e17827 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:24 +0000 Subject: [PATCH 070/151] udp: move udp->gro_enabled to udp->udp_flags [ Upstream commit e1dc0615c6b08ef36414f08c011965b8fb56198b ] syzbot reported that udp->gro_enabled can be read locklessly. Use one atomic bit from udp->udp_flags. Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- include/linux/udp.h | 2 +- net/ipv4/udp.c | 6 +++--- net/ipv4/udp_offload.c | 4 ++-- net/ipv6/udp.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index e6cd46e2b083..f87e2123fe7b 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -34,6 +34,7 @@ enum { UDP_FLAGS_CORK, /* Cork is required */ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ + UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ }; struct udp_sock { @@ -52,7 +53,6 @@ struct udp_sock { * different encapsulation layer set * this */ - gro_enabled:1, /* Request GRO aggregation */ accept_udp_l4:1, accept_udp_fraglist:1; /* indicator bits used by pcflag: */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 01e74919885a..28292fcf0707 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1901,7 +1901,7 @@ try_again: (struct sockaddr *)sin); } - if (udp_sk(sk)->gro_enabled) + if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (inet->cmsg_flags) @@ -2730,7 +2730,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk->sk_socket); - up->gro_enabled = valbool; + udp_assign_bit(GRO_ENABLED, sk, valbool); up->accept_udp_l4 = valbool; release_sock(sk); break; @@ -2820,7 +2820,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - val = up->gro_enabled; + val = udp_test_bit(GRO_ENABLED, sk); break; /* The following two cannot be changed on UDP sockets, the return is diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6d1a4bec2614..8096576fd9bd 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -549,10 +549,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { if (skb->dev->features & NETIF_F_GRO_FRAGLIST) - NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1; + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || - (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) + (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist) return call_gro_receive(udp_gro_receive_segment, head, skb); /* no GRO, be sure flush the current packet */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ae4f7f983f95..ddd17b5ea425 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -440,7 +440,7 @@ try_again: (struct sockaddr *)sin6); } - if (udp_sk(sk)->gro_enabled) + if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (np->rxopt.all) From b680a907d17ca3d2b33c6afcf879f032b6065c9b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:26 +0000 Subject: [PATCH 071/151] udp: move udp->accept_udp_{l4|fraglist} to udp->udp_flags [ Upstream commit f5f52f0884a595ff99ab1a608643fe4025fca2d5 ] These are read locklessly, move them to udp_flags to fix data-races. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Stable-dep-of: 70a36f571362 ("udp: annotate data-races around udp->encap_type") Signed-off-by: Sasha Levin --- include/linux/udp.h | 16 +++++++++------- net/ipv4/udp.c | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index f87e2123fe7b..0e6880856246 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -35,6 +35,8 @@ enum { UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ + UDP_FLAGS_ACCEPT_FRAGLIST, + UDP_FLAGS_ACCEPT_L4, }; struct udp_sock { @@ -48,13 +50,11 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ - unsigned char encap_enabled:1, /* This socket enabled encap + unsigned char encap_enabled:1; /* This socket enabled encap * processing; UDP tunnels and * different encapsulation layer set * this */ - accept_udp_l4:1, - accept_udp_fraglist:1; /* indicator bits used by pcflag: */ #define UDPLITE_BIT 0x1 /* set by udplite proto init function */ #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ @@ -146,10 +146,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) if (!skb_is_gso(skb)) return false; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + !udp_test_bit(ACCEPT_L4, sk)) return true; - if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist) + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && + !udp_test_bit(ACCEPT_FRAGLIST, sk)) return true; return false; @@ -157,8 +159,8 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) static inline void udp_allow_gso(struct sock *sk) { - udp_sk(sk)->accept_udp_l4 = 1; - udp_sk(sk)->accept_udp_fraglist = 1; + udp_set_bit(ACCEPT_L4, sk); + udp_set_bit(ACCEPT_FRAGLIST, sk); } #define udp_portaddr_for_each_entry(__sk, list) \ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 28292fcf0707..df0ea45b8b8f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2731,7 +2731,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, if (valbool) udp_tunnel_encap_enable(sk->sk_socket); udp_assign_bit(GRO_ENABLED, sk, valbool); - up->accept_udp_l4 = valbool; + udp_assign_bit(ACCEPT_L4, sk, valbool); release_sock(sk); break; From 8d929b6c11141429f79b6b17ada2eeabcb988b99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:27 +0000 Subject: [PATCH 072/151] udp: lockless UDP_ENCAP_L2TPINUDP / UDP_GRO [ Upstream commit ac9a7f4ce5dda1472e8f44096f33066c6ec1a3b4 ] Move udp->encap_enabled to udp->udp_flags. Add udp_test_and_set_bit() helper to allow lockless udp_tunnel_encap_enable() implementation. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Stable-dep-of: 70a36f571362 ("udp: annotate data-races around udp->encap_type") Signed-off-by: Sasha Levin --- include/linux/udp.h | 9 ++++----- include/net/udp_tunnel.h | 9 +++------ net/ipv4/udp.c | 10 +++------- net/ipv4/udp_tunnel_core.c | 2 +- net/ipv6/udp.c | 2 +- 5 files changed, 12 insertions(+), 20 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 0e6880856246..efd9ab6df379 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -37,6 +37,7 @@ enum { UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ UDP_FLAGS_ACCEPT_FRAGLIST, UDP_FLAGS_ACCEPT_L4, + UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */ }; struct udp_sock { @@ -50,11 +51,7 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ - unsigned char encap_enabled:1; /* This socket enabled encap - * processing; UDP tunnels and - * different encapsulation layer set - * this - */ + /* indicator bits used by pcflag: */ #define UDPLITE_BIT 0x1 /* set by udplite proto init function */ #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ @@ -98,6 +95,8 @@ struct udp_sock { test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_set_bit(nr, sk) \ set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) +#define udp_test_and_set_bit(nr, sk) \ + test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_clear_bit(nr, sk) \ clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_assign_bit(nr, sk, val) \ diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 72394f441dad..e5f81710b18f 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -174,16 +174,13 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif -static inline void udp_tunnel_encap_enable(struct socket *sock) +static inline void udp_tunnel_encap_enable(struct sock *sk) { - struct udp_sock *up = udp_sk(sock->sk); - - if (up->encap_enabled) + if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) return; - up->encap_enabled = 1; #if IS_ENABLED(CONFIG_IPV6) - if (sock->sk->sk_family == PF_INET6) + if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif udp_encap_enable(); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index df0ea45b8b8f..267f77633a8f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2645,7 +2645,7 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (up->encap_enabled) + if (udp_test_bit(ENCAP_ENABLED, sk)) static_branch_dec(&udp_encap_needed_key); } } @@ -2700,9 +2700,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, fallthrough; case UDP_ENCAP_L2TPINUDP: up->encap_type = val; - lock_sock(sk); - udp_tunnel_encap_enable(sk->sk_socket); - release_sock(sk); + udp_tunnel_encap_enable(sk); break; default: err = -ENOPROTOOPT; @@ -2725,14 +2723,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) - udp_tunnel_encap_enable(sk->sk_socket); + udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); - release_sock(sk); break; /* diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 5f8104cf082d..732e21b75ba2 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -78,7 +78,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; - udp_tunnel_encap_enable(sock); + udp_tunnel_encap_enable(sk); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ddd17b5ea425..5b7c4f8e2ed0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1688,7 +1688,7 @@ void udpv6_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (up->encap_enabled) { + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); } From 158b71f3a9fa46431cb1f745c8024c4c96ea159d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Sep 2023 09:17:28 +0000 Subject: [PATCH 073/151] udp: annotate data-races around udp->encap_type [ Upstream commit 70a36f571362a8de8b8c02d21ae524fc776287f2 ] syzbot/KCSAN complained about UDP_ENCAP_L2TPINUDP setsockopt() racing. Add READ_ONCE()/WRITE_ONCE() to document races on this lockless field. syzbot report was: BUG: KCSAN: data-race in udp_lib_setsockopt / udp_lib_setsockopt read-write to 0xffff8881083603fa of 1 bytes by task 16557 on cpu 0: udp_lib_setsockopt+0x682/0x6c0 udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2779 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read-write to 0xffff8881083603fa of 1 bytes by task 16554 on cpu 1: udp_lib_setsockopt+0x682/0x6c0 udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2779 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x01 -> 0x05 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 16554 Comm: syz-executor.5 Not tainted 6.5.0-rc7-syzkaller-00004-gf7757129e3de #0 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/gtp.c | 4 ++-- net/ipv4/udp.c | 9 +++++---- net/ipv4/xfrm4_input.c | 4 ++-- net/ipv6/udp.c | 5 +++-- net/ipv6/xfrm6_input.c | 4 ++-- net/l2tp/l2tp_core.c | 6 +++--- 6 files changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 477b4d4f860b..bace989591f7 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -629,7 +629,7 @@ static void __gtp_encap_destroy(struct sock *sk) gtp->sk0 = NULL; else gtp->sk1u = NULL; - udp_sk(sk)->encap_type = 0; + WRITE_ONCE(udp_sk(sk)->encap_type, 0); rcu_assign_sk_user_data(sk, NULL); release_sock(sk); sock_put(sk); @@ -681,7 +681,7 @@ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb) netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk); - switch (udp_sk(sk)->encap_type) { + switch (READ_ONCE(udp_sk(sk)->encap_type)) { case UDP_ENCAP_GTP0: netdev_dbg(gtp->dev, "received GTP0 packet\n"); ret = gtp0_udp_encap_recv(gtp, skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 267f77633a8f..5672d9a86c5d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -733,7 +733,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); - if (!sk || udp_sk(sk)->encap_type) { + if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) { sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, @@ -2114,7 +2114,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) } nf_reset_ct(skb); - if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { + if (static_branch_unlikely(&udp_encap_needed_key) && + READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* @@ -2699,7 +2700,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #endif fallthrough; case UDP_ENCAP_L2TPINUDP: - up->encap_type = val; + WRITE_ONCE(up->encap_type, val); udp_tunnel_encap_enable(sk); break; default: @@ -2800,7 +2801,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, break; case UDP_ENCAP: - val = up->encap_type; + val = READ_ONCE(up->encap_type); break; case UDP_NO_CHECK6_TX: diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index eac206a290d0..183f6dc37242 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -85,11 +85,11 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) struct udphdr *uh; struct iphdr *iph; int iphlen, len; - __u8 *udpdata; __be32 *udpdata32; - __u16 encap_type = up->encap_type; + u16 encap_type; + encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5b7c4f8e2ed0..961106eda69d 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, NULL); - if (!sk || udp_sk(sk)->encap_type) { + if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udpv6_encap_needed_key)) { sk = __udp6_lib_err_encap(net, hdr, offset, uh, @@ -712,7 +712,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) } nf_reset_ct(skb); - if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { + if (static_branch_unlikely(&udpv6_encap_needed_key) && + READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 4907ab241d6b..4156387248e4 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -81,14 +81,14 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) struct ipv6hdr *ip6h; int len; int ip6hlen = sizeof(struct ipv6hdr); - __u8 *udpdata; __be32 *udpdata32; - __u16 encap_type = up->encap_type; + u16 encap_type; if (skb->protocol == htons(ETH_P_IP)) return xfrm4_udp_encap_rcv(sk, skb); + encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 03608d3ded4b..8d21ff25f160 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1139,9 +1139,9 @@ static void l2tp_tunnel_destruct(struct sock *sk) switch (tunnel->encap) { case L2TP_ENCAPTYPE_UDP: /* No longer an encapsulation socket. See net/ipv4/udp.c */ - (udp_sk(sk))->encap_type = 0; - (udp_sk(sk))->encap_rcv = NULL; - (udp_sk(sk))->encap_destroy = NULL; + WRITE_ONCE(udp_sk(sk)->encap_type, 0); + udp_sk(sk)->encap_rcv = NULL; + udp_sk(sk)->encap_destroy = NULL; break; case L2TP_ENCAPTYPE_IP: break; From 343bb27e31528a2f928f6d6ca1b835c058e1394c Mon Sep 17 00:00:00 2001 From: Rotem Saado Date: Wed, 4 Oct 2023 12:36:22 +0300 Subject: [PATCH 074/151] wifi: iwlwifi: yoyo: swap cdb and jacket bits values [ Upstream commit 65008777b9dcd2002414ddb2c2158293a6e2fd6f ] The bits are wrong, the jacket bit should be 5 and cdb bit 4. Fix it. Fixes: 1f171f4f1437 ("iwlwifi: Add support for getting rf id with blank otp") Signed-off-by: Rotem Saado Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20231004123422.356d8dacda2f.I349ab888b43a11baa2453a1d6978a6a703e422f0@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/iwl-prph.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-prph.h b/drivers/net/wireless/intel/iwlwifi/iwl-prph.h index 157d1f31c487..c5a306b01fe2 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-prph.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-prph.h @@ -348,8 +348,8 @@ #define RFIC_REG_RD 0xAD0470 #define WFPM_CTRL_REG 0xA03030 #define WFPM_OTP_CFG1_ADDR 0x00a03098 -#define WFPM_OTP_CFG1_IS_JACKET_BIT BIT(4) -#define WFPM_OTP_CFG1_IS_CDB_BIT BIT(5) +#define WFPM_OTP_CFG1_IS_JACKET_BIT BIT(5) +#define WFPM_OTP_CFG1_IS_CDB_BIT BIT(4) #define WFPM_GP2 0xA030B4 From 5db8b93cbe2d291ac507e7cd9accd0d578271c0e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 27 Jan 2023 12:43:42 +0100 Subject: [PATCH 075/151] arm64: dts: qcom: sdm845: align RPMh regulator nodes with bindings [ Upstream commit 86dd19bbdea2b7d3feb69c0c39f141de30a18ec9 ] Device node names should be generic and bindings expect certain pattern for RPMh regulator nodes. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Konrad Dybcio Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20230127114347.235963-6-krzysztof.kozlowski@linaro.org Stable-dep-of: a5f01673d394 ("arm64: dts: qcom: sdm845: Fix PSCI power domain names") Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 4 ++-- arch/arm64/boot/dts/qcom/sdm845-db845c.dts | 4 ++-- arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi | 6 +++--- arch/arm64/boot/dts/qcom/sdm845-mtp.dts | 6 +++--- arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi | 6 +++--- arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts | 6 +++--- arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi | 6 +++--- arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts | 2 +- arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts | 6 +++--- arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 2 +- arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts | 2 +- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi index a5c0c788969f..985824032c52 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi @@ -351,7 +351,7 @@ &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -633,7 +633,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-1 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts index c9efcb894a52..8c9ccf5b4ea4 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts @@ -271,7 +271,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; vdd-s1-supply = <&vph_pwr>; @@ -396,7 +396,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi b/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi index 20f275f8694d..e2921640880a 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi @@ -166,7 +166,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -419,7 +419,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; @@ -433,7 +433,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-2 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-mtp.dts b/arch/arm64/boot/dts/qcom/sdm845-mtp.dts index 64958dee17d8..b47e333aa351 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-mtp.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-mtp.dts @@ -117,7 +117,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -382,7 +382,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; @@ -396,7 +396,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-2 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi b/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi index 392461c29e76..0713b774a97b 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi @@ -144,7 +144,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -280,7 +280,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; @@ -294,7 +294,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-2 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts index 83261c9bb4f2..b65c35865dab 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts @@ -110,7 +110,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -375,7 +375,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; @@ -389,7 +389,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-2 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi b/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi index d6918e6d1979..249a715d5aae 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi @@ -78,7 +78,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -308,7 +308,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; @@ -319,7 +319,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-2 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts index 0f470cf1ed1c..6d6b3dd69947 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts @@ -125,7 +125,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; diff --git a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts index 093b04359ec3..ffbe45a99b74 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts +++ b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts @@ -143,7 +143,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; @@ -343,7 +343,7 @@ }; }; - pmi8998-rpmh-regulators { + regulators-1 { compatible = "qcom,pmi8998-rpmh-regulators"; qcom,pmic-id = "b"; @@ -355,7 +355,7 @@ }; }; - pm8005-rpmh-regulators { + regulators-2 { compatible = "qcom,pm8005-rpmh-regulators"; qcom,pmic-id = "c"; diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts index 74f43da51fa5..48a41ace8fc5 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts @@ -99,7 +99,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; diff --git a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts index d028a7eb364a..c169d2870bdf 100644 --- a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts +++ b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts @@ -129,7 +129,7 @@ }; &apps_rsc { - pm8998-rpmh-regulators { + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; From 51a1b943022fe97d78b3f6490dce1e67bf52d060 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Tue, 12 Sep 2023 12:42:03 +0530 Subject: [PATCH 076/151] arm64: dts: qcom: sdm845: Fix PSCI power domain names [ Upstream commit a5f01673d3946e424091e6b8ff274716f9c21454 ] The original commit hasn't been updated according to refactoring done in sdm845.dtsi. Fixes: a1ade6cac5a2 ("arm64: dts: qcom: sdm845: Switch PSCI cpu idle states from PC to OSI") Suggested-by: Dmitry Baryshkov Reviewed-by: Douglas Anderson Signed-off-by: David Heidelberg Reviewed-by: Stephen Boyd Reviewed-by: Abel Vesa Link: https://lore.kernel.org/r/20230912071205.11502-1-david@ixit.cz Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi index 985824032c52..43ee28db61aa 100644 --- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi @@ -150,15 +150,15 @@ }; &psci { - /delete-node/ cpu0; - /delete-node/ cpu1; - /delete-node/ cpu2; - /delete-node/ cpu3; - /delete-node/ cpu4; - /delete-node/ cpu5; - /delete-node/ cpu6; - /delete-node/ cpu7; - /delete-node/ cpu-cluster0; + /delete-node/ power-domain-cpu0; + /delete-node/ power-domain-cpu1; + /delete-node/ power-domain-cpu2; + /delete-node/ power-domain-cpu3; + /delete-node/ power-domain-cpu4; + /delete-node/ power-domain-cpu5; + /delete-node/ power-domain-cpu6; + /delete-node/ power-domain-cpu7; + /delete-node/ power-domain-cluster; }; &cpus { @@ -351,6 +351,8 @@ &apps_rsc { + /delete-property/ power-domains; + regulators-0 { compatible = "qcom,pm8998-rpmh-regulators"; qcom,pmic-id = "a"; From f2a79f3651a54015507655295931bbf70f90dc8e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 27 May 2023 11:28:36 +0200 Subject: [PATCH 077/151] fbdev: imsttfb: Release framebuffer and dealloc cmap on error path [ Upstream commit 5cf9a090a39c97f4506b7b53739d469b1c05a7e9 ] Add missing cleanups in error path. Signed-off-by: Helge Deller Stable-dep-of: e08c30efda21 ("fbdev: imsttfb: fix double free in probe()") Signed-off-by: Sasha Levin --- drivers/video/fbdev/imsttfb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c index b194e71f07bf..3d1ae5267a73 100644 --- a/drivers/video/fbdev/imsttfb.c +++ b/drivers/video/fbdev/imsttfb.c @@ -1452,9 +1452,13 @@ static int init_imstt(struct fb_info *info) FBINFO_HWACCEL_FILLRECT | FBINFO_HWACCEL_YPAN; - fb_alloc_cmap(&info->cmap, 0, 0); + if (fb_alloc_cmap(&info->cmap, 0, 0)) { + framebuffer_release(info); + return -ENODEV; + } if (register_framebuffer(info) < 0) { + fb_dealloc_cmap(&info->cmap); framebuffer_release(info); return -ENODEV; } From eb4f2e17886ad8d830044916ee614abf88c56349 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 27 Oct 2023 15:04:56 +0300 Subject: [PATCH 078/151] fbdev: imsttfb: fix double free in probe() [ Upstream commit e08c30efda21ef4c0ec084a3a9581c220b442ba9 ] The init_imstt() function calls framebuffer_release() on error and then the probe() function calls it again. It should only be done in probe. Fixes: 518ecb6a209f ("fbdev: imsttfb: Fix error path of imsttfb_probe()") Signed-off-by: Dan Carpenter Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/imsttfb.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c index 3d1ae5267a73..aa51cb72cbba 100644 --- a/drivers/video/fbdev/imsttfb.c +++ b/drivers/video/fbdev/imsttfb.c @@ -1419,7 +1419,6 @@ static int init_imstt(struct fb_info *info) if ((info->var.xres * info->var.yres) * (info->var.bits_per_pixel >> 3) > info->fix.smem_len || !(compute_imstt_regvals(par, info->var.xres, info->var.yres))) { printk("imsttfb: %ux%ux%u not supported\n", info->var.xres, info->var.yres, info->var.bits_per_pixel); - framebuffer_release(info); return -ENODEV; } @@ -1452,14 +1451,11 @@ static int init_imstt(struct fb_info *info) FBINFO_HWACCEL_FILLRECT | FBINFO_HWACCEL_YPAN; - if (fb_alloc_cmap(&info->cmap, 0, 0)) { - framebuffer_release(info); + if (fb_alloc_cmap(&info->cmap, 0, 0)) return -ENODEV; - } if (register_framebuffer(info) < 0) { fb_dealloc_cmap(&info->cmap); - framebuffer_release(info); return -ENODEV; } From 743f3548d3018d1f25c8d7ef8e22baad2d06bb9b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:43 -0800 Subject: [PATCH 079/151] bpf: decouple prune and jump points [ Upstream commit bffdeaa8a5af7200b0e74c9d5a41167f86626a36 ] BPF verifier marks some instructions as prune points. Currently these prune points serve two purposes. It's a point where verifier tries to find previously verified state and check current state's equivalence to short circuit verification for current code path. But also currently it's a point where jump history, used for precision backtracking, is updated. This is done so that non-linear flow of execution could be properly backtracked. Such coupling is coincidental and unnecessary. Some prune points are not part of some non-linear jump path, so don't need update of jump history. On the other hand, not all instructions which have to be recorded in jump history necessarily are good prune points. This patch splits prune and jump points into independent flags. Currently all prune points are marked as jump points to minimize amount of changes in this patch, but next patch will perform some optimization of prune vs jmp point placement. No functional changes are intended. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") Signed-off-by: Sasha Levin --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1a32baa78ce2..f080ccf27d25 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -429,6 +429,7 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool prune_point; + bool jmp_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ee6e811b4315..ec688665aaa2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2512,6 +2512,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].jmp_point = true; +} + +static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].jmp_point; +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) @@ -2520,6 +2530,9 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_idx_pair *p; size_t alloc_size; + if (!is_jmp_point(env, env->insn_idx)) + return 0; + cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); p = krealloc(cur->jmp_history, alloc_size, GFP_USER); @@ -11000,11 +11013,16 @@ static struct bpf_verifier_state_list **explored_state( return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } -static void init_explored_state(struct bpf_verifier_env *env, int idx) +static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; } +static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].prune_point; +} + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -11033,9 +11051,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return -EINVAL; } - if (e == BRANCH) + if (e == BRANCH) { /* mark branch target for state pruning */ - init_explored_state(env, w); + mark_prune_point(env, w); + mark_jmp_point(env, w); + } if (insn_state[w] == 0) { /* tree-edge */ @@ -11073,10 +11093,13 @@ static int visit_func_call_insn(int t, int insn_cnt, if (ret) return ret; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + if (t + 1 < insn_cnt) { + mark_prune_point(env, t + 1); + mark_jmp_point(env, t + 1); + } if (visit_callee) { - init_explored_state(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, /* It's ok to allow recursion from CFG point of * view. __check_func_call() will do the actual @@ -11110,13 +11133,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) + if (insns[t].imm == BPF_FUNC_timer_set_callback) { /* Mark this call insn to trigger is_state_visited() check * before call itself is processed by __check_func_call(). * Otherwise new async state will be pushed for further * exploration. */ - init_explored_state(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); @@ -11134,18 +11159,22 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) * but it's marked, since backtracking needs * to record jmp history in is_state_visited(). */ - init_explored_state(env, t + insns[t].off + 1); + mark_prune_point(env, t + insns[t].off + 1); + mark_jmp_point(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + if (t + 1 < insn_cnt) { + mark_prune_point(env, t + 1); + mark_jmp_point(env, t + 1); + } return ret; default: /* conditional jump with two edges */ - init_explored_state(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; @@ -12178,11 +12207,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) bool add_new_state = env->test_state_freq ? true : false; cur->last_insn_idx = env->prev_insn_idx; - if (!env->insn_aux_data[insn_idx].prune_point) + if (!is_prune_point(env, insn_idx)) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ - return 0; + return push_jmp_history(env, cur); /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 From 8266c47d04b2c36260043583498dbd40d51652d0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:45 -0800 Subject: [PATCH 080/151] bpf: remove unnecessary prune and jump points [ Upstream commit 618945fbed501b6e5865042068a51edfb2dda948 ] Don't mark some instructions as jump points when there are actually no jumps and instructions are just processed sequentially. Such case is handled naturally by precision backtracking logic without the need to update jump history. See get_prev_insn_idx(). It goes back linearly by one instruction, unless current top of jmp_history is pointing to current instruction. In such case we use `st->jmp_history[cnt - 1].prev_idx` to find instruction from which we jumped to the current instruction non-linearly. Also remove both jump and prune point marking for instruction right after unconditional jumps, as program flow can get to the instruction right after unconditional jump instruction only if there is a jump to that instruction from somewhere else in the program. In such case we'll mark such instruction as prune/jump point because it's a destination of a jump. This change has no changes in terms of number of instructions or states processes across Cilium and selftests programs. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20221206233345.438540-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ec688665aaa2..09631797d9e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11093,13 +11093,12 @@ static int visit_func_call_insn(int t, int insn_cnt, if (ret) return ret; - if (t + 1 < insn_cnt) { - mark_prune_point(env, t + 1); - mark_jmp_point(env, t + 1); - } + mark_prune_point(env, t + 1); + /* when we exit from subprog, we need to record non-linear history */ + mark_jmp_point(env, t + 1); + if (visit_callee) { mark_prune_point(env, t); - mark_jmp_point(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, /* It's ok to allow recursion from CFG point of * view. __check_func_call() will do the actual @@ -11133,15 +11132,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) { - /* Mark this call insn to trigger is_state_visited() check - * before call itself is processed by __check_func_call(). - * Otherwise new async state will be pushed for further - * exploration. + if (insns[t].imm == BPF_FUNC_timer_set_callback) + /* Mark this call insn as a prune point to trigger + * is_state_visited() check before call itself is + * processed by __check_func_call(). Otherwise new + * async state will be pushed for further exploration. */ mark_prune_point(env, t); - mark_jmp_point(env, t); - } return visit_func_call_insn(t, insn_cnt, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); @@ -11155,26 +11152,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) if (ret) return ret; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ mark_prune_point(env, t + insns[t].off + 1); mark_jmp_point(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) { - mark_prune_point(env, t + 1); - mark_jmp_point(env, t + 1); - } return ret; default: /* conditional jump with two edges */ mark_prune_point(env, t); - mark_jmp_point(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; From 97bb6dab01728e5a5f4eca998efd91bc89403032 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 7 Dec 2022 11:55:34 -0800 Subject: [PATCH 081/151] bpf: Remove unused insn_cnt argument from visit_[func_call_]insn() [ Upstream commit dcb2288b1fd9a8cdf2f3b8c0c7b3763346ef515f ] Number of total instructions in BPF program (including subprogs) can and is accessed from env->prog->len. visit_func_call_insn() doesn't do any checks against insn_cnt anymore, relying on push_insn() to do this check internally. So remove unnecessary insn_cnt input argument from visit_func_call_insn() and visit_insn() functions. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221207195534.2866030-1-andrii@kernel.org Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 09631797d9e0..d1393e07ab2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11082,8 +11082,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } -static int visit_func_call_insn(int t, int insn_cnt, - struct bpf_insn *insns, +static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { @@ -11114,13 +11113,13 @@ static int visit_func_call_insn(int t, int insn_cnt, * DONE_EXPLORING - the instruction was fully explored * KEEP_EXPLORING - there is still work to be done before it is fully explored */ -static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int ret; if (bpf_pseudo_func(insns + t)) - return visit_func_call_insn(t, insn_cnt, insns, env, true); + return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && @@ -11139,7 +11138,7 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); - return visit_func_call_insn(t, insn_cnt, insns, env, + return visit_func_call_insn(t, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -11196,7 +11195,7 @@ static int check_cfg(struct bpf_verifier_env *env) while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; - ret = visit_insn(t, insn_cnt, env); + ret = visit_insn(t, env); switch (ret) { case DONE_EXPLORING: insn_state[t] = EXPLORED; From b1c780ed3c220b0e3dbb1426e0f716bf0afbf7a3 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 2 Mar 2023 15:50:04 -0800 Subject: [PATCH 082/151] bpf: clean up visit_insn()'s instruction processing [ Upstream commit 653ae3a874aca6764a4c1f5a8bf1b072ade0d6f4 ] Instead of referencing processed instruction repeatedly as insns[t] throughout entire visit_insn() function, take a local insn pointer and work with it in a cleaner way. It makes enhancing this function further a bit easier as well. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230302235015.2044271-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1393e07ab2c..73d500c51bd8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11115,44 +11115,43 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, */ static int visit_insn(int t, struct bpf_verifier_env *env) { - struct bpf_insn *insns = env->prog->insnsi; + struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; int ret; - if (bpf_pseudo_func(insns + t)) + if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ - if (BPF_CLASS(insns[t].code) != BPF_JMP && - BPF_CLASS(insns[t].code) != BPF_JMP32) + if (BPF_CLASS(insn->code) != BPF_JMP && + BPF_CLASS(insn->code) != BPF_JMP32) return push_insn(t, t + 1, FALLTHROUGH, env, false); - switch (BPF_OP(insns[t].code)) { + switch (BPF_OP(insn->code)) { case BPF_EXIT: return DONE_EXPLORING; case BPF_CALL: - if (insns[t].imm == BPF_FUNC_timer_set_callback) + if (insn->imm == BPF_FUNC_timer_set_callback) /* Mark this call insn as a prune point to trigger * is_state_visited() check before call itself is * processed by __check_func_call(). Otherwise new * async state will be pushed for further exploration. */ mark_prune_point(env, t); - return visit_func_call_insn(t, insns, env, - insns[t].src_reg == BPF_PSEUDO_CALL); + return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: - if (BPF_SRC(insns[t].code) != BPF_K) + if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; /* unconditional jump with single edge */ - ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, + ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, true); if (ret) return ret; - mark_prune_point(env, t + insns[t].off + 1); - mark_jmp_point(env, t + insns[t].off + 1); + mark_prune_point(env, t + insn->off + 1); + mark_jmp_point(env, t + insn->off + 1); return ret; @@ -11164,7 +11163,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (ret) return ret; - return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); + return push_insn(t, t + insn->off + 1, BRANCH, env, true); } } From 2c795ce09042c9c84f97907b5a95e09895c69686 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Jul 2023 18:12:31 -0700 Subject: [PATCH 083/151] bpf: Support new 32bit offset jmp instruction [ Upstream commit 4cd58e9af8b9d9fff6b7145e742abbfcda0af4af ] Add interpreter/jit/verifier support for 32bit offset jmp instruction. If a conditional jmp instruction needs more than 16bit offset, it can be simulated with a conditional jmp + a 32bit jmp insn. Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20230728011231.3716103-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") Signed-off-by: Sasha Levin --- arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++---------- kernel/bpf/core.c | 19 ++++++++++++++++--- kernel/bpf/verifier.c | 32 ++++++++++++++++++++++---------- 3 files changed, 56 insertions(+), 23 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 84c695ae1940..b69aee6245e4 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1625,16 +1625,24 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ break; case BPF_JMP | BPF_JA: - if (insn->off == -1) - /* -1 jmp instructions will always jump - * backwards two bytes. Explicitly handling - * this case avoids wasting too many passes - * when there are long sequences of replaced - * dead code. - */ - jmp_offset = -2; - else - jmp_offset = addrs[i + insn->off] - addrs[i]; + case BPF_JMP32 | BPF_JA: + if (BPF_CLASS(insn->code) == BPF_JMP) { + if (insn->off == -1) + /* -1 jmp instructions will always jump + * backwards two bytes. Explicitly handling + * this case avoids wasting too many passes + * when there are long sequences of replaced + * dead code. + */ + jmp_offset = -2; + else + jmp_offset = addrs[i + insn->off] - addrs[i]; + } else { + if (insn->imm == -1) + jmp_offset = -2; + else + jmp_offset = addrs[i + insn->imm] - addrs[i]; + } if (!jmp_offset) { /* diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7225cb67c0d3..0b55ebf4a9b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -367,7 +367,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, { const s32 off_min = S16_MIN, off_max = S16_MAX; s32 delta = end_new - end_old; - s32 off = insn->off; + s32 off; + + if (insn->code == (BPF_JMP32 | BPF_JA)) + off = insn->imm; + else + off = insn->off; if (curr < pos && curr + off + 1 >= end_old) off += delta; @@ -375,8 +380,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, off -= delta; if (off < off_min || off > off_max) return -ERANGE; - if (!probe_pass) - insn->off = off; + if (!probe_pass) { + if (insn->code == (BPF_JMP32 | BPF_JA)) + insn->imm = off; + else + insn->off = off; + } return 0; } @@ -1586,6 +1595,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ + INSN_2(JMP32, JA), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ @@ -1862,6 +1872,9 @@ out: JMP_JA: insn += insn->off; CONT; + JMP32_JA: + insn += insn->imm; + CONT; JMP_EXIT: return BPF_R0; /* JMP */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 73d500c51bd8..dd025f66efab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2254,7 +2254,10 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + insn[i].off + 1; + if (code == (BPF_JMP32 | BPF_JA)) + off = i + insn[i].imm + 1; + else + off = i + insn[i].off + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -2266,6 +2269,7 @@ next: * or unconditional jump back */ if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP32 | BPF_JA) && code != (BPF_JMP | BPF_JA)) { verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; @@ -11116,7 +11120,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret; + int ret, off; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); @@ -11144,14 +11148,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; + if (BPF_CLASS(insn->code) == BPF_JMP) + off = insn->off; + else + off = insn->imm; + /* unconditional jump with single edge */ - ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, + ret = push_insn(t, t + off + 1, FALLTHROUGH, env, true); if (ret) return ret; - mark_prune_point(env, t + insn->off + 1); - mark_jmp_point(env, t + insn->off + 1); + mark_prune_point(env, t + off + 1); + mark_jmp_point(env, t + off + 1); return ret; @@ -12687,15 +12696,18 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } - env->insn_idx += insn->off + 1; + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; continue; } else if (opcode == BPF_EXIT) { @@ -13521,13 +13533,13 @@ static bool insn_is_cond_jump(u8 code) { u8 op; + op = BPF_OP(code); if (BPF_CLASS(code) == BPF_JMP32) - return true; + return op != BPF_JA; if (BPF_CLASS(code) != BPF_JMP) return false; - op = BPF_OP(code); return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; } From b08acd5c4602365e6443226a65c9ed2809cd85b0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:36 -0800 Subject: [PATCH 084/151] bpf: handle ldimm64 properly in check_cfg() [ Upstream commit 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 ] ldimm64 instructions are 16-byte long, and so have to be handled appropriately in check_cfg(), just like the rest of BPF verifier does. This has implications in three places: - when determining next instruction for non-jump instructions; - when determining next instruction for callback address ldimm64 instructions (in visit_func_call_insn()); - when checking for unreachable instructions, where second half of ldimm64 is expected to be unreachable; We take this also as an opportunity to report jump into the middle of ldimm64. And adjust few test_verifier tests accordingly. Acked-by: Eduard Zingerman Reported-by: Hao Sun Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/linux/bpf.h | 8 ++++-- kernel/bpf/verifier.c | 27 ++++++++++++++----- .../testing/selftests/bpf/verifier/ld_imm64.c | 8 +++--- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 619fcba84be2..ba22cf4f5fc0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -702,10 +702,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) aux->ctx_field_size = size; } +static bool bpf_is_ldimm64(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + static inline bool bpf_pseudo_func(const struct bpf_insn *insn) { - return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && - insn->src_reg == BPF_PSEUDO_FUNC; + return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } struct bpf_prog_ops { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd025f66efab..95521beec66c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11090,15 +11090,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { - int ret; + int ret, insn_sz; - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); if (ret) return ret; - mark_prune_point(env, t + 1); + mark_prune_point(env, t + insn_sz); /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + 1); + mark_jmp_point(env, t + insn_sz); if (visit_callee) { mark_prune_point(env, t); @@ -11120,15 +11121,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off; + int ret, off, insn_sz; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) - return push_insn(t, t + 1, FALLTHROUGH, env, false); + BPF_CLASS(insn->code) != BPF_JMP32) { + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + } switch (BPF_OP(insn->code)) { case BPF_EXIT: @@ -11227,11 +11230,21 @@ static int check_cfg(struct bpf_verifier_env *env) } for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } } ret = 0; /* cfg looks good */ diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c index f9297900cea6..78f19c255f20 100644 --- a/tools/testing/selftests/bpf/verifier/ld_imm64.c +++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c @@ -9,8 +9,8 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM insn", - .errstr_unpriv = "R1 pointer comparison", + .errstr = "jump into the middle of ldimm64 insn 1", + .errstr_unpriv = "jump into the middle of ldimm64 insn 1", .result = REJECT, }, { @@ -23,8 +23,8 @@ BPF_LD_IMM64(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr = "invalid BPF_LD_IMM insn", - .errstr_unpriv = "R1 pointer comparison", + .errstr = "jump into the middle of ldimm64 insn 1", + .errstr_unpriv = "jump into the middle of ldimm64 insn 1", .result = REJECT, }, { From 803fb6109fcfa939d78907ad34161a720b37848d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Nov 2023 16:26:37 -0800 Subject: [PATCH 085/151] bpf: fix precision backtracking instruction iteration [ Upstream commit 4bb7ea946a370707315ab774432963ce47291946 ] Fix an edge case in __mark_chain_precision() which prematurely stops backtracking instructions in a state if it happens that state's first and last instruction indexes are the same. This situations doesn't necessarily mean that there were no instructions simulated in a state, but rather that we starting from the instruction, jumped around a bit, and then ended up at the same instruction before checkpointing or marking precision. To distinguish between these two possible situations, we need to consult jump history. If it's empty or contain a single record "bridging" parent state and first instruction of processed state, then we indeed backtracked all instructions in this state. But if history is not empty, we are definitely not done yet. Move this logic inside get_prev_insn_idx() to contain it more nicely. Use -ENOENT return code to denote "we are out of instructions" situation. This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once the next fix in this patch set is applied. Acked-by: Eduard Zingerman Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95521beec66c..142e10d49fd8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2551,12 +2551,29 @@ static int push_jmp_history(struct bpf_verifier_env *env, /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. */ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, u32 *history) { u32 cnt = *history; + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + if (cnt && st->jmp_history[cnt - 1].idx == i) { i = st->jmp_history[cnt - 1].prev_idx; (*history)--; @@ -3052,9 +3069,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r * Nothing to be tracked further in the parent state. */ return 0; - if (i == first_idx) - break; i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; if (i >= env->prog->len) { /* This can happen if backtracking reached insn 0 * and there are still reg_mask or stack_mask From b5c8e0ff76d10f6bf70a7237678f27c20cf59bc9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Nov 2023 11:52:31 +0800 Subject: [PATCH 086/151] blk-mq: make sure active queue usage is held for bio_integrity_prep() [ Upstream commit b0077e269f6c152e807fdac90b58caf012cdbaab ] blk_integrity_unregister() can come if queue usage counter isn't held for one bio with integrity prepared, so this request may be completed with calling profile->complete_fn, then kernel panic. Another constraint is that bio_integrity_prep() needs to be called before bio merge. Fix the issue by: - call bio_integrity_prep() with one queue usage counter grabbed reliably - call bio_integrity_prep() before bio merge Fixes: 900e080752025f00 ("block: move queue enter logic into blk_mq_submit_bio()") Reported-by: Yi Zhang Cc: Christoph Hellwig Signed-off-by: Ming Lei Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20231113035231.2708053-1-ming.lei@redhat.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/blk-mq.c | 75 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 100fb0c3114f..383d94615e50 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2855,11 +2855,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (unlikely(bio_queue_enter(bio))) - return NULL; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - goto queue_exit; + return NULL; rq_qos_throttle(q, bio); @@ -2875,35 +2872,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); -queue_exit: - blk_queue_exit(q); return NULL; } -static inline struct request *blk_mq_get_cached_request(struct request_queue *q, - struct blk_plug *plug, struct bio **bio, unsigned int nsegs) +/* return true if this @rq can be used for @bio */ +static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) { - struct request *rq; - enum hctx_type type, hctx_type; + enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); + enum hctx_type hctx_type = rq->mq_hctx->type; - if (!plug) - return NULL; - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); - if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { - *bio = NULL; - return NULL; - } - - type = blk_mq_get_hctx_type((*bio)->bi_opf); - hctx_type = rq->mq_hctx->type; if (type != hctx_type && !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) - return NULL; + return false; + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + return false; /* * If any qos ->throttle() end up blocking, we will have flushed the @@ -2911,11 +2896,11 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, * before we throttle. */ plug->cached_rq = rq_list_next(rq); - rq_qos_throttle(q, *bio); + rq_qos_throttle(rq->q, bio); - rq->cmd_flags = (*bio)->bi_opf; + rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); - return rq; + return true; } static void bio_set_ioprio(struct bio *bio) @@ -2944,7 +2929,7 @@ void blk_mq_submit_bio(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); - struct request *rq; + struct request *rq = NULL; unsigned int nr_segs = 1; blk_status_t ret; @@ -2955,20 +2940,36 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (!bio_integrity_prep(bio)) - return; - bio_set_ioprio(bio); - rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); - if (!rq) { - if (!bio) + if (plug) { + rq = rq_list_peek(&plug->cached_rq); + if (rq && rq->q != q) + rq = NULL; + } + if (rq) { + if (!bio_integrity_prep(bio)) return; - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; + if (blk_mq_can_use_cached_rq(rq, plug, bio)) + goto done; + percpu_ref_get(&q->q_usage_counter); + } else { + if (unlikely(bio_queue_enter(bio))) + return; + if (!bio_integrity_prep(bio)) + goto fail; } + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) { +fail: + blk_queue_exit(q); + return; + } + +done: trace_block_getrq(bio); rq_qos_track(q, rq, bio); From 31051f722db23335cfbfc04911ea5eed762e872e Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 14 Nov 2023 13:58:43 -0800 Subject: [PATCH 087/151] net/mlx5: Increase size of irq name buffer [ Upstream commit 3338bebfc26a1e2cebbba82a1cf12c0159608e73 ] Without increased buffer size, will trigger -Wformat-truncation with W=1 for the snprintf operation writing to the buffer. drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c: In function 'mlx5_irq_alloc': drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:296:7: error: '@pci:' directive output may be truncated writing 5 bytes into a region of size between 1 and 32 [-Werror=format-truncation=] 296 | "%s@pci:%s", name, pci_name(dev->pdev)); | ^~~~~ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:295:2: note: 'snprintf' output 6 or more bytes (assuming 37) into a destination of size 32 295 | snprintf(irq->name, MLX5_MAX_IRQ_NAME, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 296 | "%s@pci:%s", name, pci_name(dev->pdev)); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: ada9f5d00797 ("IB/mlx5: Fix eq names to display nicely in /proc/interrupts") Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c Signed-off-by: Rahul Rameshbabu Reviewed-by: Dragos Tatulea Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20231114215846.5902-13-saeed@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++--- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index d136360ac6a9..a6d3fc96e168 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -25,7 +25,7 @@ struct mlx5_irq { struct atomic_notifier_head nh; cpumask_var_t mask; - char name[MLX5_MAX_IRQ_NAME]; + char name[MLX5_MAX_IRQ_FORMATTED_NAME]; struct mlx5_irq_pool *pool; int refcount; u32 index; @@ -236,8 +236,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, else irq_sf_set_name(pool, name, i); ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh); - snprintf(irq->name, MLX5_MAX_IRQ_NAME, - "%s@pci:%s", name, pci_name(dev->pdev)); + snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME, + MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev)); err = request_irq(irq->irqn, irq_int_handler, 0, irq->name, &irq->nh); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h index 5c7e68bee43a..4047179307c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h @@ -7,6 +7,9 @@ #include #define MLX5_MAX_IRQ_NAME (32) +#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s") +#define MLX5_MAX_IRQ_FORMATTED_NAME \ + (MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR)) /* max irq_index is 2047, so four chars */ #define MLX5_MAX_IRQ_IDX_CHARS (4) #define MLX5_EQ_REFS_PER_IRQ (2) From a1a1e5ce88a7af9a5f6f0c36b15e7bdc08f56f8b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 17 Oct 2023 21:07:04 +0200 Subject: [PATCH 088/151] s390/mm: add missing arch_set_page_dat() call to vmem_crst_alloc() [ Upstream commit 09cda0a400519b1541591c506e54c9c48e3101bf ] If the cmma no-dat feature is available all pages that are not used for dynamic address translation are marked as "no-dat" with the ESSA instruction. This information is visible to the hypervisor, so that the hypervisor can optimize purging of guest TLB entries. This also means that pages which are used for dynamic address translation must not be marked as "no-dat", since the hypervisor may then incorrectly not purge guest TLB entries. Region and segment tables allocated via vmem_crst_alloc() are incorrectly marked as "no-dat", as soon as slab_is_available() returns true. Such tables are allocated e.g. when kernel page tables are split, memory is hotplugged, or a DCSS segment is loaded. Fix this by adding the missing arch_set_page_dat() call. Cc: Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Sasha Levin --- arch/s390/mm/vmem.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 9a0ce5315f36..3cbb46182066 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -44,8 +45,11 @@ void *vmem_crst_alloc(unsigned long val) unsigned long *table; table = vmem_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + if (!table) + return NULL; + crst_table_init(table, val); + if (slab_is_available()) + arch_set_page_dat(virt_to_page(table), CRST_ALLOC_ORDER); return table; } From 2c14f4991610f2296d78d8e7e28d06dd2d60f707 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 23 Dec 2022 11:03:32 +0100 Subject: [PATCH 089/151] s390/cpumf: support user space events for counting [ Upstream commit 91d5364dc673fa9cf3a5b7b30cf33c70803eb3a4 ] CPU Measurement counting facility events PROBLEM_STATE_CPU_CYCLES(32) and PROBLEM_STATE_INSTRUCTIONS(33) are valid events. However the device driver returns error -EOPNOTSUPP when these event are to be installed. Fix this and allow installation of events PROBLEM_STATE_CPU_CYCLES, PROBLEM_STATE_CPU_CYCLES:u, PROBLEM_STATE_INSTRUCTIONS and PROBLEM_STATE_INSTRUCTIONS:u. Kernel space counting only is still not supported by s390. Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Heiko Carstens Stable-dep-of: 09cda0a40051 ("s390/mm: add missing arch_set_page_dat() call to vmem_crst_alloc()") Signed-off-by: Sasha Levin --- arch/s390/kernel/perf_cpum_cf.c | 35 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f043a7ff220b..28fa80fd69fa 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,7 +2,7 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2021 + * Copyright IBM Corp. 2012, 2022 * Author(s): Hendrik Brueckner * Thomas Richter */ @@ -434,6 +434,12 @@ static void cpumf_hw_inuse(void) mutex_unlock(&pmc_reserve_mutex); } +static int is_userspace_event(u64 ev) +{ + return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || + cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; +} + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) { struct perf_event_attr *attr = &event->attr; @@ -456,19 +462,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) if (is_sampling_event(event)) /* No sampling support */ return -ENOENT; ev = attr->config; - /* Count user space (problem-state) only */ if (!attr->exclude_user && attr->exclude_kernel) { - if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_user[ev]; - - /* No support for kernel space counters only */ + /* + * Count user space (problem-state) only + * Handle events 32 and 33 as 0:u and 1:u + */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + } } else if (!attr->exclude_kernel && attr->exclude_user) { + /* No support for kernel space counters only */ return -EOPNOTSUPP; - } else { /* Count user and kernel space */ - if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) - return -EOPNOTSUPP; - ev = cpumf_generic_events_basic[ev]; + } else { + /* Count user and kernel space, incl. events 32 + 33 */ + if (!is_userspace_event(ev)) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } } break; From 84a8d913fb532793122f45b59b73224b63c3307e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 28 Jan 2023 18:30:11 +0800 Subject: [PATCH 090/151] f2fs: clean up i_compress_flag and i_compress_level usage [ Upstream commit b90e5086df6bf5ba819216d5ecf0667370bd565f ] .i_compress_level was introduced by commit 3fde13f817e2 ("f2fs: compress: support compress level"), but never be used. This patch updates as below: - load high 8-bits of on-disk .i_compress_flag to in-memory .i_compress_level - load low 8-bits of on-disk .i_compress_flag to in-memory .i_compress_flag - change type of in-memory .i_compress_flag from unsigned short to unsigned char. w/ above changes, we can avoid unneeded bit shift whenever during .init_compress_ctx(), and shrink size of struct f2fs_inode_info. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl") Signed-off-by: Sasha Levin --- fs/f2fs/compress.c | 8 +++----- fs/f2fs/f2fs.h | 7 +++---- fs/f2fs/inode.c | 16 +++++++++++++--- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 11d9dce994db..d509b47381d5 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -241,7 +241,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) unsigned int size = LZ4_MEM_COMPRESS; #ifdef CONFIG_F2FS_FS_LZ4HC - if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + if (F2FS_I(cc->inode)->i_compress_level) size = LZ4HC_MEM_COMPRESS; #endif @@ -267,8 +267,7 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) #ifdef CONFIG_F2FS_FS_LZ4HC static int lz4hc_compress_pages(struct compress_ctx *cc) { - unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> - COMPRESS_LEVEL_OFFSET; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; int len; if (level) @@ -340,8 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) zstd_cstream *stream; void *workspace; unsigned int workspace_size; - unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> - COMPRESS_LEVEL_OFFSET; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f56abb39601a..faf1a4953e84 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -840,7 +840,7 @@ struct f2fs_inode_info { unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ - unsigned short i_compress_flag; /* compress flag */ + unsigned char i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ unsigned int atomic_write_cnt; @@ -4339,9 +4339,8 @@ static inline int set_compress_context(struct inode *inode) if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) - F2FS_I(inode)->i_compress_flag |= - F2FS_OPTION(sbi).compress_level << - COMPRESS_LEVEL_OFFSET; + F2FS_I(inode)->i_compress_level = + F2FS_OPTION(sbi).compress_level; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1fc7760499f1..933554985d32 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -450,11 +450,17 @@ static int do_read_inode(struct inode *inode) (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { + unsigned short compress_flag; + atomic_set(&fi->i_compr_blocks, le64_to_cpu(ri->i_compr_blocks)); fi->i_compress_algorithm = ri->i_compress_algorithm; fi->i_log_cluster_size = ri->i_log_cluster_size; - fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); + compress_flag = le16_to_cpu(ri->i_compress_flag); + fi->i_compress_level = compress_flag >> + COMPRESS_LEVEL_OFFSET; + fi->i_compress_flag = compress_flag & + (BIT(COMPRESS_LEVEL_OFFSET) - 1); fi->i_cluster_size = 1 << fi->i_log_cluster_size; set_inode_flag(inode, FI_COMPRESSED_FILE); } @@ -675,13 +681,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_log_cluster_size)) { + unsigned short compress_flag; + ri->i_compr_blocks = cpu_to_le64(atomic_read( &F2FS_I(inode)->i_compr_blocks)); ri->i_compress_algorithm = F2FS_I(inode)->i_compress_algorithm; - ri->i_compress_flag = - cpu_to_le16(F2FS_I(inode)->i_compress_flag); + compress_flag = F2FS_I(inode)->i_compress_flag | + F2FS_I(inode)->i_compress_level << + COMPRESS_LEVEL_OFFSET; + ri->i_compress_flag = cpu_to_le16(compress_flag); ri->i_log_cluster_size = F2FS_I(inode)->i_log_cluster_size; } From 55d3f41e5583937980a59c523d6254df731df70e Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 16 Feb 2023 21:53:24 +0800 Subject: [PATCH 091/151] f2fs: convert to use bitmap API [ Upstream commit 447286ebadaafa551550704ff0b42eb08b1d1cb2 ] Let's use BIT() and GENMASK() instead of open it. Signed-off-by: Yangtao Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl") Signed-off-by: Sasha Levin --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/compress.c | 4 ++-- fs/f2fs/data.c | 12 ++++++------ fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 2 +- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.h | 20 +++++++++----------- fs/f2fs/super.c | 16 ++++++++-------- fs/f2fs/sysfs.c | 2 +- include/linux/f2fs_fs.h | 9 ++++----- 11 files changed, 48 insertions(+), 51 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5df04ed010ca..eb4d69f53337 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -984,7 +984,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); if (cur_page == cp2) - cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); for (i = 1; i < cp_blks; i++) { void *sit_bitmap_ptr; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d509b47381d5..c3ba202a7c29 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -673,7 +673,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->cbuf->clen = cpu_to_le32(cc->clen); - if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) + if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) chksum = f2fs_crc32(F2FS_I_SB(cc->inode), cc->cbuf->cdata, cc->clen); cc->cbuf->chksum = cpu_to_le32(chksum); @@ -771,7 +771,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) ret = cops->decompress_pages(dic); - if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { + if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { u32 provided = le32_to_cpu(dic->cbuf->chksum); u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ea05710ca9bd..3666c1fd77a6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -95,17 +95,17 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { #ifdef CONFIG_FS_ENCRYPTION - STEP_DECRYPT = 1 << 0, + STEP_DECRYPT = BIT(0), #else STEP_DECRYPT = 0, /* compile out the decryption-related code */ #endif #ifdef CONFIG_F2FS_FS_COMPRESSION - STEP_DECOMPRESS = 1 << 1, + STEP_DECOMPRESS = BIT(1), #else STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ #endif #ifdef CONFIG_FS_VERITY - STEP_VERITY = 1 << 2, + STEP_VERITY = BIT(2), #else STEP_VERITY = 0, /* compile out the verity-related code */ #endif @@ -409,7 +409,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { - unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; + unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -431,9 +431,9 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ - if ((1 << fio->temp) & meta_flag) + if (BIT(fio->temp) & meta_flag) op_flags |= REQ_META; - if ((1 << fio->temp) & fua_flag) + if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; return op_flags; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8373eba3a133..510736d2ae11 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -29,7 +29,7 @@ static unsigned long dir_blocks(struct inode *inode) static unsigned int dir_buckets(unsigned int level, int dir_level) { if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) - return 1 << (level + dir_level); + return BIT(level + dir_level); else return MAX_DIR_BUCKETS; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index faf1a4953e84..6fa3ac2097b2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -64,7 +64,7 @@ enum { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) +#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0)) struct f2fs_fault_info { atomic_t inject_ops; @@ -73,7 +73,7 @@ struct f2fs_fault_info { }; extern const char *f2fs_fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) #endif /* @@ -1412,7 +1412,7 @@ static inline void set_page_private_##name(struct page *page) \ static inline void clear_page_private_##name(struct page *page) \ { \ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ - if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \ set_page_private(page, 0); \ if (PagePrivate(page)) { \ ClearPagePrivate(page); \ @@ -1462,8 +1462,8 @@ static inline void set_page_private_data(struct page *page, unsigned long data) static inline void clear_page_private_data(struct page *page) { - page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; - if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { + page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); + if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { set_page_private(page, 0); if (PagePrivate(page)) { ClearPagePrivate(page); @@ -2882,7 +2882,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); return mask & *addr; } @@ -2891,7 +2891,7 @@ static inline void f2fs_set_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr |= mask; } @@ -2900,7 +2900,7 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr &= ~mask; } @@ -2910,7 +2910,7 @@ static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr |= mask; return ret; @@ -2922,7 +2922,7 @@ static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) int ret; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); ret = mask & *addr; *addr &= ~mask; return ret; @@ -2933,7 +2933,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) int mask; addr += (nr >> 3); - mask = 1 << (7 - (nr & 0x07)); + mask = BIT(7 - (nr & 0x07)); *addr ^= mask; } @@ -4333,9 +4333,9 @@ static inline int set_compress_context(struct inode *inode) F2FS_OPTION(sbi).compress_log_size; F2FS_I(inode)->i_compress_flag = F2FS_OPTION(sbi).compress_chksum ? - 1 << COMPRESS_CHKSUM : 0; + BIT(COMPRESS_CHKSUM) : 0; F2FS_I(inode)->i_cluster_size = - 1 << F2FS_I(inode)->i_log_cluster_size; + BIT(F2FS_I(inode)->i_log_cluster_size); if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d0c17366ebf4..126c074deebd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3983,7 +3983,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) F2FS_I(inode)->i_compress_algorithm = option.algorithm; F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; - F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); f2fs_mark_inode_dirty_sync(inode, true); if (!f2fs_is_compress_backend_ready(inode)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 933554985d32..0010579f1736 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -460,8 +460,8 @@ static int do_read_inode(struct inode *inode) fi->i_compress_level = compress_flag >> COMPRESS_LEVEL_OFFSET; fi->i_compress_flag = compress_flag & - (BIT(COMPRESS_LEVEL_OFFSET) - 1); - fi->i_cluster_size = 1 << fi->i_log_cluster_size; + GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0); + fi->i_cluster_size = BIT(fi->i_log_cluster_size); set_inode_flag(inode, FI_COMPRESSED_FILE); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0aa48704c77a..7068f3ac036a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -93,17 +93,15 @@ static inline void copy_node_info(struct node_info *dst, static inline void set_nat_flag(struct nat_entry *ne, unsigned int type, bool set) { - unsigned char mask = 0x01 << type; if (set) - ne->ni.flag |= mask; + ne->ni.flag |= BIT(type); else - ne->ni.flag &= ~mask; + ne->ni.flag &= ~BIT(type); } static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) { - unsigned char mask = 0x01 << type; - return ne->ni.flag & mask; + return ne->ni.flag & BIT(type); } static inline void nat_reset_flag(struct nat_entry *ne) @@ -224,7 +222,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - block_addr ^= 1 << sbi->log_blocks_per_seg; + block_addr ^= BIT(sbi->log_blocks_per_seg); return block_addr + nm_i->nat_blkaddr; } @@ -394,7 +392,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) static inline int is_node(struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); - return le32_to_cpu(rn->footer.flag) & (1 << type); + return le32_to_cpu(rn->footer.flag) & BIT(type); } #define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) @@ -407,9 +405,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) unsigned int flag = le32_to_cpu(rn->footer.flag); if (is_dir) - flag &= ~(0x1 << COLD_BIT_SHIFT); + flag &= ~BIT(COLD_BIT_SHIFT); else - flag |= (0x1 << COLD_BIT_SHIFT); + flag |= BIT(COLD_BIT_SHIFT); rn->footer.flag = cpu_to_le32(flag); } @@ -418,9 +416,9 @@ static inline void set_mark(struct page *page, int mark, int type) struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << type); + flag |= BIT(type); else - flag &= ~(0x1 << type); + flag &= ~BIT(type); rn->footer.flag = cpu_to_le32(flag); #ifdef CONFIG_F2FS_CHECK_FS diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1ba85ef97cbd..4f87e0e374c2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -898,8 +898,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (args->from && match_int(args, &arg)) return -EINVAL; if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { - f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_VECS); + f2fs_warn(sbi, "Not support %ld, larger than %d", + BIT(arg), BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; @@ -1340,7 +1340,7 @@ default_check: #endif if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", + f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } @@ -3356,7 +3356,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, total_sections = le32_to_cpu(raw_super->section_count); /* blocks_per_seg should be 512, given the above check */ - blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); + blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); if (segment_count > F2FS_MAX_SEGMENT || segment_count < F2FS_MIN_SEGMENTS) { @@ -3625,9 +3625,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); - sbi->blocksize = 1 << sbi->log_blocksize; + sbi->blocksize = BIT(sbi->log_blocksize); sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); - sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; + sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); sbi->total_sections = le32_to_cpu(raw_super->section_count); @@ -3883,7 +3883,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) f2fs_down_write(&sbi->sb_lock); - if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) + if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) raw_super->s_stop_reason[reason]++; err = f2fs_commit_super(sbi, false); @@ -4033,7 +4033,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).start_blk, FDEV(i).end_blk); } f2fs_info(sbi, - "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); + "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); return 0; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 3d68bfa75cf2..751a108e612f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -451,7 +451,7 @@ out: if (ret < 0) return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX)) return -EINVAL; if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) return -EINVAL; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index ee0d75d9a302..1e0df607e40c 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -40,9 +40,8 @@ #define F2FS_ENC_UTF8_12_1 1 -#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ -#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ -#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits + 2) /* KB */ #define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) #define F2FS_IO_ALIGNED(sbi) (F2FS_IO_SIZE(sbi) > 1) @@ -340,7 +339,7 @@ enum { OFFSET_BIT_SHIFT }; -#define OFFSET_BIT_MASK (0x07) /* (0x01 << OFFSET_BIT_SHIFT) - 1 */ +#define OFFSET_BIT_MASK GENMASK(OFFSET_BIT_SHIFT - 1, 0) struct node_footer { __le32 nid; /* node id */ @@ -545,7 +544,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_HASH_DEPTH 63 /* MAX buckets in one level of dir */ -#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) +#define MAX_DIR_BUCKETS BIT((MAX_DIR_HASH_DEPTH / 2) - 1) /* * space utilization of regular dentry and inline dentry (w/o extra reservation) From 1ff3f5ef284b31b4901a1890045bbb8f8412d12c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 12 Jun 2023 12:58:34 -0700 Subject: [PATCH 092/151] f2fs: assign default compression level [ Upstream commit 00e120b5e4b5638cf19eee96d4332f2d100746ba ] Let's avoid any confusion from assigning compress_level=0 for LZ4HC and ZSTD. Signed-off-by: Jaegeuk Kim Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl") Signed-off-by: Sasha Levin --- fs/f2fs/compress.c | 3 +-- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 12 +++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c3ba202a7c29..4cb58e8d699e 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -331,8 +331,6 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = { #endif #ifdef CONFIG_F2FS_FS_ZSTD -#define F2FS_ZSTD_DEFAULT_CLEVEL 1 - static int zstd_init_compress_ctx(struct compress_ctx *cc) { zstd_parameters params; @@ -341,6 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) unsigned int workspace_size; unsigned char level = F2FS_I(cc->inode)->i_compress_level; + /* Need to remain this for backward compatibility */ if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6fa3ac2097b2..5c76ba764b71 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1501,6 +1501,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + #define COMPRESS_LEVEL_OFFSET 8 /* compress context */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4f87e0e374c2..584fe00fdeeb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -613,14 +613,12 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; -#endif if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; return 0; } -#ifdef CONFIG_F2FS_FS_LZ4HC str += 3; if (str[0] != ':') { @@ -638,6 +636,10 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) F2FS_OPTION(sbi).compress_level = level; return 0; #else + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } f2fs_info(sbi, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif @@ -651,7 +653,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) int len = 4; if (strlen(str) == len) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; return 0; } @@ -664,7 +666,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) if (kstrtouint(str + 1, 10, &level)) return -EINVAL; - if (!level || level > zstd_max_clevel()) { + if (level < zstd_min_clevel() || level > zstd_max_clevel()) { f2fs_info(sbi, "invalid zstd compress level: %d", level); return -EINVAL; } From 336d1ee07efb87c44ad68a99d7b0e97fedd6c9bc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 8 Sep 2023 15:41:42 -0700 Subject: [PATCH 093/151] f2fs: set the default compress_level on ioctl [ Upstream commit f5f3bd903a5d3e3b2ba89f11e0e29db25e60c048 ] Otherwise, we'll get a broken inode. # touch $FILE # f2fs_io setflags compression $FILE # f2fs_io set_coption 2 8 $FILE [ 112.227612] F2FS-fs (dm-51): sanity_check_compress_inode: inode (ino=8d3fe) has unsupported compress level: 0, run fsck to fix Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 126c074deebd..9b9fb3c57ec6 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3984,6 +3984,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) F2FS_I(inode)->i_compress_algorithm = option.algorithm; F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); + /* Set default level */ + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) + F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + else + F2FS_I(inode)->i_compress_level = 0; + /* Adjust mount option level */ + if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level; f2fs_mark_inode_dirty_sync(inode, true); if (!f2fs_is_compress_backend_ready(inode)) From 4b85e920afc80e36cb0c62aede033e75f71a37b9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 14 Nov 2023 00:16:17 +0100 Subject: [PATCH 094/151] selftests: mptcp: fix fastclose with csum failure [ Upstream commit 7cefbe5e1dacc7236caa77e9d072423f21422fe2 ] Running the mp_join selftest manually with the following command line: ./mptcp_join.sh -z -C leads to some failures: 002 fastclose server test # ... rtx [fail] got 1 MP_RST[s] TX expected 0 # ... rstrx [fail] got 1 MP_RST[s] RX expected 0 The problem is really in the wrong expectations for the RST checks implied by the csum validation. Note that the same check is repeated explicitly in the same test-case, with the correct expectation and pass successfully. Address the issue explicitly setting the correct expectation for the failing checks. Reported-by: Xiumei Mu Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-5-7b9cd6a7b7f4@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index e52d513009fb..9d8dde3b5c33 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3041,7 +3041,7 @@ fastclose_tests() if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_server - chk_join_nr 0 0 0 + chk_join_nr 0 0 0 0 0 0 1 chk_fclose_nr 1 1 invert chk_rst_nr 1 1 fi From c96a4f936008fd58a462dd5eb5975b013fa19855 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 23 Jun 2023 10:34:09 -0700 Subject: [PATCH 095/151] selftests: mptcp: set FAILING_LINKS in run_tests [ Upstream commit be7e9786c9155c2942cd53b813e4723be67e07c4 ] Set FAILING_LINKS as an env var with a limited scope only when calling run_tests(). Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20230623-send-net-next-20230623-v1-3-a883213c8ba9@kernel.org Signed-off-by: Jakub Kicinski Stable-dep-of: 7cefbe5e1dac ("selftests: mptcp: fix fastclose with csum failure") Signed-off-by: Sasha Levin --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 9d8dde3b5c33..2107579e2939 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -2167,9 +2167,9 @@ link_failure_tests() pm_nl_set_limits $ns1 0 2 pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal pm_nl_set_limits $ns2 1 2 - FAILING_LINKS="1" pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup - run_tests $ns1 $ns2 10.0.1.1 1 + FAILING_LINKS="1" \ + run_tests $ns1 $ns2 10.0.1.1 1 chk_join_nr 2 2 2 chk_add_nr 1 1 chk_link_usage $ns2 ns2eth3 $cinsent 0 @@ -2183,8 +2183,8 @@ link_failure_tests() pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal pm_nl_set_limits $ns2 1 2 pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup - FAILING_LINKS="1 2" - run_tests $ns1 $ns2 10.0.1.1 1 + FAILING_LINKS="1 2" \ + run_tests $ns1 $ns2 10.0.1.1 1 chk_join_nr 2 2 2 chk_add_nr 1 1 chk_stale_nr $ns2 2 4 2 @@ -2199,8 +2199,8 @@ link_failure_tests() pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal pm_nl_set_limits $ns2 1 3 pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup - FAILING_LINKS="1 2" - run_tests $ns1 $ns2 10.0.1.1 2 + FAILING_LINKS="1 2" \ + run_tests $ns1 $ns2 10.0.1.1 2 chk_join_nr 2 2 2 chk_add_nr 1 1 chk_stale_nr $ns2 1 -1 2 From 710f70555d5b6be7503dfa68e575f8cff9e2a8f3 Mon Sep 17 00:00:00 2001 From: Milen Mitkov Date: Fri, 9 Dec 2022 11:40:34 +0200 Subject: [PATCH 096/151] media: camss: sm8250: Virtual channels for CSID [ Upstream commit 3c4ed72a16bc6733cda9c65048af74a2e8eaa0eb ] CSID hardware on SM8250 can demux up to 4 simultaneous streams based on virtual channel (vc) or datatype (dt). The CSID subdevice entity now has 4 source ports that can be enabled/disabled and thus can control which virtual channels are enabled. Datatype demuxing not tested. In order to keep a valid internal state of the subdevice, implicit format propagation from the sink to the source pads has been preserved. However, the format on each source pad can be different and in that case it must be configured explicitly. CSID's s_stream is called when any stream is started or stopped. It will call configure_streams() that will rewrite IRQ settings to HW. When multiple streams are running simultaneously there is an issue when writing IRQ settings for one stream while another is still running, thus avoid re-writing settings if they were not changed in link setup, or by fully powering off the CSID hardware. Signed-off-by: Milen Mitkov Reviewed-by: Robert Foss Tested-by: Bryan O'Donoghue Acked-by: Robert Foss Signed-off-by: Hans Verkuil Stable-dep-of: e655d1ae9703 ("media: qcom: camss: Fix set CSI2_RX_CFG1_VC_MODE when VC is greater than 3") Signed-off-by: Sasha Levin --- .../platform/qcom/camss/camss-csid-gen2.c | 54 ++++++++++++------- .../media/platform/qcom/camss/camss-csid.c | 44 ++++++++++----- .../media/platform/qcom/camss/camss-csid.h | 11 +++- 3 files changed, 74 insertions(+), 35 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c index 904208f6f954..2e015e69a6ad 100644 --- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c +++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c @@ -334,13 +334,14 @@ static const struct csid_format csid_formats[] = { }, }; -static void csid_configure_stream(struct csid_device *csid, u8 enable) +static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc) { struct csid_testgen_config *tg = &csid->testgen; u32 val; u32 phy_sel = 0; u8 lane_cnt = csid->phy.lane_cnt; - struct v4l2_mbus_framefmt *input_format = &csid->fmt[MSM_CSID_PAD_SRC]; + /* Source pads matching RDI channels on hardware. Pad 1 -> RDI0, Pad 2 -> RDI1, etc. */ + struct v4l2_mbus_framefmt *input_format = &csid->fmt[MSM_CSID_PAD_FIRST_SRC + vc]; const struct csid_format *format = csid_get_fmt_entry(csid->formats, csid->nformats, input_format->code); @@ -351,8 +352,7 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable) phy_sel = csid->phy.csiphy_id; if (enable) { - u8 vc = 0; /* Virtual Channel 0 */ - u8 dt_id = vc * 4; + u8 dt_id = vc; if (tg->enabled) { /* configure one DT, infinite frames */ @@ -392,42 +392,42 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable) val |= format->data_type << RDI_CFG0_DATA_TYPE; val |= vc << RDI_CFG0_VIRTUAL_CHANNEL; val |= dt_id << RDI_CFG0_DT_ID; - writel_relaxed(val, csid->base + CSID_RDI_CFG0(0)); + writel_relaxed(val, csid->base + CSID_RDI_CFG0(vc)); /* CSID_TIMESTAMP_STB_POST_IRQ */ val = 2 << RDI_CFG1_TIMESTAMP_STB_SEL; - writel_relaxed(val, csid->base + CSID_RDI_CFG1(0)); + writel_relaxed(val, csid->base + CSID_RDI_CFG1(vc)); val = 1; - writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PERIOD(0)); + writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PERIOD(vc)); val = 0; - writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PATTERN(0)); + writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PATTERN(vc)); val = 1; - writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PERIOD(0)); + writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PERIOD(vc)); val = 0; - writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PATTERN(0)); + writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PATTERN(vc)); val = 1; - writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PERIOD(0)); + writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PERIOD(vc)); val = 0; - writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PATTERN(0)); + writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PATTERN(vc)); val = 1; - writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PERIOD(0)); + writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PERIOD(vc)); val = 0; - writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PATTERN(0)); + writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PATTERN(vc)); val = 0; - writel_relaxed(val, csid->base + CSID_RDI_CTRL(0)); + writel_relaxed(val, csid->base + CSID_RDI_CTRL(vc)); - val = readl_relaxed(csid->base + CSID_RDI_CFG0(0)); + val = readl_relaxed(csid->base + CSID_RDI_CFG0(vc)); val |= 1 << RDI_CFG0_ENABLE; - writel_relaxed(val, csid->base + CSID_RDI_CFG0(0)); + writel_relaxed(val, csid->base + CSID_RDI_CFG0(vc)); } if (tg->enabled) { @@ -453,7 +453,16 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable) val = HALT_CMD_RESUME_AT_FRAME_BOUNDARY << RDI_CTRL_HALT_CMD; else val = HALT_CMD_HALT_AT_FRAME_BOUNDARY << RDI_CTRL_HALT_CMD; - writel_relaxed(val, csid->base + CSID_RDI_CTRL(0)); + writel_relaxed(val, csid->base + CSID_RDI_CTRL(vc)); +} + +static void csid_configure_stream(struct csid_device *csid, u8 enable) +{ + u8 i; + /* Loop through all enabled VCs and configure stream for each */ + for (i = 0; i < MSM_CSID_MAX_SRC_STREAMS; i++) + if (csid->phy.en_vc & BIT(i)) + __csid_configure_stream(csid, enable, i); } static int csid_configure_testgen_pattern(struct csid_device *csid, s32 val) @@ -499,6 +508,7 @@ static irqreturn_t csid_isr(int irq, void *dev) struct csid_device *csid = dev; u32 val; u8 reset_done; + int i; val = readl_relaxed(csid->base + CSID_TOP_IRQ_STATUS); writel_relaxed(val, csid->base + CSID_TOP_IRQ_CLEAR); @@ -507,8 +517,12 @@ static irqreturn_t csid_isr(int irq, void *dev) val = readl_relaxed(csid->base + CSID_CSI2_RX_IRQ_STATUS); writel_relaxed(val, csid->base + CSID_CSI2_RX_IRQ_CLEAR); - val = readl_relaxed(csid->base + CSID_CSI2_RDIN_IRQ_STATUS(0)); - writel_relaxed(val, csid->base + CSID_CSI2_RDIN_IRQ_CLEAR(0)); + /* Read and clear IRQ status for each enabled RDI channel */ + for (i = 0; i < MSM_CSID_MAX_SRC_STREAMS; i++) + if (csid->phy.en_vc & BIT(i)) { + val = readl_relaxed(csid->base + CSID_CSI2_RDIN_IRQ_STATUS(i)); + writel_relaxed(val, csid->base + CSID_CSI2_RDIN_IRQ_CLEAR(i)); + } val = 1 << IRQ_CMD_CLEAR; writel_relaxed(val, csid->base + CSID_IRQ_CMD); diff --git a/drivers/media/platform/qcom/camss/camss-csid.c b/drivers/media/platform/qcom/camss/camss-csid.c index 88f188e0f750..6360314f04a6 100644 --- a/drivers/media/platform/qcom/camss/camss-csid.c +++ b/drivers/media/platform/qcom/camss/camss-csid.c @@ -196,6 +196,8 @@ static int csid_set_power(struct v4l2_subdev *sd, int on) return ret; } + csid->phy.need_vc_update = true; + enable_irq(csid->irq); ret = csid->ops->reset(csid); @@ -249,7 +251,10 @@ static int csid_set_stream(struct v4l2_subdev *sd, int enable) return -ENOLINK; } - csid->ops->configure_stream(csid, enable); + if (csid->phy.need_vc_update) { + csid->ops->configure_stream(csid, enable); + csid->phy.need_vc_update = false; + } return 0; } @@ -460,6 +465,7 @@ static int csid_set_format(struct v4l2_subdev *sd, { struct csid_device *csid = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *format; + int i; format = __csid_get_format(csid, sd_state, fmt->pad, fmt->which); if (format == NULL) @@ -468,14 +474,14 @@ static int csid_set_format(struct v4l2_subdev *sd, csid_try_format(csid, sd_state, fmt->pad, &fmt->format, fmt->which); *format = fmt->format; - /* Propagate the format from sink to source */ + /* Propagate the format from sink to source pads */ if (fmt->pad == MSM_CSID_PAD_SINK) { - format = __csid_get_format(csid, sd_state, MSM_CSID_PAD_SRC, - fmt->which); + for (i = MSM_CSID_PAD_FIRST_SRC; i < MSM_CSID_PADS_NUM; ++i) { + format = __csid_get_format(csid, sd_state, i, fmt->which); - *format = fmt->format; - csid_try_format(csid, sd_state, MSM_CSID_PAD_SRC, format, - fmt->which); + *format = fmt->format; + csid_try_format(csid, sd_state, i, format, fmt->which); + } } return 0; @@ -738,7 +744,6 @@ static int csid_link_setup(struct media_entity *entity, struct csid_device *csid; struct csiphy_device *csiphy; struct csiphy_lanes_cfg *lane_cfg; - struct v4l2_subdev_format format = { 0 }; sd = media_entity_to_v4l2_subdev(entity); csid = v4l2_get_subdevdata(sd); @@ -761,11 +766,22 @@ static int csid_link_setup(struct media_entity *entity, lane_cfg = &csiphy->cfg.csi2->lane_cfg; csid->phy.lane_cnt = lane_cfg->num_data; csid->phy.lane_assign = csid_get_lane_assign(lane_cfg); + } + /* Decide which virtual channels to enable based on which source pads are enabled */ + if (local->flags & MEDIA_PAD_FL_SOURCE) { + struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity); + struct csid_device *csid = v4l2_get_subdevdata(sd); + struct device *dev = csid->camss->dev; - /* Reset format on source pad to sink pad format */ - format.pad = MSM_CSID_PAD_SRC; - format.which = V4L2_SUBDEV_FORMAT_ACTIVE; - csid_set_format(&csid->subdev, NULL, &format); + if (flags & MEDIA_LNK_FL_ENABLED) + csid->phy.en_vc |= BIT(local->index - 1); + else + csid->phy.en_vc &= ~BIT(local->index - 1); + + csid->phy.need_vc_update = true; + + dev_dbg(dev, "%s: Enabled CSID virtual channels mask 0x%x\n", + __func__, csid->phy.en_vc); } return 0; @@ -816,6 +832,7 @@ int msm_csid_register_entity(struct csid_device *csid, struct v4l2_subdev *sd = &csid->subdev; struct media_pad *pads = csid->pads; struct device *dev = csid->camss->dev; + int i; int ret; v4l2_subdev_init(sd, &csid_v4l2_ops); @@ -852,7 +869,8 @@ int msm_csid_register_entity(struct csid_device *csid, } pads[MSM_CSID_PAD_SINK].flags = MEDIA_PAD_FL_SINK; - pads[MSM_CSID_PAD_SRC].flags = MEDIA_PAD_FL_SOURCE; + for (i = MSM_CSID_PAD_FIRST_SRC; i < MSM_CSID_PADS_NUM; ++i) + pads[i].flags = MEDIA_PAD_FL_SOURCE; sd->entity.function = MEDIA_ENT_F_PROC_VIDEO_PIXEL_FORMATTER; sd->entity.ops = &csid_media_ops; diff --git a/drivers/media/platform/qcom/camss/camss-csid.h b/drivers/media/platform/qcom/camss/camss-csid.h index f06040e44c51..d4b48432a097 100644 --- a/drivers/media/platform/qcom/camss/camss-csid.h +++ b/drivers/media/platform/qcom/camss/camss-csid.h @@ -19,8 +19,13 @@ #include #define MSM_CSID_PAD_SINK 0 -#define MSM_CSID_PAD_SRC 1 -#define MSM_CSID_PADS_NUM 2 +#define MSM_CSID_PAD_FIRST_SRC 1 +#define MSM_CSID_PADS_NUM 5 + +#define MSM_CSID_PAD_SRC (MSM_CSID_PAD_FIRST_SRC) + +/* CSID hardware can demultiplex up to 4 outputs */ +#define MSM_CSID_MAX_SRC_STREAMS 4 #define DATA_TYPE_EMBEDDED_DATA_8BIT 0x12 #define DATA_TYPE_YUV420_8BIT 0x18 @@ -81,6 +86,8 @@ struct csid_phy_config { u8 csiphy_id; u8 lane_cnt; u32 lane_assign; + u32 en_vc; + u8 need_vc_update; }; struct csid_device; From b92a8f591ca8ba302d486728746a1afaeadf6b00 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Wed, 30 Aug 2023 16:16:14 +0100 Subject: [PATCH 097/151] media: qcom: camss: Fix set CSI2_RX_CFG1_VC_MODE when VC is greater than 3 [ Upstream commit e655d1ae9703286cef7fda8675cad62f649dc183 ] VC_MODE = 0 implies a two bit VC address. VC_MODE = 1 is required for VCs with a larger address than two bits. Fixes: eebe6d00e9bf ("media: camss: Add support for CSID hardware version Titan 170") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- drivers/media/platform/qcom/camss/camss-csid-gen2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c index 2e015e69a6ad..23acc387be5f 100644 --- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c +++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c @@ -446,6 +446,8 @@ static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc) writel_relaxed(val, csid->base + CSID_CSI2_RX_CFG0); val = 1 << CSI2_RX_CFG1_PACKET_ECC_CORRECTION_EN; + if (vc > 3) + val |= 1 << CSI2_RX_CFG1_VC_MODE; val |= 1 << CSI2_RX_CFG1_MISR_EN; writel_relaxed(val, csid->base + CSID_CSI2_RX_CFG1); From 4c78612e5fbc632f1568fa237d261241061cc7c2 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:52 -0800 Subject: [PATCH 098/151] ext4: convert move_extent_per_page() to use folios [ Upstream commit 6dd8fe86fa84729538d8bed3149faf9c5886bb5b ] Patch series "Removing the try_to_release_page() wrapper", v3. This patchset replaces the remaining calls of try_to_release_page() with the folio equivalent: filemap_release_folio(). This allows us to remove the wrapper. This patch (of 4): Convert move_extent_per_page() to use folios. This change removes 5 calls to compound_head() and is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") Signed-off-by: Sasha Levin --- fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 044e34cd835c..8dbb87edf24c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; + struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; @@ -313,6 +314,13 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ + folio[0] = page_folio(pagep[0]); + folio[1] = page_folio(pagep[1]); + + VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); + VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); + VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -331,10 +339,10 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((page_has_private(pagep[0]) && - !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && - !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; goto drop_data_sem; } @@ -344,19 +352,21 @@ again: block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_pages; + goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); if (*err) - goto unlock_pages; + goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; - goto unlock_pages; + goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, @@ -369,13 +379,13 @@ data_copy: replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto unlock_pages; + goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!page_has_buffers(pagep[0])) - create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); - bh = page_buffers(pagep[0]); + if (!folio_buffers(folio[0])) + create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); + bh = folio_buffers(folio[0]); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { @@ -385,7 +395,7 @@ data_copy: bh = bh->b_this_page; } if (!*err) - *err = block_commit_write(pagep[0], from, from + replaced_size); + *err = block_commit_write(&folio[0]->page, from, from + replaced_size); if (unlikely(*err < 0)) goto repair_branches; @@ -395,11 +405,11 @@ data_copy: *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); -unlock_pages: - unlock_page(pagep[0]); - put_page(pagep[0]); - unlock_page(pagep[1]); - put_page(pagep[1]); +unlock_folios: + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && @@ -430,7 +440,7 @@ repair_branches: *err = -EIO; } replaced_count = 0; - goto unlock_pages; + goto unlock_folios; } /** From a6f440f3b9569dd3aca7faaddb08678bcff12847 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:53 -0800 Subject: [PATCH 099/151] khugepage: replace try_to_release_page() with filemap_release_folio() [ Upstream commit 64ab3195ea077eaeedc8b382939c3dc5ca56f369 ] Replace some calls with their folio equivalents. This change removes 4 calls to compound_head() and is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") Signed-off-by: Sasha Levin --- mm/khugepaged.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ef72d3df4b65..6fc7db587c45 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1818,6 +1818,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_set(&xas, start); for (index = start; index < end; index++) { struct page *page = xas_next(&xas); + struct folio *folio; VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -1844,8 +1845,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } if (xa_is_value(page) || !PageUptodate(page)) { - struct folio *folio; - xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, @@ -1933,13 +1932,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (page_mapping(page) != mapping) { + folio = page_folio(page); + + if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - if (!is_shmem && (PageDirty(page) || - PageWriteback(page))) { + if (!is_shmem && (folio_test_dirty(folio) || + folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed @@ -1949,20 +1950,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (isolate_lru_page(page)) { + if (folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) { + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; - putback_lru_page(page); + folio_putback_lru(folio); goto out_unlock; } - if (page_mapped(page)) - try_to_unmap(page_folio(page), + if (folio_mapped(folio)) + try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); From 8b6b3ecf0c1391ced217def3798546ed2c37fd7b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:54 -0800 Subject: [PATCH 100/151] memory-failure: convert truncate_error_page() to use folio [ Upstream commit ac5efa782041670b63a05c36d92d02a80e50bb63 ] Replace try_to_release_page() with filemap_release_folio(). This change is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") Signed-off-by: Sasha Levin --- mm/memory-failure.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ebd717157c81..6355166a6bb2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -827,12 +827,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn, int ret = MF_FAILED; if (mapping->a_ops->error_remove_page) { + struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { + } else if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_NOIO)) { pr_info("%#lx: failed to release buffers\n", pfn); } else { ret = MF_RECOVERED; From bceff380f361c4369217ce869d26bfe353ebb55a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:51 +0100 Subject: [PATCH 101/151] mm: merge folio_has_private()/filemap_release_folio() call pairs [ Upstream commit 0201ebf274a306a6ebb95e5dc2d6a0a27c737cac ] Patch series "mm, netfs, fscache: Stop read optimisation when folio removed from pagecache", v7. This fixes an optimisation in fscache whereby we don't read from the cache for a particular file until we know that there's data there that we don't have in the pagecache. The problem is that I'm no longer using PG_fscache (aka PG_private_2) to indicate that the page is cached and so I don't get a notification when a cached page is dropped from the pagecache. The first patch merges some folio_has_private() and filemap_release_folio() pairs and introduces a helper, folio_needs_release(), to indicate if a release is required. The second patch is the actual fix. Following Willy's suggestions[1], it adds an AS_RELEASE_ALWAYS flag to an address_space that will make filemap_release_folio() always call ->release_folio(), even if PG_private/PG_private_2 aren't set. folio_needs_release() is altered to add a check for this. This patch (of 2): Make filemap_release_folio() check folio_has_private(). Then, in most cases, where a call to folio_has_private() is immediately followed by a call to filemap_release_folio(), we can get rid of the test in the pair. There are a couple of sites in mm/vscan.c that this can't so easily be done. In shrink_folio_list(), there are actually three cases (something different is done for incompletely invalidated buffers), but filemap_release_folio() elides two of them. In shrink_active_list(), we don't have have the folio lock yet, so the check allows us to avoid locking the page unnecessarily. A wrapper function to check if a folio needs release is provided for those places that still need to do it in the mm/ directory. This will acquire additional parts to the condition in a future patch. After this, the only remaining caller of folio_has_private() outside of mm/ is a check in fuse. Link: https://lkml.kernel.org/r/20230628104852.3391651-1-dhowells@redhat.com Link: https://lkml.kernel.org/r/20230628104852.3391651-2-dhowells@redhat.com Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Signed-off-by: David Howells Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Xiubo Li Cc: Jingbo Xu Signed-off-by: Andrew Morton Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") Signed-off-by: Sasha Levin --- fs/ext4/move_extent.c | 12 ++++-------- fs/splice.c | 3 +-- mm/filemap.c | 2 ++ mm/huge_memory.c | 3 +-- mm/internal.h | 8 ++++++++ mm/khugepaged.c | 3 +-- mm/memory-failure.c | 8 +++----- mm/migrate.c | 3 +-- mm/truncate.c | 6 ++---- mm/vmscan.c | 8 ++++---- 10 files changed, 27 insertions(+), 29 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 8dbb87edf24c..dedc9d445f24 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -339,10 +339,8 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -361,10 +359,8 @@ data_copy: /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } diff --git a/fs/splice.c b/fs/splice.c index c4ae54deac42..d0230cf8ec57 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -65,8 +65,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, */ folio_wait_writeback(folio); - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* diff --git a/mm/filemap.c b/mm/filemap.c index 10fe6430693b..2809b1174f04 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4005,6 +4005,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); + if (!folio_needs_release(folio)) + return true; if (folio_test_writeback(folio)) return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2753fb54cdf3..59577946735b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2694,8 +2694,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) gfp = current_gfp_context(mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); - if (folio_test_private(folio) && - !filemap_release_folio(folio, gfp)) { + if (!filemap_release_folio(folio, gfp)) { ret = -EBUSY; goto out; } diff --git a/mm/internal.h b/mm/internal.h index 6b7ef495b56d..1fefb5181ab7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -163,6 +163,14 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +/* + * Return true if a folio needs ->release_folio() calling upon it. + */ +static inline bool folio_needs_release(struct folio *folio) +{ + return folio_has_private(folio); +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6fc7db587c45..65bd0b105266 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1955,8 +1955,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) { + if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6355166a6bb2..5b846ed5dcbe 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -830,14 +830,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn, struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { + if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_NOIO)) { + else if (!filemap_release_folio(folio, GFP_NOIO)) pr_info("%#lx: failed to release buffers\n", pfn); - } else { + else ret = MF_RECOVERED; - } } else { /* * If the file system doesn't support it just invalidate diff --git a/mm/migrate.c b/mm/migrate.c index 91bd69c61148..c93dd6a31c31 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -914,8 +914,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (folio_test_private(src) && - !filemap_release_folio(src, GFP_KERNEL)) + if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_folio(mapping, dst, src, mode); diff --git a/mm/truncate.c b/mm/truncate.c index c0be77e5c008..0d4dd233f518 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,7 +19,6 @@ #include #include #include -#include /* grr. try_to_release_page */ #include #include #include "internal.h" @@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping, if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; - if (folio_has_private(folio) && !filemap_release_folio(folio, 0)) + if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); @@ -581,8 +580,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, if (folio->mapping != mapping) return 0; - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f090faa6377..9f3cfb7caa48 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1992,7 +1992,7 @@ retry: * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ - if (folio_has_private(folio)) { + if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { @@ -2618,9 +2618,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (unlikely(buffer_heads_over_limit)) { - if (folio_test_private(folio) && folio_trylock(folio)) { - if (folio_test_private(folio)) - filemap_release_folio(folio, 0); + if (folio_needs_release(folio) && + folio_trylock(folio)) { + filemap_release_folio(folio, 0); folio_unlock(folio); } } From d0eafc763135508be118dac208887a26c0adb74d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:52 +0100 Subject: [PATCH 102/151] mm, netfs, fscache: stop read optimisation when folio removed from pagecache [ Upstream commit b4fa966f03b7401ceacd4ffd7227197afb2b8376 ] Fscache has an optimisation by which reads from the cache are skipped until we know that (a) there's data there to be read and (b) that data isn't entirely covered by pages resident in the netfs pagecache. This is done with two flags manipulated by fscache_note_page_release(): if (... test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); where the NO_DATA_TO_READ flag causes cachefiles_prepare_read() to indicate that netfslib should download from the server or clear the page instead. The fscache_note_page_release() function is intended to be called from ->releasepage() - but that only gets called if PG_private or PG_private_2 is set - and currently the former is at the discretion of the network filesystem and the latter is only set whilst a page is being written to the cache, so sometimes we miss clearing the optimisation. Fix this by following Willy's suggestion[1] and adding an address_space flag, AS_RELEASE_ALWAYS, that causes filemap_release_folio() to always call ->release_folio() if it's set, even if PG_private or PG_private_2 aren't set. Note that this would require folio_test_private() and page_has_private() to become more complicated. To avoid that, in the places[*] where these are used to conditionalise calls to filemap_release_folio() and try_to_release_page(), the tests are removed the those functions just jumped to unconditionally and the test is performed there. [*] There are some exceptions in vmscan.c where the check guards more than just a call to the releaser. I've added a function, folio_needs_release() to wrap all the checks for that. AS_RELEASE_ALWAYS should be set if a non-NULL cookie is obtained from fscache and cleared in ->evict_inode() before truncate_inode_pages_final() is called. Additionally, the FSCACHE_COOKIE_NO_DATA_TO_READ flag needs to be cleared and the optimisation cancelled if a cachefiles object already contains data when we open it. [dwysocha@redhat.com: call folio_mapping() inside folio_needs_release()] Link: https://github.com/DaveWysochanskiRH/kernel/commit/902c990e311120179fa5de99d68364b2947b79ec Link: https://lkml.kernel.org/r/20230628104852.3391651-3-dhowells@redhat.com Fixes: 1f67e6d0b188 ("fscache: Provide a function to note the release of a page") Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") Signed-off-by: David Howells Signed-off-by: Dave Wysochanski Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Tested-by: SeongJae Park Cc: Daire Byrne Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: Andreas Dilger Cc: Jingbo Xu Cc: "Theodore Ts'o" Cc: Xiubo Li Signed-off-by: Andrew Morton Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") Signed-off-by: Sasha Levin --- fs/9p/cache.c | 2 ++ fs/afs/internal.h | 2 ++ fs/cachefiles/namei.c | 2 ++ fs/ceph/cache.c | 2 ++ fs/nfs/fscache.c | 3 +++ fs/smb/client/fscache.c | 2 ++ include/linux/pagemap.h | 16 ++++++++++++++++ mm/internal.h | 5 ++++- 8 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index cebba4eaa0b5..12c0ae29f185 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &path, sizeof(path), &version, sizeof(version), i_size_read(&v9inode->netfs.inode)); + if (v9inode->netfs.cache) + mapping_set_release_always(inode->i_mapping); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index fcbb598d8c85..a25fdc3e5231 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -682,6 +682,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 03ca8f2f657a..50b2ee163af6 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -584,6 +584,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, if (ret < 0) goto check_failed; + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); + object->file = file; /* Always update the atime on an object we've just looked up (this is diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 177d8e8d73fe..de1dee46d3df 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e731c00a9fcb..d3c938dd2b12 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -176,6 +176,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index e73625b5d0cc..f64bad513ba6 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); + if (cifsi->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 03307b72de6c..fdbb90ae56c7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -199,6 +199,7 @@ enum mapping_flags { /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, + AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ }; /** @@ -269,6 +270,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping) return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } +static inline bool mapping_release_always(const struct address_space *mapping) +{ + return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_set_release_always(struct address_space *mapping) +{ + set_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_clear_release_always(struct address_space *mapping) +{ + clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/mm/internal.h b/mm/internal.h index 1fefb5181ab7..d01130efce5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,7 +168,10 @@ static inline void set_page_refcounted(struct page *page) */ static inline bool folio_needs_release(struct folio *folio) { - return folio_has_private(folio); + struct address_space *mapping = folio_mapping(folio); + + return folio_has_private(folio) || + (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; From a8e4300ae58dae7f181d7daa9034f127a33f217a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2023 16:10:17 +0200 Subject: [PATCH 103/151] filemap: add a per-mapping stable writes flag [ Upstream commit 762321dab9a72760bf9aec48362f932717c9424d ] folio_wait_stable waits for writeback to finish before modifying the contents of a folio again, e.g. to support check summing of the data in the block integrity code. Currently this behavior is controlled by the SB_I_STABLE_WRITES flag on the super_block, which means it is uniform for the entire file system. This is wrong for the block device pseudofs which is shared by all block devices, or file systems that can use multiple devices like XFS witht the RT subvolume or btrfs (although btrfs currently reimplements folio_wait_stable anyway). Add a per-address_space AS_STABLE_WRITES flag to control the behavior in a more fine grained way. The existing SB_I_STABLE_WRITES is kept to initialize AS_STABLE_WRITES to the existing default which covers most cases. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-2-hch@lst.de Tested-by: Ilya Dryomov Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") Signed-off-by: Sasha Levin --- fs/inode.c | 2 ++ include/linux/pagemap.h | 17 +++++++++++++++++ mm/page-writeback.c | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 73ad1b0d4775..8cfda7a6d590 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); + if (sb->s_iflags & SB_I_STABLE_WRITES) + mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fdbb90ae56c7..1be5a1fa6a3a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -200,6 +200,8 @@ enum mapping_flags { AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ + AS_STABLE_WRITES, /* must wait for writeback before modifying + folio contents */ }; /** @@ -285,6 +287,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping) clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } +static inline bool mapping_stable_writes(const struct address_space *mapping) +{ + return test_bit(AS_STABLE_WRITES, &mapping->flags); +} + +static inline void mapping_set_stable_writes(struct address_space *mapping) +{ + set_bit(AS_STABLE_WRITES, &mapping->flags); +} + +static inline void mapping_clear_stable_writes(struct address_space *mapping) +{ + clear_bit(AS_STABLE_WRITES, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e9d8d857ecc..de5f69921b94 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3078,7 +3078,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); */ void folio_wait_stable(struct folio *folio) { - if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) + if (mapping_stable_writes(folio_mapping(folio))) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable); From bf223fd4d914f8d7877b5f13a03477cc7542cc25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Oct 2023 16:10:18 +0200 Subject: [PATCH 104/151] block: update the stable_writes flag in bdev_add [ Upstream commit 1898efcdbed32bb1c67269c985a50bab0dbc9493 ] Propagate the per-queue stable_write flags into each bdev inode in bdev_add. This makes sure devices that require stable writes have it set for I/O on the block device node as well. Note that this doesn't cover the case of a flag changing on a live device yet. We should handle that as well, but I plan to cover it as part of a more general rework of how changing runtime paramters on block devices works. Fixes: 1cb039f3dc16 ("bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag") Reported-by: Ilya Dryomov Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-3-hch@lst.de Tested-by: Ilya Dryomov Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- block/bdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bdev.c b/block/bdev.c index d699ecdb3260..b61502ec8da0 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -507,6 +507,8 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) void bdev_add(struct block_device *bdev, dev_t dev) { + if (bdev_stable_writes(bdev)) + mapping_set_stable_writes(bdev->bd_inode->i_mapping); bdev->bd_dev = dev; bdev->bd_inode->i_rdev = dev; bdev->bd_inode->i_ino = dev; From e88275ce7e7ba641d9c1c5df8ffe344e698507fd Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Sat, 25 Nov 2023 23:55:10 -0300 Subject: [PATCH 105/151] smb: client: fix missing mode bits for SMB symlinks [ Upstream commit ef22bb800d967616c7638d204bc1b425beac7f5f ] When instantiating inodes for SMB symlinks, add the mode bits from @cifs_sb->ctx->file_mode as we already do for the other special files. Cc: stable@vger.kernel.org Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 7be51f9d2fa1..5343898bac8a 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -264,7 +264,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, fattr->cf_dtype = DT_REG; break; case UNIX_SYMLINK: - fattr->cf_mode |= S_IFLNK; + fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_LNK; break; case UNIX_DIR: From 5b8938fc7d00ad4a89251a1cef96d9f0437540b0 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 18 Oct 2022 17:18:51 +0300 Subject: [PATCH 106/151] net: dpaa2-eth: rearrange variable in dpaa2_eth_get_ethtool_stats [ Upstream commit 3313206827678f6f036eca601a51f6c4524b559a ] Rearrange the variables in the dpaa2_eth_get_ethtool_stats() function so that we adhere to the reverse Christmas tree rule. Also, in the next patch we are adding more variables and I didn't know where to place them with the current ordering. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller Stable-dep-of: beb1930f966d ("dpaa2-eth: recycle the RX buffer only after all processing done") Signed-off-by: Sasha Levin --- .../ethernet/freescale/dpaa2/dpaa2-ethtool.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index eea7d7a07c00..59888826469b 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -227,17 +227,8 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev, struct ethtool_stats *stats, u64 *data) { - int i = 0; - int j, k, err; - int num_cnt; - union dpni_statistics dpni_stats; - u32 fcnt, bcnt; - u32 fcnt_rx_total = 0, fcnt_tx_total = 0; - u32 bcnt_rx_total = 0, bcnt_tx_total = 0; - u32 buf_cnt; struct dpaa2_eth_priv *priv = netdev_priv(net_dev); - struct dpaa2_eth_drv_stats *extras; - struct dpaa2_eth_ch_stats *ch_stats; + union dpni_statistics dpni_stats; int dpni_stats_page_size[DPNI_STATISTICS_CNT] = { sizeof(dpni_stats.page_0), sizeof(dpni_stats.page_1), @@ -247,6 +238,13 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev, sizeof(dpni_stats.page_5), sizeof(dpni_stats.page_6), }; + u32 fcnt_rx_total = 0, fcnt_tx_total = 0; + u32 bcnt_rx_total = 0, bcnt_tx_total = 0; + struct dpaa2_eth_ch_stats *ch_stats; + struct dpaa2_eth_drv_stats *extras; + int j, k, err, num_cnt, i = 0; + u32 fcnt, bcnt; + u32 buf_cnt; memset(data, 0, sizeof(u64) * (DPAA2_ETH_NUM_STATS + DPAA2_ETH_NUM_EXTRA_STATS)); From e570b15087532919688979b6ead02bb5b890b082 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Fri, 24 Nov 2023 12:28:05 +0200 Subject: [PATCH 107/151] dpaa2-eth: recycle the RX buffer only after all processing done [ Upstream commit beb1930f966d1517921488bd5d64147f58f79abf ] The blamed commit added support for Rx copybreak. This meant that for certain frame sizes, a new skb was allocated and the initial data buffer was recycled. Instead of waiting to recycle the Rx buffer only after all processing was done on it (like accessing the parse results or timestamp information), the code path just went ahead and re-used the buffer right away. This sometimes lead to corrupted HW and SW annotation areas. Fix this by delaying the moment when the buffer is recycled. Fixes: 50f826999a80 ("dpaa2-eth: add rx copybreak support") Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index b58162ce81d8..de62eee58a00 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -509,8 +509,6 @@ static struct sk_buff *dpaa2_eth_copybreak(struct dpaa2_eth_channel *ch, memcpy(skb->data, fd_vaddr + fd_offset, fd_length); - dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd)); - return skb; } @@ -528,6 +526,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, struct dpaa2_eth_drv_stats *percpu_extras; struct device *dev = priv->net_dev->dev.parent; struct dpaa2_fas *fas; + bool recycle_rx_buf = false; void *buf_data; u32 status = 0; u32 xdp_act; @@ -560,6 +559,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, dma_unmap_page(dev, addr, priv->rx_buf_size, DMA_BIDIRECTIONAL); skb = dpaa2_eth_build_linear_skb(ch, fd, vaddr); + } else { + recycle_rx_buf = true; } } else if (fd_format == dpaa2_fd_sg) { WARN_ON(priv->xdp_prog); @@ -607,6 +608,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, list_add_tail(&skb->list, ch->rx_list); + if (recycle_rx_buf) + dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd)); return; err_build_skb: From 5ff1682fec185fe7537370370f202abc454b6b3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 14:58:06 -0800 Subject: [PATCH 108/151] ethtool: don't propagate EOPNOTSUPP from dumps [ Upstream commit cbeb989e41f4094f54bec2cecce993f26f547bea ] The default dump handler needs to clear ret before returning. Otherwise if the last interface returns an inconsequential error this error will propagate to user space. This may confuse user space (ethtool CLI seems to ignore it, but YNL doesn't). It will also terminate the dump early for mutli-skb dump, because netlink core treats EOPNOTSUPP as a real error. Fixes: 728480f12442 ("ethtool: default handlers for GET requests") Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231126225806.2143528-1-kuba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ethtool/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 1a4c11356c96..fc4ccecf9495 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -509,7 +509,7 @@ lock_and_cont: cont: idx++; } - + ret = 0; } rtnl_unlock(); From 90d1f74c3cf68e6a987c370a50d30a66ef39f5c2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 28 Nov 2023 17:25:56 -0800 Subject: [PATCH 109/151] bpf, sockmap: af_unix stream sockets need to hold ref for pair sock [ Upstream commit 8866730aed5100f06d3d965c22f1c61f74942541 ] AF_UNIX stream sockets are a paired socket. So sending on one of the pairs will lookup the paired socket as part of the send operation. It is possible however to put just one of the pairs in a BPF map. This currently increments the refcnt on the sock in the sockmap to ensure it is not free'd by the stack before sockmap cleans up its state and stops any skbs being sent/recv'd to that socket. But we missed a case. If the peer socket is closed it will be free'd by the stack. However, the paired socket can still be referenced from BPF sockmap side because we hold a reference there. Then if we are sending traffic through BPF sockmap to that socket it will try to dereference the free'd pair in its send logic creating a use after free. And following splat: [59.900375] BUG: KASAN: slab-use-after-free in sk_wake_async+0x31/0x1b0 [59.901211] Read of size 8 at addr ffff88811acbf060 by task kworker/1:2/954 [...] [59.905468] Call Trace: [59.905787] [59.906066] dump_stack_lvl+0x130/0x1d0 [59.908877] print_report+0x16f/0x740 [59.910629] kasan_report+0x118/0x160 [59.912576] sk_wake_async+0x31/0x1b0 [59.913554] sock_def_readable+0x156/0x2a0 [59.914060] unix_stream_sendmsg+0x3f9/0x12a0 [59.916398] sock_sendmsg+0x20e/0x250 [59.916854] skb_send_sock+0x236/0xac0 [59.920527] sk_psock_backlog+0x287/0xaa0 To fix let BPF sockmap hold a refcnt on both the socket in the sockmap and its paired socket. It wasn't obvious how to contain the fix to bpf_unix logic. The primarily problem with keeping this logic in bpf_unix was: In the sock close() we could handle the deref by having a close handler. But, when we are destroying the psock through a map delete operation we wouldn't have gotten any signal thorugh the proto struct other than it being replaced. If we do the deref from the proto replace its too early because we need to deref the sk_pair after the backlog worker has been stopped. Given all this it seems best to just cache it at the end of the psock and eat 8B for the af_unix and vsock users. Notice dgram sockets are OK because they handle locking already. Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20231129012557.95371-2-john.fastabend@gmail.com Signed-off-by: Sasha Levin --- include/linux/skmsg.h | 1 + include/net/af_unix.h | 1 + net/core/skmsg.c | 2 ++ net/unix/af_unix.c | 2 -- net/unix/unix_bpf.c | 5 +++++ 5 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c1637515a8a4..c953b8c0d2f4 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -106,6 +106,7 @@ struct sk_psock { struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; + struct sock *sk_pair; struct rcu_work rwork; }; diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 480fa579787e..55ca217c626b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -77,6 +77,7 @@ static inline struct unix_sock *unix_sk(const struct sock *sk) { return (struct unix_sock *)sk; } +#define unix_peer(sk) (unix_sk(sk)->peer) #define peer_wait peer_wq.wait diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a5c1f67dc96e..3818035ea002 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -825,6 +825,8 @@ static void sk_psock_destroy(struct work_struct *work) if (psock->sk_redir) sock_put(psock->sk_redir); + if (psock->sk_pair) + sock_put(psock->sk_pair); sock_put(psock->sk); kfree(psock); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6dbeb8007333..be2ed7b0fe21 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -211,8 +211,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -#define unix_peer(sk) (unix_sk(sk)->peer) - static inline int unix_our_peer(struct sock *sk, struct sock *osk) { return unix_peer(osk) == sk; diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index 2f9d8271c6ec..7ea7c3a0d0d0 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -159,12 +159,17 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { + struct sock *sk_pair; + if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } + sk_pair = unix_peer(sk); + sock_hold(sk_pair); + psock->sk_pair = sk_pair; unix_stream_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; From aee609302d65ae443c73972d3882a1ca8f830a36 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 30 Nov 2023 20:43:42 +0000 Subject: [PATCH 110/151] firmware: arm_scmi: Fix frequency truncation by promoting multiplier type [ Upstream commit 8e3c98d9187e09274fc000a7d1a77b070a42d259 ] Fix the possible frequency truncation for all values equal to or greater 4GHz on 64bit machines by updating the multiplier 'mult_factor' to 'unsigned long' type. It is also possible that the multiplier itself can be greater than or equal to 2^32. So we need to also fix the equation computing the value of the multiplier. Fixes: a9e3fbfaa0ff ("firmware: arm_scmi: add initial support for performance protocol") Reported-by: Sibi Sankar Closes: https://lore.kernel.org/all/20231129065748.19871-3-quic_sibis@quicinc.com/ Cc: Cristian Marussi Link: https://lore.kernel.org/r/20231130204343.503076-1-sudeep.holla@arm.com Signed-off-by: Sudeep Holla Signed-off-by: Sasha Levin --- drivers/firmware/arm_scmi/perf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c index 431bda9165c3..2775bcafe40f 100644 --- a/drivers/firmware/arm_scmi/perf.c +++ b/drivers/firmware/arm_scmi/perf.c @@ -131,7 +131,7 @@ struct perf_dom_info { u32 opp_count; u32 sustained_freq_khz; u32 sustained_perf_level; - u32 mult_factor; + unsigned long mult_factor; char name[SCMI_MAX_STR_SIZE]; struct scmi_opp opp[MAX_OPPS]; struct scmi_fc_info *fc_info; @@ -223,8 +223,8 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph, dom_info->mult_factor = 1000; else dom_info->mult_factor = - (dom_info->sustained_freq_khz * 1000) / - dom_info->sustained_perf_level; + (dom_info->sustained_freq_khz * 1000UL) + / dom_info->sustained_perf_level; strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE); } From f4fe76467e7bd0e628518750b170763eb25dc203 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 7 Dec 2023 19:20:35 +0100 Subject: [PATCH 111/151] ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 [ Upstream commit 634e5e1e06f5cdd614a1bc429ecb243a51cc009d ] Lenovo Yoga Pro 7 14APH8 (PCI SSID 17aa:3882) seems requiring the similar workaround like Yoga 9 model for the bass speaker. Cc: Link: https://lore.kernel.org/r/CAGGk=CRRQ1L9p771HsXTN_ebZP41Qj+3gw35Gezurn+nokRewg@mail.gmail.com Link: https://lore.kernel.org/r/20231207182035.30248-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 919f3e391de1..1c8ffc5cf97f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9907,6 +9907,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0xc019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xc022, "Clevo NH77[DC][QW]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x17aa, 0x1036, "Lenovo P520", ALC233_FIXUP_LENOVO_MULTI_CODECS), + SND_PCI_QUIRK(0x17aa, 0x3882, "Lenovo Yoga Pro 7 14APH8", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x1048, "ThinkCentre Station", ALC623_FIXUP_LENOVO_THINKSTATION_P340), SND_PCI_QUIRK(0x17aa, 0x20f2, "Thinkpad SL410/510", ALC269_FIXUP_SKU_IGNORE), SND_PCI_QUIRK(0x17aa, 0x215e, "Thinkpad L512", ALC269_FIXUP_SKU_IGNORE), From a1dcd1794730bae933b77f81a4935cfad06f58c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:00 +0800 Subject: [PATCH 112/151] genirq/affinity: Remove the 'firstvec' parameter from irq_build_affinity_masks [ Upstream commit cdf07f0ea48a3b52f924714d477366ac510ee870 ] The 'firstvec' parameter is always same with the parameter of 'startvec', so use 'startvec' directly inside irq_build_affinity_masks(). Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-2-ming.lei@redhat.com Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") Signed-off-by: Sasha Levin --- kernel/irq/affinity.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d9a5c1d65a79..3361e36ebaa1 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec, * 2) spread other possible CPUs on these vectors */ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, - unsigned int firstvec, struct irq_affinity_desc *masks) { unsigned int curvec = startvec, nr_present = 0, nr_others = 0; + unsigned int firstvec = startvec; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(curvec, this_vecs, - curvec, masks); + ret = irq_build_affinity_masks(curvec, this_vecs, masks); if (ret) { kfree(masks); return NULL; From 9e84d7bb15053bd260138b1aae59c038f7692218 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:01 +0800 Subject: [PATCH 113/151] genirq/affinity: Pass affinity managed mask array to irq_build_affinity_masks [ Upstream commit 1f962d91a15af54301c63febb8ac2ba07aa3654f ] Pass affinity managed mask array to irq_build_affinity_masks() so that the index of the first affinity managed vector is always zero. This allows to simplify the implementation a bit. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-3-ming.lei@redhat.com Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") Signed-off-by: Sasha Levin --- kernel/irq/affinity.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 3361e36ebaa1..da6379cd27fd 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs, static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, - unsigned int firstvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct irq_affinity_desc *masks) { unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; - unsigned int last_affv = firstvec + numvecs; + unsigned int last_affv = numvecs; unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; struct node_vectors *node_vectors; @@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); if (++curvec == last_affv) - curvec = firstvec; + curvec = 0; } return numvecs; } @@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, * may start anywhere */ if (curvec >= last_affv) - curvec = firstvec; + curvec = 0; irq_spread_init_one(&masks[curvec].mask, nmsk, cpus_per_vec); } @@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, +static int irq_build_affinity_masks(unsigned int numvecs, struct irq_affinity_desc *masks) { - unsigned int curvec = startvec, nr_present = 0, nr_others = 0; - unsigned int firstvec = startvec; + unsigned int curvec = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, - node_to_cpumask, cpu_present_mask, - nmsk, masks); + ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, + cpu_present_mask, nmsk, masks); if (ret < 0) goto fail_build_affinity; nr_present = ret; @@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, * out vectors. */ if (nr_present >= numvecs) - curvec = firstvec; + curvec = 0; else - curvec = firstvec + nr_present; + curvec = nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, - node_to_cpumask, npresmsk, nmsk, - masks); + ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, + npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; @@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(curvec, this_vecs, masks); + ret = irq_build_affinity_masks(this_vecs, &masks[curvec]); if (ret) { kfree(masks); return NULL; From aeeb4e4e49f8118d00cf803581555a2a2905759c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:02 +0800 Subject: [PATCH 114/151] genirq/affinity: Don't pass irq_affinity_desc array to irq_build_affinity_masks [ Upstream commit e7bdd7f0cbd1c001bb9b4d3313edc5ee094bc3f8 ] Prepare for abstracting irq_build_affinity_masks() into a public function for assigning all CPUs evenly into several groups. Don't pass irq_affinity_desc array to irq_build_affinity_masks, instead return a cpumask array by storing each assigned group into one element of the array. This allows to provide a generic interface for grouping all CPUs evenly from a NUMA and CPU locality viewpoint, and the cost is one extra allocation in irq_build_affinity_masks(), which should be fine since it is done via GFP_KERNEL and irq_build_affinity_masks() is a slow path anyway. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-4-ming.lei@redhat.com Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") Signed-off-by: Sasha Levin --- kernel/irq/affinity.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index da6379cd27fd..00bba1020ecb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, - struct irq_affinity_desc *masks) + struct cpumask *masks) { unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; unsigned int last_affv = numvecs; @@ -270,7 +270,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, for_each_node_mask(n, nodemsk) { /* Ensure that only CPUs which are in both masks are set */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); + cpumask_or(&masks[curvec], &masks[curvec], nmsk); if (++curvec == last_affv) curvec = 0; } @@ -321,7 +321,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, */ if (curvec >= last_affv) curvec = 0; - irq_spread_init_one(&masks[curvec].mask, nmsk, + irq_spread_init_one(&masks[curvec], nmsk, cpus_per_vec); } done += nv->nvectors; @@ -335,16 +335,16 @@ static int __irq_build_affinity_masks(unsigned int startvec, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(unsigned int numvecs, - struct irq_affinity_desc *masks) +static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) { unsigned int curvec = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; + struct cpumask *masks = NULL; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return ret; + return NULL; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) goto fail_nmsk; @@ -353,6 +353,10 @@ static int irq_build_affinity_masks(unsigned int numvecs, if (!node_to_cpumask) goto fail_npresmsk; + masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + /* Stabilize the cpumasks */ cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); @@ -386,6 +390,7 @@ static int irq_build_affinity_masks(unsigned int numvecs, if (ret >= 0) WARN_ON(nr_present + nr_others < numvecs); + fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); fail_npresmsk: @@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs, fail_nmsk: free_cpumask_var(nmsk); - return ret < 0 ? ret : 0; + if (ret < 0) { + kfree(masks); + return NULL; + } + return masks; } static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) @@ -457,13 +466,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int this_vecs = affd->set_size[i]; - int ret; + int j; + struct cpumask *result = irq_build_affinity_masks(this_vecs); - ret = irq_build_affinity_masks(this_vecs, &masks[curvec]); - if (ret) { + if (!result) { kfree(masks); return NULL; } + + for (j = 0; j < this_vecs; j++) + cpumask_copy(&masks[curvec + j].mask, &result[j]); + kfree(result); + curvec += this_vecs; usedvecs += this_vecs; } From 617ba3735d3b49c9b2db1e631b8bc008fdc83a5d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:03 +0800 Subject: [PATCH 115/151] genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly [ Upstream commit 523f1ea76aad9025f9bd5258d77f4406fa9dbe5d ] Map irq vector into group, which allows to abstract the algorithm for a generic use case outside of the interrupt core. Rename irq_build_affinity_masks as group_cpus_evenly, so the API can be reused for blk-mq to make default queue mapping even though irq vectors aren't involved. No functional change, just rename vector as group. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-5-ming.lei@redhat.com Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") Signed-off-by: Sasha Levin --- kernel/irq/affinity.c | 242 +++++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 00bba1020ecb..54083331f1bc 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -9,13 +9,13 @@ #include #include -static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - unsigned int cpus_per_vec) +static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + unsigned int cpus_per_grp) { const struct cpumask *siblmsk; int cpu, sibl; - for ( ; cpus_per_vec > 0; ) { + for ( ; cpus_per_grp > 0; ) { cpu = cpumask_first(nmsk); /* Should not happen, but I'm too lazy to think about it */ @@ -24,18 +24,18 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, cpumask_clear_cpu(cpu, nmsk); cpumask_set_cpu(cpu, irqmsk); - cpus_per_vec--; + cpus_per_grp--; /* If the cpu has siblings, use them first */ siblmsk = topology_sibling_cpumask(cpu); - for (sibl = -1; cpus_per_vec > 0; ) { + for (sibl = -1; cpus_per_grp > 0; ) { sibl = cpumask_next(sibl, siblmsk); if (sibl >= nr_cpu_ids) break; if (!cpumask_test_and_clear_cpu(sibl, nmsk)) continue; cpumask_set_cpu(sibl, irqmsk); - cpus_per_vec--; + cpus_per_grp--; } } } @@ -95,48 +95,48 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -struct node_vectors { +struct node_groups { unsigned id; union { - unsigned nvectors; + unsigned ngroups; unsigned ncpus; }; }; static int ncpus_cmp_func(const void *l, const void *r) { - const struct node_vectors *ln = l; - const struct node_vectors *rn = r; + const struct node_groups *ln = l; + const struct node_groups *rn = r; return ln->ncpus - rn->ncpus; } /* - * Allocate vector number for each node, so that for each node: + * Allocate group number for each node, so that for each node: * * 1) the allocated number is >= 1 * - * 2) the allocated numbver is <= active CPU number of this node + * 2) the allocated number is <= active CPU number of this node * - * The actual allocated total vectors may be less than @numvecs when - * active total CPU number is less than @numvecs. + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. * * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' * for each node. */ -static void alloc_nodes_vectors(unsigned int numvecs, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_vectors *node_vectors) +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) { unsigned n, remaining_ncpus = 0; for (n = 0; n < nr_node_ids; n++) { - node_vectors[n].id = n; - node_vectors[n].ncpus = UINT_MAX; + node_groups[n].id = n; + node_groups[n].ncpus = UINT_MAX; } for_each_node_mask(n, nodemsk) { @@ -148,61 +148,61 @@ static void alloc_nodes_vectors(unsigned int numvecs, if (!ncpus) continue; remaining_ncpus += ncpus; - node_vectors[n].ncpus = ncpus; + node_groups[n].ncpus = ncpus; } - numvecs = min_t(unsigned, remaining_ncpus, numvecs); + numgrps = min_t(unsigned, remaining_ncpus, numgrps); - sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), ncpus_cmp_func, NULL); /* - * Allocate vectors for each node according to the ratio of this - * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is + * Allocate groups for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is * bigger than number of active numa nodes. Always start the * allocation from the node with minimized nr_cpus. * * This way guarantees that each active node gets allocated at - * least one vector, and the theory is simple: over-allocation - * is only done when this node is assigned by one vector, so - * other nodes will be allocated >= 1 vector, since 'numvecs' is + * least one group, and the theory is simple: over-allocation + * is only done when this node is assigned by one group, so + * other nodes will be allocated >= 1 groups, since 'numgrps' is * bigger than number of numa nodes. * - * One perfect invariant is that number of allocated vectors for + * One perfect invariant is that number of allocated groups for * each node is <= CPU count of this node: * * 1) suppose there are two nodes: A and B * ncpu(X) is CPU count of node X - * vecs(X) is the vector count allocated to node X via this + * grps(X) is the group count allocated to node X via this * algorithm * * ncpu(A) <= ncpu(B) * ncpu(A) + ncpu(B) = N - * vecs(A) + vecs(B) = V + * grps(A) + grps(B) = G * - * vecs(A) = max(1, round_down(V * ncpu(A) / N)) - * vecs(B) = V - vecs(A) + * grps(A) = max(1, round_down(G * ncpu(A) / N)) + * grps(B) = G - grps(A) * - * both N and V are integer, and 2 <= V <= N, suppose - * V = N - delta, and 0 <= delta <= N - 2 + * both N and G are integer, and 2 <= G <= N, suppose + * G = N - delta, and 0 <= delta <= N - 2 * - * 2) obviously vecs(A) <= ncpu(A) because: + * 2) obviously grps(A) <= ncpu(A) because: * - * if vecs(A) is 1, then vecs(A) <= ncpu(A) given + * if grps(A) is 1, then grps(A) <= ncpu(A) given * ncpu(A) >= 1 * * otherwise, - * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N + * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N * - * 3) prove how vecs(B) <= ncpu(B): + * 3) prove how grps(B) <= ncpu(B): * - * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be - * over-allocated, so vecs(B) <= ncpu(B), + * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so grps(B) <= ncpu(B), * * otherwise: * - * vecs(A) = - * round_down(V * ncpu(A) / N) = + * grps(A) = + * round_down(G * ncpu(A) / N) = * round_down((N - delta) * ncpu(A) / N) = * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= * round_down((N * ncpu(A) - delta * N) / N) = @@ -210,52 +210,50 @@ static void alloc_nodes_vectors(unsigned int numvecs, * * then: * - * vecs(A) - V >= ncpu(A) - delta - V + * grps(A) - G >= ncpu(A) - delta - G * => - * V - vecs(A) <= V + delta - ncpu(A) + * G - grps(A) <= G + delta - ncpu(A) * => - * vecs(B) <= N - ncpu(A) + * grps(B) <= N - ncpu(A) * => - * vecs(B) <= cpu(B) + * grps(B) <= cpu(B) * * For nodes >= 3, it can be thought as one node and another big * node given that is exactly what this algorithm is implemented, - * and we always re-calculate 'remaining_ncpus' & 'numvecs', and - * finally for each node X: vecs(X) <= ncpu(X). + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and + * finally for each node X: grps(X) <= ncpu(X). * */ for (n = 0; n < nr_node_ids; n++) { - unsigned nvectors, ncpus; + unsigned ngroups, ncpus; - if (node_vectors[n].ncpus == UINT_MAX) + if (node_groups[n].ncpus == UINT_MAX) continue; - WARN_ON_ONCE(numvecs == 0); + WARN_ON_ONCE(numgrps == 0); - ncpus = node_vectors[n].ncpus; - nvectors = max_t(unsigned, 1, - numvecs * ncpus / remaining_ncpus); - WARN_ON_ONCE(nvectors > ncpus); + ncpus = node_groups[n].ncpus; + ngroups = max_t(unsigned, 1, + numgrps * ncpus / remaining_ncpus); + WARN_ON_ONCE(ngroups > ncpus); - node_vectors[n].nvectors = nvectors; + node_groups[n].ngroups = ngroups; remaining_ncpus -= ncpus; - numvecs -= nvectors; + numgrps -= ngroups; } } -static int __irq_build_affinity_masks(unsigned int startvec, - unsigned int numvecs, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, - struct cpumask *masks) +static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, struct cpumask *masks) { - unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; - unsigned int last_affv = numvecs; - unsigned int curvec = startvec; + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; + unsigned int last_grp = numgrps; + unsigned int curgrp = startgrp; nodemask_t nodemsk = NODE_MASK_NONE; - struct node_vectors *node_vectors; + struct node_groups *node_groups; if (cpumask_empty(cpu_mask)) return 0; @@ -264,34 +262,33 @@ static int __irq_build_affinity_masks(unsigned int startvec, /* * If the number of nodes in the mask is greater than or equal the - * number of vectors we just spread the vectors across the nodes. + * number of groups we just spread the groups across the nodes. */ - if (numvecs <= nodes) { + if (numgrps <= nodes) { for_each_node_mask(n, nodemsk) { /* Ensure that only CPUs which are in both masks are set */ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curvec], &masks[curvec], nmsk); - if (++curvec == last_affv) - curvec = 0; + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); + if (++curgrp == last_grp) + curgrp = 0; } - return numvecs; + return numgrps; } - node_vectors = kcalloc(nr_node_ids, - sizeof(struct node_vectors), + node_groups = kcalloc(nr_node_ids, + sizeof(struct node_groups), GFP_KERNEL); - if (!node_vectors) + if (!node_groups) return -ENOMEM; - /* allocate vector number for each node */ - alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, - nodemsk, nmsk, node_vectors); - + /* allocate group number for each node */ + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_groups); for (i = 0; i < nr_node_ids; i++) { unsigned int ncpus, v; - struct node_vectors *nv = &node_vectors[i]; + struct node_groups *nv = &node_groups[i]; - if (nv->nvectors == UINT_MAX) + if (nv->ngroups == UINT_MAX) continue; /* Get the cpus on this node which are in the mask */ @@ -300,44 +297,47 @@ static int __irq_build_affinity_masks(unsigned int startvec, if (!ncpus) continue; - WARN_ON_ONCE(nv->nvectors > ncpus); + WARN_ON_ONCE(nv->ngroups > ncpus); /* Account for rounding errors */ - extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); - /* Spread allocated vectors on CPUs of the current node */ - for (v = 0; v < nv->nvectors; v++, curvec++) { - cpus_per_vec = ncpus / nv->nvectors; + /* Spread allocated groups on CPUs of the current node */ + for (v = 0; v < nv->ngroups; v++, curgrp++) { + cpus_per_grp = ncpus / nv->ngroups; - /* Account for extra vectors to compensate rounding errors */ - if (extra_vecs) { - cpus_per_vec++; - --extra_vecs; + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; } /* - * wrapping has to be considered given 'startvec' + * wrapping has to be considered given 'startgrp' * may start anywhere */ - if (curvec >= last_affv) - curvec = 0; - irq_spread_init_one(&masks[curvec], nmsk, - cpus_per_vec); + if (curgrp >= last_grp) + curgrp = 0; + grp_spread_init_one(&masks[curgrp], nmsk, + cpus_per_grp); } - done += nv->nvectors; + done += nv->ngroups; } - kfree(node_vectors); + kfree(node_groups); return done; } /* - * build affinity in two stages: - * 1) spread present CPU on these vectors - * 2) spread other possible CPUs on these vectors + * build affinity in two stages for each group, and try to put close CPUs + * in viewpoint of CPU and NUMA locality into same group, and we run + * two-stage grouping: + * + * 1) allocate present CPUs on these groups evenly first + * 2) allocate other possible CPUs on these groups evenly */ -static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) +static struct cpumask *group_cpus_evenly(unsigned int numgrps) { - unsigned int curvec = 0, nr_present = 0, nr_others = 0; + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; int ret = -ENOMEM; @@ -353,7 +353,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) if (!node_to_cpumask) goto fail_npresmsk; - masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL); + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) goto fail_node_to_cpumask; @@ -361,26 +361,26 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); - /* Spread on present CPUs starting from affd->pre_vectors */ - ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, - cpu_present_mask, nmsk, masks); + /* grouping present CPUs first */ + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + cpu_present_mask, nmsk, masks); if (ret < 0) goto fail_build_affinity; nr_present = ret; /* - * Spread on non present CPUs starting from the next vector to be - * handled. If the spreading of present CPUs already exhausted the - * vector space, assign the non present CPUs to the already spread - * out vectors. + * Allocate non present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non present CPUs to the already + * allocated out groups. */ - if (nr_present >= numvecs) - curvec = 0; + if (nr_present >= numgrps) + curgrp = 0; else - curvec = nr_present; + curgrp = nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, - npresmsk, nmsk, masks); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; @@ -388,7 +388,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) cpus_read_unlock(); if (ret >= 0) - WARN_ON(nr_present + nr_others < numvecs); + WARN_ON(nr_present + nr_others < numgrps); fail_node_to_cpumask: free_node_to_cpumask(node_to_cpumask); @@ -467,7 +467,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int this_vecs = affd->set_size[i]; int j; - struct cpumask *result = irq_build_affinity_masks(this_vecs); + struct cpumask *result = group_cpus_evenly(this_vecs); if (!result) { kfree(masks); From f33b27f5c3de579dc8e3af27569507c868cb0812 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Dec 2022 10:29:04 +0800 Subject: [PATCH 116/151] genirq/affinity: Move group_cpus_evenly() into lib/ [ Upstream commit f7b3ea8cf72f3d6060fe08e461805181e7450a13 ] group_cpus_evenly() has become a generic function which can be used for other subsystems than the interrupt subsystem, so move it into lib/. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") Signed-off-by: Sasha Levin --- MAINTAINERS | 2 + include/linux/group_cpus.h | 14 ++ kernel/irq/affinity.c | 398 +--------------------------------- lib/Makefile | 2 + lib/group_cpus.c | 427 +++++++++++++++++++++++++++++++++++++ 5 files changed, 446 insertions(+), 397 deletions(-) create mode 100644 include/linux/group_cpus.h create mode 100644 lib/group_cpus.c diff --git a/MAINTAINERS b/MAINTAINERS index 07a9c274c0e2..13d1078808bb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10803,6 +10803,8 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core F: kernel/irq/ +F: include/linux/group_cpus.h +F: lib/group_cpus.c IRQCHIP DRIVERS M: Thomas Gleixner diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h new file mode 100644 index 000000000000..e42807ec61f6 --- /dev/null +++ b/include/linux/group_cpus.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ + +#ifndef __LINUX_GROUP_CPUS_H +#define __LINUX_GROUP_CPUS_H +#include +#include + +struct cpumask *group_cpus_evenly(unsigned int numgrps); + +#endif diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 54083331f1bc..44a4eba80315 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,403 +7,7 @@ #include #include #include -#include - -static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - unsigned int cpus_per_grp) -{ - const struct cpumask *siblmsk; - int cpu, sibl; - - for ( ; cpus_per_grp > 0; ) { - cpu = cpumask_first(nmsk); - - /* Should not happen, but I'm too lazy to think about it */ - if (cpu >= nr_cpu_ids) - return; - - cpumask_clear_cpu(cpu, nmsk); - cpumask_set_cpu(cpu, irqmsk); - cpus_per_grp--; - - /* If the cpu has siblings, use them first */ - siblmsk = topology_sibling_cpumask(cpu); - for (sibl = -1; cpus_per_grp > 0; ) { - sibl = cpumask_next(sibl, siblmsk); - if (sibl >= nr_cpu_ids) - break; - if (!cpumask_test_and_clear_cpu(sibl, nmsk)) - continue; - cpumask_set_cpu(sibl, irqmsk); - cpus_per_grp--; - } - } -} - -static cpumask_var_t *alloc_node_to_cpumask(void) -{ - cpumask_var_t *masks; - int node; - - masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); - if (!masks) - return NULL; - - for (node = 0; node < nr_node_ids; node++) { - if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) - goto out_unwind; - } - - return masks; - -out_unwind: - while (--node >= 0) - free_cpumask_var(masks[node]); - kfree(masks); - return NULL; -} - -static void free_node_to_cpumask(cpumask_var_t *masks) -{ - int node; - - for (node = 0; node < nr_node_ids; node++) - free_cpumask_var(masks[node]); - kfree(masks); -} - -static void build_node_to_cpumask(cpumask_var_t *masks) -{ - int cpu; - - for_each_possible_cpu(cpu) - cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); -} - -static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, - const struct cpumask *mask, nodemask_t *nodemsk) -{ - int n, nodes = 0; - - /* Calculate the number of nodes in the supplied affinity mask */ - for_each_node(n) { - if (cpumask_intersects(mask, node_to_cpumask[n])) { - node_set(n, *nodemsk); - nodes++; - } - } - return nodes; -} - -struct node_groups { - unsigned id; - - union { - unsigned ngroups; - unsigned ncpus; - }; -}; - -static int ncpus_cmp_func(const void *l, const void *r) -{ - const struct node_groups *ln = l; - const struct node_groups *rn = r; - - return ln->ncpus - rn->ncpus; -} - -/* - * Allocate group number for each node, so that for each node: - * - * 1) the allocated number is >= 1 - * - * 2) the allocated number is <= active CPU number of this node - * - * The actual allocated total groups may be less than @numgrps when - * active total CPU number is less than @numgrps. - * - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' - * for each node. - */ -static void alloc_nodes_groups(unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_groups *node_groups) -{ - unsigned n, remaining_ncpus = 0; - - for (n = 0; n < nr_node_ids; n++) { - node_groups[n].id = n; - node_groups[n].ncpus = UINT_MAX; - } - - for_each_node_mask(n, nodemsk) { - unsigned ncpus; - - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - ncpus = cpumask_weight(nmsk); - - if (!ncpus) - continue; - remaining_ncpus += ncpus; - node_groups[n].ncpus = ncpus; - } - - numgrps = min_t(unsigned, remaining_ncpus, numgrps); - - sort(node_groups, nr_node_ids, sizeof(node_groups[0]), - ncpus_cmp_func, NULL); - - /* - * Allocate groups for each node according to the ratio of this - * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is - * bigger than number of active numa nodes. Always start the - * allocation from the node with minimized nr_cpus. - * - * This way guarantees that each active node gets allocated at - * least one group, and the theory is simple: over-allocation - * is only done when this node is assigned by one group, so - * other nodes will be allocated >= 1 groups, since 'numgrps' is - * bigger than number of numa nodes. - * - * One perfect invariant is that number of allocated groups for - * each node is <= CPU count of this node: - * - * 1) suppose there are two nodes: A and B - * ncpu(X) is CPU count of node X - * grps(X) is the group count allocated to node X via this - * algorithm - * - * ncpu(A) <= ncpu(B) - * ncpu(A) + ncpu(B) = N - * grps(A) + grps(B) = G - * - * grps(A) = max(1, round_down(G * ncpu(A) / N)) - * grps(B) = G - grps(A) - * - * both N and G are integer, and 2 <= G <= N, suppose - * G = N - delta, and 0 <= delta <= N - 2 - * - * 2) obviously grps(A) <= ncpu(A) because: - * - * if grps(A) is 1, then grps(A) <= ncpu(A) given - * ncpu(A) >= 1 - * - * otherwise, - * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N - * - * 3) prove how grps(B) <= ncpu(B): - * - * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be - * over-allocated, so grps(B) <= ncpu(B), - * - * otherwise: - * - * grps(A) = - * round_down(G * ncpu(A) / N) = - * round_down((N - delta) * ncpu(A) / N) = - * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= - * round_down((N * ncpu(A) - delta * N) / N) = - * cpu(A) - delta - * - * then: - * - * grps(A) - G >= ncpu(A) - delta - G - * => - * G - grps(A) <= G + delta - ncpu(A) - * => - * grps(B) <= N - ncpu(A) - * => - * grps(B) <= cpu(B) - * - * For nodes >= 3, it can be thought as one node and another big - * node given that is exactly what this algorithm is implemented, - * and we always re-calculate 'remaining_ncpus' & 'numgrps', and - * finally for each node X: grps(X) <= ncpu(X). - * - */ - for (n = 0; n < nr_node_ids; n++) { - unsigned ngroups, ncpus; - - if (node_groups[n].ncpus == UINT_MAX) - continue; - - WARN_ON_ONCE(numgrps == 0); - - ncpus = node_groups[n].ncpus; - ngroups = max_t(unsigned, 1, - numgrps * ncpus / remaining_ncpus); - WARN_ON_ONCE(ngroups > ncpus); - - node_groups[n].ngroups = ngroups; - - remaining_ncpus -= ncpus; - numgrps -= ngroups; - } -} - -static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, struct cpumask *masks) -{ - unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; - unsigned int last_grp = numgrps; - unsigned int curgrp = startgrp; - nodemask_t nodemsk = NODE_MASK_NONE; - struct node_groups *node_groups; - - if (cpumask_empty(cpu_mask)) - return 0; - - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); - - /* - * If the number of nodes in the mask is greater than or equal the - * number of groups we just spread the groups across the nodes. - */ - if (numgrps <= nodes) { - for_each_node_mask(n, nodemsk) { - /* Ensure that only CPUs which are in both masks are set */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); - if (++curgrp == last_grp) - curgrp = 0; - } - return numgrps; - } - - node_groups = kcalloc(nr_node_ids, - sizeof(struct node_groups), - GFP_KERNEL); - if (!node_groups) - return -ENOMEM; - - /* allocate group number for each node */ - alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, - nodemsk, nmsk, node_groups); - for (i = 0; i < nr_node_ids; i++) { - unsigned int ncpus, v; - struct node_groups *nv = &node_groups[i]; - - if (nv->ngroups == UINT_MAX) - continue; - - /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); - ncpus = cpumask_weight(nmsk); - if (!ncpus) - continue; - - WARN_ON_ONCE(nv->ngroups > ncpus); - - /* Account for rounding errors */ - extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); - - /* Spread allocated groups on CPUs of the current node */ - for (v = 0; v < nv->ngroups; v++, curgrp++) { - cpus_per_grp = ncpus / nv->ngroups; - - /* Account for extra groups to compensate rounding errors */ - if (extra_grps) { - cpus_per_grp++; - --extra_grps; - } - - /* - * wrapping has to be considered given 'startgrp' - * may start anywhere - */ - if (curgrp >= last_grp) - curgrp = 0; - grp_spread_init_one(&masks[curgrp], nmsk, - cpus_per_grp); - } - done += nv->ngroups; - } - kfree(node_groups); - return done; -} - -/* - * build affinity in two stages for each group, and try to put close CPUs - * in viewpoint of CPU and NUMA locality into same group, and we run - * two-stage grouping: - * - * 1) allocate present CPUs on these groups evenly first - * 2) allocate other possible CPUs on these groups evenly - */ -static struct cpumask *group_cpus_evenly(unsigned int numgrps) -{ - unsigned int curgrp = 0, nr_present = 0, nr_others = 0; - cpumask_var_t *node_to_cpumask; - cpumask_var_t nmsk, npresmsk; - int ret = -ENOMEM; - struct cpumask *masks = NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail_nmsk; - - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto fail_npresmsk; - - masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto fail_node_to_cpumask; - - /* Stabilize the cpumasks */ - cpus_read_lock(); - build_node_to_cpumask(node_to_cpumask); - - /* grouping present CPUs first */ - ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - cpu_present_mask, nmsk, masks); - if (ret < 0) - goto fail_build_affinity; - nr_present = ret; - - /* - * Allocate non present CPUs starting from the next group to be - * handled. If the grouping of present CPUs already exhausted the - * group space, assign the non present CPUs to the already - * allocated out groups. - */ - if (nr_present >= numgrps) - curgrp = 0; - else - curgrp = nr_present; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - npresmsk, nmsk, masks); - if (ret >= 0) - nr_others = ret; - - fail_build_affinity: - cpus_read_unlock(); - - if (ret >= 0) - WARN_ON(nr_present + nr_others < numgrps); - - fail_node_to_cpumask: - free_node_to_cpumask(node_to_cpumask); - - fail_npresmsk: - free_cpumask_var(npresmsk); - - fail_nmsk: - free_cpumask_var(nmsk); - if (ret < 0) { - kfree(masks); - return NULL; - } - return masks; -} +#include static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) { diff --git a/lib/Makefile b/lib/Makefile index 5ffe72ec9979..6f1611d053e6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -361,6 +361,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o obj-$(CONFIG_PARMAN) += parman.o +obj-y += group_cpus.o + # GCC library routines obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o diff --git a/lib/group_cpus.c b/lib/group_cpus.c new file mode 100644 index 000000000000..99f08c6cb9d9 --- /dev/null +++ b/lib/group_cpus.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ +#include +#include +#include +#include +#include + +static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + unsigned int cpus_per_grp) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_grp > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >= nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_grp--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); + for (sibl = -1; cpus_per_grp > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_grp--; + } + } +} + +static cpumask_var_t *alloc_node_to_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node = 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >= 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node = 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_possible_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes = 0; + + /* Calculate the number of nodes in the supplied affinity mask */ + for_each_node(n) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +struct node_groups { + unsigned id; + + union { + unsigned ngroups; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_groups *ln = l; + const struct node_groups *rn = r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate group number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated number is <= active CPU number of this node + * + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) +{ + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_groups[n].id = n; + node_groups[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus += ncpus; + node_groups[n].ncpus = ncpus; + } + + numgrps = min_t(unsigned, remaining_ncpus, numgrps); + + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate groups for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one group, and the theory is simple: over-allocation + * is only done when this node is assigned by one group, so + * other nodes will be allocated >= 1 groups, since 'numgrps' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated groups for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * grps(X) is the group count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N + * grps(A) + grps(B) = G + * + * grps(A) = max(1, round_down(G * ncpu(A) / N)) + * grps(B) = G - grps(A) + * + * both N and G are integer, and 2 <= G <= N, suppose + * G = N - delta, and 0 <= delta <= N - 2 + * + * 2) obviously grps(A) <= ncpu(A) because: + * + * if grps(A) is 1, then grps(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, + * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N + * + * 3) prove how grps(B) <= ncpu(B): + * + * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be + * over-allocated, so grps(B) <= ncpu(B), + * + * otherwise: + * + * grps(A) = + * round_down(G * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = + * cpu(A) - delta + * + * then: + * + * grps(A) - G >= ncpu(A) - delta - G + * => + * G - grps(A) <= G + delta - ncpu(A) + * => + * grps(B) <= N - ncpu(A) + * => + * grps(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and + * finally for each node X: grps(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { + unsigned ngroups, ncpus; + + if (node_groups[n].ncpus == UINT_MAX) + continue; + + WARN_ON_ONCE(numgrps == 0); + + ncpus = node_groups[n].ncpus; + ngroups = max_t(unsigned, 1, + numgrps * ncpus / remaining_ncpus); + WARN_ON_ONCE(ngroups > ncpus); + + node_groups[n].ngroups = ngroups; + + remaining_ncpus -= ncpus; + numgrps -= ngroups; + } +} + +static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, struct cpumask *masks) +{ + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; + unsigned int last_grp = numgrps; + unsigned int curgrp = startgrp; + nodemask_t nodemsk = NODE_MASK_NONE; + struct node_groups *node_groups; + + if (cpumask_empty(cpu_mask)) + return 0; + + nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); + + /* + * If the number of nodes in the mask is greater than or equal the + * number of groups we just spread the groups across the nodes. + */ + if (numgrps <= nodes) { + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); + if (++curgrp == last_grp) + curgrp = 0; + } + return numgrps; + } + + node_groups = kcalloc(nr_node_ids, + sizeof(struct node_groups), + GFP_KERNEL); + if (!node_groups) + return -ENOMEM; + + /* allocate group number for each node */ + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_groups); + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_groups *nv = &node_groups[i]; + + if (nv->ngroups == UINT_MAX) + continue; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); + ncpus = cpumask_weight(nmsk); + if (!ncpus) + continue; + + WARN_ON_ONCE(nv->ngroups > ncpus); + + /* Account for rounding errors */ + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); + + /* Spread allocated groups on CPUs of the current node */ + for (v = 0; v < nv->ngroups; v++, curgrp++) { + cpus_per_grp = ncpus / nv->ngroups; + + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; + } + + /* + * wrapping has to be considered given 'startgrp' + * may start anywhere + */ + if (curgrp >= last_grp) + curgrp = 0; + grp_spread_init_one(&masks[curgrp], nmsk, + cpus_per_grp); + } + done += nv->ngroups; + } + kfree(node_groups); + return done; +} + +#ifdef CONFIG_SMP +/** + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality + * @numgrps: number of groups + * + * Return: cpumask array if successful, NULL otherwise. And each element + * includes CPUs assigned to this group + * + * Try to put close CPUs from viewpoint of CPU and NUMA locality into + * same group, and run two-stage grouping: + * 1) allocate present CPUs on these groups evenly first + * 2) allocate other possible CPUs on these groups evenly + * + * We guarantee in the resulted grouping that all CPUs are covered, and + * no same CPU is assigned to multiple groups + */ +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + unsigned int curgrp = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; + struct cpumask *masks = NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; + + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + + /* Stabilize the cpumasks */ + cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); + + /* grouping present CPUs first */ + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + cpu_present_mask, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; + + /* + * Allocate non present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non present CPUs to the already + * allocated out groups. + */ + if (nr_present >= numgrps) + curgrp = 0; + else + curgrp = nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: + cpus_read_unlock(); + + if (ret >= 0) + WARN_ON(nr_present + nr_others < numgrps); + + fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: + free_cpumask_var(npresmsk); + + fail_nmsk: + free_cpumask_var(nmsk); + if (ret < 0) { + kfree(masks); + return NULL; + } + return masks; +} +#else +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + + if (!masks) + return NULL; + + /* assign all CPUs(cpu 0) to the 1st group only */ + cpumask_copy(&masks[0], cpu_possible_mask); + return masks; +} +#endif From a576780a2a66b1c6da69f5eed4cf1b307f7c97aa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 20 Nov 2023 16:35:59 +0800 Subject: [PATCH 117/151] lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly [ Upstream commit 0263f92fadbb9d294d5971ac57743f882c93b2b3 ] group_cpus_evenly() could be part of storage driver's error handler, such as nvme driver, when may happen during CPU hotplug, in which storage queue has to drain its pending IOs because all CPUs associated with the queue are offline and the queue is becoming inactive. And handling IO needs error handler to provide forward progress. Then deadlock is caused: 1) inside CPU hotplug handler, CPU hotplug lock is held, and blk-mq's handler is waiting for inflight IO 2) error handler is waiting for CPU hotplug lock 3) inflight IO can't be completed in blk-mq's CPU hotplug handler because error handling can't provide forward progress. Solve the deadlock by not holding CPU hotplug lock in group_cpus_evenly(), in which two stage spreads are taken: 1) the 1st stage is over all present CPUs; 2) the end stage is over all other CPUs. Turns out the two stage spread just needs consistent 'cpu_present_mask', and remove the CPU hotplug lock by storing it into one local cache. This way doesn't change correctness, because all CPUs are still covered. Link: https://lkml.kernel.org/r/20231120083559.285174-1-ming.lei@redhat.com Signed-off-by: Ming Lei Reported-by: Yi Zhang Reported-by: Guangwu Zhang Tested-by: Guangwu Zhang Reviewed-by: Chengming Zhou Reviewed-by: Jens Axboe Cc: Keith Busch Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- lib/group_cpus.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 99f08c6cb9d9..156b1446d2a2 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -365,13 +365,25 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) if (!masks) goto fail_node_to_cpumask; - /* Stabilize the cpumasks */ - cpus_read_lock(); build_node_to_cpumask(node_to_cpumask); + /* + * Make a local cache of 'cpu_present_mask', so the two stages + * spread can observe consistent 'cpu_present_mask' without holding + * cpu hotplug lock, then we can reduce deadlock risk with cpu + * hotplug code. + * + * Here CPU hotplug may happen when reading `cpu_present_mask`, and + * we can live with the case because it only affects that hotplug + * CPU is handled in the 1st or 2nd stage, and either way is correct + * from API user viewpoint since 2-stage spread is sort of + * optimization. + */ + cpumask_copy(npresmsk, data_race(cpu_present_mask)); + /* grouping present CPUs first */ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - cpu_present_mask, nmsk, masks); + npresmsk, nmsk, masks); if (ret < 0) goto fail_build_affinity; nr_present = ret; @@ -386,15 +398,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) curgrp = 0; else curgrp = nr_present; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, npresmsk, nmsk, masks); if (ret >= 0) nr_others = ret; fail_build_affinity: - cpus_read_unlock(); - if (ret >= 0) WARN_ON(nr_present + nr_others < numgrps); From 4666f003afffbea8ec8421bbea5aab260d0ac7b9 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 20 Nov 2023 15:53:52 +0100 Subject: [PATCH 118/151] mm/memory_hotplug: add missing mem_hotplug_lock [ Upstream commit 001002e73712cdf6b8d9a103648cda3040ad7647 ] From Documentation/core-api/memory-hotplug.rst: When adding/removing/onlining/offlining memory or adding/removing heterogeneous/device memory, we should always hold the mem_hotplug_lock in write mode to serialise memory hotplug (e.g. access to global/zone variables). mhp_(de)init_memmap_on_memory() functions can change zone stats and struct page content, but they are currently called w/o the mem_hotplug_lock. When memory block is being offlined and when kmemleak goes through each populated zone, the following theoretical race conditions could occur: CPU 0: | CPU 1: memory_offline() | -> offline_pages() | -> mem_hotplug_begin() | ... | -> mem_hotplug_done() | | kmemleak_scan() | -> get_online_mems() | ... -> mhp_deinit_memmap_on_memory() | [not protected by mem_hotplug_begin/done()]| Marks memory section as offline, | Retrieves zone_start_pfn poisons vmemmap struct pages and updates | and struct page members. the zone related data | | ... | -> put_online_mems() Fix this by ensuring mem_hotplug_lock is taken before performing mhp_init_memmap_on_memory(). Also ensure that mhp_deinit_memmap_on_memory() holds the lock. online/offline_pages() are currently only called from memory_block_online/offline(), so it is safe to move the locking there. Link: https://lkml.kernel.org/r/20231120145354.308999-2-sumanthk@linux.ibm.com Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range") Signed-off-by: Sumanth Korikkar Reviewed-by: Gerald Schaefer Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Cc: kernel test robot Cc: [5.15+] Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- drivers/base/memory.c | 18 +++++++++++++++--- mm/memory_hotplug.c | 13 ++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 9aa0da991cfb..5d39f3e374da 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -175,6 +175,9 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +/* + * Must acquire mem_hotplug_lock in write mode. + */ static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -193,10 +196,11 @@ static int memory_block_online(struct memory_block *mem) * stage helps to keep accounting easier to follow - e.g vmemmaps * belong to the same zone as the memory they backed. */ + mem_hotplug_begin(); if (nr_vmemmap_pages) { ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); if (ret) - return ret; + goto out; } ret = online_pages(start_pfn + nr_vmemmap_pages, @@ -204,7 +208,7 @@ static int memory_block_online(struct memory_block *mem) if (ret) { if (nr_vmemmap_pages) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); - return ret; + goto out; } /* @@ -216,9 +220,14 @@ static int memory_block_online(struct memory_block *mem) nr_vmemmap_pages); mem->zone = zone; +out: + mem_hotplug_done(); return ret; } +/* + * Must acquire mem_hotplug_lock in write mode. + */ static int memory_block_offline(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -233,6 +242,7 @@ static int memory_block_offline(struct memory_block *mem) * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ + mem_hotplug_begin(); if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); @@ -244,13 +254,15 @@ static int memory_block_offline(struct memory_block *mem) if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), mem->group, nr_vmemmap_pages); - return ret; + goto out; } if (nr_vmemmap_pages) mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); mem->zone = NULL; +out: + mem_hotplug_done(); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index bd2570b4f9b7..d02722bbfcf3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1069,6 +1069,9 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); } +/* + * Must be called with mem_hotplug_lock in write mode. + */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { @@ -1089,7 +1092,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); /* associate pfn range with the zone */ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); @@ -1148,7 +1150,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, writeback_set_ratelimit(); memory_notify(MEM_ONLINE, &arg); - mem_hotplug_done(); return 0; failed_addition: @@ -1157,7 +1158,6 @@ failed_addition: (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); - mem_hotplug_done(); return ret; } @@ -1787,6 +1787,9 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, return 0; } +/* + * Must be called with mem_hotplug_lock in write mode. + */ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group) { @@ -1809,8 +1812,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; - mem_hotplug_begin(); - /* * Don't allow to offline memory blocks that contain holes. * Consequently, memory blocks with holes can never get onlined @@ -1946,7 +1947,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, memory_notify(MEM_OFFLINE, &arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); - mem_hotplug_done(); return 0; failed_removal_isolated: @@ -1961,7 +1961,6 @@ failed_removal: (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - mem_hotplug_done(); return ret; } From d49bf9c1ceb3bde36c91a01dcc2c54cf1bbe3c7a Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 20 Nov 2023 15:53:53 +0100 Subject: [PATCH 119/151] mm/memory_hotplug: fix error handling in add_memory_resource() [ Upstream commit f42ce5f087eb69e47294ababd2e7e6f88a82d308 ] In add_memory_resource(), creation of memory block devices occurs after successful call to arch_add_memory(). However, creation of memory block devices could fail. In that case, arch_remove_memory() is called to perform necessary cleanup. Currently with or without altmap support, arch_remove_memory() is always passed with altmap set to NULL during error handling. This leads to freeing of struct pages using free_pages(), eventhough the allocation might have been performed with altmap support via altmap_alloc_block_buf(). Fix the error handling by passing altmap in arch_remove_memory(). This ensures the following: * When altmap is disabled, deallocation of the struct pages array occurs via free_pages(). * When altmap is enabled, deallocation occurs via vmem_altmap_free(). Link: https://lkml.kernel.org/r/20231120145354.308999-3-sumanthk@linux.ibm.com Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range") Signed-off-by: Sumanth Korikkar Reviewed-by: Gerald Schaefer Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: kernel test robot Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Cc: [5.15+] Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d02722bbfcf3..3b9d3a4b4386 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1382,7 +1382,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = create_memory_block_devices(start, size, mhp_altmap.alloc, group); if (ret) { - arch_remove_memory(start, size, NULL); + arch_remove_memory(start, size, params.altmap); goto error; } From e681f711e9e8ee0d70151711364cbff5394a8660 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:16 -0500 Subject: [PATCH 120/151] net: sched: call tcf_ct_params_free to free params in tcf_ct_init [ Upstream commit 1913894100ca53205f2d56091cb34b8eba1de217 ] This patch is to make the err path simple by calling tcf_ct_params_free(), so that it won't cause problems when more members are added into param and need freeing on the err path. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") Signed-off-by: Sasha Levin --- net/sched/act_ct.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 4c7f7861ea96..478cedc29b73 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -345,11 +345,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) module_put(THIS_MODULE); } -static void tcf_ct_flow_table_put(struct tcf_ct_params *params) +static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft) { - struct tcf_ct_flow_table *ct_ft = params->ct_ft; - - if (refcount_dec_and_test(¶ms->ct_ft->ref)) { + if (refcount_dec_and_test(&ct_ft->ref)) { rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); queue_rcu_work(act_ct_wq, &ct_ft->rwork); @@ -832,18 +830,23 @@ out_free: return err; } -static void tcf_ct_params_free(struct rcu_head *head) +static void tcf_ct_params_free(struct tcf_ct_params *params) { - struct tcf_ct_params *params = container_of(head, - struct tcf_ct_params, rcu); - - tcf_ct_flow_table_put(params); - + if (params->ct_ft) + tcf_ct_flow_table_put(params->ct_ft); if (params->tmpl) nf_ct_put(params->tmpl); kfree(params); } +static void tcf_ct_params_free_rcu(struct rcu_head *head) +{ + struct tcf_ct_params *params; + + params = container_of(head, struct tcf_ct_params, rcu); + tcf_ct_params_free(params); +} + #if IS_ENABLED(CONFIG_NF_NAT) /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. @@ -1390,7 +1393,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, err = tcf_ct_flow_table_get(net, params); if (err) - goto cleanup_params; + goto cleanup; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -1401,17 +1404,15 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (params) - call_rcu(¶ms->rcu, tcf_ct_params_free); + call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); return res; -cleanup_params: - if (params->tmpl) - nf_ct_put(params->tmpl); cleanup: if (goto_ch) tcf_chain_put_by_act(goto_ch); - kfree(params); + if (params) + tcf_ct_params_free(params); tcf_idr_release(*a, bind); return err; } @@ -1423,7 +1424,7 @@ static void tcf_ct_cleanup(struct tc_action *a) params = rcu_dereference_protected(c->params, 1); if (params) - call_rcu(¶ms->rcu, tcf_ct_params_free); + call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); } static int tcf_ct_dump_key_val(struct sk_buff *skb, From c29a7656f8a2a2386ce495892aff3d4b26304667 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 1 Feb 2023 17:30:56 +0100 Subject: [PATCH 121/151] netfilter: flowtable: allow unidirectional rules [ Upstream commit 8f84780b84d645d6e35467f4a6f3236b20d7f4b2 ] Modify flow table offload to support unidirectional connections by extending enum nf_flow_flags with new "NF_FLOW_HW_BIDIRECTIONAL" flag. Only offload reply direction when the flag is set. This infrastructure change is necessary to support offloading UDP NEW connections in original direction in following patches in series. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_flow_table.h | 1 + net/netfilter/nf_flow_table_offload.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index cd982f4a0f50..88ab98ab41d9 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -164,6 +164,7 @@ enum nf_flow_flags { NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, NF_FLOW_HW_PENDING, + NF_FLOW_HW_BIDIRECTIONAL, }; enum flow_offload_type { diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 4d9b99abe37d..8b852f10fab4 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, ok_count += flow_offload_tuple_add(offload, flow_rule[0], FLOW_OFFLOAD_DIR_ORIGINAL); - ok_count += flow_offload_tuple_add(offload, flow_rule[1], - FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + ok_count += flow_offload_tuple_add(offload, flow_rule[1], + FLOW_OFFLOAD_DIR_REPLY); if (ok_count == 0) return -ENOENT; @@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload) { clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); - flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); } @@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) u64 lastused; flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); - flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); + if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, + &stats[1]); lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, From 8b160f2fba777a27b912bcd2488e95d21f6936f0 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 1 Feb 2023 17:30:57 +0100 Subject: [PATCH 122/151] netfilter: flowtable: cache info of last offload [ Upstream commit 1a441a9b8be8849957a01413a144f84932c324cb ] Modify flow table offload to cache the last ct info status that was passed to the driver offload callbacks by extending enum nf_flow_flags with new "NF_FLOW_HW_ESTABLISHED" flag. Set the flag if ctinfo was 'established' during last act_ct meta actions fill call. This infrastructure change is necessary to optimize promoting of UDP connections from 'new' to 'established' in following patches in this series. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_flow_table.h | 7 ++++--- net/netfilter/nf_flow_table_inet.c | 2 +- net/netfilter/nf_flow_table_offload.c | 6 +++--- net/sched/act_ct.c | 12 +++++++----- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 88ab98ab41d9..ebb28ec5b6fa 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -57,7 +57,7 @@ struct nf_flowtable_type { struct net_device *dev, enum flow_block_command cmd); int (*action)(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); @@ -165,6 +165,7 @@ enum nf_flow_flags { NF_FLOW_HW_DEAD, NF_FLOW_HW_PENDING, NF_FLOW_HW_BIDIRECTIONAL, + NF_FLOW_HW_ESTABLISHED, }; enum flow_offload_type { @@ -313,10 +314,10 @@ void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable); int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct net_device *dev, enum flow_block_command cmd); -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 0ccabf3fa6aa..9505f9d188ff 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -39,7 +39,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, } static int nf_flow_rule_route_inet(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 8b852f10fab4..1c26f03fc661 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, return 0; } -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); -int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, +int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { @@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net, { const struct nf_flowtable *flowtable = offload->flowtable; const struct flow_offload_tuple *tuple, *other_tuple; - const struct flow_offload *flow = offload->flow; + struct flow_offload *flow = offload->flow; struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; int err = -ENOMEM; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 478cedc29b73..86d269724485 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -168,11 +168,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, enum ip_conntrack_dir dir, + enum ip_conntrack_info ctinfo, struct flow_action *action) { struct nf_conn_labels *ct_labels; struct flow_action_entry *entry; - enum ip_conntrack_info ctinfo; u32 *act_ct_labels; entry = tcf_ct_flow_table_flow_action_get_next(action); @@ -180,8 +180,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif - ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; /* aligns with the CT reference on the SKB nf_ct_set */ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; @@ -235,22 +233,26 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net, } static int tcf_ct_flow_table_fill_actions(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir tdir, struct nf_flow_rule *flow_rule) { struct flow_action *action = &flow_rule->rule->action; int num_entries = action->num_entries; struct nf_conn *ct = flow->ct; + enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; int i, err; switch (tdir) { case FLOW_OFFLOAD_DIR_ORIGINAL: dir = IP_CT_DIR_ORIGINAL; + ctinfo = IP_CT_ESTABLISHED; + set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); break; case FLOW_OFFLOAD_DIR_REPLY: dir = IP_CT_DIR_REPLY; + ctinfo = IP_CT_ESTABLISHED_REPLY; break; default: return -EOPNOTSUPP; @@ -260,7 +262,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net, if (err) goto err_nat; - tcf_ct_flow_table_add_action_meta(ct, dir, action); + tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action); return 0; err_nat: From 87466a374571f212caaecf60216ef213299d7fe8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 1 Feb 2023 17:30:59 +0100 Subject: [PATCH 123/151] net/sched: act_ct: offload UDP NEW connections [ Upstream commit 6a9bad0069cf306f3df6ac53cf02438d4e15f296 ] Modify the offload algorithm of UDP connections to the following: - Offload NEW connection as unidirectional. - When connection state changes to ESTABLISHED also update the hardware flow. However, in order to prevent act_ct from spamming offload add wq for every packet coming in reply direction in this state verify whether connection has already been updated to ESTABLISHED in the drivers. If that it the case, then skip flow_table and let conntrack handle such packets which will also allow conntrack to potentially promote the connection to ASSURED. - When connection state changes to ASSURED set the flow_table flow NF_FLOW_HW_BIDIRECTIONAL flag which will cause refresh mechanism to offload the reply direction. All other protocols have their offload algorithm preserved and are always offloaded as bidirectional. Note that this change tries to minimize the load on flow_table add workqueue. First, it tracks the last ctinfo that was offloaded by using new flow 'NF_FLOW_HW_ESTABLISHED' flag and doesn't schedule the refresh for reply direction packets when the offloads have already been updated with current ctinfo. Second, when 'add' task executes on workqueue it always update the offload with current flow state (by checking 'bidirectional' flow flag and obtaining actual ctinfo/cookie through meta action instead of caching any of these from the moment of scheduling the 'add' work) preventing the need from scheduling more updates if state changed concurrently while the 'add' work was pending on workqueue. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") Signed-off-by: Sasha Levin --- net/sched/act_ct.c | 51 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 86d269724485..3c063065f125 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -365,7 +365,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, - bool tcp) + bool tcp, bool bidirectional) { struct nf_conn_act_ct_ext *act_ct_ext; struct flow_offload *entry; @@ -384,6 +384,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } + if (bidirectional) + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags); act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { @@ -407,26 +409,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - bool tcp = false; - - if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) || - !test_bit(IPS_ASSURED_BIT, &ct->status)) - return; + bool tcp = false, bidirectional = true; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: - tcp = true; - if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) return; + + tcp = true; break; case IPPROTO_UDP: + if (!nf_ct_is_confirmed(ct)) + return; + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) + bidirectional = false; break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; - if (ct->status & IPS_NAT_MASK) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->status & IPS_NAT_MASK) return; + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) @@ -442,7 +452,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, ct->status & IPS_SEQ_ADJUST) return; - tcf_ct_flow_table_add(ct_ft, ct, tcp); + tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional); } static bool @@ -621,13 +631,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); ct = flow->ct; + if (dir == FLOW_OFFLOAD_DIR_REPLY && + !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) { + /* Only offload reply direction after connection became + * assured. + */ + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); + else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags)) + /* If flow_table flow has already been updated to the + * established state, then don't refresh. + */ + return false; + } + if (tcph && (unlikely(tcph->fin || tcph->rst))) { flow_offload_teardown(flow); return false; } - ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; + if (dir == FLOW_OFFLOAD_DIR_ORIGINAL) + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + else + ctinfo = IP_CT_ESTABLISHED_REPLY; flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); From df01de08b4118f19c87f23a72a0c4751b906d23b Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Fri, 9 Jun 2023 15:22:59 +0300 Subject: [PATCH 124/151] net/sched: act_ct: Fix promotion of offloaded unreplied tuple [ Upstream commit 41f2c7c342d3adb1c4dd5f2e3dd831adff16a669 ] Currently UNREPLIED and UNASSURED connections are added to the nf flow table. This causes the following connection packets to be processed by the flow table which then skips conntrack_in(), and thus such the connections will remain UNREPLIED and UNASSURED even if reply traffic is then seen. Even still, the unoffloaded reply packets are the ones triggering hardware update from new to established state, and if there aren't any to triger an update and/or previous update was missed, hardware can get out of sync with sw and still mark packets as new. Fix the above by: 1) Not skipping conntrack_in() for UNASSURED packets, but still refresh for hardware, as before the cited patch. 2) Try and force a refresh by reply-direction packets that update the hardware rules from new to established state. 3) Remove any bidirectional flows that didn't failed to update in hardware for re-insertion as bidrectional once any new packet arrives. Fixes: 6a9bad0069cf ("net/sched: act_ct: offload UDP NEW connections") Co-developed-by: Vlad Buslov Signed-off-by: Vlad Buslov Signed-off-by: Paul Blakey Reviewed-by: Florian Westphal Link: https://lore.kernel.org/r/1686313379-117663-1-git-send-email-paulb@nvidia.com Signed-off-by: Paolo Abeni Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_flow_table.h | 2 +- net/netfilter/nf_flow_table_core.c | 13 ++++++++++--- net/netfilter/nf_flow_table_ip.c | 4 ++-- net/sched/act_ct.c | 9 ++++++++- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ebb28ec5b6fa..f37f9f34430c 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -268,7 +268,7 @@ int flow_offload_route_init(struct flow_offload *flow, int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); void flow_offload_refresh(struct nf_flowtable *flow_table, - struct flow_offload *flow); + struct flow_offload *flow, bool force); struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 81c26a96c30b..baddb93a5e8c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -314,12 +314,12 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, - struct flow_offload *flow) + struct flow_offload *flow, bool force) { u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); - if (timeout - READ_ONCE(flow->timeout) > HZ) + if (force || timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); else return; @@ -331,6 +331,12 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, } EXPORT_SYMBOL_GPL(flow_offload_refresh); +static bool nf_flow_is_outdated(const struct flow_offload *flow) +{ + return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && + !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); +} + static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; @@ -420,7 +426,8 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || - nf_ct_is_dying(flow->ct)) + nf_ct_is_dying(flow->ct) || + nf_flow_is_outdated(flow)) flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index b350fe9d00b0..6feaac9ab05c 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -384,7 +384,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; - flow_offload_refresh(flow_table, flow); + flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); thoff -= offset; @@ -646,7 +646,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; - flow_offload_refresh(flow_table, flow); + flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 3c063065f125..b80a58d3bf0f 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -606,6 +606,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, struct flow_offload_tuple tuple = {}; enum ip_conntrack_info ctinfo; struct tcphdr *tcph = NULL; + bool force_refresh = false; struct flow_offload *flow; struct nf_conn *ct; u8 dir; @@ -643,6 +644,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, * established state, then don't refresh. */ return false; + force_refresh = true; } if (tcph && (unlikely(tcph->fin || tcph->rst))) { @@ -656,7 +658,12 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, else ctinfo = IP_CT_ESTABLISHED_REPLY; - flow_offload_refresh(nf_ft, flow); + flow_offload_refresh(nf_ft, flow, force_refresh); + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + /* Process this flow in SW to allow promoting to ASSURED */ + return false; + } + nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); if (nf_ft->flags & NF_FLOWTABLE_COUNTER) From 2bb4ecb3349c19a04e2219113b169646ca194608 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 24 Oct 2023 21:09:47 +0200 Subject: [PATCH 125/151] netfilter: flowtable: GC pushes back packets to classic path [ Upstream commit 735795f68b37e9bb49f642407a0d49b1631ea1c7 ] Since 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple"), flowtable GC pushes back flows with IPS_SEEN_REPLY back to classic path in every run, ie. every second. This is because of a new check for NF_FLOW_HW_ESTABLISHED which is specific of sched/act_ct. In Netfilter's flowtable case, NF_FLOW_HW_ESTABLISHED never gets set on and IPS_SEEN_REPLY is unreliable since users decide when to offload the flow before, such bit might be set on at a later stage. Fix it by adding a custom .gc handler that sched/act_ct can use to deal with its NF_FLOW_HW_ESTABLISHED bit. Fixes: 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple") Reported-by: Vladimir Smelhaus Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") Signed-off-by: Sasha Levin --- include/net/netfilter/nf_flow_table.h | 1 + net/netfilter/nf_flow_table_core.c | 14 +++++++------- net/sched/act_ct.c | 7 +++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f37f9f34430c..0b163ead95c9 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -53,6 +53,7 @@ struct nf_flowtable_type { struct list_head list; int family; int (*init)(struct nf_flowtable *ft); + bool (*gc)(const struct flow_offload *flow); int (*setup)(struct nf_flowtable *ft, struct net_device *dev, enum flow_block_command cmd); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index baddb93a5e8c..c1d99cb370b4 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -331,12 +331,6 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static bool nf_flow_is_outdated(const struct flow_offload *flow) -{ - return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && - !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); -} - static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; @@ -422,12 +416,18 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } +static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, + const struct flow_offload *flow) +{ + return flow_table->type->gc && flow_table->type->gc(flow); +} + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - nf_flow_is_outdated(flow)) + nf_flow_custom_gc(flow_table, flow)) flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index b80a58d3bf0f..4d34474f2cc0 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -274,7 +274,14 @@ err_nat: return err; } +static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) +{ + return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && + !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); +} + static struct nf_flowtable_type flowtable_ct = { + .gc = tcf_ct_flow_is_outdated, .action = tcf_ct_flow_table_fill_actions, .owner = THIS_MODULE, }; From a29b15cc68a668abfc79e6c38766ee890b64cf59 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 5 Dec 2023 18:25:54 +0100 Subject: [PATCH 126/151] net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table [ Upstream commit 125f1c7f26ffcdbf96177abe75b70c1a6ceb17bc ] The referenced change added custom cleanup code to act_ct to delete any callbacks registered on the parent block when deleting the tcf_ct_flow_table instance. However, the underlying issue is that the drivers don't obtain the reference to the tcf_ct_flow_table instance when registering callbacks which means that not only driver callbacks may still be on the table when deleting it but also that the driver can still have pointers to its internal nf_flowtable and can use it concurrently which results either warning in netfilter[0] or use-after-free. Fix the issue by taking a reference to the underlying struct tcf_ct_flow_table instance when registering the callback and release the reference when unregistering. Expose new API required for such reference counting by adding two new callbacks to nf_flowtable_type and implementing them for act_ct flowtable_ct type. This fixes the issue by extending the lifetime of nf_flowtable until all users have unregistered. [0]: [106170.938634] ------------[ cut here ]------------ [106170.939111] WARNING: CPU: 21 PID: 3688 at include/net/netfilter/nf_flow_table.h:262 mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.940108] Modules linked in: act_ct nf_flow_table act_mirred act_skbedit act_tunnel_key vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa bonding openvswitch nsh rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_regis try overlay mlx5_core [106170.943496] CPU: 21 PID: 3688 Comm: kworker/u48:0 Not tainted 6.6.0-rc7_for_upstream_min_debug_2023_11_01_13_02 #1 [106170.944361] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [106170.945292] Workqueue: mlx5e mlx5e_rep_neigh_update [mlx5_core] [106170.945846] RIP: 0010:mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.946413] Code: 89 ef 48 83 05 71 a4 14 00 01 e8 f4 06 04 e1 48 83 05 6c a4 14 00 01 48 83 c4 28 5b 5d 41 5c 41 5d c3 48 83 05 d1 8b 14 00 01 <0f> 0b 48 83 05 d7 8b 14 00 01 e9 96 fe ff ff 48 83 05 a2 90 14 00 [106170.947924] RSP: 0018:ffff88813ff0fcb8 EFLAGS: 00010202 [106170.948397] RAX: 0000000000000000 RBX: ffff88811eabac40 RCX: ffff88811eabad48 [106170.949040] RDX: ffff88811eab8000 RSI: ffffffffa02cd560 RDI: 0000000000000000 [106170.949679] RBP: ffff88811eab8000 R08: 0000000000000001 R09: ffffffffa0229700 [106170.950317] R10: ffff888103538fc0 R11: 0000000000000001 R12: ffff88811eabad58 [106170.950969] R13: ffff888110c01c00 R14: ffff888106b40000 R15: 0000000000000000 [106170.951616] FS: 0000000000000000(0000) GS:ffff88885fd40000(0000) knlGS:0000000000000000 [106170.952329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [106170.952834] CR2: 00007f1cefd28cb0 CR3: 000000012181b006 CR4: 0000000000370ea0 [106170.953482] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [106170.954121] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [106170.954766] Call Trace: [106170.955057] [106170.955315] ? __warn+0x79/0x120 [106170.955648] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.956172] ? report_bug+0x17c/0x190 [106170.956537] ? handle_bug+0x3c/0x60 [106170.956891] ? exc_invalid_op+0x14/0x70 [106170.957264] ? asm_exc_invalid_op+0x16/0x20 [106170.957666] ? mlx5_del_flow_rules+0x10/0x310 [mlx5_core] [106170.958172] ? mlx5_tc_ct_block_flow_offload_add+0x1240/0x1240 [mlx5_core] [106170.958788] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] [106170.959339] ? mlx5_tc_ct_del_ft_cb+0xc6/0x2b0 [mlx5_core] [106170.959854] ? mapping_remove+0x154/0x1d0 [mlx5_core] [106170.960342] ? mlx5e_tc_action_miss_mapping_put+0x4f/0x80 [mlx5_core] [106170.960927] mlx5_tc_ct_delete_flow+0x76/0xc0 [mlx5_core] [106170.961441] mlx5_free_flow_attr_actions+0x13b/0x220 [mlx5_core] [106170.962001] mlx5e_tc_del_fdb_flow+0x22c/0x3b0 [mlx5_core] [106170.962524] mlx5e_tc_del_flow+0x95/0x3c0 [mlx5_core] [106170.963034] mlx5e_flow_put+0x73/0xe0 [mlx5_core] [106170.963506] mlx5e_put_flow_list+0x38/0x70 [mlx5_core] [106170.964002] mlx5e_rep_update_flows+0xec/0x290 [mlx5_core] [106170.964525] mlx5e_rep_neigh_update+0x1da/0x310 [mlx5_core] [106170.965056] process_one_work+0x13a/0x2c0 [106170.965443] worker_thread+0x2e5/0x3f0 [106170.965808] ? rescuer_thread+0x410/0x410 [106170.966192] kthread+0xc6/0xf0 [106170.966515] ? kthread_complete_and_exit+0x20/0x20 [106170.966970] ret_from_fork+0x2d/0x50 [106170.967332] ? kthread_complete_and_exit+0x20/0x20 [106170.967774] ret_from_fork_asm+0x11/0x20 [106170.970466] [106170.970726] ---[ end trace 0000000000000000 ]--- Fixes: 77ac5e40c44e ("net/sched: act_ct: remove and free nf_table callbacks") Signed-off-by: Vlad Buslov Reviewed-by: Paul Blakey Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/netfilter/nf_flow_table.h | 10 ++++++++ net/sched/act_ct.c | 34 ++++++++++++++++++++++----- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 0b163ead95c9..dde4dd9c4012 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -62,6 +62,8 @@ struct nf_flowtable_type { enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule); void (*free)(struct nf_flowtable *ft); + void (*get)(struct nf_flowtable *ft); + void (*put)(struct nf_flowtable *ft); nf_hookfn *hook; struct module *owner; }; @@ -240,6 +242,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, } list_add_tail(&block_cb->list, &block->cb_list); + up_write(&flow_table->flow_block_lock); + + if (flow_table->type->get) + flow_table->type->get(flow_table); + return 0; unlock: up_write(&flow_table->flow_block_lock); @@ -262,6 +269,9 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, WARN_ON(true); } up_write(&flow_table->flow_block_lock); + + if (flow_table->type->put) + flow_table->type->put(flow_table); } int flow_offload_route_init(struct flow_offload *flow, diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 4d34474f2cc0..faf798133059 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -280,9 +280,31 @@ static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); } +static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft); + +static void tcf_ct_nf_get(struct nf_flowtable *ft) +{ + struct tcf_ct_flow_table *ct_ft = + container_of(ft, struct tcf_ct_flow_table, nf_ft); + + tcf_ct_flow_table_get_ref(ct_ft); +} + +static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft); + +static void tcf_ct_nf_put(struct nf_flowtable *ft) +{ + struct tcf_ct_flow_table *ct_ft = + container_of(ft, struct tcf_ct_flow_table, nf_ft); + + tcf_ct_flow_table_put(ct_ft); +} + static struct nf_flowtable_type flowtable_ct = { .gc = tcf_ct_flow_is_outdated, .action = tcf_ct_flow_table_fill_actions, + .get = tcf_ct_nf_get, + .put = tcf_ct_nf_put, .owner = THIS_MODULE, }; @@ -331,9 +353,13 @@ err_alloc: return err; } +static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft) +{ + refcount_inc(&ct_ft->ref); +} + static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) { - struct flow_block_cb *block_cb, *tmp_cb; struct tcf_ct_flow_table *ct_ft; struct flow_block *block; @@ -341,13 +367,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) rwork); nf_flow_table_free(&ct_ft->nf_ft); - /* Remove any remaining callbacks before cleanup */ block = &ct_ft->nf_ft.flow_block; down_write(&ct_ft->nf_ft.flow_block_lock); - list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) { - list_del(&block_cb->list); - flow_block_cb_free(block_cb); - } + WARN_ON(!list_empty(&block->cb_list)); up_write(&ct_ft->nf_ft.flow_block_lock); kfree(ct_ft); From 7d3912613d5b045530e4096fbcef5b2f78e030a1 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Fri, 8 Dec 2023 14:57:54 +0530 Subject: [PATCH 127/151] octeontx2-af: Fix pause frame configuration [ Upstream commit e307b5a845c5951dabafc48d00b6424ee64716c4 ] The current implementation's default Pause Forward setting is causing unnecessary network traffic. This patch disables Pause Forward to address this issue. Fixes: 1121f6b02e7a ("octeontx2-af: Priority flow control configuration support") Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/af/rpm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c index a70e1153fa04..6b4792a942d8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c @@ -283,6 +283,11 @@ void rpm_lmac_pause_frm_config(void *rpmd, int lmac_id, bool enable) cfg = FIELD_SET(RPM_PFC_CLASS_MASK, 0, cfg); rpm_write(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL, cfg); + /* Disable forward pause to driver */ + cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG); + cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD; + rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg); + /* Enable channel mask for all LMACS */ rpm_write(rpm, 0, RPMX_CMR_CHAN_MSK_OR, ~0ULL); } @@ -451,12 +456,10 @@ int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause, u16 p if (rx_pause) { cfg &= ~(RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE | - RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE | - RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD); + RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE); } else { cfg |= (RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE | - RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE | - RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD); + RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE); } if (tx_pause) { From 0f74dde5be2c345333f5b5e903c01ae78c7b8505 Mon Sep 17 00:00:00 2001 From: Rakesh Babu Saladi Date: Mon, 5 Dec 2022 12:35:18 +0530 Subject: [PATCH 128/151] octeontx2-af: Support variable number of lmacs [ Upstream commit f2e664ad503d4e5ce7c42a0862ab164331a0ef37 ] Most of the code in CGX/RPM driver assumes that max lmacs per given MAC as always, 4 and the number of MAC blocks also as 4. With this assumption, the max number of interfaces supported is hardcoded to 16. This creates a problem as next gen CN10KB silicon MAC supports 8 lmacs per MAC block. This patch solves the problem by using "max lmac per MAC block" value from constant csrs and uses cgx_cnt_max value which is populated based number of MAC blocks supported by silicon. Signed-off-by: Rakesh Babu Saladi Signed-off-by: Hariprasad Kelam Signed-off-by: Sunil Kovvuri Goutham Signed-off-by: Paolo Abeni Stable-dep-of: e307b5a845c5 ("octeontx2-af: Fix pause frame configuration") Signed-off-by: Sasha Levin --- .../net/ethernet/marvell/octeontx2/af/cgx.c | 35 ++++++++----------- .../net/ethernet/marvell/octeontx2/af/cgx.h | 6 ++-- .../marvell/octeontx2/af/lmac_common.h | 5 ++- .../net/ethernet/marvell/octeontx2/af/rvu.h | 2 +- .../ethernet/marvell/octeontx2/af/rvu_cgx.c | 26 ++++++++------ .../marvell/octeontx2/af/rvu_debugfs.c | 2 +- .../ethernet/marvell/octeontx2/af/rvu_nix.c | 2 +- .../marvell/octeontx2/af/rvu_npc_hash.c | 4 ++- 8 files changed, 42 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 65c0373d34d1..90be87dc105d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -78,7 +78,7 @@ static bool is_dev_rpm(void *cgxd) bool is_lmac_valid(struct cgx *cgx, int lmac_id) { - if (!cgx || lmac_id < 0 || lmac_id >= MAX_LMAC_PER_CGX) + if (!cgx || lmac_id < 0 || lmac_id >= cgx->max_lmac_per_mac) return false; return test_bit(lmac_id, &cgx->lmac_bmap); } @@ -90,7 +90,7 @@ static int get_sequence_id_of_lmac(struct cgx *cgx, int lmac_id) { int tmp, id = 0; - for_each_set_bit(tmp, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(tmp, &cgx->lmac_bmap, cgx->max_lmac_per_mac) { if (tmp == lmac_id) break; id++; @@ -121,7 +121,7 @@ u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset) struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx) { - if (!cgx || lmac_id >= MAX_LMAC_PER_CGX) + if (!cgx || lmac_id >= cgx->max_lmac_per_mac) return NULL; return cgx->lmac_idmap[lmac_id]; @@ -1410,7 +1410,7 @@ int cgx_get_fwdata_base(u64 *base) if (!cgx) return -ENXIO; - first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX); + first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac); req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FWD_BASE, req); err = cgx_fwi_cmd_generic(req, &resp, cgx, first_lmac); if (!err) @@ -1499,7 +1499,7 @@ static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool enable) static inline int cgx_fwi_read_version(u64 *resp, struct cgx *cgx) { - int first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX); + int first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac); u64 req = 0; req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FW_VER, req); @@ -1537,7 +1537,7 @@ static void cgx_lmac_linkup_work(struct work_struct *work) int i, err; /* Do Link up for all the enabled lmacs */ - for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) { err = cgx_fwi_link_change(cgx, i, true); if (err) dev_info(dev, "cgx port %d:%d Link up command failed\n", @@ -1557,14 +1557,6 @@ int cgx_lmac_linkup_start(void *cgxd) return 0; } -static void cgx_lmac_get_fifolen(struct cgx *cgx) -{ - u64 cfg; - - cfg = cgx_read(cgx, 0, CGX_CONST); - cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg); -} - static int cgx_configure_interrupt(struct cgx *cgx, struct lmac *lmac, int cnt, bool req_free) { @@ -1619,17 +1611,14 @@ static int cgx_lmac_init(struct cgx *cgx) u64 lmac_list; int i, err; - cgx_lmac_get_fifolen(cgx); - - cgx->lmac_count = cgx->mac_ops->get_nr_lmacs(cgx); /* lmac_list specifies which lmacs are enabled * when bit n is set to 1, LMAC[n] is enabled */ if (cgx->mac_ops->non_contiguous_serdes_lane) lmac_list = cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0xFULL; - if (cgx->lmac_count > MAX_LMAC_PER_CGX) - cgx->lmac_count = MAX_LMAC_PER_CGX; + if (cgx->lmac_count > cgx->max_lmac_per_mac) + cgx->lmac_count = cgx->max_lmac_per_mac; for (i = 0; i < cgx->lmac_count; i++) { lmac = kzalloc(sizeof(struct lmac), GFP_KERNEL); @@ -1707,7 +1696,7 @@ static int cgx_lmac_exit(struct cgx *cgx) } /* Free all lmac related resources */ - for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) { lmac = cgx->lmac_idmap[i]; if (!lmac) continue; @@ -1723,6 +1712,12 @@ static int cgx_lmac_exit(struct cgx *cgx) static void cgx_populate_features(struct cgx *cgx) { + u64 cfg; + + cfg = cgx_read(cgx, 0, CGX_CONST); + cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg); + cgx->max_lmac_per_mac = FIELD_GET(CGX_CONST_MAX_LMACS, cfg); + if (is_dev_rpm(cgx)) cgx->hw_features = (RVU_LMAC_FEAT_DMACF | RVU_MAC_RPM | RVU_LMAC_FEAT_FC | RVU_LMAC_FEAT_PTP); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h index 04338db38671..09ddb00f63cc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h @@ -18,11 +18,8 @@ /* PCI BAR nos */ #define PCI_CFG_REG_BAR_NUM 0 -#define CGX_ID_MASK 0x7 -#define MAX_LMAC_PER_CGX 4 +#define CGX_ID_MASK 0xF #define MAX_DMAC_ENTRIES_PER_CGX 32 -#define CGX_FIFO_LEN 65536 /* 64K for both Rx & Tx */ -#define CGX_OFFSET(x) ((x) * MAX_LMAC_PER_CGX) /* Registers */ #define CGXX_CMRX_CFG 0x00 @@ -56,6 +53,7 @@ #define CGXX_SCRATCH1_REG 0x1058 #define CGX_CONST 0x2000 #define CGX_CONST_RXFIFO_SIZE GENMASK_ULL(23, 0) +#define CGX_CONST_MAX_LMACS GENMASK_ULL(31, 24) #define CGXX_SPUX_CONTROL1 0x10000 #define CGXX_SPUX_LNX_FEC_CORR_BLOCKS 0x10700 #define CGXX_SPUX_LNX_FEC_UNCORR_BLOCKS 0x10800 diff --git a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h index 52b6016789fa..697cfec74aa1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h @@ -128,7 +128,10 @@ struct cgx { struct pci_dev *pdev; u8 cgx_id; u8 lmac_count; - struct lmac *lmac_idmap[MAX_LMAC_PER_CGX]; + /* number of LMACs per MAC could be 4 or 8 */ + u8 max_lmac_per_mac; +#define MAX_LMAC_COUNT 8 + struct lmac *lmac_idmap[MAX_LMAC_COUNT]; struct work_struct cgx_cmd_work; struct workqueue_struct *cgx_cmd_workq; struct list_head cgx_list; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index ab78e9d02075..0b76dfa979d4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -480,7 +480,7 @@ struct rvu { u8 cgx_mapped_pfs; u8 cgx_cnt_max; /* CGX port count max */ u8 *pf2cgxlmac_map; /* pf to cgx_lmac map */ - u16 *cgxlmac2pf_map; /* bitmap of mapped pfs for + u64 *cgxlmac2pf_map; /* bitmap of mapped pfs for * every cgx lmac port */ unsigned long pf_notify_bmap; /* Flags for PF notification */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index fa658bd4dfb3..bcb4385d0621 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -55,8 +55,9 @@ bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature) return (cgx_features_get(cgxd) & feature); } +#define CGX_OFFSET(x) ((x) * rvu->hw->lmac_per_cgx) /* Returns bitmap of mapped PFs */ -static u16 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id) +static u64 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id) { return rvu->cgxlmac2pf_map[CGX_OFFSET(cgx_id) + lmac_id]; } @@ -71,7 +72,8 @@ int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id) if (!pfmap) return -ENODEV; else - return find_first_bit(&pfmap, 16); + return find_first_bit(&pfmap, + rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx); } static u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id) @@ -129,14 +131,14 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) if (!cgx_cnt_max) return 0; - if (cgx_cnt_max > 0xF || MAX_LMAC_PER_CGX > 0xF) + if (cgx_cnt_max > 0xF || rvu->hw->lmac_per_cgx > 0xF) return -EINVAL; /* Alloc map table * An additional entry is required since PF id starts from 1 and * hence entry at offset 0 is invalid. */ - size = (cgx_cnt_max * MAX_LMAC_PER_CGX + 1) * sizeof(u8); + size = (cgx_cnt_max * rvu->hw->lmac_per_cgx + 1) * sizeof(u8); rvu->pf2cgxlmac_map = devm_kmalloc(rvu->dev, size, GFP_KERNEL); if (!rvu->pf2cgxlmac_map) return -ENOMEM; @@ -145,9 +147,10 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) memset(rvu->pf2cgxlmac_map, 0xFF, size); /* Reverse map table */ - rvu->cgxlmac2pf_map = devm_kzalloc(rvu->dev, - cgx_cnt_max * MAX_LMAC_PER_CGX * sizeof(u16), - GFP_KERNEL); + rvu->cgxlmac2pf_map = + devm_kzalloc(rvu->dev, + cgx_cnt_max * rvu->hw->lmac_per_cgx * sizeof(u64), + GFP_KERNEL); if (!rvu->cgxlmac2pf_map) return -ENOMEM; @@ -156,7 +159,7 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) if (!rvu_cgx_pdata(cgx, rvu)) continue; lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); - for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu), iter); rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac); @@ -235,7 +238,8 @@ static void cgx_notify_pfs(struct cgx_link_event *event, struct rvu *rvu) pfmap = cgxlmac_to_pfmap(rvu, event->cgx_id, event->lmac_id); do { - pfid = find_first_bit(&pfmap, 16); + pfid = find_first_bit(&pfmap, + rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx); clear_bit(pfid, &pfmap); /* check if notification is enabled */ @@ -310,7 +314,7 @@ static int cgx_lmac_event_handler_init(struct rvu *rvu) if (!cgxd) continue; lmac_bmap = cgx_get_lmac_bmap(cgxd); - for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx) { err = cgx_lmac_evh_register(&cb, cgxd, lmac); if (err) dev_err(rvu->dev, @@ -396,7 +400,7 @@ int rvu_cgx_exit(struct rvu *rvu) if (!cgxd) continue; lmac_bmap = cgx_get_lmac_bmap(cgxd); - for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX) + for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx) cgx_lmac_evh_unregister(cgxd, lmac); } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index 5c9dc3f9262f..cc5d342e026c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -2618,7 +2618,7 @@ static void rvu_dbg_cgx_init(struct rvu *rvu) rvu->rvu_dbg.cgx = debugfs_create_dir(dname, rvu->rvu_dbg.cgx_root); - for_each_set_bit(lmac_id, &lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(lmac_id, &lmac_bmap, rvu->hw->lmac_per_cgx) { /* lmac debugfs dir */ sprintf(dname, "lmac%d", lmac_id); rvu->rvu_dbg.lmac = diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index 7f9581ce7f1f..bb99302eab67 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -4079,7 +4079,7 @@ static void nix_link_config(struct rvu *rvu, int blkaddr, /* Get LMAC id's from bitmap */ lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); - for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) { + for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, iter); if (!lmac_fifo_len) { dev_err(rvu->dev, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c index 34fa59575fa9..54e0dfdc9d98 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c @@ -1999,7 +1999,9 @@ int rvu_npc_exact_init(struct rvu *rvu) /* Install SDP drop rule */ drop_mcam_idx = &table->num_drop_rules; - max_lmac_cnt = rvu->cgx_cnt_max * MAX_LMAC_PER_CGX + PF_CGXMAP_BASE; + max_lmac_cnt = rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx + + PF_CGXMAP_BASE; + for (i = PF_CGXMAP_BASE; i < max_lmac_cnt; i++) { if (rvu->pf2cgxlmac_map[i] == 0xFF) continue; From ab220f4f5c704a6165694efbbaa657ec17e498f4 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 1 Dec 2023 13:00:10 -0800 Subject: [PATCH 129/151] btrfs: fix qgroup_free_reserved_data int overflow [ Upstream commit 9e65bfca24cf1d77e4a5c7a170db5867377b3fe7 ] The reserved data counter and input parameter is a u64, but we inadvertently accumulate it in an int. Overflowing that int results in freeing the wrong amount of data and breaking reserve accounting. Unfortunately, this overflow rot spreads from there, as the qgroup release/free functions rely on returning an int to take advantage of negative values for error codes. Therefore, the full fix is to return the "released" or "freed" amount by a u64 argument and to return 0 or negative error code via the return value. Most of the call sites simply ignore the return value, though some of them handle the error and count the returned bytes. Change all of them accordingly. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/delalloc-space.c | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 16 ++++++++-------- fs/btrfs/ordered-data.c | 7 ++++--- fs/btrfs/qgroup.c | 25 +++++++++++++++---------- fs/btrfs/qgroup.h | 4 ++-- 6 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0b62ce77053f..f2bc5563c0f9 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -197,7 +197,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, start = round_down(start, fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(fs_info, len); - btrfs_qgroup_free_data(inode, reserved, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } /** diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0a46fff3dd06..1783a0fbf166 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3191,7 +3191,7 @@ static long btrfs_fallocate(struct file *file, int mode, qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, - range->start, range->len); + range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81eac121c6b2..9a7d77c410e2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -466,7 +466,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -5372,7 +5372,7 @@ static void evict_inode_truncate_pages(struct inode *inode) */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, - end - start + 1); + end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, @@ -8440,7 +8440,7 @@ next: * reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | @@ -9902,7 +9902,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; - int qgroup_released; + u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9915,9 +9915,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); - if (qgroup_released < 0) - return ERR_PTR(qgroup_released); + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); + if (ret < 0) + return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, @@ -10903,7 +10903,7 @@ out_delalloc_release: btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) - btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0321753c16b9..1b2af4785c0e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -172,11 +172,12 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, struct rb_node *node; struct btrfs_ordered_extent *entry; int ret; + u64 qgroup_rsv = 0; if (flags & ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ret; ret = 0; @@ -185,7 +186,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ret; } @@ -203,7 +204,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; - entry->qgroup_rsv = ret; + entry->qgroup_rsv = qgroup_rsv; entry->physical = (u64)-1; ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 26cabffd5971..96ec9ccc2ef6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3833,13 +3833,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; - int freed = 0; + u64 freed = 0; int ret; extent_changeset_init(&changeset); @@ -3880,7 +3881,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, } btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); - ret = freed; + if (freed_ret) + *freed_ret = freed; + ret = 0; out: extent_changeset_release(&changeset); return ret; @@ -3888,7 +3891,7 @@ out: static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, - int free) + u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; @@ -3900,7 +3903,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3915,7 +3918,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); - ret = changeset.bytes_changed; + if (released) + *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; @@ -3934,9 +3938,10 @@ out: * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* @@ -3954,9 +3959,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 578c77e94200..c382923f7628 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -360,10 +360,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, - u64 len); + u64 len, u64 *freed); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, From 820a7802f25ac0cc998b329f87389610e954c476 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:06 +0200 Subject: [PATCH 130/151] btrfs: mark the len field in struct btrfs_ordered_sum as unsigned [ Upstream commit 6e4b2479ab38b3f949a85964da212295d32102f0 ] len can't ever be negative, so mark it as an u32 instead of int. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba Stable-dep-of: 9e65bfca24cf ("btrfs: fix qgroup_free_reserved_data int overflow") Signed-off-by: Sasha Levin --- fs/btrfs/file-item.c | 2 +- fs/btrfs/ordered-data.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b14d2da9b26d..14478da87531 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -602,7 +602,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, } sums->bytenr = start; - sums->len = (int)size; + sums->len = size; offset = (start - key.offset) >> fs_info->sectorsize_bits; offset *= csum_size; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f59f2dbdb25e..cc3ca4bb9bd5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -20,7 +20,7 @@ struct btrfs_ordered_sum { /* * this is the length in bytes covered by the sums array below. */ - int len; + u32 len; struct list_head list; /* last field is a variable length array of csums */ u8 sums[]; From 09a44d994bfe92b7968a19ada2ff23d2d79e7331 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 12 Dec 2023 14:30:49 -0500 Subject: [PATCH 131/151] ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg() [ Upstream commit dec890089bf79a4954b61482715ee2d084364856 ] The following race can cause rb_time_read() to observe a corrupted time stamp: rb_time_cmpxchg() [...] if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; __rb_time_read() [...] do { c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); /* If top and msb counts don't match, this interrupted a write */ if (*cnt != rb_time_cnt(msb)) return false; ^ this check fails to catch that "bottom" is still not updated. So the old "bottom" value is returned, which is wrong. Fix this by checking that all three of msb, top, and bottom 2-bit cnt values match. The reason to favor checking all three fields over requiring a specific update order for both rb_time_set() and rb_time_cmpxchg() is because checking all three fields is more robust to handle partial failures of rb_time_cmpxchg() when interrupted by nested rb_time_set(). Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/ Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()") Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 06d52525407b..71cad4f1323c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -646,8 +646,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) *cnt = rb_time_cnt(top); - /* If top and msb counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(msb)) + /* If top, msb or bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) return false; /* The shift to msb will lose its cnt bits */ From d1db1ef5e63301f9672024cf34a28e49713aebcf Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 2 Jan 2024 20:01:50 +0900 Subject: [PATCH 132/151] firewire: ohci: suppress unexpected system reboot in AMD Ryzen machines and ASM108x/VT630x PCIe cards commit ac9184fbb8478dab4a0724b279f94956b69be827 upstream. VIA VT6306/6307/6308 provides PCI interface compliant to 1394 OHCI. When the hardware is combined with Asmedia ASM1083/1085 PCIe-to-PCI bus bridge, it appears that accesses to its 'Isochronous Cycle Timer' register (offset 0xf0 on PCI memory space) often causes unexpected system reboot in any type of AMD Ryzen machine (both 0x17 and 0x19 families). It does not appears in the other type of machine (AMD pre-Ryzen machine, Intel machine, at least), or in the other OHCI 1394 hardware (e.g. Texas Instruments). The issue explicitly appears at a commit dcadfd7f7c74 ("firewire: core: use union for callback of transaction completion") added to v6.5 kernel. It changed 1394 OHCI driver to access to the register every time to dispatch local asynchronous transaction. However, the issue exists in older version of kernel as long as it runs in AMD Ryzen machine, since the access to the register is required to maintain bus time. It is not hard to imagine that users experience the unexpected system reboot when generating bus reset by plugging any devices in, or reading the register by time-aware application programs; e.g. audio sample processing. This commit suppresses the unexpected system reboot in the combination of hardware. It avoids the access itself. As a result, the software stack can not provide the hardware time anymore to unit drivers, userspace applications, and nodes in the same IEEE 1394 bus. It brings apparent disadvantage since time-aware application programs require it, while time-unaware applications are available again; e.g. sbp2. Cc: stable@vger.kernel.org Reported-by: Jiri Slaby Closes: https://bugzilla.suse.com/show_bug.cgi?id=1215436 Reported-by: Mario Limonciello Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217994 Reported-by: Tobias Gruetzmacher Closes: https://sourceforge.net/p/linux1394/mailman/message/58711901/ Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2240973 Closes: https://bugs.launchpad.net/linux/+bug/2043905 Link: https://lore.kernel.org/r/20240102110150.244475-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto Signed-off-by: Greg Kroah-Hartman --- drivers/firewire/ohci.c | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 17c9d825188b..667ff40f3935 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -279,6 +279,51 @@ static char ohci_driver_name[] = KBUILD_MODNAME; #define QUIRK_TI_SLLZ059 0x20 #define QUIRK_IR_WAKE 0x40 +// On PCI Express Root Complex in any type of AMD Ryzen machine, VIA VT6306/6307/6308 with Asmedia +// ASM1083/1085 brings an inconvenience that the read accesses to 'Isochronous Cycle Timer' register +// (at offset 0xf0 in PCI I/O space) often causes unexpected system reboot. The mechanism is not +// clear, since the read access to the other registers is enough safe; e.g. 'Node ID' register, +// while it is probable due to detection of any type of PCIe error. +#define QUIRK_REBOOT_BY_CYCLE_TIMER_READ 0x80000000 + +#if IS_ENABLED(CONFIG_X86) + +static bool has_reboot_by_cycle_timer_read_quirk(const struct fw_ohci *ohci) +{ + return !!(ohci->quirks & QUIRK_REBOOT_BY_CYCLE_TIMER_READ); +} + +#define PCI_DEVICE_ID_ASMEDIA_ASM108X 0x1080 + +static bool detect_vt630x_with_asm1083_on_amd_ryzen_machine(const struct pci_dev *pdev) +{ + const struct pci_dev *pcie_to_pci_bridge; + + // Detect any type of AMD Ryzen machine. + if (!static_cpu_has(X86_FEATURE_ZEN)) + return false; + + // Detect VIA VT6306/6307/6308. + if (pdev->vendor != PCI_VENDOR_ID_VIA) + return false; + if (pdev->device != PCI_DEVICE_ID_VIA_VT630X) + return false; + + // Detect Asmedia ASM1083/1085. + pcie_to_pci_bridge = pdev->bus->self; + if (pcie_to_pci_bridge->vendor != PCI_VENDOR_ID_ASMEDIA) + return false; + if (pcie_to_pci_bridge->device != PCI_DEVICE_ID_ASMEDIA_ASM108X) + return false; + + return true; +} + +#else +#define has_reboot_by_cycle_timer_read_quirk(ohci) false +#define detect_vt630x_with_asm1083_on_amd_ryzen_machine(pdev) false +#endif + /* In case of multiple matches in ohci_quirks[], only the first one is used. */ static const struct { unsigned short vendor, device, revision, flags; @@ -1713,6 +1758,9 @@ static u32 get_cycle_time(struct fw_ohci *ohci) s32 diff01, diff12; int i; + if (has_reboot_by_cycle_timer_read_quirk(ohci)) + return 0; + c2 = reg_read(ohci, OHCI1394_IsochronousCycleTimer); if (ohci->quirks & QUIRK_CYCLE_TIMER) { @@ -3615,6 +3663,9 @@ static int pci_probe(struct pci_dev *dev, if (param_quirks) ohci->quirks = param_quirks; + if (detect_vt630x_with_asm1083_on_amd_ryzen_machine(dev)) + ohci->quirks |= QUIRK_REBOOT_BY_CYCLE_TIMER_READ; + /* * Because dma_alloc_coherent() allocates at least one page, * we save space by using a common buffer for the AR request/ From 53b42cb33fb1b9a85dd18ed964cbec6cd8be9d89 Mon Sep 17 00:00:00 2001 From: Jinghao Jia Date: Tue, 2 Jan 2024 17:33:45 -0600 Subject: [PATCH 133/151] x86/kprobes: fix incorrect return address calculation in kprobe_emulate_call_indirect commit f5d03da48d062966c94f0199d20be0b3a37a7982 upstream. kprobe_emulate_call_indirect currently uses int3_emulate_call to emulate indirect calls. However, int3_emulate_call always assumes the size of the call to be 5 bytes when calculating the return address. This is incorrect for register-based indirect calls in x86, which can be either 2 or 3 bytes depending on whether REX prefix is used. At kprobe runtime, the incorrect return address causes control flow to land onto the wrong place after return -- possibly not a valid instruction boundary. This can lead to a panic like the following: [ 7.308204][ C1] BUG: unable to handle page fault for address: 000000000002b4d8 [ 7.308883][ C1] #PF: supervisor read access in kernel mode [ 7.309168][ C1] #PF: error_code(0x0000) - not-present page [ 7.309461][ C1] PGD 0 P4D 0 [ 7.309652][ C1] Oops: 0000 [#1] SMP [ 7.309929][ C1] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.7.0-rc5-trace-for-next #6 [ 7.310397][ C1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-20220807_005459-localhost 04/01/2014 [ 7.311068][ C1] RIP: 0010:__common_interrupt+0x52/0xc0 [ 7.311349][ C1] Code: 01 00 4d 85 f6 74 39 49 81 fe 00 f0 ff ff 77 30 4c 89 f7 4d 8b 5e 68 41 ba 91 76 d8 42 45 03 53 fc 74 02 0f 0b cc ff d3 65 48 <8b> 05 30 c7 ff 7e 65 4c 89 3d 28 c7 ff 7e 5b 41 5c 41 5e 41 5f c3 [ 7.312512][ C1] RSP: 0018:ffffc900000e0fd0 EFLAGS: 00010046 [ 7.312899][ C1] RAX: 0000000000000001 RBX: 0000000000000023 RCX: 0000000000000001 [ 7.313334][ C1] RDX: 00000000000003cd RSI: 0000000000000001 RDI: ffff888100d302a4 [ 7.313702][ C1] RBP: 0000000000000001 R08: 0ef439818636191f R09: b1621ff338a3b482 [ 7.314146][ C1] R10: ffffffff81e5127b R11: ffffffff81059810 R12: 0000000000000023 [ 7.314509][ C1] R13: 0000000000000000 R14: ffff888100d30200 R15: 0000000000000000 [ 7.314951][ C1] FS: 0000000000000000(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 [ 7.315396][ C1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7.315691][ C1] CR2: 000000000002b4d8 CR3: 0000000003028003 CR4: 0000000000370ef0 [ 7.316153][ C1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 7.316508][ C1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 7.316948][ C1] Call Trace: [ 7.317123][ C1] [ 7.317279][ C1] ? __die_body+0x64/0xb0 [ 7.317482][ C1] ? page_fault_oops+0x248/0x370 [ 7.317712][ C1] ? __wake_up+0x96/0xb0 [ 7.317964][ C1] ? exc_page_fault+0x62/0x130 [ 7.318211][ C1] ? asm_exc_page_fault+0x22/0x30 [ 7.318444][ C1] ? __cfi_native_send_call_func_single_ipi+0x10/0x10 [ 7.318860][ C1] ? default_idle+0xb/0x10 [ 7.319063][ C1] ? __common_interrupt+0x52/0xc0 [ 7.319330][ C1] common_interrupt+0x78/0x90 [ 7.319546][ C1] [ 7.319679][ C1] [ 7.319854][ C1] asm_common_interrupt+0x22/0x40 [ 7.320082][ C1] RIP: 0010:default_idle+0xb/0x10 [ 7.320309][ C1] Code: 4c 01 c7 4c 29 c2 e9 72 ff ff ff cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 b8 0c 67 40 a5 66 90 0f 00 2d 09 b9 3b 00 fb f4 c3 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 b8 0c 67 40 a5 e9 [ 7.321449][ C1] RSP: 0018:ffffc9000009bee8 EFLAGS: 00000256 [ 7.321808][ C1] RAX: ffff88813bca8b68 RBX: 0000000000000001 RCX: 000000000001ef0c [ 7.322227][ C1] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 000000000001ef0c [ 7.322656][ C1] RBP: ffffc9000009bef8 R08: 8000000000000000 R09: 00000000000008c2 [ 7.323083][ C1] R10: 0000000000000000 R11: ffffffff81058e70 R12: 0000000000000000 [ 7.323530][ C1] R13: ffff8881002b30c0 R14: 0000000000000000 R15: 0000000000000000 [ 7.323948][ C1] ? __cfi_lapic_next_deadline+0x10/0x10 [ 7.324239][ C1] default_idle_call+0x31/0x50 [ 7.324464][ C1] do_idle+0xd3/0x240 [ 7.324690][ C1] cpu_startup_entry+0x25/0x30 [ 7.324983][ C1] start_secondary+0xb4/0xc0 [ 7.325217][ C1] secondary_startup_64_no_verify+0x179/0x17b [ 7.325498][ C1] [ 7.325641][ C1] Modules linked in: [ 7.325906][ C1] CR2: 000000000002b4d8 [ 7.326104][ C1] ---[ end trace 0000000000000000 ]--- [ 7.326354][ C1] RIP: 0010:__common_interrupt+0x52/0xc0 [ 7.326614][ C1] Code: 01 00 4d 85 f6 74 39 49 81 fe 00 f0 ff ff 77 30 4c 89 f7 4d 8b 5e 68 41 ba 91 76 d8 42 45 03 53 fc 74 02 0f 0b cc ff d3 65 48 <8b> 05 30 c7 ff 7e 65 4c 89 3d 28 c7 ff 7e 5b 41 5c 41 5e 41 5f c3 [ 7.327570][ C1] RSP: 0018:ffffc900000e0fd0 EFLAGS: 00010046 [ 7.327910][ C1] RAX: 0000000000000001 RBX: 0000000000000023 RCX: 0000000000000001 [ 7.328273][ C1] RDX: 00000000000003cd RSI: 0000000000000001 RDI: ffff888100d302a4 [ 7.328632][ C1] RBP: 0000000000000001 R08: 0ef439818636191f R09: b1621ff338a3b482 [ 7.329223][ C1] R10: ffffffff81e5127b R11: ffffffff81059810 R12: 0000000000000023 [ 7.329780][ C1] R13: 0000000000000000 R14: ffff888100d30200 R15: 0000000000000000 [ 7.330193][ C1] FS: 0000000000000000(0000) GS:ffff88813bc80000(0000) knlGS:0000000000000000 [ 7.330632][ C1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7.331050][ C1] CR2: 000000000002b4d8 CR3: 0000000003028003 CR4: 0000000000370ef0 [ 7.331454][ C1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 7.331854][ C1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 7.332236][ C1] Kernel panic - not syncing: Fatal exception in interrupt [ 7.332730][ C1] Kernel Offset: disabled [ 7.333044][ C1] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- The relevant assembly code is (from objdump, faulting address highlighted): ffffffff8102ed9d: 41 ff d3 call *%r11 ffffffff8102eda0: 65 48 <8b> 05 30 c7 ff mov %gs:0x7effc730(%rip),%rax The emulation incorrectly sets the return address to be ffffffff8102ed9d + 0x5 = ffffffff8102eda2, which is the 8b byte in the middle of the next mov. This in turn causes incorrect subsequent instruction decoding and eventually triggers the page fault above. Instead of invoking int3_emulate_call, perform push and jmp emulation directly in kprobe_emulate_call_indirect. At this point we can obtain the instruction size from p->ainsn.size so that we can calculate the correct return address. Link: https://lore.kernel.org/all/20240102233345.385475-1-jinghao7@illinois.edu/ Fixes: 6256e668b7af ("x86/kprobes: Use int3 instead of debug trap for single-step") Cc: stable@vger.kernel.org Signed-off-by: Jinghao Jia Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kprobes/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ea155f0cf545..6120f25b0d5c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -549,7 +549,8 @@ static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) { unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; - int3_emulate_call(regs, regs_get_register(regs, offs)); + int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size); + int3_emulate_jmp(regs, regs_get_register(regs, offs)); } NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); From 08038069c23798b4268eb341c12815fcf13c7ac0 Mon Sep 17 00:00:00 2001 From: Benjamin Bara Date: Thu, 4 Jan 2024 09:17:08 +0100 Subject: [PATCH 134/151] i2c: core: Fix atomic xfer check for non-preempt config commit a3368e1186e3ce8e38f78cbca019622095b1f331 upstream. Since commit aa49c90894d0 ("i2c: core: Run atomic i2c xfer when !preemptible"), the whole reboot/power off sequence on non-preempt kernels is using atomic i2c xfer, as !preemptible() always results to 1. During device_shutdown(), the i2c might be used a lot and not all busses have implemented an atomic xfer handler. This results in a lot of avoidable noise, like: [ 12.687169] No atomic I2C transfer handler for 'i2c-0' [ 12.692313] WARNING: CPU: 6 PID: 275 at drivers/i2c/i2c-core.h:40 i2c_smbus_xfer+0x100/0x118 ... Fix this by allowing non-atomic xfer when the interrupts are enabled, as it was before. Link: https://lore.kernel.org/r/20231222230106.73f030a5@yea Link: https://lore.kernel.org/r/20240102150350.3180741-1-mwalle@kernel.org Link: https://lore.kernel.org/linux-i2c/13271b9b-4132-46ef-abf8-2c311967bb46@mailbox.org/ Fixes: aa49c90894d0 ("i2c: core: Run atomic i2c xfer when !preemptible") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Benjamin Bara Tested-by: Michael Walle Tested-by: Tor Vic [wsa: removed a comment which needs more work, code is ok] Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/i2c-core.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-core.h b/drivers/i2c/i2c-core.h index 05b8b8dfa9bd..36587f38dff3 100644 --- a/drivers/i2c/i2c-core.h +++ b/drivers/i2c/i2c-core.h @@ -3,6 +3,7 @@ * i2c-core.h - interfaces internal to the I2C framework */ +#include #include struct i2c_devinfo { @@ -29,7 +30,8 @@ int i2c_dev_irq_from_resources(const struct resource *resources, */ static inline bool i2c_in_atomic_xfer_mode(void) { - return system_state > SYSTEM_RUNNING && !preemptible(); + return system_state > SYSTEM_RUNNING && + (IS_ENABLED(CONFIG_PREEMPT_COUNT) ? !preemptible() : irqs_disabled()); } static inline int __i2c_lock_bus_helper(struct i2c_adapter *adap) From dafdeb7b91f1597983693a418c496b6702ea34b5 Mon Sep 17 00:00:00 2001 From: Jiajun Xie Date: Wed, 20 Dec 2023 13:28:39 +0800 Subject: [PATCH 135/151] mm: fix unmap_mapping_range high bits shift bug commit 9eab0421fa94a3dde0d1f7e36ab3294fc306c99d upstream. The bug happens when highest bit of holebegin is 1, suppose holebegin is 0x8000000111111000, after shift, hba would be 0xfff8000000111111, then vma_interval_tree_foreach would look it up fail or leads to the wrong result. error call seq e.g.: - mmap(..., offset=0x8000000111111000) |- syscall(mmap, ... unsigned long, off): |- ksys_mmap_pgoff( ... , off >> PAGE_SHIFT); here pgoff is correctly shifted to 0x8000000111111, but pass 0x8000000111111000 as holebegin to unmap would then cause terrible result, as shown below: - unmap_mapping_range(..., loff_t const holebegin) |- pgoff_t hba = holebegin >> PAGE_SHIFT; /* hba = 0xfff8000000111111 unexpectedly */ The issue happens in Heterogeneous computing, where the device(e.g. gpu) and host share the same virtual address space. A simple workflow pattern which hit the issue is: /* host */ 1. userspace first mmap a file backed VA range with specified offset. e.g. (offset=0x800..., mmap return: va_a) 2. write some data to the corresponding sys page e.g. (va_a = 0xAABB) /* device */ 3. gpu workload touches VA, triggers gpu fault and notify the host. /* host */ 4. reviced gpu fault notification, then it will: 4.1 unmap host pages and also takes care of cpu tlb (use unmap_mapping_range with offset=0x800...) 4.2 migrate sys page to device 4.3 setup device page table and resolve device fault. /* device */ 5. gpu workload continued, it accessed va_a and got 0xAABB. 6. gpu workload continued, it wrote 0xBBCC to va_a. /* host */ 7. userspace access va_a, as expected, it will: 7.1 trigger cpu vm fault. 7.2 driver handling fault to migrate gpu local page to host. 8. userspace then could correctly get 0xBBCC from va_a 9. done But in step 4.1, if we hit the bug this patch mentioned, then userspace would never trigger cpu fault, and still get the old value: 0xAABB. Making holebegin unsigned first fixes the bug. Link: https://lkml.kernel.org/r/20231220052839.26970-1-jiajun.xie.sh@gmail.com Signed-off-by: Jiajun Xie Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0d1b3ee8fcd7..fc8b264ec0ca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3617,8 +3617,8 @@ EXPORT_SYMBOL_GPL(unmap_mapping_pages); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - pgoff_t hba = holebegin >> PAGE_SHIFT; - pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; + pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { From 11c3510d1d4d53c029e8f4c7153725ebcbf45c27 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 20 Dec 2023 12:36:08 -0500 Subject: [PATCH 136/151] drm/amdgpu: skip gpu_info fw loading on navi12 commit 21f6137c64c65d6808c4a81006956197ca203383 upstream. It's no longer required. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2318 Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8a1b84aaaf71..a5352e5e2bd4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1976,15 +1976,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; - if (adev->mman.discovery_bin) { - /* - * FIXME: The bounding box is still needed by Navi12, so - * temporarily read it from gpu_info firmware. Should be dropped - * when DAL no longer needs it. - */ - if (adev->asic_type != CHIP_NAVI12) - return 0; - } + if (adev->mman.discovery_bin) + return 0; switch (adev->asic_type) { default: From 48e1d426f452ccbdfae23f94c369547fd714d2f2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 20 Dec 2023 12:33:45 -0500 Subject: [PATCH 137/151] drm/amd/display: add nv12 bounding box commit 7e725c20fea8914ef1829da777f517ce1a93d388 upstream. This was included in gpu_info firmware, move it into the driver for consistency with other nv1x parts. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2318 Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- .../drm/amd/display/dc/dml/dcn20/dcn20_fpu.c | 110 +++++++++++++++++- 1 file changed, 109 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c index 9d224bb2b3df..ce893fe1c69f 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/dcn20_fpu.c @@ -438,7 +438,115 @@ struct _vcs_dpi_soc_bounding_box_st dcn2_0_nv14_soc = { .use_urgent_burst_bw = 0 }; -struct _vcs_dpi_soc_bounding_box_st dcn2_0_nv12_soc = { 0 }; +struct _vcs_dpi_soc_bounding_box_st dcn2_0_nv12_soc = { + .clock_limits = { + { + .state = 0, + .dcfclk_mhz = 560.0, + .fabricclk_mhz = 560.0, + .dispclk_mhz = 513.0, + .dppclk_mhz = 513.0, + .phyclk_mhz = 540.0, + .socclk_mhz = 560.0, + .dscclk_mhz = 171.0, + .dram_speed_mts = 1069.0, + }, + { + .state = 1, + .dcfclk_mhz = 694.0, + .fabricclk_mhz = 694.0, + .dispclk_mhz = 642.0, + .dppclk_mhz = 642.0, + .phyclk_mhz = 600.0, + .socclk_mhz = 694.0, + .dscclk_mhz = 214.0, + .dram_speed_mts = 1324.0, + }, + { + .state = 2, + .dcfclk_mhz = 875.0, + .fabricclk_mhz = 875.0, + .dispclk_mhz = 734.0, + .dppclk_mhz = 734.0, + .phyclk_mhz = 810.0, + .socclk_mhz = 875.0, + .dscclk_mhz = 245.0, + .dram_speed_mts = 1670.0, + }, + { + .state = 3, + .dcfclk_mhz = 1000.0, + .fabricclk_mhz = 1000.0, + .dispclk_mhz = 1100.0, + .dppclk_mhz = 1100.0, + .phyclk_mhz = 810.0, + .socclk_mhz = 1000.0, + .dscclk_mhz = 367.0, + .dram_speed_mts = 2000.0, + }, + { + .state = 4, + .dcfclk_mhz = 1200.0, + .fabricclk_mhz = 1200.0, + .dispclk_mhz = 1284.0, + .dppclk_mhz = 1284.0, + .phyclk_mhz = 810.0, + .socclk_mhz = 1200.0, + .dscclk_mhz = 428.0, + .dram_speed_mts = 2000.0, + }, + { + .state = 5, + .dcfclk_mhz = 1200.0, + .fabricclk_mhz = 1200.0, + .dispclk_mhz = 1284.0, + .dppclk_mhz = 1284.0, + .phyclk_mhz = 810.0, + .socclk_mhz = 1200.0, + .dscclk_mhz = 428.0, + .dram_speed_mts = 2000.0, + }, + }, + + .num_states = 5, + .sr_exit_time_us = 1.9, + .sr_enter_plus_exit_time_us = 4.4, + .urgent_latency_us = 3.0, + .urgent_latency_pixel_data_only_us = 4.0, + .urgent_latency_pixel_mixed_with_vm_data_us = 4.0, + .urgent_latency_vm_data_only_us = 4.0, + .urgent_out_of_order_return_per_channel_pixel_only_bytes = 4096, + .urgent_out_of_order_return_per_channel_pixel_and_vm_bytes = 4096, + .urgent_out_of_order_return_per_channel_vm_only_bytes = 4096, + .pct_ideal_dram_sdp_bw_after_urgent_pixel_only = 40.0, + .pct_ideal_dram_sdp_bw_after_urgent_pixel_and_vm = 40.0, + .pct_ideal_dram_sdp_bw_after_urgent_vm_only = 40.0, + .max_avg_sdp_bw_use_normal_percent = 40.0, + .max_avg_dram_bw_use_normal_percent = 40.0, + .writeback_latency_us = 12.0, + .ideal_dram_bw_after_urgent_percent = 40.0, + .max_request_size_bytes = 256, + .dram_channel_width_bytes = 16, + .fabric_datapath_to_dcn_data_return_bytes = 64, + .dcn_downspread_percent = 0.5, + .downspread_percent = 0.5, + .dram_page_open_time_ns = 50.0, + .dram_rw_turnaround_time_ns = 17.5, + .dram_return_buffer_per_channel_bytes = 8192, + .round_trip_ping_latency_dcfclk_cycles = 131, + .urgent_out_of_order_return_per_channel_bytes = 4096, + .channel_interleave_bytes = 256, + .num_banks = 8, + .num_chans = 16, + .vmm_page_size_bytes = 4096, + .dram_clock_change_latency_us = 45.0, + .writeback_dram_clock_change_latency_us = 23.0, + .return_bus_width_bytes = 64, + .dispclk_dppclk_vco_speed_mhz = 3850, + .xfc_bus_transport_time_us = 20, + .xfc_xbuf_latency_tolerance_us = 50, + .use_urgent_burst_bw = 0, +}; struct _vcs_dpi_ip_params_st dcn2_1_ip = { .odm_capable = 1, From 9c5efaa09b31d9bbe3caa8537c10a27e2223c1d2 Mon Sep 17 00:00:00 2001 From: Ziyang Huang Date: Wed, 11 Oct 2023 00:44:00 +0800 Subject: [PATCH 138/151] mmc: meson-mx-sdhc: Fix initialization frozen issue commit 8c124d998ea0c9022e247b11ac51f86ec8afa0e1 upstream. Commit 4bc31edebde5 ("mmc: core: Set HS clock speed before sending HS CMD13") set HS clock (52MHz) before switching to HS mode. For this freq, FCLK_DIV5 will be selected and div value is 10 (reg value is 9). Then we set rx_clk_phase to 11 or 15 which is out of range and make hardware frozen. After we send command request, no irq will be interrupted and the mmc driver will keep to wait for request finished, even durning rebooting. So let's set it to Phase 90 which should work in most cases. Then let meson_mx_sdhc_execute_tuning() to find the accurate value for data transfer. If this doesn't work, maybe need to define a factor in dts. Fixes: e4bf1b0970ef ("mmc: host: meson-mx-sdhc: new driver for the Amlogic Meson SDHC host") Signed-off-by: Ziyang Huang Tested-by: Anand Moon Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/TYZPR01MB5556A3E71554A2EC08597EA4C9CDA@TYZPR01MB5556.apcprd01.prod.exchangelabs.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/meson-mx-sdhc-mmc.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/drivers/mmc/host/meson-mx-sdhc-mmc.c b/drivers/mmc/host/meson-mx-sdhc-mmc.c index da85c2f2acb8..c0e3b1634a88 100644 --- a/drivers/mmc/host/meson-mx-sdhc-mmc.c +++ b/drivers/mmc/host/meson-mx-sdhc-mmc.c @@ -269,7 +269,7 @@ static int meson_mx_sdhc_enable_clks(struct mmc_host *mmc) static int meson_mx_sdhc_set_clk(struct mmc_host *mmc, struct mmc_ios *ios) { struct meson_mx_sdhc_host *host = mmc_priv(mmc); - u32 rx_clk_phase; + u32 val, rx_clk_phase; int ret; meson_mx_sdhc_disable_clks(mmc); @@ -290,27 +290,11 @@ static int meson_mx_sdhc_set_clk(struct mmc_host *mmc, struct mmc_ios *ios) mmc->actual_clock = clk_get_rate(host->sd_clk); /* - * according to Amlogic the following latching points are - * selected with empirical values, there is no (known) formula - * to calculate these. + * Phase 90 should work in most cases. For data transmission, + * meson_mx_sdhc_execute_tuning() will find a accurate value */ - if (mmc->actual_clock > 100000000) { - rx_clk_phase = 1; - } else if (mmc->actual_clock > 45000000) { - if (ios->signal_voltage == MMC_SIGNAL_VOLTAGE_330) - rx_clk_phase = 15; - else - rx_clk_phase = 11; - } else if (mmc->actual_clock >= 25000000) { - rx_clk_phase = 15; - } else if (mmc->actual_clock > 5000000) { - rx_clk_phase = 23; - } else if (mmc->actual_clock > 1000000) { - rx_clk_phase = 55; - } else { - rx_clk_phase = 1061; - } - + regmap_read(host->regmap, MESON_SDHC_CLKC, &val); + rx_clk_phase = FIELD_GET(MESON_SDHC_CLKC_CLK_DIV, val) / 4; regmap_update_bits(host->regmap, MESON_SDHC_CLK2, MESON_SDHC_CLK2_RX_CLK_PHASE, FIELD_PREP(MESON_SDHC_CLK2_RX_CLK_PHASE, From 575e127041f219b6256f8d2820920c9fbb10adbe Mon Sep 17 00:00:00 2001 From: Jorge Ramirez-Ortiz Date: Fri, 1 Dec 2023 16:31:43 +0100 Subject: [PATCH 139/151] mmc: rpmb: fixes pause retune on all RPMB partitions. commit e7794c14fd73e5eb4a3e0ecaa5334d5a17377c50 upstream. When RPMB was converted to a character device, it added support for multiple RPMB partitions (Commit 97548575bef3 ("mmc: block: Convert RPMB to a character device"). One of the changes in this commit was transforming the variable target_part defined in __mmc_blk_ioctl_cmd into a bitmask. This inadvertently regressed the validation check done in mmc_blk_part_switch_pre() and mmc_blk_part_switch_post(), so let's fix it. Fixes: 97548575bef3 ("mmc: block: Convert RPMB to a character device") Signed-off-by: Jorge Ramirez-Ortiz Reviewed-by: Linus Walleij Cc: Link: https://lore.kernel.org/r/20231201153143.1449753-1-jorge@foundries.io Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/block.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 770490234c87..e9ce53d200bc 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -866,9 +866,10 @@ static const struct block_device_operations mmc_bdops = { static int mmc_blk_part_switch_pre(struct mmc_card *card, unsigned int part_type) { + const unsigned int mask = EXT_CSD_PART_CONFIG_ACC_RPMB; int ret = 0; - if (part_type == EXT_CSD_PART_CONFIG_ACC_RPMB) { + if ((part_type & mask) == mask) { if (card->ext_csd.cmdq_en) { ret = mmc_cmdq_disable(card); if (ret) @@ -883,9 +884,10 @@ static int mmc_blk_part_switch_pre(struct mmc_card *card, static int mmc_blk_part_switch_post(struct mmc_card *card, unsigned int part_type) { + const unsigned int mask = EXT_CSD_PART_CONFIG_ACC_RPMB; int ret = 0; - if (part_type == EXT_CSD_PART_CONFIG_ACC_RPMB) { + if ((part_type & mask) == mask) { mmc_retune_unpause(card->host); if (card->reenable_cmdq && !card->ext_csd.cmdq_en) ret = mmc_cmdq_enable(card); @@ -3180,4 +3182,3 @@ module_exit(mmc_blk_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Multimedia Card (MMC) block device driver"); - From 2813a434d461f05e82a25cff0b19368171332071 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 4 Dec 2023 12:29:53 +0100 Subject: [PATCH 140/151] mmc: core: Cancel delayed work before releasing host commit 1036f69e251380573e256568cf814506e3fb9988 upstream. On RZ/Five SMARC EVK, where probing of SDHI is deferred due to probe deferral of the vqmmc-supply regulator: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/time/timer.c:1738 __run_timers.part.0+0x1d0/0x1e8 Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.7.0-rc4 #101 Hardware name: Renesas SMARC EVK based on r9a07g043f01 (DT) epc : __run_timers.part.0+0x1d0/0x1e8 ra : __run_timers.part.0+0x134/0x1e8 epc : ffffffff800771a4 ra : ffffffff80077108 sp : ffffffc800003e60 gp : ffffffff814f5028 tp : ffffffff8140c5c0 t0 : ffffffc800000000 t1 : 0000000000000001 t2 : ffffffff81201300 s0 : ffffffc800003f20 s1 : ffffffd8023bc4a0 a0 : 00000000fffee6b0 a1 : 0004010000400000 a2 : ffffffffc0000016 a3 : ffffffff81488640 a4 : ffffffc800003e60 a5 : 0000000000000000 a6 : 0000000004000000 a7 : ffffffc800003e68 s2 : 0000000000000122 s3 : 0000000000200000 s4 : 0000000000000000 s5 : ffffffffffffffff s6 : ffffffff81488678 s7 : ffffffff814886c0 s8 : ffffffff814f49c0 s9 : ffffffff81488640 s10: 0000000000000000 s11: ffffffc800003e60 t3 : 0000000000000240 t4 : 0000000000000a52 t5 : ffffffd8024ae018 t6 : ffffffd8024ae038 status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [] __run_timers.part.0+0x1d0/0x1e8 [] run_timer_softirq+0x24/0x4a [] __do_softirq+0xc6/0x1fa [] irq_exit_rcu+0x66/0x84 [] handle_riscv_irq+0x40/0x4e [] call_on_irq_stack+0x1c/0x28 ---[ end trace 0000000000000000 ]--- What happens? renesas_sdhi_probe() { tmio_mmc_host_alloc() mmc_alloc_host() INIT_DELAYED_WORK(&host->detect, mmc_rescan); devm_request_irq(tmio_mmc_irq); /* * After this, the interrupt handler may be invoked at any time * * tmio_mmc_irq() * { * __tmio_mmc_card_detect_irq() * mmc_detect_change() * _mmc_detect_change() * mmc_schedule_delayed_work(&host->detect, delay); * } */ tmio_mmc_host_probe() tmio_mmc_init_ocr() -EPROBE_DEFER tmio_mmc_host_free() mmc_free_host() } When expire_timers() runs later, it warns because the MMC host structure containing the delayed work was freed, and now contains an invalid work function pointer. Fix this by cancelling any pending delayed work before releasing the MMC host structure. Signed-off-by: Geert Uytterhoeven Tested-by: Lad Prabhakar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/205dc4c91b47e31b64392fe2498c7a449e717b4b.1701689330.git.geert+renesas@glider.be Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/host.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index b89dca1f15e9..25c152ef5d60 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -670,6 +670,7 @@ EXPORT_SYMBOL(mmc_remove_host); */ void mmc_free_host(struct mmc_host *host) { + cancel_delayed_work_sync(&host->detect); mmc_pwrseq_free(host); put_device(&host->class_dev); } From 28c9222e29e5b89923d4107eed22b6fa844d668d Mon Sep 17 00:00:00 2001 From: Wenchao Chen Date: Mon, 4 Dec 2023 14:49:34 +0800 Subject: [PATCH 141/151] mmc: sdhci-sprd: Fix eMMC init failure after hw reset commit 8abf77c88929b6d20fa4f9928b18d6448d64e293 upstream. Some eMMC devices that do not close the auto clk gate after hw reset will cause eMMC initialization to fail. Let's fix this. Signed-off-by: Wenchao Chen Fixes: ff874dbc4f86 ("mmc: sdhci-sprd: Disable CLK_AUTO when the clock is less than 400K") Reviewed-by: Baolin Wang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20231204064934.21236-1-wenchao.chen@unisoc.com Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sdhci-sprd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index 2101b6e794c0..66c1782823d8 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -228,15 +228,19 @@ static inline void _sdhci_sprd_set_clock(struct sdhci_host *host, div = ((div & 0x300) >> 2) | ((div & 0xFF) << 8); sdhci_enable_clk(host, div); + val = sdhci_readl(host, SDHCI_SPRD_REG_32_BUSY_POSI); + mask = SDHCI_SPRD_BIT_OUTR_CLK_AUTO_EN | SDHCI_SPRD_BIT_INNR_CLK_AUTO_EN; /* Enable CLK_AUTO when the clock is greater than 400K. */ if (clk > 400000) { - val = sdhci_readl(host, SDHCI_SPRD_REG_32_BUSY_POSI); - mask = SDHCI_SPRD_BIT_OUTR_CLK_AUTO_EN | - SDHCI_SPRD_BIT_INNR_CLK_AUTO_EN; if (mask != (val & mask)) { val |= mask; sdhci_writel(host, val, SDHCI_SPRD_REG_32_BUSY_POSI); } + } else { + if (val & mask) { + val &= ~mask; + sdhci_writel(host, val, SDHCI_SPRD_REG_32_BUSY_POSI); + } } } From 397f719037c2ffd5f0fb27787b7a604d5f309c77 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Jan 2023 12:14:01 +0100 Subject: [PATCH 142/151] genirq/affinity: Only build SMP-only helper functions on SMP kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 188a569658584e93930ab60334c5a1079c0330d8 upstream. allnoconfig grew these new build warnings in lib/group_cpus.c: lib/group_cpus.c:247:12: warning: ‘__group_cpus_evenly’ defined but not used [-Wunused-function] lib/group_cpus.c:75:13: warning: ‘build_node_to_cpumask’ defined but not used [-Wunused-function] lib/group_cpus.c:66:13: warning: ‘free_node_to_cpumask’ defined but not used [-Wunused-function] lib/group_cpus.c:43:23: warning: ‘alloc_node_to_cpumask’ defined but not used [-Wunused-function] Widen the #ifdef CONFIG_SMP block to not expose unused helpers on non-SMP builds. Also annotate the preprocessor branches for better readability. Fixes: f7b3ea8cf72f ("genirq/affinity: Move group_cpus_evenly() into lib/") Cc: Ming Lei Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- lib/group_cpus.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 156b1446d2a2..0292611901b8 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -9,6 +9,8 @@ #include #include +#ifdef CONFIG_SMP + static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, unsigned int cpus_per_grp) { @@ -327,7 +329,6 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, return done; } -#ifdef CONFIG_SMP /** * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality * @numgrps: number of groups @@ -422,7 +423,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) } return masks; } -#else +#else /* CONFIG_SMP */ struct cpumask *group_cpus_evenly(unsigned int numgrps) { struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); @@ -434,4 +435,4 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) cpumask_copy(&masks[0], cpu_possible_mask); return masks; } -#endif +#endif /* CONFIG_SMP */ From 87318b7e374cbed6d4353748791faea605535d58 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 21 Aug 2023 23:22:25 +0800 Subject: [PATCH 143/151] f2fs: compress: fix to assign compress_level for lz4 correctly commit 091a4dfbb1d32b06c031edbfe2a44af100c4604f upstream. After remount, F2FS_OPTION().compress_level was assgin to LZ4HC_DEFAULT_CLEVEL incorrectly, result in lz4hc:9 was enabled, fix it. 1. mount /dev/vdb /dev/vdb on /mnt/f2fs type f2fs (...,compress_algorithm=lz4,compress_log_size=2,...) 2. mount -t f2fs -o remount,compress_log_size=3 /mnt/f2fs/ 3. mount|grep f2fs /dev/vdb on /mnt/f2fs type f2fs (...,compress_algorithm=lz4:9,compress_log_size=3,...) Fixes: 00e120b5e4b5 ("f2fs: assign default compression level") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 584fe00fdeeb..3805162dcef2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -615,7 +615,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) unsigned int level; if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; + F2FS_OPTION(sbi).compress_level = 0; return 0; } From 2be4e8ac2d167a89858fa2faec8fd13f02626fc0 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Tue, 24 Oct 2023 21:58:57 +0200 Subject: [PATCH 144/151] net/sched: act_ct: additional checks for outdated flows commit a63b6622120cd03a304796dbccb80655b3a21798 upstream. Current nf_flow_is_outdated() implementation considers any flow table flow which state diverged from its underlying CT connection status for teardown which can be problematic in the following cases: - Flow has never been offloaded to hardware in the first place either because flow table has hardware offload disabled (flag NF_FLOWTABLE_HW_OFFLOAD is not set) or because it is still pending on 'add' workqueue to be offloaded for the first time. The former is incorrect, the later generates excessive deletions and additions of flows. - Flow is already pending to be updated on the workqueue. Tearing down such flows will also generate excessive removals from the flow table, especially on highly loaded system where the latency to re-offload a flow via 'add' workqueue can be quite high. When considering a flow for teardown as outdated verify that it is both offloaded to hardware and doesn't have any pending updates. Fixes: 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple") Reviewed-by: Paul Blakey Signed-off-by: Vlad Buslov Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- net/sched/act_ct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index faf798133059..4257b38c8b3e 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -277,6 +277,8 @@ err_nat: static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) { return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && + test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) && + !test_bit(NF_FLOW_HW_PENDING, &flow->flags) && !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); } From 7cbdf36eabf3dfca4a19c0b4932ae62bd743e416 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 3 Nov 2023 16:14:10 +0100 Subject: [PATCH 145/151] net/sched: act_ct: Always fill offloading tuple iifidx commit 9bc64bd0cd765f696fcd40fc98909b1f7c73b2ba upstream. Referenced commit doesn't always set iifidx when offloading the flow to hardware. Fix the following cases: - nf_conn_act_ct_ext_fill() is called before extension is created with nf_conn_act_ct_ext_add() in tcf_ct_act(). This can cause rule offload with unspecified iifidx when connection is offloaded after only single original-direction packet has been processed by tc data path. Always fill the new nf_conn_act_ct_ext instance after creating it in nf_conn_act_ct_ext_add(). - Offloading of unidirectional UDP NEW connections is now supported, but ct flow iifidx field is not updated when connection is promoted to bidirectional which can result reply-direction iifidx to be zero when refreshing the connection. Fill in the extension and update flow iifidx before calling flow_offload_refresh(). Fixes: 9795ded7f924 ("net/sched: act_ct: Fill offloading tuple iifidx") Reviewed-by: Paul Blakey Signed-off-by: Vlad Buslov Reviewed-by: Simon Horman Fixes: 6a9bad0069cf ("net/sched: act_ct: offload UDP NEW connections") Link: https://lore.kernel.org/r/20231103151410.764271-1-vladbu@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_conntrack_act_ct.h | 34 ++++++++++++--------- net/openvswitch/conntrack.c | 2 +- net/sched/act_ct.c | 15 ++++++++- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_act_ct.h b/include/net/netfilter/nf_conntrack_act_ct.h index 078d3c52c03f..e5f2f0b73a9a 100644 --- a/include/net/netfilter/nf_conntrack_act_ct.h +++ b/include/net/netfilter/nf_conntrack_act_ct.h @@ -20,21 +20,6 @@ static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_find(const struct nf #endif } -static inline struct nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct nf_conn *ct) -{ -#if IS_ENABLED(CONFIG_NET_ACT_CT) - struct nf_conn_act_ct_ext *act_ct = nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT); - - if (act_ct) - return act_ct; - - act_ct = nf_ct_ext_add(ct, NF_CT_EXT_ACT_CT, GFP_ATOMIC); - return act_ct; -#else - return NULL; -#endif -} - static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { @@ -47,4 +32,23 @@ static inline void nf_conn_act_ct_ext_fill(struct sk_buff *skb, struct nf_conn * #endif } +static inline struct +nf_conn_act_ct_ext *nf_conn_act_ct_ext_add(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ +#if IS_ENABLED(CONFIG_NET_ACT_CT) + struct nf_conn_act_ct_ext *act_ct = nf_ct_ext_find(ct, NF_CT_EXT_ACT_CT); + + if (act_ct) + return act_ct; + + act_ct = nf_ct_ext_add(ct, NF_CT_EXT_ACT_CT, GFP_ATOMIC); + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); + return act_ct; +#else + return NULL; +#endif +} + #endif /* _NF_CONNTRACK_ACT_CT_H */ diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c8eaf4234b2e..0591cfb289d5 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1252,7 +1252,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, if (err) return err; - nf_conn_act_ct_ext_add(ct); + nf_conn_act_ct_ext_add(skb, ct, ctinfo); } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && labels_nonzero(&info->labels.mask)) { err = ovs_ct_set_labels(ct, key, &info->labels.value, diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 4257b38c8b3e..d6d33f854050 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -394,6 +394,17 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; } +static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry) +{ + struct nf_conn_act_ct_ext *act_ct_ext; + + act_ct_ext = nf_conn_act_ct_ext_find(entry->ct); + if (act_ct_ext) { + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); + tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); + } +} + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, bool tcp, bool bidirectional) @@ -689,6 +700,8 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, else ctinfo = IP_CT_ESTABLISHED_REPLY; + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); + tcf_ct_flow_ct_ext_ifidx_update(flow); flow_offload_refresh(nf_ft, flow, force_refresh); if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { /* Process this flow in SW to allow promoting to ASSURED */ @@ -1191,7 +1204,7 @@ do_nat: tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); if (!nf_ct_is_confirmed(ct)) - nf_conn_act_ct_ext_add(ct); + nf_conn_act_ct_ext_add(skb, ct, ctinfo); /* This will take care of sending queued events * even if the connection is already confirmed. From 15db682980fc0d438a1706f20343ebdd01325356 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 30 Nov 2023 18:46:40 -0800 Subject: [PATCH 146/151] bpf: Fix a verifier bug due to incorrect branch offset comparison with cpu=v4 commit dfce9cb3140592b886838e06f3e0c25fea2a9cae upstream. Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") added support for new 32bit offset jmp instruction. Unfortunately, in function bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset (plus/minor a small delta) compares to 16-bit offset bound [S16_MIN, S16_MAX], which caused the following verification failure: $ ./test_progs-cpuv4 -t verif_scale_pyperf180 ... insn 10 cannot be patched due to 16-bit range ... libbpf: failed to load object 'pyperf180.bpf.o' scale_test:FAIL:expect_success unexpected error: -12 (errno 12) #405 verif_scale_pyperf180:FAIL Note that due to recent llvm18 development, the patch [2] (already applied in bpf-next) needs to be applied to bpf tree for testing purpose. The fix is rather simple. For 32bit offset branch insn, the adjusted offset compares to [S32_MIN, S32_MAX] and then verification succeeded. [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0b55ebf4a9b1..76bf1de26115 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -365,14 +365,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { - const s32 off_min = S16_MIN, off_max = S16_MAX; + s64 off_min, off_max, off; s32 delta = end_new - end_old; - s32 off; - if (insn->code == (BPF_JMP32 | BPF_JA)) + if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; - else + off_min = S32_MIN; + off_max = S32_MAX; + } else { off = insn->off; + off_min = S16_MIN; + off_max = S16_MAX; + } if (curr < pos && curr + off + 1 >= end_old) off += delta; From a5c3f2b4cee7ada7b8015129fbe52f0c2f2119ed Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 1 Dec 2023 10:01:38 -0800 Subject: [PATCH 147/151] bpf: syzkaller found null ptr deref in unix_bpf proto add commit 8d6650646ce49e9a5b8c5c23eb94f74b1749f70f upstream. I added logic to track the sock pair for stream_unix sockets so that we ensure lifetime of the sock matches the time a sockmap could reference the sock (see fixes tag). I forgot though that we allow af_unix unconnected sockets into a sock{map|hash} map. This is problematic because previous fixed expected sk_pair() to exist and did not NULL check it. Because unconnected sockets have a NULL sk_pair this resulted in the NULL ptr dereference found by syzkaller. BUG: KASAN: null-ptr-deref in unix_stream_bpf_update_proto+0x72/0x430 net/unix/unix_bpf.c:171 Write of size 4 at addr 0000000000000080 by task syz-executor360/5073 Call Trace: ... sock_hold include/net/sock.h:777 [inline] unix_stream_bpf_update_proto+0x72/0x430 net/unix/unix_bpf.c:171 sock_map_init_proto net/core/sock_map.c:190 [inline] sock_map_link+0xb87/0x1100 net/core/sock_map.c:294 sock_map_update_common+0xf6/0x870 net/core/sock_map.c:483 sock_map_update_elem_sys+0x5b6/0x640 net/core/sock_map.c:577 bpf_map_update_value+0x3af/0x820 kernel/bpf/syscall.c:167 We considered just checking for the null ptr and skipping taking a ref on the NULL peer sock. But, if the socket is then connected() after being added to the sockmap we can cause the original issue again. So instead this patch blocks adding af_unix sockets that are not in the ESTABLISHED state. Reported-by: Eric Dumazet Reported-by: syzbot+e8030702aefd3444fb9e@syzkaller.appspotmail.com Fixes: 8866730aed51 ("bpf, sockmap: af_unix stream sockets need to hold ref for pair sock") Acked-by: Jakub Sitnicki Signed-off-by: John Fastabend Link: https://lore.kernel.org/r/20231201180139.328529-2-john.fastabend@gmail.com Signed-off-by: Martin KaFai Lau Signed-off-by: Greg Kroah-Hartman --- include/net/sock.h | 5 +++++ net/core/sock_map.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 9de9f070537c..6b51e85ae69e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2829,6 +2829,11 @@ static inline bool sk_is_tcp(const struct sock *sk) return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; } +static inline bool sk_is_stream_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 38e01f82f2ef..91140bc0541f 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -538,6 +538,8 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); + if (sk_is_stream_unix(sk)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } From ec162546a73387c3fb0558e13e469cffa94efb5b Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Thu, 28 Sep 2023 01:58:25 +0100 Subject: [PATCH 148/151] media: qcom: camss: Comment CSID dt_id field commit f910d3ba78a2677c23508f225eb047d89eb4b2b6 upstream. Digging into the documentation we find that the DT_ID bitfield is used to map the six bit DT to a two bit ID code. This value is concatenated to the VC bitfield to create a CID value. DT_ID is the two least significant bits of CID and VC the most significant bits. Originally we set dt_id = vc * 4 in and then subsequently set dt_id = vc. commit 3c4ed72a16bc ("media: camss: sm8250: Virtual channels for CSID") silently fixed the multiplication by four which would give a better value for the generated CID without mentioning what was being done or why. Next up I haplessly changed the value back to "dt_id = vc * 4" since there didn't appear to be any logic behind it. Hans asked what the change was for and I honestly couldn't remember the provenance of it, so I dug in. Link: https://lore.kernel.org/linux-arm-msm/edd4bf9b-0e1b-883c-1a4d-50f4102c3924@xs4all.nl/ Add a comment so the next hapless programmer doesn't make this same mistake. Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil Signed-off-by: Greg Kroah-Hartman --- .../media/platform/qcom/camss/camss-csid-gen2.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c index 23acc387be5f..0147cc062e1a 100644 --- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c +++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c @@ -352,7 +352,19 @@ static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc) phy_sel = csid->phy.csiphy_id; if (enable) { - u8 dt_id = vc; + /* + * DT_ID is a two bit bitfield that is concatenated with + * the four least significant bits of the five bit VC + * bitfield to generate an internal CID value. + * + * CSID_RDI_CFG0(vc) + * DT_ID : 28:27 + * VC : 26:22 + * DT : 21:16 + * + * CID : VC 3:0 << 2 | DT_ID 1:0 + */ + u8 dt_id = vc & 0x03; if (tg->enabled) { /* configure one DT, infinite frames */ From f73a374c1969e4627938c3f0fe8925752622a4dd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Feb 2023 16:24:40 -0800 Subject: [PATCH 149/151] smb3: Replace smb2pdu 1-element arrays with flex-arrays commit eb3e28c1e89b4984308777231887e41aa8a0151f upstream. The kernel is globally removing the ambiguous 0-length and 1-element arrays in favor of flexible arrays, so that we can gain both compile-time and run-time array bounds checking[1]. Replace the trailing 1-element array with a flexible array in the following structures: struct smb2_err_rsp struct smb2_tree_connect_req struct smb2_negotiate_rsp struct smb2_sess_setup_req struct smb2_sess_setup_rsp struct smb2_read_req struct smb2_read_rsp struct smb2_write_req struct smb2_write_rsp struct smb2_query_directory_req struct smb2_query_directory_rsp struct smb2_set_info_req struct smb2_change_notify_rsp struct smb2_create_rsp struct smb2_query_info_req struct smb2_query_info_rsp Replace the trailing 1-element array with a flexible array, but leave the existing structure padding: struct smb2_file_all_info struct smb2_lock_req Adjust all related size calculations to match the changes to sizeof(). No machine code output or .data section differences are produced after these changes. [1] For lots of details, see both: https://docs.kernel.org/process/deprecated.html#zero-length-and-one-element-arrays https://people.kernel.org/kees/bounded-flexible-arrays-in-c Cc: Steve French Cc: Paulo Alcantara Cc: Ronnie Sahlberg Cc: Shyam Prasad N Cc: Tom Talpey Cc: Namjae Jeon Cc: Sergey Senozhatsky Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Reviewed-by: Namjae Jeon Signed-off-by: Kees Cook Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/smb2file.c | 2 +- fs/smb/client/smb2misc.c | 2 +- fs/smb/client/smb2ops.c | 14 +++++++------- fs/smb/client/smb2pdu.c | 16 +++++++-------- fs/smb/client/smb2pdu.h | 2 +- fs/smb/common/smb2pdu.h | 42 +++++++++++++++++++++++----------------- fs/smb/server/smb2ops.c | 8 ++++---- fs/smb/server/smb2pdu.c | 8 +++----- 8 files changed, 48 insertions(+), 46 deletions(-) diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index ba6cc50af390..a7475bc05cac 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -34,7 +34,7 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) len = (u32)err->ErrorContextCount * (offsetof(struct smb2_error_context_rsp, ErrorContextData) + sizeof(struct smb2_symlink_err_rsp)); - if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err)) + if (le32_to_cpu(err->ByteCount) < len || iov->iov_len < len + sizeof(*err) + 1) return ERR_PTR(-EINVAL); p = (struct smb2_error_context_rsp *)err->ErrorData; diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 88942b1fb431..fdf7a7f188c5 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -113,7 +113,7 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, } else if (nc_offset + 1 == non_ctxlen) { cifs_dbg(FYI, "no SPNEGO security blob in negprot rsp\n"); size_of_pad_before_neg_ctxts = 0; - } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE) + } else if (non_ctxlen == SMB311_NEGPROT_BASE_SIZE + 1) /* has padding, but no SPNEGO blob */ size_of_pad_before_neg_ctxts = nc_offset - non_ctxlen + 1; else diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 285d360eb59a..4596d2dfdec3 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5764,7 +5764,7 @@ struct smb_version_values smb20_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5786,7 +5786,7 @@ struct smb_version_values smb21_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5807,7 +5807,7 @@ struct smb_version_values smb3any_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5828,7 +5828,7 @@ struct smb_version_values smbdefault_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5849,7 +5849,7 @@ struct smb_version_values smb30_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5870,7 +5870,7 @@ struct smb_version_values smb302_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -5891,7 +5891,7 @@ struct smb_version_values smb311_values = { .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 05ff8a457a3d..2dfbf1b23cfa 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1386,7 +1386,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); + cpu_to_le16(sizeof(struct smb2_sess_setup_req)); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); memset(&rqst, 0, sizeof(struct smb_rqst)); @@ -1905,8 +1905,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[0].iov_len = total_len - 1; /* Testing shows that buffer offset must be at location of Buffer[0] */ - req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) - - 1 /* pad */); + req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req)); req->PathLength = cpu_to_le16(unc_path_len - 2); iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; @@ -3796,7 +3795,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, ses->Suid, (u8)watch_tree, completion_filter); /* validate that notify information is plausible */ if ((rsp_iov.iov_base == NULL) || - (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp))) + (rsp_iov.iov_len < sizeof(struct smb2_change_notify_rsp) + 1)) goto cnotify_exit; smb_rsp = (struct smb2_change_notify_rsp *)rsp_iov.iov_base; @@ -5009,7 +5008,7 @@ int SMB2_query_directory_init(const unsigned int xid, memcpy(bufptr, &asteriks, len); req->FileNameOffset = - cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_directory_req)); req->FileNameLength = cpu_to_le16(len); /* * BB could be 30 bytes or so longer if we used SMB2 specific @@ -5205,8 +5204,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - req->BufferOffset = - cpu_to_le16(sizeof(struct smb2_set_info_req) - 1); + req->BufferOffset = cpu_to_le16(sizeof(struct smb2_set_info_req)); req->BufferLength = cpu_to_le32(*size); memcpy(req->Buffer, *data, *size); @@ -5440,9 +5438,9 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; /* 1 for pad */ req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); + cpu_to_le16(sizeof(struct smb2_query_info_req)); req->OutputBufferLength = cpu_to_le32( - outbuf_len + sizeof(struct smb2_query_info_rsp) - 1); + outbuf_len + sizeof(struct smb2_query_info_rsp)); iov->iov_base = (char *)req; iov->iov_len = total_len; diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h index 1237bb86e93a..a5773a06aba8 100644 --- a/fs/smb/client/smb2pdu.h +++ b/fs/smb/client/smb2pdu.h @@ -57,7 +57,7 @@ struct smb2_rdma_crypto_transform { #define COMPOUND_FID 0xFFFFFFFFFFFFFFFFULL #define SMB2_SYMLINK_STRUCT_SIZE \ - (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + (sizeof(struct smb2_err_rsp) + sizeof(struct smb2_symlink_err_rsp)) #define SYMLINK_ERROR_TAG 0x4c4d5953 diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 07549957b309..5593bb49954c 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -189,7 +189,7 @@ struct smb2_err_rsp { __u8 ErrorContextCount; __u8 Reserved; __le32 ByteCount; /* even if zero, at least one byte follows */ - __u8 ErrorData[1]; /* variable length */ + __u8 ErrorData[]; /* variable length */ } __packed; #define SMB3_AES_CCM_NONCE 11 @@ -330,7 +330,7 @@ struct smb2_tree_connect_req { __le16 Flags; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; - __u8 Buffer[1]; /* variable length */ + __u8 Buffer[]; /* variable length */ } __packed; /* Possible ShareType values */ @@ -617,7 +617,7 @@ struct smb2_negotiate_rsp { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -638,7 +638,7 @@ struct smb2_sess_setup_req { __le16 SecurityBufferOffset; __le16 SecurityBufferLength; __le64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; /* Currently defined SessionFlags */ @@ -655,7 +655,7 @@ struct smb2_sess_setup_rsp { __le16 SessionFlags; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ + __u8 Buffer[]; /* variable length GSS security buffer */ } __packed; @@ -737,7 +737,7 @@ struct smb2_read_req { __le32 RemainingBytes; __le16 ReadChannelInfoOffset; __le16 ReadChannelInfoLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* Read flags */ @@ -752,7 +752,7 @@ struct smb2_read_rsp { __le32 DataLength; __le32 DataRemaining; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -776,7 +776,7 @@ struct smb2_write_req { __le16 WriteChannelInfoOffset; __le16 WriteChannelInfoLength; __le32 Flags; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_write_rsp { @@ -787,7 +787,7 @@ struct smb2_write_rsp { __le32 DataLength; __le32 DataRemaining; __u32 Reserved2; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; @@ -834,7 +834,10 @@ struct smb2_lock_req { __u64 PersistentFileId; __u64 VolatileFileId; /* Followed by at least one */ - struct smb2_lock_element locks[1]; + union { + struct smb2_lock_element lock; + DECLARE_FLEX_ARRAY(struct smb2_lock_element, locks); + }; } __packed; struct smb2_lock_rsp { @@ -888,7 +891,7 @@ struct smb2_query_directory_req { __le16 FileNameOffset; __le16 FileNameLength; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_directory_rsp { @@ -896,7 +899,7 @@ struct smb2_query_directory_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -919,7 +922,7 @@ struct smb2_set_info_req { __le32 AdditionalInformation; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_set_info_rsp { @@ -974,7 +977,7 @@ struct smb2_change_notify_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ + __u8 Buffer[]; /* array of file notify structs */ } __packed; @@ -1180,7 +1183,7 @@ struct smb2_create_rsp { __u64 VolatileFileId; __le32 CreateContextsOffset; __le32 CreateContextsLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct create_posix { @@ -1524,7 +1527,7 @@ struct smb2_query_info_req { __le32 Flags; __u64 PersistentFileId; __u64 VolatileFileId; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; struct smb2_query_info_rsp { @@ -1532,7 +1535,7 @@ struct smb2_query_info_rsp { __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; - __u8 Buffer[1]; + __u8 Buffer[]; } __packed; /* @@ -1593,7 +1596,10 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ __le32 Mode; __le32 AlignmentRequirement; __le32 FileNameLength; - char FileName[1]; + union { + char __pad; /* Legacy structure padding */ + DECLARE_FLEX_ARRAY(char, FileName); + }; } __packed; /* level 18 Query */ struct smb2_file_eof_info { /* encoding of request for level 10 */ diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c index 535402629655..27a9dce3e03a 100644 --- a/fs/smb/server/smb2ops.c +++ b/fs/smb/server/smb2ops.c @@ -26,7 +26,7 @@ static struct smb_version_values smb21_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -52,7 +52,7 @@ static struct smb_version_values smb30_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -79,7 +79,7 @@ static struct smb_version_values smb302_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, @@ -106,7 +106,7 @@ static struct smb_version_values smb311_server_values = { .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, .header_size = sizeof(struct smb2_hdr), .max_header_size = MAX_SMB2_HDR_SIZE, - .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .read_rsp_size = sizeof(struct smb2_read_rsp), .lock_cmd = SMB2_LOCK, .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index ea48dd06d4da..6e5ed0ac578a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -294,8 +294,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; err = ksmbd_iov_pin_rsp(work, rsp, - sizeof(struct smb2_negotiate_rsp) - - sizeof(rsp->Buffer) + AUTH_GSS_LENGTH); + sizeof(struct smb2_negotiate_rsp) + AUTH_GSS_LENGTH); if (err) return err; conn->use_spnego = true; @@ -1263,9 +1262,8 @@ err_out: if (!rc) rc = ksmbd_iov_pin_rsp(work, rsp, - sizeof(struct smb2_negotiate_rsp) - - sizeof(rsp->Buffer) + - AUTH_GSS_LENGTH + neg_ctxt_len); + sizeof(struct smb2_negotiate_rsp) + + AUTH_GSS_LENGTH + neg_ctxt_len); if (rc < 0) smb2_set_err_rsp(work); return rc; From 2dbe25ae06e65db0ceae0571c45ae644a893677e Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Sun, 7 Jan 2024 21:27:02 +0530 Subject: [PATCH 150/151] Revert "interconnect: qcom: sm8250: Enable sync_state" This reverts commit 3637f6bdfe2ccd53c493836b6e43c9a73e4513b3 which is commit bfc7db1cb94ad664546d70212699f8cc6c539e8c upstream. This resulted in boot regression on RB5 (sm8250), causing the device to hard crash into USB crash dump mode everytime. Signed-off-by: Amit Pundir Link: https://lkft.validation.linaro.org/scheduler/job/7151629#L4239 Signed-off-by: Greg Kroah-Hartman --- drivers/interconnect/qcom/sm8250.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/interconnect/qcom/sm8250.c b/drivers/interconnect/qcom/sm8250.c index 9c2dd40d9a55..5cdb058fa095 100644 --- a/drivers/interconnect/qcom/sm8250.c +++ b/drivers/interconnect/qcom/sm8250.c @@ -551,7 +551,6 @@ static struct platform_driver qnoc_driver = { .driver = { .name = "qnoc-sm8250", .of_match_table = qnoc_of_match, - .sync_state = icc_sync_state, }, }; module_platform_driver(qnoc_driver); From 7c58bfa711cb556ef1edc48e7dfa6d84e5fb8912 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 10 Jan 2024 17:10:37 +0100 Subject: [PATCH 151/151] Linux 6.1.72 Link: https://lore.kernel.org/r/20240108153511.214254205@linuxfoundation.org Tested-by: SeongJae Park Tested-by: Florian Fainelli Tested-by: Allen Pais Tested-by: Shuah Khan Tested-by: Salvatore Bonaccorso Tested-by: Linux Kernel Functional Testing Tested-by: Conor Dooley Tested-by: Jon Hunter Tested-by: Sven Joachim Tested-by: Ron Economos Tested-by: Kelsey Steele Tested-by: Pavel Machek (CIP) Tested-by: Yann Sionneau Tested-by: kernelci.org bot Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2840e36fd559..bad3387b3251 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 71 +SUBLEVEL = 72 EXTRAVERSION = NAME = Curry Ramen