From eddee1c229c21ad1686547d50ebd0c835a20cd30 Mon Sep 17 00:00:00 2001 From: Gil Fine Date: Tue, 31 Jan 2023 13:04:52 +0200 Subject: [PATCH 001/179] thunderbolt: Limit USB3 bandwidth of certain Intel USB4 host routers [ Upstream commit f0a57dd33b3eadf540912cd130db727ea824d174 ] Current Intel USB4 host routers have hardware limitation that the USB3 bandwidth cannot go higher than 16376 Mb/s. Work this around by adding a new quirk that limits the bandwidth for the affected host routers. Cc: stable@vger.kernel.org Signed-off-by: Gil Fine Signed-off-by: Mika Westerberg Signed-off-by: Sasha Levin --- drivers/thunderbolt/quirks.c | 31 +++++++++++++++++++++++++++++++ drivers/thunderbolt/tb.h | 3 +++ drivers/thunderbolt/usb4.c | 17 +++++++++++++++-- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/drivers/thunderbolt/quirks.c b/drivers/thunderbolt/quirks.c index ae28a03fa890..1157b8869bcc 100644 --- a/drivers/thunderbolt/quirks.c +++ b/drivers/thunderbolt/quirks.c @@ -26,6 +26,19 @@ static void quirk_clx_disable(struct tb_switch *sw) tb_sw_dbg(sw, "disabling CL states\n"); } +static void quirk_usb3_maximum_bandwidth(struct tb_switch *sw) +{ + struct tb_port *port; + + tb_switch_for_each_port(sw, port) { + if (!tb_port_is_usb3_down(port)) + continue; + port->max_bw = 16376; + tb_port_dbg(port, "USB3 maximum bandwidth limited to %u Mb/s\n", + port->max_bw); + } +} + struct tb_quirk { u16 hw_vendor_id; u16 hw_device_id; @@ -43,6 +56,24 @@ static const struct tb_quirk tb_quirks[] = { * DP buffers. */ { 0x8087, 0x0b26, 0x0000, 0x0000, quirk_dp_credit_allocation }, + /* + * Limit the maximum USB3 bandwidth for the following Intel USB4 + * host routers due to a hardware issue. + */ + { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI0, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, + { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI1, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, + { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI0, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, + { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI1, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, + { 0x8087, PCI_DEVICE_ID_INTEL_MTL_M_NHI0, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, + { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI0, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, + { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI1, 0x0000, 0x0000, + quirk_usb3_maximum_bandwidth }, /* * CLx is not supported on AMD USB4 Yellow Carp and Pink Sardine platforms. */ diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index e11d973a8f9b..f034723b1b40 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -252,6 +252,8 @@ struct tb_switch { * @ctl_credits: Buffers reserved for control path * @dma_credits: Number of credits allocated for DMA tunneling for all * DMA paths through this port. + * @max_bw: Maximum possible bandwidth through this adapter if set to + * non-zero. * * In USB4 terminology this structure represents an adapter (protocol or * lane adapter). @@ -277,6 +279,7 @@ struct tb_port { unsigned int total_credits; unsigned int ctl_credits; unsigned int dma_credits; + unsigned int max_bw; }; /** diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c index cf8d4f769579..3c821f5e4481 100644 --- a/drivers/thunderbolt/usb4.c +++ b/drivers/thunderbolt/usb4.c @@ -1865,6 +1865,15 @@ int usb4_port_retimer_nvm_read(struct tb_port *port, u8 index, usb4_port_retimer_nvm_read_block, &info); } +static inline unsigned int +usb4_usb3_port_max_bandwidth(const struct tb_port *port, unsigned int bw) +{ + /* Take the possible bandwidth limitation into account */ + if (port->max_bw) + return min(bw, port->max_bw); + return bw; +} + /** * usb4_usb3_port_max_link_rate() - Maximum support USB3 link rate * @port: USB3 adapter port @@ -1886,7 +1895,9 @@ int usb4_usb3_port_max_link_rate(struct tb_port *port) return ret; lr = (val & ADP_USB3_CS_4_MSLR_MASK) >> ADP_USB3_CS_4_MSLR_SHIFT; - return lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000; + ret = lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000; + + return usb4_usb3_port_max_bandwidth(port, ret); } /** @@ -1913,7 +1924,9 @@ int usb4_usb3_port_actual_link_rate(struct tb_port *port) return 0; lr = val & ADP_USB3_CS_4_ALR_MASK; - return lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000; + ret = lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000; + + return usb4_usb3_port_max_bandwidth(port, ret); } static int usb4_usb3_port_cm_request(struct tb_port *port, bool request) From f60cdd319b2c35df32934802ce2bfb6810cac048 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 10 Feb 2023 17:41:17 +0000 Subject: [PATCH 002/179] cifs: update ip_addr for ses only for primary chan setup [ Upstream commit e77978de4765229e09c8fabcf4f8419ff367317f ] We update ses->ip_addr whenever we do a session setup. But this should happen only for primary channel in mchan scenario. Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") Signed-off-by: Sasha Levin --- fs/cifs/connect.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7aecb1646b6f..43637c128374 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4082,16 +4082,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; - struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; + struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; + struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; bool is_binding = false; spin_lock(&ses->ses_lock); - if (server->dstaddr.ss_family == AF_INET6) - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); - else - scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); - if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { @@ -4115,6 +4111,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, ses->ses_status = SES_IN_SETUP; spin_unlock(&ses->ses_lock); + /* update ses ip_addr only for primary chan */ + if (server == pserver) { + if (server->dstaddr.ss_family == AF_INET6) + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); + else + scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); + } + if (!is_binding) { ses->capabilities = server->capabilities; if (!linuxExtEnabled) From a5698f3ebb785aeb73e38497a1f029c9a4b93966 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 28 Feb 2023 19:01:55 -0300 Subject: [PATCH 003/179] cifs: prevent data race in cifs_reconnect_tcon() [ Upstream commit 1bcd548d935a33c6fc58331405eb1b82fd6150de ] Make sure to get an up-to-date TCP_Server_Info::nr_targets value prior to waiting the server to be reconnected in cifs_reconnect_tcon(). It is set in cifs_tcp_ses_needs_reconnect() and protected by TCP_Server_Info::srv_lock. Create a new cifs_wait_for_server_reconnect() helper that can be used by both SMB2+ and CIFS reconnect code. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") Signed-off-by: Sasha Levin --- fs/cifs/cifsproto.h | 1 + fs/cifs/cifssmb.c | 43 ++---------------------- fs/cifs/misc.c | 44 ++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 82 ++++++++++++--------------------------------- 4 files changed, 69 insertions(+), 101 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index bc4475f6c082..98513f5af3f9 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -691,5 +691,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); void cifs_put_tcon_super(struct super_block *sb); +int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6c6a7fc47f3e..4bc6ba87baf4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -70,7 +70,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) struct cifs_ses *ses; struct TCP_Server_Info *server; struct nls_table *nls_codepage; - int retries; /* * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for @@ -98,45 +97,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } spin_unlock(&tcon->tc_lock); - retries = server->nr_targets; - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - */ - while (server->tcpStatus == CifsNeedReconnect) { - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - 10 * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - break; - } - spin_unlock(&server->srv_lock); - - if (retries && --retries) - continue; - - /* - * on "soft" mounts we wait once. Hard mounts keep - * retrying until process is killed or server comes - * back on-line - */ - if (!tcon->retry) { - cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); - return -EHOSTDOWN; - } - retries = server->nr_targets; - } + rc = cifs_wait_for_server_reconnect(server, tcon->retry); + if (rc) + return rc; spin_lock(&ses->chan_lock); if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 4e54736a0699..832856aef4b7 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1382,3 +1382,47 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, return 0; } #endif + +int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry) +{ + int timeout = 10; + int rc; + + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + timeout *= server->nr_targets; + spin_unlock(&server->srv_lock); + + /* + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. + * + * On "soft" mounts we wait once. Hard mounts keep retrying until + * process is killed or server comes back on-line. + */ + do { + rc = wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus != CifsNeedReconnect), + timeout * HZ); + if (rc < 0) { + cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", + __func__); + return -ERESTARTSYS; + } + + /* are we still trying to reconnect? */ + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsNeedReconnect) { + spin_unlock(&server->srv_lock); + return 0; + } + spin_unlock(&server->srv_lock); + } while (retry); + + cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); + return -EHOSTDOWN; +} diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6e6e44d8b4c7..83d04cd2f9df 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -139,66 +139,6 @@ out: return; } -static int wait_for_server_reconnect(struct TCP_Server_Info *server, - __le16 smb2_command, bool retry) -{ - int timeout = 10; - int rc; - - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - return 0; - } - timeout *= server->nr_targets; - spin_unlock(&server->srv_lock); - - /* - * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE - * here since they are implicitly done when session drops. - */ - switch (smb2_command) { - /* - * BB Should we keep oplock break and add flush to exceptions? - */ - case SMB2_TREE_DISCONNECT: - case SMB2_CANCEL: - case SMB2_CLOSE: - case SMB2_OPLOCK_BREAK: - return -EAGAIN; - } - - /* - * Give demultiplex thread up to 10 seconds to each target available for - * reconnect -- should be greater than cifs socket timeout which is 7 - * seconds. - * - * On "soft" mounts we wait once. Hard mounts keep retrying until - * process is killed or server comes back on-line. - */ - do { - rc = wait_event_interruptible_timeout(server->response_q, - (server->tcpStatus != CifsNeedReconnect), - timeout * HZ); - if (rc < 0) { - cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", - __func__); - return -ERESTARTSYS; - } - - /* are we still trying to reconnect? */ - spin_lock(&server->srv_lock); - if (server->tcpStatus != CifsNeedReconnect) { - spin_unlock(&server->srv_lock); - return 0; - } - spin_unlock(&server->srv_lock); - } while (retry); - - cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); - return -EHOSTDOWN; -} - static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) @@ -239,7 +179,27 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, (!tcon->ses->server) || !server) return -EIO; - rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsNeedReconnect) { + /* + * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE + * here since they are implicitly done when session drops. + */ + switch (smb2_command) { + /* + * BB Should we keep oplock break and add flush to exceptions? + */ + case SMB2_TREE_DISCONNECT: + case SMB2_CANCEL: + case SMB2_CLOSE: + case SMB2_OPLOCK_BREAK: + spin_unlock(&server->srv_lock); + return -EAGAIN; + } + } + spin_unlock(&server->srv_lock); + + rc = cifs_wait_for_server_reconnect(server, tcon->retry); if (rc) return rc; From 9dad2690b1269a75cb8e7de87ecdcc2d2550aed2 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 20 Mar 2023 06:08:19 +0000 Subject: [PATCH 004/179] cifs: avoid race conditions with parallel reconnects [ Upstream commit bc962159e8e326af634a506508034a375bf2b858 ] When multiple processes/channels do reconnects in parallel we used to return success immediately negotiate/session-setup/tree-connect, causing race conditions between processes that enter the function in parallel. This caused several errors related to session not found to show up during parallel reconnects. Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Cc: stable@vger.kernel.org Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/connect.c | 50 ++++++++++++++++++++++++++++++----------- fs/cifs/smb2pdu.c | 44 +++++++++++++++++++++--------------- fs/cifs/smb2transport.c | 17 +++++++++++--- 3 files changed, 77 insertions(+), 34 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 43637c128374..077c88c49dfd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -261,31 +261,42 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, cifs_chan_update_iface(ses, server); spin_lock(&ses->chan_lock); - if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) - goto next_session; + if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) { + spin_unlock(&ses->chan_lock); + continue; + } if (mark_smb_session) CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses); else cifs_chan_set_need_reconnect(ses, server); - /* If all channels need reconnect, then tcon needs reconnect */ - if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) - goto next_session; + cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n", + __func__, ses->chans_need_reconnect); + /* If all channels need reconnect, then tcon needs reconnect */ + if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { + spin_unlock(&ses->chan_lock); + continue; + } + spin_unlock(&ses->chan_lock); + + spin_lock(&ses->ses_lock); ses->ses_status = SES_NEED_RECON; + spin_unlock(&ses->ses_lock); list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { tcon->need_reconnect = true; + spin_lock(&tcon->tc_lock); tcon->status = TID_NEED_RECON; + spin_unlock(&tcon->tc_lock); } if (ses->tcon_ipc) { ses->tcon_ipc->need_reconnect = true; + spin_lock(&ses->tcon_ipc->tc_lock); ses->tcon_ipc->status = TID_NEED_RECON; + spin_unlock(&ses->tcon_ipc->tc_lock); } - -next_session: - spin_unlock(&ses->chan_lock); } spin_unlock(&cifs_tcp_ses_lock); } @@ -4050,11 +4061,19 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, /* only send once per connect */ spin_lock(&server->srv_lock); - if (!server->ops->need_neg(server) || + if (server->tcpStatus != CifsGood && + server->tcpStatus != CifsNew && server->tcpStatus != CifsNeedNegotiate) { + spin_unlock(&server->srv_lock); + return -EHOSTDOWN; + } + + if (!server->ops->need_neg(server) && + server->tcpStatus == CifsGood) { spin_unlock(&server->srv_lock); return 0; } + server->tcpStatus = CifsInNegotiate; spin_unlock(&server->srv_lock); @@ -4088,23 +4107,28 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, bool is_binding = false; spin_lock(&ses->ses_lock); + cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n", + __func__, ses->chans_need_reconnect); + if (ses->ses_status != SES_GOOD && ses->ses_status != SES_NEW && ses->ses_status != SES_NEED_RECON) { spin_unlock(&ses->ses_lock); - return 0; + return -EHOSTDOWN; } /* only send once per connect */ spin_lock(&ses->chan_lock); - if (CIFS_ALL_CHANS_GOOD(ses) || - cifs_chan_in_reconnect(ses, server)) { + if (CIFS_ALL_CHANS_GOOD(ses)) { + if (ses->ses_status == SES_NEED_RECON) + ses->ses_status = SES_GOOD; spin_unlock(&ses->chan_lock); spin_unlock(&ses->ses_lock); return 0; } - is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + cifs_chan_set_in_reconnect(ses, server); + is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); spin_unlock(&ses->chan_lock); if (!is_binding) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 83d04cd2f9df..f0b1ae0835d7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -199,6 +199,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, } spin_unlock(&server->srv_lock); +again: rc = cifs_wait_for_server_reconnect(server, tcon->retry); if (rc) return rc; @@ -217,6 +218,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, nls_codepage = load_nls_default(); + mutex_lock(&ses->session_mutex); /* * Recheck after acquire mutex. If another thread is negotiating * and the server never sends an answer the socket will be closed @@ -225,6 +227,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedReconnect) { spin_unlock(&server->srv_lock); + mutex_unlock(&ses->session_mutex); + + if (tcon->retry) + goto again; + rc = -EHOSTDOWN; goto out; } @@ -234,19 +241,22 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, * need to prevent multiple threads trying to simultaneously * reconnect the same SMB session */ + spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); - if (!cifs_chan_needs_reconnect(ses, server)) { + if (!cifs_chan_needs_reconnect(ses, server) && + ses->ses_status == SES_GOOD) { spin_unlock(&ses->chan_lock); - + spin_unlock(&ses->ses_lock); /* this means that we only need to tree connect */ if (tcon->need_reconnect) goto skip_sess_setup; + mutex_unlock(&ses->session_mutex); goto out; } spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); - mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(0, ses, server); if (!rc) { rc = cifs_setup_session(0, ses, server, nls_codepage); @@ -262,10 +272,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, mutex_unlock(&ses->session_mutex); goto out; } - mutex_unlock(&ses->session_mutex); skip_sess_setup: - mutex_lock(&ses->session_mutex); if (!tcon->need_reconnect) { mutex_unlock(&ses->session_mutex); goto out; @@ -280,7 +288,7 @@ skip_sess_setup: cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { /* If sess reconnected but tcon didn't, something strange ... */ - pr_warn_once("reconnect tcon failed rc = %d\n", rc); + cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); goto out; } @@ -1252,9 +1260,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) if (rc) return rc; - spin_lock(&ses->chan_lock); - is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); - spin_unlock(&ses->chan_lock); + spin_lock(&ses->ses_lock); + is_binding = (ses->ses_status == SES_GOOD); + spin_unlock(&ses->ses_lock); if (is_binding) { req->hdr.SessionId = cpu_to_le64(ses->Suid); @@ -1412,9 +1420,9 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) goto out_put_spnego_key; } - spin_lock(&ses->chan_lock); - is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); - spin_unlock(&ses->chan_lock); + spin_lock(&ses->ses_lock); + is_binding = (ses->ses_status == SES_GOOD); + spin_unlock(&ses->ses_lock); /* keep session key if binding */ if (!is_binding) { @@ -1538,9 +1546,9 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); - spin_lock(&ses->chan_lock); - is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); - spin_unlock(&ses->chan_lock); + spin_lock(&ses->ses_lock); + is_binding = (ses->ses_status == SES_GOOD); + spin_unlock(&ses->ses_lock); /* keep existing ses id and flags if binding */ if (!is_binding) { @@ -1606,9 +1614,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; - spin_lock(&ses->chan_lock); - is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); - spin_unlock(&ses->chan_lock); + spin_lock(&ses->ses_lock); + is_binding = (ses->ses_status == SES_GOOD); + spin_unlock(&ses->ses_lock); /* keep existing ses id and flags if binding */ if (!is_binding) { diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d827b7547ffa..790acf65a092 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -81,6 +81,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) struct cifs_ses *ses = NULL; int i; int rc = 0; + bool is_binding = false; spin_lock(&cifs_tcp_ses_lock); @@ -97,9 +98,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) goto out; found: + spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); - if (cifs_chan_needs_reconnect(ses, server) && - !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { + + is_binding = (cifs_chan_needs_reconnect(ses, server) && + ses->ses_status == SES_GOOD); + if (is_binding) { /* * If we are in the process of binding a new channel * to an existing session, use the master connection @@ -107,6 +111,7 @@ found: */ memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); goto out; } @@ -119,10 +124,12 @@ found: if (chan->server == server) { memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); goto out; } } spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); cifs_dbg(VFS, "%s: Could not find channel signing key for session 0x%llx\n", @@ -392,11 +399,15 @@ generate_smb3signingkey(struct cifs_ses *ses, bool is_binding = false; int chan_index = 0; + spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); - is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + is_binding = (cifs_chan_needs_reconnect(ses, server) && + ses->ses_status == SES_GOOD); + chan_index = cifs_ses_get_chan_index(ses, server); /* TODO: introduce ref counting for channels when the can be freed */ spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); /* * All channels use the same encryption/decryption keys but From a624b4796f38b0859043249f02bcf09f43d35dd4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 25 Nov 2022 09:39:33 +0900 Subject: [PATCH 005/179] zonefs: Reorganize code [ Upstream commit 4008e2a0b01aba982356fd15b128a47bf11bd9c7 ] Move all code related to zone file operations from super.c to the new file.c file. Inode and zone management code remains in super.c. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") Signed-off-by: Sasha Levin --- fs/zonefs/Makefile | 2 +- fs/zonefs/file.c | 874 ++++++++++++++++++++++++++++++++++++++++ fs/zonefs/super.c | 973 +++------------------------------------------ fs/zonefs/zonefs.h | 22 + 4 files changed, 955 insertions(+), 916 deletions(-) create mode 100644 fs/zonefs/file.c diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 9fe54f5319f2..645f7229de4a 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -3,4 +3,4 @@ ccflags-y += -I$(src) obj-$(CONFIG_ZONEFS_FS) += zonefs.o -zonefs-y := super.o sysfs.o +zonefs-y := super.o file.o sysfs.o diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c new file mode 100644 index 000000000000..ece0f3959b6d --- /dev/null +++ b/fs/zonefs/file.c @@ -0,0 +1,874 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Simple file system for zoned block devices exposing zones as files. + * + * Copyright (C) 2022 Western Digital Corporation or its affiliates. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zonefs.h" + +#include "trace.h" + +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* + * All blocks are always mapped below EOF. If reading past EOF, + * act as if there is a hole up to the file maximum size. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { + .iomap_begin = zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ + if (WARN_ON_ONCE(offset + length > zi->i_max_size)) + return -EIO; + + /* + * Sequential zones can only accept direct writes. This is already + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ + if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && + !(flags & IOMAP_DIRECT))) + return -EIO; + + /* + * For conventional zones, all blocks are always mapped. For sequential + * zones, all blocks after always mapped below the inode size (zone + * write pointer) and unwriten beyond. + */ + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_UNWRITTEN; + iomap->length = zi->i_max_size - iomap->offset; + } else { + iomap->type = IOMAP_MAPPED; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_write_iomap_ops = { + .iomap_begin = zonefs_write_iomap_begin, +}; + +static int zonefs_read_folio(struct file *unused, struct folio *folio) +{ + return iomap_read_folio(folio, &zonefs_read_iomap_ops); +} + +static void zonefs_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &zonefs_read_iomap_ops); +} + +/* + * Map blocks for page writeback. This is used only on conventional zone files, + * which implies that the page range can only be within the fixed inode size. + */ +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; + + /* If the mapping is already OK, nothing needs to be done */ + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops zonefs_writeback_ops = { + .map_blocks = zonefs_write_map_blocks, +}; + +static int zonefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); +} + +static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) +{ + struct inode *inode = file_inode(swap_file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; + } + + return iomap_swapfile_activate(sis, swap_file, span, + &zonefs_read_iomap_ops); +} + +const struct address_space_operations zonefs_file_aops = { + .read_folio = zonefs_read_folio, + .readahead = zonefs_readahead, + .writepages = zonefs_writepages, + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .migrate_folio = filemap_migrate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, + .direct_IO = noop_direct_IO, + .swap_activate = zonefs_swap_activate, +}; + +int zonefs_file_truncate(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t old_isize; + enum req_op op; + int ret = 0; + + /* + * Only sequential zone files can be truncated and truncation is allowed + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return -EPERM; + + if (!isize) + op = REQ_OP_ZONE_RESET; + else if (isize == zi->i_max_size) + op = REQ_OP_ZONE_FINISH; + else + return -EPERM; + + inode_dio_wait(inode); + + /* Serialize against page faults */ + filemap_invalidate_lock(inode->i_mapping); + + /* Serialize against zonefs_iomap_begin() */ + mutex_lock(&zi->i_truncate_mutex); + + old_isize = i_size_read(inode); + if (isize == old_isize) + goto unlock; + + ret = zonefs_zone_mgmt(inode, op); + if (ret) + goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + } + + zonefs_update_stats(inode, isize); + truncate_setsize(inode, isize); + zi->i_wpoffset = isize; + zonefs_account_active(inode); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + filemap_invalidate_unlock(inode->i_mapping); + + return ret; +} + +static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file_inode(file); + int ret = 0; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + /* + * Since only direct writes are allowed in sequential files, page cache + * flush is needed only for conventional zone files. + */ + if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) + ret = file_write_and_wait_range(file, start, end); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev); + + if (ret) + zonefs_io_error(inode, true); + + return ret; +} + +static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + + /* + * Sanity check: only conventional zone files can have shared + * writeable mappings. + */ + if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) + return VM_FAULT_NOPAGE; + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + + /* Serialize against truncates */ + filemap_invalidate_lock_shared(inode->i_mapping); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + filemap_invalidate_unlock_shared(inode->i_mapping); + + sb_end_pagefault(inode->i_sb); + return ret; +} + +static const struct vm_operations_struct zonefs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = zonefs_filemap_page_mkwrite, +}; + +static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* + * Conventional zones accept random writes, so their files can support + * shared writable mappings. For sequential zone files, only read + * mappings are possible since there are no guarantees for write + * ordering between msync() and page cache writeback. + */ + if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + + file_accessed(file); + vma->vm_ops = &zonefs_file_vm_ops; + + return 0; +} + +static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t isize = i_size_read(file_inode(file)); + + /* + * Seeks are limited to below the zone size for conventional zones + * and below the zone write pointer for sequential zones. In both + * cases, this limit is the inode size. + */ + return generic_file_llseek_size(file, offset, whence, isize, isize); +} + +static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (error) { + zonefs_io_error(inode, true); + return error; + } + + if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed + * successfully necessarily means that all preceding writes + * were also successful. So we can safely increase the inode + * size to the write end location. + */ + mutex_lock(&zi->i_truncate_mutex); + if (i_size_read(inode) < iocb->ki_pos + size) { + zonefs_update_stats(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); + } + mutex_unlock(&zi->i_truncate_mutex); + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_write_dio_ops = { + .end_io = zonefs_file_write_dio_end_io, +}; + +static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int max = bdev_max_zone_append_sectors(bdev); + struct bio *bio; + ssize_t size; + int nr_pages; + ssize_t ret; + + max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); + iov_iter_truncate(from, max); + + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); + if (!nr_pages) + return 0; + + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); + bio->bi_iter.bi_sector = zi->i_zsector; + bio->bi_ioprio = iocb->ki_ioprio; + if (iocb_is_dsync(iocb)) + bio->bi_opf |= REQ_FUA; + + ret = bio_iov_iter_get_pages(bio, from); + if (unlikely(ret)) + goto out_release; + + size = bio->bi_iter.bi_size; + task_io_account_write(size); + + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, iocb); + + ret = submit_bio_wait(bio); + + /* + * If the file zone was written underneath the file system, the zone + * write pointer may not be where we expect it to be, but the zone + * append write can still succeed. So check manually that we wrote where + * we intended to, that is, at zi->i_wpoffset. + */ + if (!ret) { + sector_t wpsector = + zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); + + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", + wpsector, zi->i_zsector); + ret = -EIO; + } + } + + zonefs_file_write_dio_end_io(iocb, size, ret, 0); + trace_zonefs_file_dio_append(inode, size, ret); + +out_release: + bio_release_pages(bio, false); + bio_put(bio); + + if (ret >= 0) { + iocb->ki_pos += size; + return size; + } + + return ret; +} + +/* + * Do not exceed the LFS limits nor the file zone size. If pos is under the + * limit it becomes a short access. If it exceeds the limit, return -EFBIG. + */ +static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) +{ + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); + loff_t max_size = zi->i_max_size; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + count = min(count, limit - pos); + } + + if (!(file->f_flags & O_LARGEFILE)) + max_size = min_t(loff_t, MAX_NON_LFS, max_size); + + if (unlikely(pos >= max_size)) + return -EFBIG; + + return min(count, max_size - pos); +} + +static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = zi->i_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + + count = zonefs_write_check_limits(file, iocb->ki_pos, + iov_iter_count(from)); + if (count < 0) + return count; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} + +/* + * Handle direct writes. For sequential zone files, this is the only possible + * write path. For these files, check that the user is issuing writes + * sequentially from the end of the file. This code assumes that the block layer + * delivers write requests to the device in sequential order. This is always the + * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE + * elevator feature is being used (e.g. mq-deadline). The block layer always + * automatically select such an elevator for zoned block devices during the + * device initialization. + */ +static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + bool sync = is_sync_kiocb(iocb); + bool append = false; + ssize_t ret, count; + + /* + * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && + (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + count = zonefs_write_checks(iocb, from); + if (count <= 0) { + ret = count; + goto inode_unlock; + } + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + + /* Enforce sequential writes (append only) in sequential zones */ + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { + mutex_lock(&zi->i_truncate_mutex); + if (iocb->ki_pos != zi->i_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; + } + mutex_unlock(&zi->i_truncate_mutex); + append = sync; + } + + if (append) + ret = zonefs_file_dio_append(iocb, from); + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; + + /* + * Update the zone write pointer offset assuming the write + * operation succeeded. If it did not, the error recovery path + * will correct it. Also do active seq file accounting. + */ + mutex_lock(&zi->i_truncate_mutex); + zi->i_wpoffset += count; + zonefs_account_active(inode); + mutex_unlock(&zi->i_truncate_mutex); + } + +inode_unlock: + inode_unlock(inode); + + return ret; +} + +static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + ssize_t ret; + + /* + * Direct IO writes are mandatory for sequential zone files so that the + * write IO issuing order is preserved. + */ + if (zi->i_ztype != ZONEFS_ZTYPE_CNV) + return -EIO; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = zonefs_write_checks(iocb, from); + if (ret <= 0) + goto inode_unlock; + + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); + if (ret > 0) + iocb->ki_pos += ret; + else if (ret == -EIO) + zonefs_io_error(inode, true); + +inode_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (sb_rdonly(inode->i_sb)) + return -EROFS; + + /* Write operations beyond the zone size are not allowed */ + if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) + return -EFBIG; + + if (iocb->ki_flags & IOCB_DIRECT) { + ssize_t ret = zonefs_file_dio_write(iocb, from); + + if (ret != -ENOTBLK) + return ret; + } + + return zonefs_file_buffered_write(iocb, from); +} + +static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + if (error) { + zonefs_io_error(file_inode(iocb->ki_filp), false); + return error; + } + + return 0; +} + +static const struct iomap_dio_ops zonefs_read_dio_ops = { + .end_io = zonefs_file_read_dio_end_io, +}; + +static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + ssize_t ret; + + /* Offline zones cannot be read */ + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + + if (iocb->ki_pos >= zi->i_max_size) + return 0; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + /* Limit read operations to written data */ + mutex_lock(&zi->i_truncate_mutex); + isize = i_size_read(inode); + if (iocb->ki_pos >= isize) { + mutex_unlock(&zi->i_truncate_mutex); + ret = 0; + goto inode_unlock; + } + iov_iter_truncate(to, isize - iocb->ki_pos); + mutex_unlock(&zi->i_truncate_mutex); + + if (iocb->ki_flags & IOCB_DIRECT) { + size_t count = iov_iter_count(to); + + if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { + ret = -EINVAL; + goto inode_unlock; + } + file_accessed(iocb->ki_filp); + ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, + &zonefs_read_dio_ops, 0, NULL, 0); + } else { + ret = generic_file_read_iter(iocb, to); + if (ret == -EIO) + zonefs_io_error(inode, false); + } + +inode_unlock: + inode_unlock_shared(inode); + + return ret; +} + +/* + * Write open accounting is done only for sequential files. + */ +static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_seq_file_write_open(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); + + if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + + if (sbi->s_max_wro_seq_files + && wro > sbi->s_max_wro_seq_files) { + atomic_dec(&sbi->s_wro_seq_files); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + zonefs_account_active(inode); + } + } + } + + zi->i_wr_refcnt++; + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_seq_file_need_wro(inode, file)) + return zonefs_seq_file_write_open(inode); + + return 0; +} + +static void zonefs_seq_file_write_close(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt--; + if (zi->i_wr_refcnt) + goto unlock; + + /* + * The file zone may not be open anymore (e.g. the file was truncated to + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, + "closing zone at %llu failed %d\n", + zi->i_zsector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + goto unlock; + } + + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + zonefs_account_active(inode); + } + + atomic_dec(&sbi->s_wro_seq_files); + +unlock: + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_seq_file_need_wro(inode, file)) + zonefs_seq_file_write_close(inode); + + return 0; +} + +const struct file_operations zonefs_file_operations = { + .open = zonefs_file_open, + .release = zonefs_file_release, + .fsync = zonefs_file_fsync, + .mmap = zonefs_file_mmap, + .llseek = zonefs_file_llseek, + .read_iter = zonefs_file_read_iter, + .write_iter = zonefs_file_write_iter, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .iopoll = iocb_bio_iopoll, +}; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a9c5c3f720ad..e808276b8801 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -30,7 +30,7 @@ /* * Manage the active zone count. Called with zi->i_truncate_mutex held. */ -static void zonefs_account_active(struct inode *inode) +void zonefs_account_active(struct inode *inode) { struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); struct zonefs_inode_info *zi = ZONEFS_I(inode); @@ -68,7 +68,7 @@ out: } } -static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) +int zonefs_zone_mgmt(struct inode *inode, enum req_op op) { struct zonefs_inode_info *zi = ZONEFS_I(inode); int ret; @@ -99,7 +99,7 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) return 0; } -static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) +void zonefs_i_size_write(struct inode *inode, loff_t isize) { struct zonefs_inode_info *zi = ZONEFS_I(inode); @@ -117,167 +117,7 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) } } -static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned int flags, - struct iomap *iomap, struct iomap *srcmap) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; - - /* - * All blocks are always mapped below EOF. If reading past EOF, - * act as if there is a hole up to the file maximum size. - */ - mutex_lock(&zi->i_truncate_mutex); - iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - isize = i_size_read(inode); - if (iomap->offset >= isize) { - iomap->type = IOMAP_HOLE; - iomap->addr = IOMAP_NULL_ADDR; - iomap->length = length; - } else { - iomap->type = IOMAP_MAPPED; - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - iomap->length = isize - iomap->offset; - } - mutex_unlock(&zi->i_truncate_mutex); - - trace_zonefs_iomap_begin(inode, iomap); - - return 0; -} - -static const struct iomap_ops zonefs_read_iomap_ops = { - .iomap_begin = zonefs_read_iomap_begin, -}; - -static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, - loff_t length, unsigned int flags, - struct iomap *iomap, struct iomap *srcmap) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; - - /* All write I/Os should always be within the file maximum size */ - if (WARN_ON_ONCE(offset + length > zi->i_max_size)) - return -EIO; - - /* - * Sequential zones can only accept direct writes. This is already - * checked when writes are issued, so warn if we see a page writeback - * operation. - */ - if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - !(flags & IOMAP_DIRECT))) - return -EIO; - - /* - * For conventional zones, all blocks are always mapped. For sequential - * zones, all blocks after always mapped below the inode size (zone - * write pointer) and unwriten beyond. - */ - mutex_lock(&zi->i_truncate_mutex); - iomap->bdev = inode->i_sb->s_bdev; - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; - isize = i_size_read(inode); - if (iomap->offset >= isize) { - iomap->type = IOMAP_UNWRITTEN; - iomap->length = zi->i_max_size - iomap->offset; - } else { - iomap->type = IOMAP_MAPPED; - iomap->length = isize - iomap->offset; - } - mutex_unlock(&zi->i_truncate_mutex); - - trace_zonefs_iomap_begin(inode, iomap); - - return 0; -} - -static const struct iomap_ops zonefs_write_iomap_ops = { - .iomap_begin = zonefs_write_iomap_begin, -}; - -static int zonefs_read_folio(struct file *unused, struct folio *folio) -{ - return iomap_read_folio(folio, &zonefs_read_iomap_ops); -} - -static void zonefs_readahead(struct readahead_control *rac) -{ - iomap_readahead(rac, &zonefs_read_iomap_ops); -} - -/* - * Map blocks for page writeback. This is used only on conventional zone files, - * which implies that the page range can only be within the fixed inode size. - */ -static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return -EIO; - if (WARN_ON_ONCE(offset >= i_size_read(inode))) - return -EIO; - - /* If the mapping is already OK, nothing needs to be done */ - if (offset >= wpc->iomap.offset && - offset < wpc->iomap.offset + wpc->iomap.length) - return 0; - - return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, - IOMAP_WRITE, &wpc->iomap, NULL); -} - -static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_write_map_blocks, -}; - -static int zonefs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct iomap_writepage_ctx wpc = { }; - - return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); -} - -static int zonefs_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct inode *inode = file_inode(swap_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { - zonefs_err(inode->i_sb, - "swap file: not a conventional zone file\n"); - return -EINVAL; - } - - return iomap_swapfile_activate(sis, swap_file, span, - &zonefs_read_iomap_ops); -} - -static const struct address_space_operations zonefs_file_aops = { - .read_folio = zonefs_read_folio, - .readahead = zonefs_readahead, - .writepages = zonefs_writepages, - .dirty_folio = filemap_dirty_folio, - .release_folio = iomap_release_folio, - .invalidate_folio = iomap_invalidate_folio, - .migrate_folio = filemap_migrate_folio, - .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, - .direct_IO = noop_direct_IO, - .swap_activate = zonefs_swap_activate, -}; - -static void zonefs_update_stats(struct inode *inode, loff_t new_isize) +void zonefs_update_stats(struct inode *inode, loff_t new_isize) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); @@ -487,7 +327,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void __zonefs_io_error(struct inode *inode, bool write) +void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; @@ -526,749 +366,6 @@ static void __zonefs_io_error(struct inode *inode, bool write) memalloc_noio_restore(noio_flag); } -static void zonefs_io_error(struct inode *inode, bool write) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - mutex_lock(&zi->i_truncate_mutex); - __zonefs_io_error(inode, write); - mutex_unlock(&zi->i_truncate_mutex); -} - -static int zonefs_file_truncate(struct inode *inode, loff_t isize) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t old_isize; - enum req_op op; - int ret = 0; - - /* - * Only sequential zone files can be truncated and truncation is allowed - * only down to a 0 size, which is equivalent to a zone reset, and to - * the maximum file size, which is equivalent to a zone finish. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EPERM; - - if (!isize) - op = REQ_OP_ZONE_RESET; - else if (isize == zi->i_max_size) - op = REQ_OP_ZONE_FINISH; - else - return -EPERM; - - inode_dio_wait(inode); - - /* Serialize against page faults */ - filemap_invalidate_lock(inode->i_mapping); - - /* Serialize against zonefs_iomap_begin() */ - mutex_lock(&zi->i_truncate_mutex); - - old_isize = i_size_read(inode); - if (isize == old_isize) - goto unlock; - - ret = zonefs_zone_mgmt(inode, op); - if (ret) - goto unlock; - - /* - * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, - * take care of open zones. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - /* - * Truncating a zone to EMPTY or FULL is the equivalent of - * closing the zone. For a truncation to 0, we need to - * re-open the zone to ensure new writes can be processed. - * For a truncation to the maximum file size, the zone is - * closed and writes cannot be accepted anymore, so clear - * the open flag. - */ - if (!isize) - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - else - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - } - - zonefs_update_stats(inode, isize); - truncate_setsize(inode, isize); - zi->i_wpoffset = isize; - zonefs_account_active(inode); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - filemap_invalidate_unlock(inode->i_mapping); - - return ret; -} - -static int zonefs_inode_setattr(struct user_namespace *mnt_userns, - struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = d_inode(dentry); - int ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - ret = setattr_prepare(&init_user_ns, dentry, iattr); - if (ret) - return ret; - - /* - * Since files and directories cannot be created nor deleted, do not - * allow setting any write attributes on the sub-directories grouping - * files by zone type. - */ - if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && - (iattr->ia_mode & 0222)) - return -EPERM; - - if (((iattr->ia_valid & ATTR_UID) && - !uid_eq(iattr->ia_uid, inode->i_uid)) || - ((iattr->ia_valid & ATTR_GID) && - !gid_eq(iattr->ia_gid, inode->i_gid))) { - ret = dquot_transfer(mnt_userns, inode, iattr); - if (ret) - return ret; - } - - if (iattr->ia_valid & ATTR_SIZE) { - ret = zonefs_file_truncate(inode, iattr->ia_size); - if (ret) - return ret; - } - - setattr_copy(&init_user_ns, inode, iattr); - - return 0; -} - -static const struct inode_operations zonefs_file_inode_operations = { - .setattr = zonefs_inode_setattr, -}; - -static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - int ret = 0; - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - /* - * Since only direct writes are allowed in sequential files, page cache - * flush is needed only for conventional zone files. - */ - if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) - ret = file_write_and_wait_range(file, start, end); - if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev); - - if (ret) - zonefs_io_error(inode, true); - - return ret; -} - -static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - vm_fault_t ret; - - if (unlikely(IS_IMMUTABLE(inode))) - return VM_FAULT_SIGBUS; - - /* - * Sanity check: only conventional zone files can have shared - * writeable mappings. - */ - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) - return VM_FAULT_NOPAGE; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* Serialize against truncates */ - filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); - filemap_invalidate_unlock_shared(inode->i_mapping); - - sb_end_pagefault(inode->i_sb); - return ret; -} - -static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = zonefs_filemap_page_mkwrite, -}; - -static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* - * Conventional zones accept random writes, so their files can support - * shared writable mappings. For sequential zone files, only read - * mappings are possible since there are no guarantees for write - * ordering between msync() and page cache writeback. - */ - if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && - (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - - file_accessed(file); - vma->vm_ops = &zonefs_file_vm_ops; - - return 0; -} - -static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) -{ - loff_t isize = i_size_read(file_inode(file)); - - /* - * Seeks are limited to below the zone size for conventional zones - * and below the zone write pointer for sequential zones. In both - * cases, this limit is the inode size. - */ - return generic_file_llseek_size(file, offset, whence, isize, isize); -} - -static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (error) { - zonefs_io_error(inode, true); - return error; - } - - if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { - /* - * Note that we may be seeing completions out of order, - * but that is not a problem since a write completed - * successfully necessarily means that all preceding writes - * were also successful. So we can safely increase the inode - * size to the write end location. - */ - mutex_lock(&zi->i_truncate_mutex); - if (i_size_read(inode) < iocb->ki_pos + size) { - zonefs_update_stats(inode, iocb->ki_pos + size); - zonefs_i_size_write(inode, iocb->ki_pos + size); - } - mutex_unlock(&zi->i_truncate_mutex); - } - - return 0; -} - -static const struct iomap_dio_ops zonefs_write_dio_ops = { - .end_io = zonefs_file_write_dio_end_io, -}; - -static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max = bdev_max_zone_append_sectors(bdev); - struct bio *bio; - ssize_t size; - int nr_pages; - ssize_t ret; - - max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); - iov_iter_truncate(from, max); - - nr_pages = iov_iter_npages(from, BIO_MAX_VECS); - if (!nr_pages) - return 0; - - bio = bio_alloc(bdev, nr_pages, - REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = zi->i_zsector; - bio->bi_ioprio = iocb->ki_ioprio; - if (iocb_is_dsync(iocb)) - bio->bi_opf |= REQ_FUA; - - ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) - goto out_release; - - size = bio->bi_iter.bi_size; - task_io_account_write(size); - - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, iocb); - - ret = submit_bio_wait(bio); - - /* - * If the file zone was written underneath the file system, the zone - * write pointer may not be where we expect it to be, but the zone - * append write can still succeed. So check manually that we wrote where - * we intended to, that is, at zi->i_wpoffset. - */ - if (!ret) { - sector_t wpsector = - zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); - - if (bio->bi_iter.bi_sector != wpsector) { - zonefs_warn(inode->i_sb, - "Corrupted write pointer %llu for zone at %llu\n", - wpsector, zi->i_zsector); - ret = -EIO; - } - } - - zonefs_file_write_dio_end_io(iocb, size, ret, 0); - trace_zonefs_file_dio_append(inode, size, ret); - -out_release: - bio_release_pages(bio, false); - bio_put(bio); - - if (ret >= 0) { - iocb->ki_pos += size; - return size; - } - - return ret; -} - -/* - * Do not exceed the LFS limits nor the file zone size. If pos is under the - * limit it becomes a short access. If it exceeds the limit, return -EFBIG. - */ -static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, - loff_t count) -{ - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t limit = rlimit(RLIMIT_FSIZE); - loff_t max_size = zi->i_max_size; - - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - count = min(count, limit - pos); - } - - if (!(file->f_flags & O_LARGEFILE)) - max_size = min_t(loff_t, MAX_NON_LFS, max_size); - - if (unlikely(pos >= max_size)) - return -EFBIG; - - return min(count, max_size - pos); -} - -static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - loff_t count; - - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) - return -EINVAL; - - if (iocb->ki_flags & IOCB_APPEND) { - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return -EINVAL; - mutex_lock(&zi->i_truncate_mutex); - iocb->ki_pos = zi->i_wpoffset; - mutex_unlock(&zi->i_truncate_mutex); - } - - count = zonefs_write_check_limits(file, iocb->ki_pos, - iov_iter_count(from)); - if (count < 0) - return count; - - iov_iter_truncate(from, count); - return iov_iter_count(from); -} - -/* - * Handle direct writes. For sequential zone files, this is the only possible - * write path. For these files, check that the user is issuing writes - * sequentially from the end of the file. This code assumes that the block layer - * delivers write requests to the device in sequential order. This is always the - * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE - * elevator feature is being used (e.g. mq-deadline). The block layer always - * automatically select such an elevator for zoned block devices during the - * device initialization. - */ -static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - bool sync = is_sync_kiocb(iocb); - bool append = false; - ssize_t ret, count; - - /* - * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT - * as this can cause write reordering (e.g. the first aio gets EAGAIN - * on the inode lock but the second goes through but is now unaligned). - */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && - (iocb->ki_flags & IOCB_NOWAIT)) - return -EOPNOTSUPP; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - count = zonefs_write_checks(iocb, from); - if (count <= 0) { - ret = count; - goto inode_unlock; - } - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - - /* Enforce sequential writes (append only) in sequential zones */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { - mutex_lock(&zi->i_truncate_mutex); - if (iocb->ki_pos != zi->i_wpoffset) { - mutex_unlock(&zi->i_truncate_mutex); - ret = -EINVAL; - goto inode_unlock; - } - mutex_unlock(&zi->i_truncate_mutex); - append = sync; - } - - if (append) - ret = zonefs_file_dio_append(iocb, from); - else - ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, 0, NULL, 0); - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (ret > 0 || ret == -EIOCBQUEUED)) { - if (ret > 0) - count = ret; - - /* - * Update the zone write pointer offset assuming the write - * operation succeeded. If it did not, the error recovery path - * will correct it. Also do active seq file accounting. - */ - mutex_lock(&zi->i_truncate_mutex); - zi->i_wpoffset += count; - zonefs_account_active(inode); - mutex_unlock(&zi->i_truncate_mutex); - } - -inode_unlock: - inode_unlock(inode); - - return ret; -} - -static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, - struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - ssize_t ret; - - /* - * Direct IO writes are mandatory for sequential zone files so that the - * write IO issuing order is preserved. - */ - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) - return -EIO; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; - } else { - inode_lock(inode); - } - - ret = zonefs_write_checks(iocb, from); - if (ret <= 0) - goto inode_unlock; - - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); - if (ret > 0) - iocb->ki_pos += ret; - else if (ret == -EIO) - zonefs_io_error(inode, true); - -inode_unlock: - inode_unlock(inode); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - - return ret; -} - -static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - if (unlikely(IS_IMMUTABLE(inode))) - return -EPERM; - - if (sb_rdonly(inode->i_sb)) - return -EROFS; - - /* Write operations beyond the zone size are not allowed */ - if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) - return -EFBIG; - - if (iocb->ki_flags & IOCB_DIRECT) { - ssize_t ret = zonefs_file_dio_write(iocb, from); - if (ret != -ENOTBLK) - return ret; - } - - return zonefs_file_buffered_write(iocb, from); -} - -static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, - int error, unsigned int flags) -{ - if (error) { - zonefs_io_error(file_inode(iocb->ki_filp), false); - return error; - } - - return 0; -} - -static const struct iomap_dio_ops zonefs_read_dio_ops = { - .end_io = zonefs_file_read_dio_end_io, -}; - -static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - loff_t isize; - ssize_t ret; - - /* Offline zones cannot be read */ - if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) - return -EPERM; - - if (iocb->ki_pos >= zi->i_max_size) - return 0; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock_shared(inode)) - return -EAGAIN; - } else { - inode_lock_shared(inode); - } - - /* Limit read operations to written data */ - mutex_lock(&zi->i_truncate_mutex); - isize = i_size_read(inode); - if (iocb->ki_pos >= isize) { - mutex_unlock(&zi->i_truncate_mutex); - ret = 0; - goto inode_unlock; - } - iov_iter_truncate(to, isize - iocb->ki_pos); - mutex_unlock(&zi->i_truncate_mutex); - - if (iocb->ki_flags & IOCB_DIRECT) { - size_t count = iov_iter_count(to); - - if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { - ret = -EINVAL; - goto inode_unlock; - } - file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, - &zonefs_read_dio_ops, 0, NULL, 0); - } else { - ret = generic_file_read_iter(iocb, to); - if (ret == -EIO) - zonefs_io_error(inode, false); - } - -inode_unlock: - inode_unlock_shared(inode); - - return ret; -} - -/* - * Write open accounting is done only for sequential files. - */ -static inline bool zonefs_seq_file_need_wro(struct inode *inode, - struct file *file) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) - return false; - - if (!(file->f_mode & FMODE_WRITE)) - return false; - - return true; -} - -static int zonefs_seq_file_write_open(struct inode *inode) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - if (!zi->i_wr_refcnt) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); - - if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { - - if (sbi->s_max_wro_seq_files - && wro > sbi->s_max_wro_seq_files) { - atomic_dec(&sbi->s_wro_seq_files); - ret = -EBUSY; - goto unlock; - } - - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); - if (ret) { - atomic_dec(&sbi->s_wro_seq_files); - goto unlock; - } - zi->i_flags |= ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - } - } - - zi->i_wr_refcnt++; - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - - return ret; -} - -static int zonefs_file_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = generic_file_open(inode, file); - if (ret) - return ret; - - if (zonefs_seq_file_need_wro(inode, file)) - return zonefs_seq_file_write_open(inode); - - return 0; -} - -static void zonefs_seq_file_write_close(struct inode *inode) -{ - struct zonefs_inode_info *zi = ZONEFS_I(inode); - struct super_block *sb = inode->i_sb; - struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - int ret = 0; - - mutex_lock(&zi->i_truncate_mutex); - - zi->i_wr_refcnt--; - if (zi->i_wr_refcnt) - goto unlock; - - /* - * The file zone may not be open anymore (e.g. the file was truncated to - * its maximum size or it was fully written). For this case, we only - * need to decrement the write open count. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) { - __zonefs_io_error(inode, false); - /* - * Leaving zones explicitly open may lead to a state - * where most zones cannot be written (zone resources - * exhausted). So take preventive action by remounting - * read-only. - */ - if (zi->i_flags & ZONEFS_ZONE_OPEN && - !(sb->s_flags & SB_RDONLY)) { - zonefs_warn(sb, - "closing zone at %llu failed %d\n", - zi->i_zsector, ret); - zonefs_warn(sb, - "remounting filesystem read-only\n"); - sb->s_flags |= SB_RDONLY; - } - goto unlock; - } - - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); - } - - atomic_dec(&sbi->s_wro_seq_files); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); -} - -static int zonefs_file_release(struct inode *inode, struct file *file) -{ - /* - * If we explicitly open a zone we must close it again as well, but the - * zone management operation can fail (either due to an IO error or as - * the zone has gone offline or read-only). Make sure we don't fail the - * close(2) for user-space. - */ - if (zonefs_seq_file_need_wro(inode, file)) - zonefs_seq_file_write_close(inode); - - return 0; -} - -static const struct file_operations zonefs_file_operations = { - .open = zonefs_file_open, - .release = zonefs_file_release, - .fsync = zonefs_file_fsync, - .mmap = zonefs_file_mmap, - .llseek = zonefs_file_llseek, - .read_iter = zonefs_file_read_iter, - .write_iter = zonefs_file_write_iter, - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .iopoll = iocb_bio_iopoll, -}; - static struct kmem_cache *zonefs_inode_cachep; static struct inode *zonefs_alloc_inode(struct super_block *sb) @@ -1408,13 +505,47 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data) return zonefs_parse_options(sb, data); } -static const struct super_operations zonefs_sops = { - .alloc_inode = zonefs_alloc_inode, - .free_inode = zonefs_free_inode, - .statfs = zonefs_statfs, - .remount_fs = zonefs_remount, - .show_options = zonefs_show_options, -}; +static int zonefs_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + ret = setattr_prepare(&init_user_ns, dentry, iattr); + if (ret) + return ret; + + /* + * Since files and directories cannot be created nor deleted, do not + * allow setting any write attributes on the sub-directories grouping + * files by zone type. + */ + if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && + (iattr->ia_mode & 0222)) + return -EPERM; + + if (((iattr->ia_valid & ATTR_UID) && + !uid_eq(iattr->ia_uid, inode->i_uid)) || + ((iattr->ia_valid & ATTR_GID) && + !gid_eq(iattr->ia_gid, inode->i_gid))) { + ret = dquot_transfer(mnt_userns, inode, iattr); + if (ret) + return ret; + } + + if (iattr->ia_valid & ATTR_SIZE) { + ret = zonefs_file_truncate(inode, iattr->ia_size); + if (ret) + return ret; + } + + setattr_copy(&init_user_ns, inode, iattr); + + return 0; +} static const struct inode_operations zonefs_dir_inode_operations = { .lookup = simple_lookup, @@ -1434,6 +565,10 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, inc_nlink(parent); } +static const struct inode_operations zonefs_file_inode_operations = { + .setattr = zonefs_inode_setattr, +}; + static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, enum zonefs_ztype type) { @@ -1785,6 +920,14 @@ free_page: return ret; } +static const struct super_operations zonefs_sops = { + .alloc_inode = zonefs_alloc_inode, + .free_inode = zonefs_free_inode, + .statfs = zonefs_statfs, + .remount_fs = zonefs_remount, + .show_options = zonefs_show_options, +}; + /* * Check that the device is zoned. If it is, get the list of zones and create * sub-directories and files according to the device zone configuration and diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 1dbe78119ff1..839ebe9afb6c 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -209,6 +209,28 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) #define zonefs_warn(sb, format, args...) \ pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) +/* In super.c */ +void zonefs_account_active(struct inode *inode); +int zonefs_zone_mgmt(struct inode *inode, enum req_op op); +void zonefs_i_size_write(struct inode *inode, loff_t isize); +void zonefs_update_stats(struct inode *inode, loff_t new_isize); +void __zonefs_io_error(struct inode *inode, bool write); + +static inline void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); + mutex_unlock(&zi->i_truncate_mutex); +} + +/* In file.c */ +extern const struct address_space_operations zonefs_file_aops; +extern const struct file_operations zonefs_file_operations; +int zonefs_file_truncate(struct inode *inode, loff_t isize); + +/* In sysfs.c */ int zonefs_sysfs_register(struct super_block *sb); void zonefs_sysfs_unregister(struct super_block *sb); int zonefs_sysfs_init(void); From 3741898b169476c17ece7962e5acf415c7041265 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 25 Nov 2022 11:06:20 +0900 Subject: [PATCH 006/179] zonefs: Simplify IO error handling [ Upstream commit 46a9c526eef7fb68a00321e2a9591ce5276ae92b ] Simplify zonefs_check_zone_condition() by moving the code that changes an inode access rights to the new function zonefs_inode_update_mode(). Furthermore, since on mount an inode wpoffset is always zero when zonefs_check_zone_condition() is called during an inode initialization, the "mount" boolean argument is not necessary for the readonly zone case. This argument is thus removed. zonefs_io_error_cb() is also modified to use the inode offline and zone state flags instead of checking the device zone condition. The multiple calls to zonefs_check_zone_condition() are reduced to the first call on entry, which allows removing the "warn" argument. zonefs_inode_update_mode() is also used to update an inode access rights as zonefs_io_error_cb() modifies the inode flags depending on the volume error handling mode (defined with a mount option). Since an inode mode change differs for read-only zones between mount time and IO error time, the flag ZONEFS_ZONE_INIT_MODE is used to differentiate both cases. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") Signed-off-by: Sasha Levin --- fs/zonefs/super.c | 110 ++++++++++++++++++++++++--------------------- fs/zonefs/zonefs.h | 9 ++-- 2 files changed, 64 insertions(+), 55 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e808276b8801..6307cc95be06 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -155,48 +155,31 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize) * amount of readable data in the zone. */ static loff_t zonefs_check_zone_condition(struct inode *inode, - struct blk_zone *zone, bool warn, - bool mount) + struct blk_zone *zone) { struct zonefs_inode_info *zi = ZONEFS_I(inode); switch (zone->cond) { case BLK_ZONE_COND_OFFLINE: - /* - * Dead zone: make the inode immutable, disable all accesses - * and set the file size to 0 (zone wp set to zone start). - */ - if (warn) - zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", - inode->i_ino); - inode->i_flags |= S_IMMUTABLE; - inode->i_mode &= ~0777; - zone->wp = zone->start; + zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", + inode->i_ino); zi->i_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* - * The write pointer of read-only zones is invalid. If such a - * zone is found during mount, the file size cannot be retrieved - * so we treat the zone as offline (mount == true case). - * Otherwise, keep the file size as it was when last updated - * so that the user can recover data. In both cases, writes are - * always disabled for the zone. + * The write pointer of read-only zones is invalid, so we cannot + * determine the zone wpoffset (inode size). We thus keep the + * zone wpoffset as is, which leads to an empty file + * (wpoffset == 0) on mount. For a runtime error, this keeps + * the inode size as it was when last updated so that the user + * can recover data. */ - if (warn) - zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", - inode->i_ino); - inode->i_flags |= S_IMMUTABLE; - if (mount) { - zone->cond = BLK_ZONE_COND_OFFLINE; - inode->i_mode &= ~0777; - zone->wp = zone->start; - zi->i_flags |= ZONEFS_ZONE_OFFLINE; - return 0; - } + zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", + inode->i_ino); zi->i_flags |= ZONEFS_ZONE_READONLY; - inode->i_mode &= ~0222; - return i_size_read(inode); + if (zi->i_ztype == ZONEFS_ZTYPE_CNV) + return zi->i_max_size; + return zi->i_wpoffset; case BLK_ZONE_COND_FULL: /* The write pointer of full zones is invalid. */ return zi->i_max_size; @@ -207,6 +190,30 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, } } +/* + * Check a zone condition and adjust its inode access permissions for + * offline and readonly zones. + */ +static void zonefs_inode_update_mode(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + if (zi->i_flags & ZONEFS_ZONE_OFFLINE) { + /* Offline zones cannot be read nor written */ + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0777; + } else if (zi->i_flags & ZONEFS_ZONE_READONLY) { + /* Readonly zones cannot be written */ + inode->i_flags |= S_IMMUTABLE; + if (zi->i_flags & ZONEFS_ZONE_INIT_MODE) + inode->i_mode &= ~0777; + else + inode->i_mode &= ~0222; + } + + zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE; +} + struct zonefs_ioerr_data { struct inode *inode; bool write; @@ -228,10 +235,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * as there is no inconsistency between the inode size and the amount of * data writen in the zone (data_size). */ - data_size = zonefs_check_zone_condition(inode, zone, true, false); + data_size = zonefs_check_zone_condition(inode, zone); isize = i_size_read(inode); - if (zone->cond != BLK_ZONE_COND_OFFLINE && - zone->cond != BLK_ZONE_COND_READONLY && + if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && !err->write && isize == data_size) return 0; @@ -264,24 +270,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * zone condition to read-only and offline respectively, as if the * condition was signaled by the hardware. */ - if (zone->cond == BLK_ZONE_COND_OFFLINE || - sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { + if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { zonefs_warn(sb, "inode %lu: read/write access disabled\n", inode->i_ino); - if (zone->cond != BLK_ZONE_COND_OFFLINE) { - zone->cond = BLK_ZONE_COND_OFFLINE; - data_size = zonefs_check_zone_condition(inode, zone, - false, false); - } - } else if (zone->cond == BLK_ZONE_COND_READONLY || - sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { + if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE)) + zi->i_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_inode_update_mode(inode); + data_size = 0; + } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { zonefs_warn(sb, "inode %lu: write access disabled\n", inode->i_ino); - if (zone->cond != BLK_ZONE_COND_READONLY) { - zone->cond = BLK_ZONE_COND_READONLY; - data_size = zonefs_check_zone_condition(inode, zone, - false, false); - } + if (!(zi->i_flags & ZONEFS_ZONE_READONLY)) + zi->i_flags |= ZONEFS_ZONE_READONLY; + zonefs_inode_update_mode(inode); + data_size = isize; } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && data_size > isize) { /* Do not expose garbage data */ @@ -295,8 +299,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * close of the zone when the inode file is closed. */ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && - (zone->cond == BLK_ZONE_COND_OFFLINE || - zone->cond == BLK_ZONE_COND_READONLY)) + (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) zi->i_flags &= ~ZONEFS_ZONE_OPEN; /* @@ -378,6 +381,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); + zi->i_wpoffset = 0; zi->i_wr_refcnt = 0; zi->i_flags = 0; @@ -594,7 +598,7 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, zone->capacity << SECTOR_SHIFT); - zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true); + zi->i_wpoffset = zonefs_check_zone_condition(inode, zone); inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; @@ -605,6 +609,10 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, inode->i_fop = &zonefs_file_operations; inode->i_mapping->a_ops = &zonefs_file_aops; + /* Update the inode access rights depending on the zone condition */ + zi->i_flags |= ZONEFS_ZONE_INIT_MODE; + zonefs_inode_update_mode(inode); + sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 839ebe9afb6c..439096445ee5 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -39,10 +39,11 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } -#define ZONEFS_ZONE_OPEN (1U << 0) -#define ZONEFS_ZONE_ACTIVE (1U << 1) -#define ZONEFS_ZONE_OFFLINE (1U << 2) -#define ZONEFS_ZONE_READONLY (1U << 3) +#define ZONEFS_ZONE_INIT_MODE (1U << 0) +#define ZONEFS_ZONE_OPEN (1U << 1) +#define ZONEFS_ZONE_ACTIVE (1U << 2) +#define ZONEFS_ZONE_OFFLINE (1U << 3) +#define ZONEFS_ZONE_READONLY (1U << 4) /* * In-memory inode data. From 7558b249cb4e77b2cf58725895e05bbe0fd1a80e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 24 Nov 2022 19:43:30 +0900 Subject: [PATCH 007/179] zonefs: Reduce struct zonefs_inode_info size [ Upstream commit 34422914dc00b291d1c47dbdabe93b154c2f2b25 ] Instead of using the i_ztype field in struct zonefs_inode_info to indicate the zone type of an inode, introduce the new inode flag ZONEFS_ZONE_CNV to be set in the i_flags field of struct zonefs_inode_info to identify conventional zones. If this flag is not set, the zone of an inode is considered to be a sequential zone. The helpers zonefs_zone_is_cnv(), zonefs_zone_is_seq(), zonefs_inode_is_cnv() and zonefs_inode_is_seq() are introduced to simplify testing the zone type of a struct zonefs_inode_info and of a struct inode. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") Signed-off-by: Sasha Levin --- fs/zonefs/file.c | 35 ++++++++++++++--------------------- fs/zonefs/super.c | 12 +++++++----- fs/zonefs/zonefs.h | 24 +++++++++++++++++++++--- 3 files changed, 42 insertions(+), 29 deletions(-) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index ece0f3959b6d..64873d31d75d 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -77,8 +77,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, * checked when writes are issued, so warn if we see a page writeback * operation. */ - if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - !(flags & IOMAP_DIRECT))) + if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) return -EIO; /* @@ -128,7 +127,7 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, { struct zonefs_inode_info *zi = ZONEFS_I(inode); - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) + if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) return -EIO; if (WARN_ON_ONCE(offset >= i_size_read(inode))) return -EIO; @@ -158,9 +157,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct inode *inode = file_inode(swap_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { + if (zonefs_inode_is_seq(inode)) { zonefs_err(inode->i_sb, "swap file: not a conventional zone file\n"); return -EINVAL; @@ -196,7 +194,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) * only down to a 0 size, which is equivalent to a zone reset, and to * the maximum file size, which is equivalent to a zone finish. */ - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + if (!zonefs_zone_is_seq(zi)) return -EPERM; if (!isize) @@ -266,7 +264,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, * Since only direct writes are allowed in sequential files, page cache * flush is needed only for conventional zone files. */ - if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) + if (zonefs_inode_is_cnv(inode)) ret = file_write_and_wait_range(file, start, end); if (!ret) ret = blkdev_issue_flush(inode->i_sb->s_bdev); @@ -280,7 +278,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); vm_fault_t ret; if (unlikely(IS_IMMUTABLE(inode))) @@ -290,7 +287,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) * Sanity check: only conventional zone files can have shared * writeable mappings. */ - if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) + if (zonefs_inode_is_seq(inode)) return VM_FAULT_NOPAGE; sb_start_pagefault(inode->i_sb); @@ -319,7 +316,7 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) * mappings are possible since there are no guarantees for write * ordering between msync() and page cache writeback. */ - if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && + if (zonefs_inode_is_seq(file_inode(file)) && (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) return -EINVAL; @@ -352,7 +349,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, return error; } - if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { + if (size && zonefs_zone_is_seq(zi)) { /* * Note that we may be seeing completions out of order, * but that is not a problem since a write completed @@ -491,7 +488,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EINVAL; if (iocb->ki_flags & IOCB_APPEND) { - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + if (zonefs_zone_is_cnv(zi)) return -EINVAL; mutex_lock(&zi->i_truncate_mutex); iocb->ki_pos = zi->i_wpoffset; @@ -531,8 +528,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned). */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && - (iocb->ki_flags & IOCB_NOWAIT)) + if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -554,7 +550,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) } /* Enforce sequential writes (append only) in sequential zones */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { + if (zonefs_zone_is_seq(zi)) { mutex_lock(&zi->i_truncate_mutex); if (iocb->ki_pos != zi->i_wpoffset) { mutex_unlock(&zi->i_truncate_mutex); @@ -570,7 +566,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) else ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, &zonefs_write_dio_ops, 0, NULL, 0); - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && + if (zonefs_zone_is_seq(zi) && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) count = ret; @@ -596,14 +592,13 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); ssize_t ret; /* * Direct IO writes are mandatory for sequential zone files so that the * write IO issuing order is preserved. */ - if (zi->i_ztype != ZONEFS_ZTYPE_CNV) + if (zonefs_inode_is_seq(inode)) return -EIO; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -731,9 +726,7 @@ inode_unlock: static inline bool zonefs_seq_file_need_wro(struct inode *inode, struct file *file) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + if (zonefs_inode_is_cnv(inode)) return false; if (!(file->f_mode & FMODE_WRITE)) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 6307cc95be06..a4af29dc32e7 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -37,7 +37,7 @@ void zonefs_account_active(struct inode *inode) lockdep_assert_held(&zi->i_truncate_mutex); - if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + if (zonefs_zone_is_cnv(zi)) return; /* @@ -177,14 +177,14 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", inode->i_ino); zi->i_flags |= ZONEFS_ZONE_READONLY; - if (zi->i_ztype == ZONEFS_ZTYPE_CNV) + if (zonefs_zone_is_cnv(zi)) return zi->i_max_size; return zi->i_wpoffset; case BLK_ZONE_COND_FULL: /* The write pointer of full zones is invalid. */ return zi->i_max_size; default: - if (zi->i_ztype == ZONEFS_ZTYPE_CNV) + if (zonefs_zone_is_cnv(zi)) return zi->i_max_size; return (zone->wp - zone->start) << SECTOR_SHIFT; } @@ -260,7 +260,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) + if (zonefs_zone_is_seq(zi) && isize != data_size) zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); @@ -584,7 +584,9 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; - zi->i_ztype = type; + if (type == ZONEFS_ZTYPE_CNV) + zi->i_flags |= ZONEFS_ZONE_CNV; + zi->i_zsector = zone->start; zi->i_zone_size = zone->len << SECTOR_SHIFT; if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 439096445ee5..1a225f74015a 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -44,6 +44,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) #define ZONEFS_ZONE_ACTIVE (1U << 2) #define ZONEFS_ZONE_OFFLINE (1U << 3) #define ZONEFS_ZONE_READONLY (1U << 4) +#define ZONEFS_ZONE_CNV (1U << 31) /* * In-memory inode data. @@ -51,9 +52,6 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) struct zonefs_inode_info { struct inode i_vnode; - /* File zone type */ - enum zonefs_ztype i_ztype; - /* File zone start sector (512B unit) */ sector_t i_zsector; @@ -91,6 +89,26 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) return container_of(inode, struct zonefs_inode_info, i_vnode); } +static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi) +{ + return zi->i_flags & ZONEFS_ZONE_CNV; +} + +static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi) +{ + return !zonefs_zone_is_cnv(zi); +} + +static inline bool zonefs_inode_is_cnv(struct inode *inode) +{ + return zonefs_zone_is_cnv(ZONEFS_I(inode)); +} + +static inline bool zonefs_inode_is_seq(struct inode *inode) +{ + return zonefs_zone_is_seq(ZONEFS_I(inode)); +} + /* * On-disk super block (block 0). */ From 81cf745f110557673c8c7f0263bd8c30165bd07b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 16 Nov 2022 18:15:40 +0900 Subject: [PATCH 008/179] zonefs: Separate zone information from inode information [ Upstream commit aa7f243f32e1d18036ee00d71d3ccfad70ae2121 ] In preparation for adding dynamic inode allocation, separate an inode zone information from the zonefs inode structure. The new data structure zonefs_zone is introduced to store in memory information about a zone that must be kept throughout the lifetime of the device mount. Linking between a zone file inode and its zone information is done by setting the inode i_private field to point to a struct zonefs_zone. Using the i_private pointer avoids the need for adding a pointer in struct zonefs_inode_info. Beside the vfs inode, this structure is reduced to a mutex and a write open counter. One struct zonefs_zone is created per file inode on mount. These structures are organized in an array using the new struct zonefs_zone_group data structure to represent zone groups. The zonefs_zone arrays are indexed per file number (the index of a struct zonefs_zone in its array directly gives the file number/name for that zone file inode). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") Signed-off-by: Sasha Levin --- fs/zonefs/file.c | 99 ++++---- fs/zonefs/super.c | 571 +++++++++++++++++++++++++++------------------ fs/zonefs/trace.h | 20 +- fs/zonefs/zonefs.h | 65 ++++-- 4 files changed, 450 insertions(+), 305 deletions(-) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 64873d31d75d..738b0e28d74b 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -29,6 +29,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, struct iomap *iomap, struct iomap *srcmap) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; loff_t isize; @@ -46,7 +47,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, iomap->length = length; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; iomap->length = isize - iomap->offset; } mutex_unlock(&zi->i_truncate_mutex); @@ -65,11 +66,12 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, struct iomap *iomap, struct iomap *srcmap) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; loff_t isize; /* All write I/Os should always be within the file maximum size */ - if (WARN_ON_ONCE(offset + length > zi->i_max_size)) + if (WARN_ON_ONCE(offset + length > z->z_capacity)) return -EIO; /* @@ -77,7 +79,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, * checked when writes are issued, so warn if we see a page writeback * operation. */ - if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) + if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) return -EIO; /* @@ -88,11 +90,11 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, mutex_lock(&zi->i_truncate_mutex); iomap->bdev = inode->i_sb->s_bdev; iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; isize = i_size_read(inode); if (iomap->offset >= isize) { iomap->type = IOMAP_UNWRITTEN; - iomap->length = zi->i_max_size - iomap->offset; + iomap->length = z->z_capacity - iomap->offset; } else { iomap->type = IOMAP_MAPPED; iomap->length = isize - iomap->offset; @@ -125,9 +127,9 @@ static void zonefs_readahead(struct readahead_control *rac) static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); - if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) + if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) return -EIO; if (WARN_ON_ONCE(offset >= i_size_read(inode))) return -EIO; @@ -137,7 +139,8 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, offset < wpc->iomap.offset + wpc->iomap.length) return 0; - return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, + return zonefs_write_iomap_begin(inode, offset, + z->z_capacity - offset, IOMAP_WRITE, &wpc->iomap, NULL); } @@ -185,6 +188,7 @@ const struct address_space_operations zonefs_file_aops = { int zonefs_file_truncate(struct inode *inode, loff_t isize) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); loff_t old_isize; enum req_op op; int ret = 0; @@ -194,12 +198,12 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) * only down to a 0 size, which is equivalent to a zone reset, and to * the maximum file size, which is equivalent to a zone finish. */ - if (!zonefs_zone_is_seq(zi)) + if (!zonefs_zone_is_seq(z)) return -EPERM; if (!isize) op = REQ_OP_ZONE_RESET; - else if (isize == zi->i_max_size) + else if (isize == z->z_capacity) op = REQ_OP_ZONE_FINISH; else return -EPERM; @@ -216,7 +220,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) if (isize == old_isize) goto unlock; - ret = zonefs_zone_mgmt(inode, op); + ret = zonefs_inode_zone_mgmt(inode, op); if (ret) goto unlock; @@ -224,7 +228,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, * take care of open zones. */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { + if (z->z_flags & ZONEFS_ZONE_OPEN) { /* * Truncating a zone to EMPTY or FULL is the equivalent of * closing the zone. For a truncation to 0, we need to @@ -234,15 +238,15 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) * the open flag. */ if (!isize) - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); else - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + z->z_flags &= ~ZONEFS_ZONE_OPEN; } zonefs_update_stats(inode, isize); truncate_setsize(inode, isize); - zi->i_wpoffset = isize; - zonefs_account_active(inode); + z->z_wpoffset = isize; + zonefs_inode_account_active(inode); unlock: mutex_unlock(&zi->i_truncate_mutex); @@ -349,7 +353,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, return error; } - if (size && zonefs_zone_is_seq(zi)) { + if (size && zonefs_inode_is_seq(inode)) { /* * Note that we may be seeing completions out of order, * but that is not a problem since a write completed @@ -375,7 +379,7 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = { static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct block_device *bdev = inode->i_sb->s_bdev; unsigned int max = bdev_max_zone_append_sectors(bdev); struct bio *bio; @@ -392,7 +396,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) bio = bio_alloc(bdev, nr_pages, REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = zi->i_zsector; + bio->bi_iter.bi_sector = z->z_sector; bio->bi_ioprio = iocb->ki_ioprio; if (iocb_is_dsync(iocb)) bio->bi_opf |= REQ_FUA; @@ -417,12 +421,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) */ if (!ret) { sector_t wpsector = - zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); + z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); if (bio->bi_iter.bi_sector != wpsector) { zonefs_warn(inode->i_sb, "Corrupted write pointer %llu for zone at %llu\n", - wpsector, zi->i_zsector); + wpsector, z->z_sector); ret = -EIO; } } @@ -450,9 +454,9 @@ static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, loff_t count) { struct inode *inode = file_inode(file); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); loff_t limit = rlimit(RLIMIT_FSIZE); - loff_t max_size = zi->i_max_size; + loff_t max_size = z->z_capacity; if (limit != RLIM_INFINITY) { if (pos >= limit) { @@ -476,6 +480,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); loff_t count; if (IS_SWAPFILE(inode)) @@ -488,10 +493,10 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) return -EINVAL; if (iocb->ki_flags & IOCB_APPEND) { - if (zonefs_zone_is_cnv(zi)) + if (zonefs_zone_is_cnv(z)) return -EINVAL; mutex_lock(&zi->i_truncate_mutex); - iocb->ki_pos = zi->i_wpoffset; + iocb->ki_pos = z->z_wpoffset; mutex_unlock(&zi->i_truncate_mutex); } @@ -518,6 +523,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; bool sync = is_sync_kiocb(iocb); bool append = false; @@ -528,7 +534,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned). */ - if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -550,9 +556,9 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) } /* Enforce sequential writes (append only) in sequential zones */ - if (zonefs_zone_is_seq(zi)) { + if (zonefs_zone_is_seq(z)) { mutex_lock(&zi->i_truncate_mutex); - if (iocb->ki_pos != zi->i_wpoffset) { + if (iocb->ki_pos != z->z_wpoffset) { mutex_unlock(&zi->i_truncate_mutex); ret = -EINVAL; goto inode_unlock; @@ -566,7 +572,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) else ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, &zonefs_write_dio_ops, 0, NULL, 0); - if (zonefs_zone_is_seq(zi) && + if (zonefs_zone_is_seq(z) && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) count = ret; @@ -577,8 +583,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) * will correct it. Also do active seq file accounting. */ mutex_lock(&zi->i_truncate_mutex); - zi->i_wpoffset += count; - zonefs_account_active(inode); + z->z_wpoffset += count; + zonefs_inode_account_active(inode); mutex_unlock(&zi->i_truncate_mutex); } @@ -629,6 +635,7 @@ inode_unlock: static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_zone *z = zonefs_inode_zone(inode); if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -636,8 +643,8 @@ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sb_rdonly(inode->i_sb)) return -EROFS; - /* Write operations beyond the zone size are not allowed */ - if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) + /* Write operations beyond the zone capacity are not allowed */ + if (iocb->ki_pos >= z->z_capacity) return -EFBIG; if (iocb->ki_flags & IOCB_DIRECT) { @@ -669,6 +676,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; loff_t isize; ssize_t ret; @@ -677,7 +685,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) return -EPERM; - if (iocb->ki_pos >= zi->i_max_size) + if (iocb->ki_pos >= z->z_capacity) return 0; if (iocb->ki_flags & IOCB_NOWAIT) { @@ -738,6 +746,7 @@ static inline bool zonefs_seq_file_need_wro(struct inode *inode, static int zonefs_seq_file_write_open(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); int ret = 0; mutex_lock(&zi->i_truncate_mutex); @@ -755,14 +764,15 @@ static int zonefs_seq_file_write_open(struct inode *inode) goto unlock; } - if (i_size_read(inode) < zi->i_max_size) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (i_size_read(inode) < z->z_capacity) { + ret = zonefs_inode_zone_mgmt(inode, + REQ_OP_ZONE_OPEN); if (ret) { atomic_dec(&sbi->s_wro_seq_files); goto unlock; } - zi->i_flags |= ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); + z->z_flags |= ZONEFS_ZONE_OPEN; + zonefs_inode_account_active(inode); } } } @@ -792,6 +802,7 @@ static int zonefs_file_open(struct inode *inode, struct file *file) static void zonefs_seq_file_write_close(struct inode *inode) { struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); int ret = 0; @@ -807,8 +818,8 @@ static void zonefs_seq_file_write_close(struct inode *inode) * its maximum size or it was fully written). For this case, we only * need to decrement the write open count. */ - if (zi->i_flags & ZONEFS_ZONE_OPEN) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (z->z_flags & ZONEFS_ZONE_OPEN) { + ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); if (ret) { __zonefs_io_error(inode, false); /* @@ -817,11 +828,11 @@ static void zonefs_seq_file_write_close(struct inode *inode) * exhausted). So take preventive action by remounting * read-only. */ - if (zi->i_flags & ZONEFS_ZONE_OPEN && + if (z->z_flags & ZONEFS_ZONE_OPEN && !(sb->s_flags & SB_RDONLY)) { zonefs_warn(sb, "closing zone at %llu failed %d\n", - zi->i_zsector, ret); + z->z_sector, ret); zonefs_warn(sb, "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; @@ -829,8 +840,8 @@ static void zonefs_seq_file_write_close(struct inode *inode) goto unlock; } - zi->i_flags &= ~ZONEFS_ZONE_OPEN; - zonefs_account_active(inode); + z->z_flags &= ~ZONEFS_ZONE_OPEN; + zonefs_inode_account_active(inode); } atomic_dec(&sbi->s_wro_seq_files); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a4af29dc32e7..270ded209dde 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -28,33 +28,47 @@ #include "trace.h" /* - * Manage the active zone count. Called with zi->i_truncate_mutex held. + * Get the name of a zone group directory. */ -void zonefs_account_active(struct inode *inode) +static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) { - struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - struct zonefs_inode_info *zi = ZONEFS_I(inode); + switch (ztype) { + case ZONEFS_ZTYPE_CNV: + return "cnv"; + case ZONEFS_ZTYPE_SEQ: + return "seq"; + default: + WARN_ON_ONCE(1); + return "???"; + } +} - lockdep_assert_held(&zi->i_truncate_mutex); +/* + * Manage the active zone count. + */ +static void zonefs_account_active(struct super_block *sb, + struct zonefs_zone *z) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - if (zonefs_zone_is_cnv(zi)) + if (zonefs_zone_is_cnv(z)) return; /* * For zones that transitioned to the offline or readonly condition, * we only need to clear the active state. */ - if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) goto out; /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. */ - if ((zi->i_flags & ZONEFS_ZONE_OPEN) || - (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { - if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { - zi->i_flags |= ZONEFS_ZONE_ACTIVE; + if ((z->z_flags & ZONEFS_ZONE_OPEN) || + (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { + if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { + z->z_flags |= ZONEFS_ZONE_ACTIVE; atomic_inc(&sbi->s_active_seq_files); } return; @@ -62,18 +76,29 @@ void zonefs_account_active(struct inode *inode) out: /* The zone is not active. If it was, update the active count */ - if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { - zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; + if (z->z_flags & ZONEFS_ZONE_ACTIVE) { + z->z_flags &= ~ZONEFS_ZONE_ACTIVE; atomic_dec(&sbi->s_active_seq_files); } } -int zonefs_zone_mgmt(struct inode *inode, enum req_op op) +/* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +void zonefs_inode_account_active(struct inode *inode) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret; + lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); - lockdep_assert_held(&zi->i_truncate_mutex); + return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); +} + +/* + * Execute a zone management operation. + */ +static int zonefs_zone_mgmt(struct super_block *sb, + struct zonefs_zone *z, enum req_op op) +{ + int ret; /* * With ZNS drives, closing an explicitly open zone that has not been @@ -83,37 +108,45 @@ int zonefs_zone_mgmt(struct inode *inode, enum req_op op) * are exceeded, make sure that the zone does not remain active by * resetting it. */ - if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) + if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) op = REQ_OP_ZONE_RESET; - trace_zonefs_zone_mgmt(inode, op); - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + trace_zonefs_zone_mgmt(sb, z, op); + ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, + z->z_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) { - zonefs_err(inode->i_sb, + zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", - blk_op_str(op), zi->i_zsector, ret); + blk_op_str(op), z->z_sector, ret); return ret; } return 0; } +int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) +{ + lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); + + return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); +} + void zonefs_i_size_write(struct inode *inode, loff_t isize) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); i_size_write(inode, isize); + /* * A full zone is no longer open/active and does not need * explicit closing. */ - if (isize >= zi->i_max_size) { + if (isize >= z->z_capacity) { struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); - if (zi->i_flags & ZONEFS_ZONE_ACTIVE) + if (z->z_flags & ZONEFS_ZONE_ACTIVE) atomic_dec(&sbi->s_active_seq_files); - zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); + z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); } } @@ -150,20 +183,18 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize) } /* - * Check a zone condition and adjust its file inode access permissions for - * offline and readonly zones. Return the inode size corresponding to the - * amount of readable data in the zone. + * Check a zone condition. Return the amount of written (and still readable) + * data in the zone. */ -static loff_t zonefs_check_zone_condition(struct inode *inode, +static loff_t zonefs_check_zone_condition(struct super_block *sb, + struct zonefs_zone *z, struct blk_zone *zone) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); - switch (zone->cond) { case BLK_ZONE_COND_OFFLINE: - zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", - inode->i_ino); - zi->i_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_warn(sb, "Zone %llu: offline zone\n", + z->z_sector); + z->z_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* @@ -174,18 +205,18 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, * the inode size as it was when last updated so that the user * can recover data. */ - zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", - inode->i_ino); - zi->i_flags |= ZONEFS_ZONE_READONLY; - if (zonefs_zone_is_cnv(zi)) - return zi->i_max_size; - return zi->i_wpoffset; + zonefs_warn(sb, "Zone %llu: read-only zone\n", + z->z_sector); + z->z_flags |= ZONEFS_ZONE_READONLY; + if (zonefs_zone_is_cnv(z)) + return z->z_capacity; + return z->z_wpoffset; case BLK_ZONE_COND_FULL: /* The write pointer of full zones is invalid. */ - return zi->i_max_size; + return z->z_capacity; default: - if (zonefs_zone_is_cnv(zi)) - return zi->i_max_size; + if (zonefs_zone_is_cnv(z)) + return z->z_capacity; return (zone->wp - zone->start) << SECTOR_SHIFT; } } @@ -196,22 +227,22 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, */ static void zonefs_inode_update_mode(struct inode *inode) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); - if (zi->i_flags & ZONEFS_ZONE_OFFLINE) { + if (z->z_flags & ZONEFS_ZONE_OFFLINE) { /* Offline zones cannot be read nor written */ inode->i_flags |= S_IMMUTABLE; inode->i_mode &= ~0777; - } else if (zi->i_flags & ZONEFS_ZONE_READONLY) { + } else if (z->z_flags & ZONEFS_ZONE_READONLY) { /* Readonly zones cannot be written */ inode->i_flags |= S_IMMUTABLE; - if (zi->i_flags & ZONEFS_ZONE_INIT_MODE) + if (z->z_flags & ZONEFS_ZONE_INIT_MODE) inode->i_mode &= ~0777; else inode->i_mode &= ~0222; } - zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE; + z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; } struct zonefs_ioerr_data { @@ -224,7 +255,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, { struct zonefs_ioerr_data *err = data; struct inode *inode = err->inode; - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); loff_t isize, data_size; @@ -235,9 +266,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * as there is no inconsistency between the inode size and the amount of * data writen in the zone (data_size). */ - data_size = zonefs_check_zone_condition(inode, zone); + data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); - if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && + if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && !err->write && isize == data_size) return 0; @@ -260,8 +291,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ - if (zonefs_zone_is_seq(zi) && isize != data_size) - zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", + if (zonefs_zone_is_seq(z) && isize != data_size) + zonefs_warn(sb, + "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); /* @@ -270,20 +302,20 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * zone condition to read-only and offline respectively, as if the * condition was signaled by the hardware. */ - if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) || + if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { zonefs_warn(sb, "inode %lu: read/write access disabled\n", inode->i_ino); - if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE)) - zi->i_flags |= ZONEFS_ZONE_OFFLINE; + if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) + z->z_flags |= ZONEFS_ZONE_OFFLINE; zonefs_inode_update_mode(inode); data_size = 0; - } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) || + } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { zonefs_warn(sb, "inode %lu: write access disabled\n", inode->i_ino); - if (!(zi->i_flags & ZONEFS_ZONE_READONLY)) - zi->i_flags |= ZONEFS_ZONE_READONLY; + if (!(z->z_flags & ZONEFS_ZONE_READONLY)) + z->z_flags |= ZONEFS_ZONE_READONLY; zonefs_inode_update_mode(inode); data_size = isize; } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && @@ -299,8 +331,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * close of the zone when the inode file is closed. */ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && - (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) - zi->i_flags &= ~ZONEFS_ZONE_OPEN; + (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) + z->z_flags &= ~ZONEFS_ZONE_OPEN; /* * If error=remount-ro was specified, any error result in remounting @@ -317,8 +349,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, */ zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); - zi->i_wpoffset = data_size; - zonefs_account_active(inode); + z->z_wpoffset = data_size; + zonefs_inode_account_active(inode); return 0; } @@ -332,7 +364,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, */ void __zonefs_io_error(struct inode *inode, bool write) { - struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); unsigned int noio_flag; @@ -348,8 +380,8 @@ void __zonefs_io_error(struct inode *inode, bool write) * files with aggregated conventional zones, for which the inode zone * size is always larger than the device zone size. */ - if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) - nr_zones = zi->i_zone_size >> + if (z->z_size > bdev_zone_sectors(sb->s_bdev)) + nr_zones = z->z_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); /* @@ -361,7 +393,7 @@ void __zonefs_io_error(struct inode *inode, bool write) * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, + ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, zonefs_io_error_cb, &err); if (ret != nr_zones) zonefs_err(sb, "Get inode %lu zone information failed %d\n", @@ -381,9 +413,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); - zi->i_wpoffset = 0; zi->i_wr_refcnt = 0; - zi->i_flags = 0; return &zi->i_vnode; } @@ -416,8 +446,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - if (sbi->s_nr_files[t]) - buf->f_files += sbi->s_nr_files[t] + 1; + if (sbi->s_zgroup[t].g_nr_zones) + buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; } buf->f_ffree = 0; @@ -557,11 +587,11 @@ static const struct inode_operations zonefs_dir_inode_operations = { }; static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, - enum zonefs_ztype type) + enum zonefs_ztype ztype) { struct super_block *sb = parent->i_sb; - inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; + inode->i_ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -573,79 +603,34 @@ static const struct inode_operations zonefs_file_inode_operations = { .setattr = zonefs_inode_setattr, }; -static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, - enum zonefs_ztype type) +static void zonefs_init_file_inode(struct inode *inode, + struct zonefs_zone *z) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - struct zonefs_inode_info *zi = ZONEFS_I(inode); - int ret = 0; - inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; + inode->i_private = z; + + inode->i_ino = z->z_sector >> sbi->s_zone_sectors_shift; inode->i_mode = S_IFREG | sbi->s_perm; - - if (type == ZONEFS_ZTYPE_CNV) - zi->i_flags |= ZONEFS_ZONE_CNV; - - zi->i_zsector = zone->start; - zi->i_zone_size = zone->len << SECTOR_SHIFT; - if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && - !(sbi->s_features & ZONEFS_F_AGGRCNV)) { - zonefs_err(sb, - "zone size %llu doesn't match device's zone sectors %llu\n", - zi->i_zone_size, - bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); - return -EINVAL; - } - - zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, - zone->capacity << SECTOR_SHIFT); - zi->i_wpoffset = zonefs_check_zone_condition(inode, zone); - inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; - inode->i_size = zi->i_wpoffset; - inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT; + inode->i_size = z->z_wpoffset; + inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; inode->i_op = &zonefs_file_inode_operations; inode->i_fop = &zonefs_file_operations; inode->i_mapping->a_ops = &zonefs_file_aops; /* Update the inode access rights depending on the zone condition */ - zi->i_flags |= ZONEFS_ZONE_INIT_MODE; + z->z_flags |= ZONEFS_ZONE_INIT_MODE; zonefs_inode_update_mode(inode); - - sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); - sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; - sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; - - mutex_lock(&zi->i_truncate_mutex); - - /* - * For sequential zones, make sure that any open zone is closed first - * to ensure that the initial number of open zones is 0, in sync with - * the open zone accounting done when the mount option - * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. - */ - if (type == ZONEFS_ZTYPE_SEQ && - (zone->cond == BLK_ZONE_COND_IMP_OPEN || - zone->cond == BLK_ZONE_COND_EXP_OPEN)) { - ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); - if (ret) - goto unlock; - } - - zonefs_account_active(inode); - -unlock: - mutex_unlock(&zi->i_truncate_mutex); - - return ret; } static struct dentry *zonefs_create_inode(struct dentry *parent, - const char *name, struct blk_zone *zone, - enum zonefs_ztype type) + const char *name, + struct zonefs_zone *z, + enum zonefs_ztype ztype) { struct inode *dir = d_inode(parent); struct dentry *dentry; @@ -661,15 +646,10 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, goto dput; inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; - if (zone) { - ret = zonefs_init_file_inode(inode, zone, type); - if (ret) { - iput(inode); - goto dput; - } - } else { - zonefs_init_dir_inode(dir, inode, type); - } + if (z) + zonefs_init_file_inode(inode, z); + else + zonefs_init_dir_inode(dir, inode, ztype); d_add(dentry, inode); dir->i_size++; @@ -685,100 +665,51 @@ dput: struct zonefs_zone_data { struct super_block *sb; unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; + sector_t cnv_zone_start; struct blk_zone *zones; }; /* - * Create a zone group and populate it with zone files. + * Create the inodes for a zone group. */ -static int zonefs_create_zgroup(struct zonefs_zone_data *zd, - enum zonefs_ztype type) +static int zonefs_create_zgroup_inodes(struct super_block *sb, + enum zonefs_ztype ztype) { - struct super_block *sb = zd->sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); - struct blk_zone *zone, *next, *end; - const char *zgroup_name; - char *file_name; + struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; struct dentry *dir, *dent; - unsigned int n = 0; - int ret; + char *file_name; + int i, ret = 0; + + if (!zgroup) + return -ENOMEM; /* If the group is empty, there is nothing to do */ - if (!zd->nr_zones[type]) + if (!zgroup->g_nr_zones) return 0; file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); if (!file_name) return -ENOMEM; - if (type == ZONEFS_ZTYPE_CNV) - zgroup_name = "cnv"; - else - zgroup_name = "seq"; - - dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); + dir = zonefs_create_inode(sb->s_root, zonefs_zgroup_name(ztype), + NULL, ztype); if (IS_ERR(dir)) { ret = PTR_ERR(dir); goto free; } - /* - * The first zone contains the super block: skip it. - */ - end = zd->zones + bdev_nr_zones(sb->s_bdev); - for (zone = &zd->zones[1]; zone < end; zone = next) { - - next = zone + 1; - if (zonefs_zone_type(zone) != type) - continue; - - /* - * For conventional zones, contiguous zones can be aggregated - * together to form larger files. Note that this overwrites the - * length of the first zone of the set of contiguous zones - * aggregated together. If one offline or read-only zone is - * found, assume that all zones aggregated have the same - * condition. - */ - if (type == ZONEFS_ZTYPE_CNV && - (sbi->s_features & ZONEFS_F_AGGRCNV)) { - for (; next < end; next++) { - if (zonefs_zone_type(next) != type) - break; - zone->len += next->len; - zone->capacity += next->capacity; - if (next->cond == BLK_ZONE_COND_READONLY && - zone->cond != BLK_ZONE_COND_OFFLINE) - zone->cond = BLK_ZONE_COND_READONLY; - else if (next->cond == BLK_ZONE_COND_OFFLINE) - zone->cond = BLK_ZONE_COND_OFFLINE; - } - if (zone->capacity != zone->len) { - zonefs_err(sb, "Invalid conventional zone capacity\n"); - ret = -EINVAL; - goto free; - } - } - - /* - * Use the file number within its group as file name. - */ - snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); - dent = zonefs_create_inode(dir, file_name, zone, type); + for (i = 0; i < zgroup->g_nr_zones; i++) { + /* Use the zone number within its group as the file name */ + snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", i); + dent = zonefs_create_inode(dir, file_name, + &zgroup->g_zones[i], ztype); if (IS_ERR(dent)) { ret = PTR_ERR(dent); - goto free; + break; } - - n++; } - zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", - zgroup_name, n, n > 1 ? "s" : ""); - - sbi->s_nr_files[type] = n; - ret = 0; - free: kfree(file_name); @@ -789,21 +720,38 @@ static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct zonefs_zone_data *zd = data; + struct super_block *sb = zd->sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); /* - * Count the number of usable zones: the first zone at index 0 contains - * the super block and is ignored. + * We do not care about the first zone: it contains the super block + * and not exposed as a file. + */ + if (!idx) + return 0; + + /* + * Count the number of zones that will be exposed as files. + * For sequential zones, we always have as many files as zones. + * FOr conventional zones, the number of files depends on if we have + * conventional zones aggregation enabled. */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: - zone->wp = zone->start + zone->len; - if (idx) - zd->nr_zones[ZONEFS_ZTYPE_CNV]++; + if (sbi->s_features & ZONEFS_F_AGGRCNV) { + /* One file per set of contiguous conventional zones */ + if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || + zone->start != zd->cnv_zone_start) + sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; + zd->cnv_zone_start = zone->start + zone->len; + } else { + /* One file per zone */ + sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; + } break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (idx) - zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; + sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; break; default: zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", @@ -843,11 +791,173 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) return 0; } -static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) +static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) { kvfree(zd->zones); } +/* + * Create a zone group and populate it with zone files. + */ +static int zonefs_init_zgroup(struct super_block *sb, + struct zonefs_zone_data *zd, + enum zonefs_ztype ztype) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; + struct blk_zone *zone, *next, *end; + struct zonefs_zone *z; + unsigned int n = 0; + int ret; + + /* Allocate the zone group. If it is empty, we have nothing to do. */ + if (!zgroup->g_nr_zones) + return 0; + + zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, + sizeof(struct zonefs_zone), GFP_KERNEL); + if (!zgroup->g_zones) + return -ENOMEM; + + /* + * Initialize the zone groups using the device zone information. + * We always skip the first zone as it contains the super block + * and is not use to back a file. + */ + end = zd->zones + bdev_nr_zones(sb->s_bdev); + for (zone = &zd->zones[1]; zone < end; zone = next) { + + next = zone + 1; + if (zonefs_zone_type(zone) != ztype) + continue; + + if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) + return -EINVAL; + + /* + * For conventional zones, contiguous zones can be aggregated + * together to form larger files. Note that this overwrites the + * length of the first zone of the set of contiguous zones + * aggregated together. If one offline or read-only zone is + * found, assume that all zones aggregated have the same + * condition. + */ + if (ztype == ZONEFS_ZTYPE_CNV && + (sbi->s_features & ZONEFS_F_AGGRCNV)) { + for (; next < end; next++) { + if (zonefs_zone_type(next) != ztype) + break; + zone->len += next->len; + zone->capacity += next->capacity; + if (next->cond == BLK_ZONE_COND_READONLY && + zone->cond != BLK_ZONE_COND_OFFLINE) + zone->cond = BLK_ZONE_COND_READONLY; + else if (next->cond == BLK_ZONE_COND_OFFLINE) + zone->cond = BLK_ZONE_COND_OFFLINE; + } + } + + z = &zgroup->g_zones[n]; + if (ztype == ZONEFS_ZTYPE_CNV) + z->z_flags |= ZONEFS_ZONE_CNV; + z->z_sector = zone->start; + z->z_size = zone->len << SECTOR_SHIFT; + if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && + !(sbi->s_features & ZONEFS_F_AGGRCNV)) { + zonefs_err(sb, + "Invalid zone size %llu (device zone sectors %llu)\n", + z->z_size, + bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); + return -EINVAL; + } + + z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, + zone->capacity << SECTOR_SHIFT); + z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); + + sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); + sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; + sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; + + /* + * For sequential zones, make sure that any open zone is closed + * first to ensure that the initial number of open zones is 0, + * in sync with the open zone accounting done when the mount + * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. + */ + if (ztype == ZONEFS_ZTYPE_SEQ && + (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN)) { + ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); + if (ret) + return ret; + } + + zonefs_account_active(sb, z); + + n++; + } + + if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) + return -EINVAL; + + zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", + zonefs_zgroup_name(ztype), + zgroup->g_nr_zones, + zgroup->g_nr_zones > 1 ? "s" : ""); + + return 0; +} + +static void zonefs_free_zgroups(struct super_block *sb) +{ + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + enum zonefs_ztype ztype; + + if (!sbi) + return; + + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + kvfree(sbi->s_zgroup[ztype].g_zones); + sbi->s_zgroup[ztype].g_zones = NULL; + } +} + +/* + * Create a zone group and populate it with zone files. + */ +static int zonefs_init_zgroups(struct super_block *sb) +{ + struct zonefs_zone_data zd; + enum zonefs_ztype ztype; + int ret; + + /* First get the device zone information */ + memset(&zd, 0, sizeof(struct zonefs_zone_data)); + zd.sb = sb; + ret = zonefs_get_zone_info(&zd); + if (ret) + goto cleanup; + + /* Allocate and initialize the zone groups */ + for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { + ret = zonefs_init_zgroup(sb, &zd, ztype); + if (ret) { + zonefs_info(sb, + "Zone group \"%s\" initialization failed\n", + zonefs_zgroup_name(ztype)); + break; + } + } + +cleanup: + zonefs_free_zone_info(&zd); + if (ret) + zonefs_free_zgroups(sb); + + return ret; +} + /* * Read super block information from the device. */ @@ -945,7 +1055,6 @@ static const struct super_operations zonefs_sops = { */ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) { - struct zonefs_zone_data zd; struct zonefs_sb_info *sbi; struct inode *inode; enum zonefs_ztype t; @@ -998,16 +1107,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - memset(&zd, 0, sizeof(struct zonefs_zone_data)); - zd.sb = sb; - ret = zonefs_get_zone_info(&zd); - if (ret) - goto cleanup; - - ret = zonefs_sysfs_register(sb); - if (ret) - goto cleanup; - zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); if (!sbi->s_max_wro_seq_files && @@ -1018,6 +1117,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } + /* Initialize the zone groups */ + ret = zonefs_init_zgroups(sb); + if (ret) + goto cleanup; + /* Create root directory inode */ ret = -ENOMEM; inode = new_inode(sb); @@ -1037,13 +1141,19 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) /* Create and populate files in zone groups directories */ for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { - ret = zonefs_create_zgroup(&zd, t); + ret = zonefs_create_zgroup_inodes(sb, t); if (ret) - break; + goto cleanup; } + ret = zonefs_sysfs_register(sb); + if (ret) + goto cleanup; + + return 0; + cleanup: - zonefs_cleanup_zone_info(&zd); + zonefs_free_zgroups(sb); return ret; } @@ -1062,6 +1172,7 @@ static void zonefs_kill_super(struct super_block *sb) d_genocide(sb->s_root); zonefs_sysfs_unregister(sb); + zonefs_free_zgroups(sb); kill_block_super(sb); kfree(sbi); } diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h index 42edcfd393ed..9969db3a9c7d 100644 --- a/fs/zonefs/trace.h +++ b/fs/zonefs/trace.h @@ -20,8 +20,9 @@ #define show_dev(dev) MAJOR(dev), MINOR(dev) TRACE_EVENT(zonefs_zone_mgmt, - TP_PROTO(struct inode *inode, enum req_op op), - TP_ARGS(inode, op), + TP_PROTO(struct super_block *sb, struct zonefs_zone *z, + enum req_op op), + TP_ARGS(sb, z, op), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) @@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt, __field(sector_t, nr_sectors) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; + __entry->dev = sb->s_dev; + __entry->ino = + z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift; __entry->op = op; - __entry->sector = ZONEFS_I(inode)->i_zsector; - __entry->nr_sectors = - ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT; + __entry->sector = z->z_sector; + __entry->nr_sectors = z->z_size >> SECTOR_SHIFT; ), TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu", show_dev(__entry->dev), (unsigned long)__entry->ino, @@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->sector = ZONEFS_I(inode)->i_zsector; + __entry->sector = zonefs_inode_zone(inode)->z_sector; __entry->size = size; - __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset; + __entry->wpoffset = + zonefs_inode_zone(inode)->z_wpoffset; __entry->ret = ret; ), TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu", diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 1a225f74015a..2d626e18b141 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -46,24 +46,41 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) #define ZONEFS_ZONE_READONLY (1U << 4) #define ZONEFS_ZONE_CNV (1U << 31) +/* + * In-memory per-file inode zone data. + */ +struct zonefs_zone { + /* Zone state flags */ + unsigned int z_flags; + + /* Zone start sector (512B unit) */ + sector_t z_sector; + + /* Zone size (bytes) */ + loff_t z_size; + + /* Zone capacity (file maximum size, bytes) */ + loff_t z_capacity; + + /* Write pointer offset in the zone (sequential zones only, bytes) */ + loff_t z_wpoffset; +}; + +/* + * In memory zone group information: all zones of a group are exposed + * as files, one file per zone. + */ +struct zonefs_zone_group { + unsigned int g_nr_zones; + struct zonefs_zone *g_zones; +}; + /* * In-memory inode data. */ struct zonefs_inode_info { struct inode i_vnode; - /* File zone start sector (512B unit) */ - sector_t i_zsector; - - /* File zone write pointer position (sequential zones only) */ - loff_t i_wpoffset; - - /* File maximum size */ - loff_t i_max_size; - - /* File zone size */ - loff_t i_zone_size; - /* * To serialise fully against both syscall and mmap based IO and * sequential file truncation, two locks are used. For serializing @@ -81,7 +98,6 @@ struct zonefs_inode_info { /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; - unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -89,24 +105,29 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) return container_of(inode, struct zonefs_inode_info, i_vnode); } -static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi) +static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z) { - return zi->i_flags & ZONEFS_ZONE_CNV; + return z->z_flags & ZONEFS_ZONE_CNV; } -static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi) +static inline bool zonefs_zone_is_seq(struct zonefs_zone *z) { - return !zonefs_zone_is_cnv(zi); + return !zonefs_zone_is_cnv(z); +} + +static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode) +{ + return inode->i_private; } static inline bool zonefs_inode_is_cnv(struct inode *inode) { - return zonefs_zone_is_cnv(ZONEFS_I(inode)); + return zonefs_zone_is_cnv(zonefs_inode_zone(inode)); } static inline bool zonefs_inode_is_seq(struct inode *inode) { - return zonefs_zone_is_seq(ZONEFS_I(inode)); + return zonefs_zone_is_seq(zonefs_inode_zone(inode)); } /* @@ -200,7 +221,7 @@ struct zonefs_sb_info { uuid_t s_uuid; unsigned int s_zone_sectors_shift; - unsigned int s_nr_files[ZONEFS_ZTYPE_MAX]; + struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX]; loff_t s_blocks; loff_t s_used_blocks; @@ -229,8 +250,8 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) /* In super.c */ -void zonefs_account_active(struct inode *inode); -int zonefs_zone_mgmt(struct inode *inode, enum req_op op); +void zonefs_inode_account_active(struct inode *inode); +int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op); void zonefs_i_size_write(struct inode *inode, loff_t isize); void zonefs_update_stats(struct inode *inode, loff_t new_isize); void __zonefs_io_error(struct inode *inode, bool write); From fc426026c3a3bd939f1ebae31f880ecf2724cc74 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 20 Mar 2023 22:49:15 +0900 Subject: [PATCH 009/179] zonefs: Fix error message in zonefs_file_dio_append() [ Upstream commit 88b170088ad2c3e27086fe35769aa49f8a512564 ] Since the expected write location in a sequential file is always at the end of the file (append write), when an invalid write append location is detected in zonefs_file_dio_append(), print the invalid written location instead of the expected write location. Fixes: a608da3bd730 ("zonefs: Detect append writes at invalid locations") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Signed-off-by: Sasha Levin --- fs/zonefs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 738b0e28d74b..c71cc0fcb3ec 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -426,7 +426,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) if (bio->bi_iter.bi_sector != wpsector) { zonefs_warn(inode->i_sb, "Corrupted write pointer %llu for zone at %llu\n", - wpsector, z->z_sector); + bio->bi_iter.bi_sector, z->z_sector); ret = -EIO; } } From 0b0e1551ba9a8e523d9a5ef2f400cb621d997d79 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 14 Mar 2023 16:31:32 -0700 Subject: [PATCH 010/179] fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY [ Upstream commit a075bacde257f755bea0e53400c9f1cdd1b8e8e6 ] The full pagecache drop at the end of FS_IOC_ENABLE_VERITY is causing performance problems and is hindering adoption of fsverity. It was intended to solve a race condition where unverified pages might be left in the pagecache. But actually it doesn't solve it fully. Since the incomplete solution for this race condition has too much performance impact for it to be worth it, let's remove it for now. Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl") Cc: stable@vger.kernel.org Reviewed-by: Victor Hsieh Link: https://lore.kernel.org/r/20230314235332.50270-1-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Sasha Levin --- fs/verity/enable.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index df6b499bf6a1..400c264bf893 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -390,25 +390,27 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) goto out_drop_write; err = enable_verity(filp, &arg); - if (err) - goto out_allow_write_access; /* - * Some pages of the file may have been evicted from pagecache after - * being used in the Merkle tree construction, then read into pagecache - * again by another process reading from the file concurrently. Since - * these pages didn't undergo verification against the file digest which - * fs-verity now claims to be enforcing, we have to wipe the pagecache - * to ensure that all future reads are verified. + * We no longer drop the inode's pagecache after enabling verity. This + * used to be done to try to avoid a race condition where pages could be + * evicted after being used in the Merkle tree construction, then + * re-instantiated by a concurrent read. Such pages are unverified, and + * the backing storage could have filled them with different content, so + * they shouldn't be used to fulfill reads once verity is enabled. + * + * But, dropping the pagecache has a big performance impact, and it + * doesn't fully solve the race condition anyway. So for those reasons, + * and also because this race condition isn't very important relatively + * speaking (especially for small-ish files, where the chance of a page + * being used, evicted, *and* re-instantiated all while enabling verity + * is quite small), we no longer drop the inode's pagecache. */ - filemap_write_and_wait(inode->i_mapping); - invalidate_inode_pages2(inode->i_mapping); /* * allow_write_access() is needed to pair with deny_write_access(). * Regardless, the filesystem won't allow writing to verity files. */ -out_allow_write_access: allow_write_access(filp); out_drop_write: mnt_drop_write_file(filp); From 01f3150cc7a78cdab61796bfd76e32041ba0e629 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Mon, 28 Nov 2022 11:43:58 +0100 Subject: [PATCH 011/179] kernel: kcsan: kcsan_test: build without structleak plugin [ Upstream commit 6fcd4267a840d0536b8e5334ad5f31e4105fce85 ] Building kcsan_test with structleak plugin enabled makes the stack frame size to grow. kernel/kcsan/kcsan_test.c:704:1: error: the frame size of 3296 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] Turn off the structleak plugin checks for kcsan_test. Link: https://lkml.kernel.org/r/20221128104358.2660634-1-anders.roxell@linaro.org Signed-off-by: Anders Roxell Suggested-by: Arnd Bergmann Acked-by: Marco Elver Cc: Arnd Bergmann Cc: David Gow Cc: Jason A. Donenfeld Cc: Kees Cook Signed-off-by: Andrew Morton Stable-dep-of: 5eb39cde1e24 ("kcsan: avoid passing -g for test") Signed-off-by: Sasha Levin --- kernel/kcsan/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 4f35d1bced6a..8cf70f068d92 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o From bae092f587592f3472f2d4ee66f1dc0ae3de3546 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 16 Mar 2023 23:47:05 +0100 Subject: [PATCH 012/179] kcsan: avoid passing -g for test [ Upstream commit 5eb39cde1e2487ba5ec1802dc5e58a77e700d99e ] Nathan reported that when building with GNU as and a version of clang that defaults to DWARF5, the assembler will complain with: Error: non-constant .uleb128 is not supported This is because `-g` defaults to the compiler debug info default. If the assembler does not support some of the directives used, the above errors occur. To fix, remove the explicit passing of `-g`. All the test wants is that stack traces print valid function names, and debug info is not required for that. (I currently cannot recall why I added the explicit `-g`.) Link: https://lkml.kernel.org/r/20230316224705.709984-2-elver@google.com Fixes: 1fe84fd4a402 ("kcsan: Add test suite") Signed-off-by: Marco Elver Reported-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- kernel/kcsan/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 8cf70f068d92..a45f3dfc8d14 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -16,6 +16,6 @@ obj-y := core.o debugfs.o report.o KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o -CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o From ea34b8bcc7aec261d2bb163a80c3ff9d368fa7eb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 1 Mar 2023 16:14:42 -0500 Subject: [PATCH 013/179] btrfs: rename BTRFS_FS_NO_OVERCOMMIT to BTRFS_FS_ACTIVE_ZONE_TRACKING [ Upstream commit bf1f1fec2724a33b67ec12032402ea75f2a83622 ] This flag only gets set when we're doing active zone tracking, and we're going to need to use this flag for things related to this behavior. Rename the flag to represent what it actually means for the file system so it can be used in other ways and still make sense. Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/ctree.h | 7 ++----- fs/btrfs/space-info.c | 2 +- fs/btrfs/zoned.c | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a3febabacec0..3bcef0c4d6fc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -590,11 +590,8 @@ enum { /* Indicate we have to finish a zone to do next allocation. */ BTRFS_FS_NEED_ZONE_FINISH, - /* - * Indicate metadata over-commit is disabled. This is set when active - * zone tracking is needed. - */ - BTRFS_FS_NO_OVERCOMMIT, + /* This is set when active zone tracking is needed. */ + BTRFS_FS_ACTIVE_ZONE_TRACKING, #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 65c010159fb5..c7642c00a65d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -404,7 +404,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && + if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) avail = 0; else diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1b72004136ef..0d88cc46ac5d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -538,8 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } atomic_set(&zone_info->active_zones_left, max_active_zones - nactive); - /* Overcommit does not work well with active zone tacking. */ - set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags); + set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); } /* Validate superblock log */ From 64621e4607f48254c27b9a0caa62ee4ada243466 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 13 Mar 2023 16:06:13 +0900 Subject: [PATCH 014/179] btrfs: zoned: count fresh BG region as zone unusable [ Upstream commit fa2068d7e922b434eba5bfb0131e6d39febfdb48 ] The naming of space_info->active_total_bytes is misleading. It counts not only active block groups but also full ones which are previously active but now inactive. That confusion results in a bug not counting the full BGs into active_total_bytes on mount time. For a background, there are three kinds of block groups in terms of activation. 1. Block groups never activated 2. Block groups currently active 3. Block groups previously active and currently inactive (due to fully written or zone finish) What we really wanted to exclude from "total_bytes" is the total size of BGs #1. They seem empty and allocatable but since they are not activated, we cannot rely on them to do the space reservation. And, since BGs #1 never get activated, they should have no "used", "reserved" and "pinned" bytes. OTOH, BGs #3 can be counted in the "total", since they are already full we cannot allocate from them anyway. For them, "total_bytes == used + reserved + pinned + zone_unusable" should hold. Tracking #2 and #3 as "active_total_bytes" (current implementation) is confusing. And, tracking #1 and subtract that properly from "total_bytes" every time you need space reservation is cumbersome. Instead, we can count the whole region of a newly allocated block group as zone_unusable. Then, once that block group is activated, release [0 .. zone_capacity] from the zone_unusable counters. With this, we can eliminate the confusing ->active_total_bytes and the code will be common among regular and the zoned mode. Also, no additional counter is needed with this approach. Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes") CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/free-space-cache.c | 8 +++++++- fs/btrfs/zoned.c | 24 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4023651dd68..6a8f2bd350f4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2684,8 +2684,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); + /* Count initial region as zone_unusable until it gets activated. */ if (!used) to_free = size; + else if (initial && + test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && + (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + to_free = 0; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) @@ -2713,7 +2718,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length) { + if (block_group->zone_unusable == block_group->length && + block_group->alloc_offset) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0d88cc46ac5d..e97c5a1ac95d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1575,9 +1575,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - unusable = (cache->alloc_offset - cache->used) + - (cache->length - cache->zone_capacity); - free = cache->zone_capacity - cache->alloc_offset; + + /* Check for block groups never get activated */ + if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && + cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && + !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && + cache->alloc_offset == 0) { + unusable = cache->length; + free = 0; + } else { + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; + } /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; @@ -1914,7 +1924,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); - space_info->active_total_bytes += block_group->length; + WARN_ON(block_group->alloc_offset != 0); + if (block_group->zone_unusable == block_group->length) { + block_group->zone_unusable = block_group->length - block_group->zone_capacity; + space_info->bytes_zone_unusable -= block_group->zone_capacity; + } spin_unlock(&block_group->lock); btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); @@ -2277,7 +2291,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) u64 avail; spin_lock(&block_group->lock); - if (block_group->reserved || + if (block_group->reserved || block_group->alloc_offset == 0 || (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { spin_unlock(&block_group->lock); continue; From a35f863210e8025bfd51280ad63285c70ff66a1a Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 20 Jan 2023 12:37:31 +0530 Subject: [PATCH 015/179] net: ethernet: ti: am65-cpsw/cpts: Fix CPTS release action [ Upstream commit 4ad8766cd3982744e53f107f378d2c65b76ff9a8 ] The am65_cpts_release() function is registered as a devm_action in the am65_cpts_create() function in am65-cpts driver. When the am65-cpsw driver invokes am65_cpts_create(), am65_cpts_release() is added in the set of devm actions associated with the am65-cpsw driver's device. In the event of probe failure or probe deferral, the platform_drv_probe() function invokes dev_pm_domain_detach() which powers off the CPSW and the CPSW's CPTS hardware, both of which share the same power domain. Since the am65_cpts_disable() function invoked by the am65_cpts_release() function attempts to reset the CPTS hardware by writing to its registers, the CPTS hardware is assumed to be powered on at this point. However, the hardware is powered off before the devm actions are executed. Fix this by getting rid of the devm action for am65_cpts_release() and invoking it directly on the cleanup and exit paths. Fixes: f6bd59526ca5 ("net: ethernet: ti: introduce am654 common platform time sync driver") Signed-off-by: Siddharth Vadapalli Reviewed-by: Leon Romanovsky Reviewed-by: Tony Nguyen Reviewed-by: Roger Quadros Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 ++ drivers/net/ethernet/ti/am65-cpts.c | 15 +++++---------- drivers/net/ethernet/ti/am65-cpts.h | 5 +++++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 00911e936052..8ff1c84a23ce 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2817,6 +2817,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) err_free_phylink: am65_cpsw_nuss_phylink_cleanup(common); + am65_cpts_release(common->cpts); err_of_clear: of_platform_device_destroy(common->mdio_dev, NULL); err_pm_clear: @@ -2845,6 +2846,7 @@ static int am65_cpsw_nuss_remove(struct platform_device *pdev) */ am65_cpsw_nuss_cleanup_ndev(common); am65_cpsw_nuss_phylink_cleanup(common); + am65_cpts_release(common->cpts); of_platform_device_destroy(common->mdio_dev, NULL); diff --git a/drivers/net/ethernet/ti/am65-cpts.c b/drivers/net/ethernet/ti/am65-cpts.c index e2f0fb286143..9948ac14e68d 100644 --- a/drivers/net/ethernet/ti/am65-cpts.c +++ b/drivers/net/ethernet/ti/am65-cpts.c @@ -918,14 +918,13 @@ static int am65_cpts_of_parse(struct am65_cpts *cpts, struct device_node *node) return cpts_of_mux_clk_setup(cpts, node); } -static void am65_cpts_release(void *data) +void am65_cpts_release(struct am65_cpts *cpts) { - struct am65_cpts *cpts = data; - ptp_clock_unregister(cpts->ptp_clock); am65_cpts_disable(cpts); clk_disable_unprepare(cpts->refclk); } +EXPORT_SYMBOL_GPL(am65_cpts_release); struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, struct device_node *node) @@ -1003,18 +1002,12 @@ struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, } cpts->phc_index = ptp_clock_index(cpts->ptp_clock); - ret = devm_add_action_or_reset(dev, am65_cpts_release, cpts); - if (ret) { - dev_err(dev, "failed to add ptpclk reset action %d", ret); - return ERR_PTR(ret); - } - ret = devm_request_threaded_irq(dev, cpts->irq, NULL, am65_cpts_interrupt, IRQF_ONESHOT, dev_name(dev), cpts); if (ret < 0) { dev_err(cpts->dev, "error attaching irq %d\n", ret); - return ERR_PTR(ret); + goto reset_ptpclk; } dev_info(dev, "CPTS ver 0x%08x, freq:%u, add_val:%u\n", @@ -1023,6 +1016,8 @@ struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, return cpts; +reset_ptpclk: + am65_cpts_release(cpts); refclk_disable: clk_disable_unprepare(cpts->refclk); return ERR_PTR(ret); diff --git a/drivers/net/ethernet/ti/am65-cpts.h b/drivers/net/ethernet/ti/am65-cpts.h index cf9fbc28fd03..c0ae0117e573 100644 --- a/drivers/net/ethernet/ti/am65-cpts.h +++ b/drivers/net/ethernet/ti/am65-cpts.h @@ -18,6 +18,7 @@ struct am65_cpts_estf_cfg { }; #if IS_ENABLED(CONFIG_TI_K3_AM65_CPTS) +void am65_cpts_release(struct am65_cpts *cpts); struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, struct device_node *node); int am65_cpts_phc_index(struct am65_cpts *cpts); @@ -29,6 +30,10 @@ int am65_cpts_estf_enable(struct am65_cpts *cpts, int idx, struct am65_cpts_estf_cfg *cfg); void am65_cpts_estf_disable(struct am65_cpts *cpts, int idx); #else +static inline void am65_cpts_release(struct am65_cpts *cpts) +{ +} + static inline struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, struct device_node *node) From 84cfcf240f4a577733b1d98fcd2611a611612b03 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Thu, 12 Jan 2023 04:05:57 -0500 Subject: [PATCH 016/179] riscv: ftrace: Fixup panic by disabling preemption [ Upstream commit 8547649981e6631328cd64f583667501ae385531 ] In RISCV, we must use an AUIPC + JALR pair to encode an immediate, forming a jump that jumps to an address over 4K. This may cause errors if we want to enable kernel preemption and remove dependency from patching code with stop_machine(). For example, if a task was switched out on auipc. And, if we changed the ftrace function before it was switched back, then it would jump to an address that has updated 11:0 bits mixing with previous XLEN:12 part. p: patched area performed by dynamic ftrace ftrace_prologue: p| REG_S ra, -SZREG(sp) p| auipc ra, 0x? ------------> preempted ... change ftrace function ... p| jalr -?(ra) <------------- switched back p| REG_L ra, -SZREG(sp) func: xxx ret Fixes: afc76b8b8011 ("riscv: Using PATCHABLE_FUNCTION_ENTRY instead of MCOUNT") Signed-off-by: Andy Chiu Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20230112090603.1295340-2-guoren@kernel.org Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt Signed-off-by: Sasha Levin --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index ae11d5647f9d..06b9b2f60b9f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -278,7 +278,7 @@ config ARCH_RV64I select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_TRACER if !XIP_KERNEL + select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION select SWIOTLB if MMU endchoice From 942100ec928cd292afc5d5cb341236d9a10f2778 Mon Sep 17 00:00:00 2001 From: Eddie James Date: Tue, 21 Feb 2023 11:03:52 +1030 Subject: [PATCH 017/179] ARM: dts: aspeed: p10bmc: Update battery node name [ Upstream commit a8cef541dd5ef9445130660008c029205c4c5aa5 ] The ADC sensor for the battery needs to be named "iio-hwmon" for compatibility with user space applications. Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20230202152759.67069-1-eajames@linux.ibm.com Fixes: bf1914e2cfed ("ARM: dts: aspeed: p10bmc: Fix ADC iio-hwmon battery node name") Signed-off-by: Joel Stanley Link: https://lore.kernel.org/r/20230221003352.1218797-1-joel@jms.id.au Signed-off-by: Arnd Bergmann Signed-off-by: Sasha Levin --- arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts | 2 +- arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts index fcc890e3ad73..f11feb98fde3 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts @@ -244,7 +244,7 @@ }; }; - iio-hwmon-battery { + iio-hwmon { compatible = "iio-hwmon"; io-channels = <&adc1 7>; }; diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts index 4879da4cdbd2..77a3a27b04e2 100644 --- a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts +++ b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts @@ -220,7 +220,7 @@ }; }; - iio-hwmon-battery { + iio-hwmon { compatible = "iio-hwmon"; io-channels = <&adc1 7>; }; From 30fff49ea686d1367ee2de0c45b436ae67a24c55 Mon Sep 17 00:00:00 2001 From: Robert Foss Date: Fri, 28 Oct 2022 14:08:05 +0200 Subject: [PATCH 018/179] drm/msm/dpu: Refactor sc7280_pp location [ Upstream commit 1a5b5372e3b0a4cc65a0cbb724b1b0859f4ac63c ] The sc7280_pp declaration is not located by the other _pp declarations, but rather hidden around the _merge_3d declarations. Let's fix this to avoid confusion. Signed-off-by: Robert Foss Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/509153/ Link: https://lore.kernel.org/r/20221028120812.339100-3-robert.foss@linaro.org Signed-off-by: Dmitry Baryshkov Stable-dep-of: 03c0c3cb22a4 ("drm/msm/dpu: correct sm8250 and sm8350 scaler") Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index 41c93a18d5cb..bbd884c8e0cb 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -1180,6 +1180,13 @@ static const struct dpu_pingpong_cfg sm8150_pp[] = { -1), }; +static const struct dpu_pingpong_cfg sc7280_pp[] = { + PP_BLK("pingpong_0", PINGPONG_0, 0x59000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_1", PINGPONG_1, 0x6a000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_2", PINGPONG_2, 0x6b000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, -1), +}; + static struct dpu_pingpong_cfg qcm2290_pp[] = { PP_BLK("pingpong_0", PINGPONG_0, 0x70000, 0, sdm845_pp_sblk, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8), @@ -1203,13 +1210,6 @@ static const struct dpu_merge_3d_cfg sm8150_merge_3d[] = { MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x83200), }; -static const struct dpu_pingpong_cfg sc7280_pp[] = { - PP_BLK("pingpong_0", PINGPONG_0, 0x59000, 0, sc7280_pp_sblk, -1, -1), - PP_BLK("pingpong_1", PINGPONG_1, 0x6a000, 0, sc7280_pp_sblk, -1, -1), - PP_BLK("pingpong_2", PINGPONG_2, 0x6b000, 0, sc7280_pp_sblk, -1, -1), - PP_BLK("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, -1), -}; - /************************************************************* * DSC sub blocks config *************************************************************/ From 3f7c4839fc041a9a4839ec40e3edccda74280d34 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 12 Feb 2023 01:12:18 +0200 Subject: [PATCH 019/179] drm/msm/dpu: correct sm8250 and sm8350 scaler [ Upstream commit 03c0c3cb22a4ff29afba1b43f0330289ea80433f ] QSEED4 is a newer variant of QSEED3LITE, which should be used on sm8250 and sm8350. Fix the DPU caps structure and used feature masks. Fixes: d21fc5dfc3df ("drm/msm/dpu1: add support for qseed3lite used on sm8250") Fixes: 0e91bcbb0016 ("drm/msm/dpu: Add SM8350 to hw catalog") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/522229/ Link: https://lore.kernel.org/r/20230211231259.1308718-10-dmitry.baryshkov@linaro.org Signed-off-by: Abhinav Kumar Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index bbd884c8e0cb..b1131860ada1 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -356,7 +356,7 @@ static const struct dpu_caps sc8180x_dpu_caps = { static const struct dpu_caps sm8250_dpu_caps = { .max_mixer_width = DEFAULT_DPU_OUTPUT_LINE_WIDTH, .max_mixer_blendstages = 0xb, - .qseed_type = DPU_SSPP_SCALER_QSEED3LITE, + .qseed_type = DPU_SSPP_SCALER_QSEED4, .smart_dma_rev = DPU_SSPP_SMART_DMA_V2, /* TODO: v2.5 */ .ubwc_version = DPU_HW_UBWC_VER_40, .has_src_split = true, @@ -855,22 +855,22 @@ static const struct dpu_sspp_cfg sc7180_sspp[] = { }; static const struct dpu_sspp_sub_blks sm8250_vig_sblk_0 = - _VIG_SBLK("0", 5, DPU_SSPP_SCALER_QSEED3LITE); + _VIG_SBLK("0", 5, DPU_SSPP_SCALER_QSEED4); static const struct dpu_sspp_sub_blks sm8250_vig_sblk_1 = - _VIG_SBLK("1", 6, DPU_SSPP_SCALER_QSEED3LITE); + _VIG_SBLK("1", 6, DPU_SSPP_SCALER_QSEED4); static const struct dpu_sspp_sub_blks sm8250_vig_sblk_2 = - _VIG_SBLK("2", 7, DPU_SSPP_SCALER_QSEED3LITE); + _VIG_SBLK("2", 7, DPU_SSPP_SCALER_QSEED4); static const struct dpu_sspp_sub_blks sm8250_vig_sblk_3 = - _VIG_SBLK("3", 8, DPU_SSPP_SCALER_QSEED3LITE); + _VIG_SBLK("3", 8, DPU_SSPP_SCALER_QSEED4); static const struct dpu_sspp_cfg sm8250_sspp[] = { - SSPP_BLK("sspp_0", SSPP_VIG0, 0x4000, VIG_SM8250_MASK, + SSPP_BLK("sspp_0", SSPP_VIG0, 0x4000, VIG_SC7180_MASK, sm8250_vig_sblk_0, 0, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG0), - SSPP_BLK("sspp_1", SSPP_VIG1, 0x6000, VIG_SM8250_MASK, + SSPP_BLK("sspp_1", SSPP_VIG1, 0x6000, VIG_SC7180_MASK, sm8250_vig_sblk_1, 4, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG1), - SSPP_BLK("sspp_2", SSPP_VIG2, 0x8000, VIG_SM8250_MASK, + SSPP_BLK("sspp_2", SSPP_VIG2, 0x8000, VIG_SC7180_MASK, sm8250_vig_sblk_2, 8, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG2), - SSPP_BLK("sspp_3", SSPP_VIG3, 0xa000, VIG_SM8250_MASK, + SSPP_BLK("sspp_3", SSPP_VIG3, 0xa000, VIG_SC7180_MASK, sm8250_vig_sblk_3, 12, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG3), SSPP_BLK("sspp_8", SSPP_DMA0, 0x24000, DMA_SDM845_MASK, sdm845_dma_sblk_0, 1, SSPP_TYPE_DMA, DPU_CLK_CTRL_DMA0), From 33123ad902f312de32bc149bf0d2613ce1fbb041 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Mon, 27 Feb 2023 13:36:40 -0800 Subject: [PATCH 020/179] drm/msm/disp/dpu: fix sc7280_pp base offset [ Upstream commit ce68153edb5b36ddf87a19ed5a85131498690bbf ] At sc7280, pingpong block is used to management the dither effects to reduce distortion at panel. Currently pingpong-0 base offset is wrongly set at 0x59000. This mistake will not cause system to crash. However it will make dither not work. This patch correct sc7280 ping pong-0 block base offset. Changes in v2: -- add more details info n regrading of pingpong block at commit text Fixes: 591e34a091d1 ("drm/msm/disp/dpu1: add support for display for SC7280 target") Signed-off-by: Kuogee Hsieh Reviewed-by: Abhinav Kumar Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/524332/ Link: https://lore.kernel.org/r/1677533800-3125-1-git-send-email-quic_khsieh@quicinc.com Signed-off-by: Abhinav Kumar Signed-off-by: Sasha Levin --- drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c index b1131860ada1..32a3c42ec45b 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c @@ -1181,7 +1181,7 @@ static const struct dpu_pingpong_cfg sm8150_pp[] = { }; static const struct dpu_pingpong_cfg sc7280_pp[] = { - PP_BLK("pingpong_0", PINGPONG_0, 0x59000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_0", PINGPONG_0, 0x69000, 0, sc7280_pp_sblk, -1, -1), PP_BLK("pingpong_1", PINGPONG_1, 0x6a000, 0, sc7280_pp_sblk, -1, -1), PP_BLK("pingpong_2", PINGPONG_2, 0x6b000, 0, sc7280_pp_sblk, -1, -1), PP_BLK("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, -1), From 3a4d6f959caaf5bc24779fe5846a191482df524c Mon Sep 17 00:00:00 2001 From: Sherry Sun Date: Wed, 23 Nov 2022 10:36:19 +0800 Subject: [PATCH 021/179] tty: serial: fsl_lpuart: switch to new dmaengine_terminate_* API [ Upstream commit 8682ab0eea89c300ebb120c02ead3999ca5560a8 ] Convert dmaengine_terminate_all() calls to synchronous and asynchronous versions where appropriate. Signed-off-by: Sherry Sun Link: https://lore.kernel.org/r/20221123023619.30173-1-sherry.sun@nxp.com Signed-off-by: Greg Kroah-Hartman Stable-dep-of: 1be6f2b15f90 ("tty: serial: fsl_lpuart: fix race on RX DMA shutdown") Signed-off-by: Sasha Levin --- drivers/tty/serial/fsl_lpuart.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index c51883f34ac2..86e96696ab26 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -582,7 +582,7 @@ static void lpuart_flush_buffer(struct uart_port *port) sport->dma_tx_nents, DMA_TO_DEVICE); sport->dma_tx_in_progress = false; } - dmaengine_terminate_all(chan); + dmaengine_terminate_async(chan); } if (lpuart_is_32(sport)) { @@ -1333,7 +1333,7 @@ static void lpuart_dma_rx_free(struct uart_port *port) struct lpuart_port, port); struct dma_chan *chan = sport->dma_rx_chan; - dmaengine_terminate_all(chan); + dmaengine_terminate_sync(chan); dma_unmap_sg(chan->device->dev, &sport->rx_sgl, 1, DMA_FROM_DEVICE); kfree(sport->rx_ring.buf); sport->rx_ring.tail = 0; @@ -1766,7 +1766,7 @@ static void lpuart_dma_shutdown(struct lpuart_port *sport) if (wait_event_interruptible_timeout(sport->dma_wait, !sport->dma_tx_in_progress, msecs_to_jiffies(300)) <= 0) { sport->dma_tx_in_progress = false; - dmaengine_terminate_all(sport->dma_tx_chan); + dmaengine_terminate_sync(sport->dma_tx_chan); } sport->lpuart_dma_tx_use = false; } @@ -2867,7 +2867,7 @@ static int __maybe_unused lpuart_suspend(struct device *dev) if (sport->lpuart_dma_tx_use) { sport->dma_tx_in_progress = false; - dmaengine_terminate_all(sport->dma_tx_chan); + dmaengine_terminate_sync(sport->dma_tx_chan); } if (sport->port.suspended && !irq_wake) From 954fc9931f0aabf272b5674cf468affdd88d3a36 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Thu, 9 Mar 2023 14:43:02 +0100 Subject: [PATCH 022/179] tty: serial: fsl_lpuart: fix race on RX DMA shutdown [ Upstream commit 1be6f2b15f902c02e055ae0b419ca789200473c9 ] From time to time DMA completion can come in the middle of DMA shutdown: : : lpuart32_shutdown() lpuart_dma_shutdown() del_timer_sync() lpuart_dma_rx_complete() lpuart_copy_rx_to_tty() mod_timer() lpuart_dma_rx_free() When the timer fires a bit later, sport->dma_rx_desc is NULL: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000004 pc : lpuart_copy_rx_to_tty+0xcc/0x5bc lr : lpuart_timer_func+0x1c/0x2c Call trace: lpuart_copy_rx_to_tty lpuart_timer_func call_timer_fn __run_timers.part.0 run_timer_softirq __do_softirq __irq_exit_rcu irq_exit handle_domain_irq gic_handle_irq call_on_irq_stack do_interrupt_handler ... To fix this fold del_timer_sync() into lpuart_dma_rx_free() after dmaengine_terminate_sync() to make sure timer will not be re-started in lpuart_copy_rx_to_tty() <= lpuart_dma_rx_complete(). Fixes: 4a8588a1cf86 ("serial: fsl_lpuart: delete timer on shutdown") Cc: stable Signed-off-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20230309134302.74940-2-alexander.sverdlin@siemens.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/fsl_lpuart.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index 86e96696ab26..cd98c04de033 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -1334,6 +1334,7 @@ static void lpuart_dma_rx_free(struct uart_port *port) struct dma_chan *chan = sport->dma_rx_chan; dmaengine_terminate_sync(chan); + del_timer_sync(&sport->lpuart_timer); dma_unmap_sg(chan->device->dev, &sport->rx_sgl, 1, DMA_FROM_DEVICE); kfree(sport->rx_ring.buf); sport->rx_ring.tail = 0; @@ -1757,7 +1758,6 @@ static int lpuart32_startup(struct uart_port *port) static void lpuart_dma_shutdown(struct lpuart_port *sport) { if (sport->lpuart_dma_rx_use) { - del_timer_sync(&sport->lpuart_timer); lpuart_dma_rx_free(&sport->port); sport->lpuart_dma_rx_use = false; } @@ -1917,10 +1917,8 @@ lpuart_set_termios(struct uart_port *port, struct ktermios *termios, * Since timer function acqures sport->port.lock, need to stop before * acquring same lock because otherwise del_timer_sync() can deadlock. */ - if (old && sport->lpuart_dma_rx_use) { - del_timer_sync(&sport->lpuart_timer); + if (old && sport->lpuart_dma_rx_use) lpuart_dma_rx_free(&sport->port); - } spin_lock_irqsave(&sport->port.lock, flags); @@ -2154,10 +2152,8 @@ lpuart32_set_termios(struct uart_port *port, struct ktermios *termios, * Since timer function acqures sport->port.lock, need to stop before * acquring same lock because otherwise del_timer_sync() can deadlock. */ - if (old && sport->lpuart_dma_rx_use) { - del_timer_sync(&sport->lpuart_timer); + if (old && sport->lpuart_dma_rx_use) lpuart_dma_rx_free(&sport->port); - } spin_lock_irqsave(&sport->port.lock, flags); @@ -2850,7 +2846,6 @@ static int __maybe_unused lpuart_suspend(struct device *dev) * Rx DMA path before suspend and start Rx DMA path on resume. */ if (irq_wake) { - del_timer_sync(&sport->lpuart_timer); lpuart_dma_rx_free(&sport->port); } From 93454d1a306ea975294b861d32ffa1e44b20c733 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: [PATCH 023/179] tracing: Add .percent suffix option to histogram values [ Upstream commit abaa5258ce5e5887a9de049f50a85dc023391a1c ] Add .percent suffix option to show the histogram values in percentage. This feature is useful when we need yo undersntand the overall trend for the histograms of large values. E.g. this shows the runtime percentage for each tasks. ------ # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=hitcount,runtime.percent:sort=pid > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=hitcount,runtime.percent:sort=pid:size=2048 [active] # { pid: 8 } hitcount: 7 runtime (%): 4.14 { pid: 14 } hitcount: 5 runtime (%): 3.69 { pid: 16 } hitcount: 11 runtime (%): 3.41 { pid: 61 } hitcount: 41 runtime (%): 19.75 { pid: 65 } hitcount: 4 runtime (%): 1.48 { pid: 70 } hitcount: 6 runtime (%): 3.60 { pid: 72 } hitcount: 2 runtime (%): 1.10 { pid: 144 } hitcount: 10 runtime (%): 32.01 { pid: 151 } hitcount: 8 runtime (%): 22.66 { pid: 152 } hitcount: 2 runtime (%): 8.10 Totals: Hits: 96 Entries: 10 Dropped: 0 ----- Link: https://lore.kernel.org/linux-trace-kernel/166610813077.56030.4238090506973562347.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Stable-dep-of: e0213434fe3e ("tracing: Do not let histogram values have some modifiers") Signed-off-by: Sasha Levin --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 90 +++++++++++++++++++++++++++----- 2 files changed, 78 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 888980257340..f714ed1f1c67 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5727,7 +5727,8 @@ static const char readme_msg[] = "\t .syscall display a syscall id as a syscall name\n" "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" - "\t .usecs display a common_timestamp in microseconds\n\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e3df03cdecbc..1c207fbf5634 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -506,6 +506,7 @@ enum hist_field_flags { HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, }; struct var_defs { @@ -1708,6 +1709,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "buckets"; else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; return flags_str; } @@ -2320,6 +2323,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (ret || !(*buckets)) goto error; *flags |= HIST_FIELD_FL_BUCKET; + } else if (strncmp(modifier, "percent", 7) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -5297,33 +5304,69 @@ static void hist_trigger_print_key(struct seq_file *m, seq_puts(m, "}"); } +/* Get the 100 times of the percentage of @val in @total */ +static inline unsigned int __get_percentage(u64 val, u64 total) +{ + if (!total) + goto div0; + + if (val < (U64_MAX / 10000)) + return (unsigned int)div64_ul(val * 10000, total); + + total = div64_u64(total, 10000); + if (!total) + goto div0; + + return (unsigned int)div64_ul(val, total); +div0: + return val ? UINT_MAX : 0; +} + +static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, + u64 *totals, struct tracing_map_elt *elt) +{ + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; + + if (flags & HIST_FIELD_FL_PERCENT) { + pc = __get_percentage(val, totals[idx]); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { + seq_printf(m, " %s: %10llu", field_name, val); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, + u64 *totals, void *key, struct tracing_map_elt *elt) { const char *field_name; - unsigned int i; + unsigned int i = HITCOUNT_IDX; + unsigned long flags; hist_trigger_print_key(m, hist_data, key, elt); - seq_printf(m, " hitcount: %10llu", - tracing_map_read_sum(elt, HITCOUNT_IDX)); + /* At first, show the raw hitcount always */ + hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); + flags = hist_data->fields[i]->flags; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || - hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) + if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) continue; - if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", field_name, - tracing_map_read_sum(elt, i)); - } else { - seq_printf(m, " %s: %10llu", field_name, - tracing_map_read_sum(elt, i)); - } + seq_puts(m, " "); + hist_trigger_print_val(m, i, field_name, flags, totals, elt); } print_actions(m, hist_data, elt); @@ -5336,7 +5379,8 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - int i, n_entries; + int i, j, n_entries; + u64 *totals = NULL; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5344,11 +5388,29 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + for (j = 0; j < hist_data->n_vals; j++) { + if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) + continue; + if (!totals) { + totals = kcalloc(hist_data->n_vals, sizeof(u64), + GFP_KERNEL); + if (!totals) { + n_entries = -ENOMEM; + goto out; + } + } + for (i = 0; i < n_entries; i++) + totals[j] += tracing_map_read_sum( + sort_entries[i]->elt, j); + } + for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, + hist_trigger_entry_print(m, hist_data, totals, sort_entries[i]->key, sort_entries[i]->elt); + kfree(totals); +out: tracing_map_destroy_sort_entries(sort_entries, n_entries); return n_entries; From 8ebeea1052f6d02b5eb5fad32f4e3cb58246fe0b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 20 Oct 2022 00:31:55 +0900 Subject: [PATCH 024/179] tracing: Add .graph suffix option to histogram value [ Upstream commit a2c54256dec7510477e2b4f4db187e638f7cac37 ] Add the .graph suffix which shows the bar graph of the histogram value. For example, the below example shows that the bar graph of the histogram of the runtime for each tasks. ------ # cd /sys/kernel/debug/tracing/ # echo hist:keys=pid:vals=runtime.graph:sort=pid > \ events/sched/sched_stat_runtime/trigger # sleep 10 # cat events/sched/sched_stat_runtime/hist # event histogram # # trigger info: hist:keys=pid:vals=hitcount,runtime.graph:sort=pid:size=2048 [active] # { pid: 14 } hitcount: 2 runtime: { pid: 16 } hitcount: 8 runtime: { pid: 26 } hitcount: 1 runtime: { pid: 57 } hitcount: 3 runtime: { pid: 61 } hitcount: 20 runtime: ### { pid: 66 } hitcount: 2 runtime: { pid: 70 } hitcount: 3 runtime: { pid: 72 } hitcount: 2 runtime: { pid: 145 } hitcount: 14 runtime: #################### { pid: 152 } hitcount: 5 runtime: ####### { pid: 153 } hitcount: 2 runtime: #### Totals: Hits: 62 Entries: 11 Dropped: 0 ------- Link: https://lore.kernel.org/linux-trace-kernel/166610813953.56030.10944148382315789485.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Stable-dep-of: e0213434fe3e ("tracing: Do not let histogram values have some modifiers") Signed-off-by: Sasha Levin --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 77 +++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f714ed1f1c67..78d69b9488e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5728,7 +5728,8 @@ static const char readme_msg[] = "\t .log2 display log2 value rather than raw number\n" "\t .buckets=size display values in groups of size rather than raw number\n" "\t .usecs display a common_timestamp in microseconds\n" - "\t .percent display a number of percentage value\n\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1c207fbf5634..8e0acf8009bd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -507,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_BUCKET = 1 << 17, HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, + HIST_FIELD_FL_GRAPH = 1 << 20, }; struct var_defs { @@ -1711,6 +1712,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "usecs"; else if (hist_field->flags & HIST_FIELD_FL_PERCENT) flags_str = "percent"; + else if (hist_field->flags & HIST_FIELD_FL_GRAPH) + flags_str = "graph"; return flags_str; } @@ -2327,6 +2330,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) goto error; *flags |= HIST_FIELD_FL_PERCENT; + } else if (strncmp(modifier, "graph", 5) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_GRAPH; } else { error: hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); @@ -5322,20 +5329,52 @@ div0: return val ? UINT_MAX : 0; } +#define BAR_CHAR '#' + +static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) +{ + unsigned int len = __get_percentage(val, max); + int i; + + if (len == UINT_MAX) { + snprintf(buf, size, "[ERROR]"); + return buf; + } + + len = len * size / 10000; + for (i = 0; i < len && i < size; i++) + buf[i] = BAR_CHAR; + while (i < size) + buf[i++] = ' '; + buf[size] = '\0'; + + return buf; +} + +struct hist_val_stat { + u64 max; + u64 total; +}; + static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, const char *field_name, unsigned long flags, - u64 *totals, struct tracing_map_elt *elt) + struct hist_val_stat *stats, + struct tracing_map_elt *elt) { u64 val = tracing_map_read_sum(elt, idx); unsigned int pc; + char bar[21]; if (flags & HIST_FIELD_FL_PERCENT) { - pc = __get_percentage(val, totals[idx]); + pc = __get_percentage(val, stats[idx].total); if (pc == UINT_MAX) seq_printf(m, " %s (%%):[ERROR]", field_name); else seq_printf(m, " %s (%%): %3u.%02u", field_name, pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_GRAPH) { + seq_printf(m, " %s: %20s", field_name, + __fill_bar_str(bar, 20, val, stats[idx].max)); } else if (flags & HIST_FIELD_FL_HEX) { seq_printf(m, " %s: %10llx", field_name, val); } else { @@ -5345,7 +5384,7 @@ static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, - u64 *totals, + struct hist_val_stat *stats, void *key, struct tracing_map_elt *elt) { @@ -5356,7 +5395,7 @@ static void hist_trigger_entry_print(struct seq_file *m, hist_trigger_print_key(m, hist_data, key, elt); /* At first, show the raw hitcount always */ - hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); for (i = 1; i < hist_data->n_vals; i++) { field_name = hist_field_name(hist_data->fields[i], 0); @@ -5366,7 +5405,7 @@ static void hist_trigger_entry_print(struct seq_file *m, continue; seq_puts(m, " "); - hist_trigger_print_val(m, i, field_name, flags, totals, elt); + hist_trigger_print_val(m, i, field_name, flags, stats, elt); } print_actions(m, hist_data, elt); @@ -5380,7 +5419,8 @@ static int print_entries(struct seq_file *m, struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; int i, j, n_entries; - u64 *totals = NULL; + struct hist_val_stat *stats = NULL; + u64 val; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, @@ -5388,28 +5428,33 @@ static int print_entries(struct seq_file *m, if (n_entries < 0) return n_entries; + /* Calculate the max and the total for each field if needed. */ for (j = 0; j < hist_data->n_vals; j++) { - if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) + if (!(hist_data->fields[j]->flags & + (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) continue; - if (!totals) { - totals = kcalloc(hist_data->n_vals, sizeof(u64), - GFP_KERNEL); - if (!totals) { + if (!stats) { + stats = kcalloc(hist_data->n_vals, sizeof(*stats), + GFP_KERNEL); + if (!stats) { n_entries = -ENOMEM; goto out; } } - for (i = 0; i < n_entries; i++) - totals[j] += tracing_map_read_sum( - sort_entries[i]->elt, j); + for (i = 0; i < n_entries; i++) { + val = tracing_map_read_sum(sort_entries[i]->elt, j); + stats[j].total += val; + if (stats[j].max < val) + stats[j].max = val; + } } for (i = 0; i < n_entries; i++) - hist_trigger_entry_print(m, hist_data, totals, + hist_trigger_entry_print(m, hist_data, stats, sort_entries[i]->key, sort_entries[i]->elt); - kfree(totals); + kfree(stats); out: tracing_map_destroy_sort_entries(sort_entries, n_entries); From 39cd75f2f3a43c0e2f95749eb6dd6420c553f87d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 1 Mar 2023 20:00:52 -0500 Subject: [PATCH 025/179] tracing: Do not let histogram values have some modifiers [ Upstream commit e0213434fe3e4a0d118923dc98d31e7ff1cd9e45 ] Histogram values can not be strings, stacktraces, graphs, symbols, syscalls, or grouped in buckets or log. Give an error if a value is set to do so. Note, the histogram code was not prepared to handle these modifiers for histograms and caused a bug. Mark Rutland reported: # echo 'p:copy_to_user __arch_copy_to_user n=$arg2' >> /sys/kernel/tracing/kprobe_events # echo 'hist:keys=n:vals=hitcount.buckets=8:sort=hitcount' > /sys/kernel/tracing/events/kprobes/copy_to_user/trigger # cat /sys/kernel/tracing/events/kprobes/copy_to_user/hist [ 143.694628] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 143.695190] Mem abort info: [ 143.695362] ESR = 0x0000000096000004 [ 143.695604] EC = 0x25: DABT (current EL), IL = 32 bits [ 143.695889] SET = 0, FnV = 0 [ 143.696077] EA = 0, S1PTW = 0 [ 143.696302] FSC = 0x04: level 0 translation fault [ 143.702381] Data abort info: [ 143.702614] ISV = 0, ISS = 0x00000004 [ 143.702832] CM = 0, WnR = 0 [ 143.703087] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000448f9000 [ 143.703407] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 143.704137] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 143.704714] Modules linked in: [ 143.705273] CPU: 0 PID: 133 Comm: cat Not tainted 6.2.0-00003-g6fc512c10a7c #3 [ 143.706138] Hardware name: linux,dummy-virt (DT) [ 143.706723] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 143.707120] pc : hist_field_name.part.0+0x14/0x140 [ 143.707504] lr : hist_field_name.part.0+0x104/0x140 [ 143.707774] sp : ffff800008333a30 [ 143.707952] x29: ffff800008333a30 x28: 0000000000000001 x27: 0000000000400cc0 [ 143.708429] x26: ffffd7a653b20260 x25: 0000000000000000 x24: ffff10d303ee5800 [ 143.708776] x23: ffffd7a6539b27b0 x22: ffff10d303fb8c00 x21: 0000000000000001 [ 143.709127] x20: ffff10d303ec2000 x19: 0000000000000000 x18: 0000000000000000 [ 143.709478] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 [ 143.709824] x14: 0000000000000000 x13: 203a6f666e692072 x12: 6567676972742023 [ 143.710179] x11: 0a230a6d6172676f x10: 000000000000002c x9 : ffffd7a6521e018c [ 143.710584] x8 : 000000000000002c x7 : 7f7f7f7f7f7f7f7f x6 : 000000000000002c [ 143.710915] x5 : ffff10d303b0103e x4 : ffffd7a653b20261 x3 : 000000000000003d [ 143.711239] x2 : 0000000000020001 x1 : 0000000000000001 x0 : 0000000000000000 [ 143.711746] Call trace: [ 143.712115] hist_field_name.part.0+0x14/0x140 [ 143.712642] hist_field_name.part.0+0x104/0x140 [ 143.712925] hist_field_print+0x28/0x140 [ 143.713125] event_hist_trigger_print+0x174/0x4d0 [ 143.713348] hist_show+0xf8/0x980 [ 143.713521] seq_read_iter+0x1bc/0x4b0 [ 143.713711] seq_read+0x8c/0xc4 [ 143.713876] vfs_read+0xc8/0x2a4 [ 143.714043] ksys_read+0x70/0xfc [ 143.714218] __arm64_sys_read+0x24/0x30 [ 143.714400] invoke_syscall+0x50/0x120 [ 143.714587] el0_svc_common.constprop.0+0x4c/0x100 [ 143.714807] do_el0_svc+0x44/0xd0 [ 143.714970] el0_svc+0x2c/0x84 [ 143.715134] el0t_64_sync_handler+0xbc/0x140 [ 143.715334] el0t_64_sync+0x190/0x194 [ 143.715742] Code: a9bd7bfd 910003fd a90153f3 aa0003f3 (f9400000) [ 143.716510] ---[ end trace 0000000000000000 ]--- Segmentation fault Link: https://lkml.kernel.org/r/20230302020810.559462599@goodmis.org Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Andrew Morton Fixes: c6afad49d127f ("tracing: Add hist trigger 'sym' and 'sym-offset' modifiers") Reported-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/trace_events_hist.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e0acf8009bd..2b2120ed2460 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4193,6 +4193,15 @@ static int __create_val_field(struct hist_trigger_data *hist_data, goto out; } + /* Some types cannot be a value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + ret = -EINVAL; + } + hist_data->fields[val_idx] = hist_field; ++hist_data->n_vals; From ec6cd79c4e54a2a5867e0497d269ad4f008216ba Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 21 Mar 2023 03:03:23 +0200 Subject: [PATCH 026/179] net: mscc: ocelot: fix stats region batching [ Upstream commit 6acc72a43eac78a309160d0a7512bbc59bcdd757 ] The blamed commit changed struct ocelot_stat_layout :: "u32 offset" to "u32 reg". However, "u32 reg" is not quite a register address, but an enum ocelot_reg, which in itself encodes an enum ocelot_target target in the upper bits, and an index into the ocelot->map[target][] array in the lower bits. So, whereas the previous code comparison between stats_layout[i].offset and last + 1 was correct (because those "offsets" at the time were 32-bit relative addresses), the new code, comparing layout[i].reg to last + 4 is not correct, because the "reg" here is an enum/index, not an actual register address. What we want to compare are indeed register addresses, but to do that, we need to actually go through the same motions as __ocelot_bulk_read_ix() itself. With this bug, all statistics counters are deemed by ocelot_prepare_stats_regions() as constituting their own region. (Truncated) log on VSC9959 (Felix) below (prints added by me): Before: region of 1 contiguous counters starting with SYS:STAT:CNT[0x000] region of 1 contiguous counters starting with SYS:STAT:CNT[0x001] region of 1 contiguous counters starting with SYS:STAT:CNT[0x002] ... region of 1 contiguous counters starting with SYS:STAT:CNT[0x041] region of 1 contiguous counters starting with SYS:STAT:CNT[0x042] region of 1 contiguous counters starting with SYS:STAT:CNT[0x080] region of 1 contiguous counters starting with SYS:STAT:CNT[0x081] ... region of 1 contiguous counters starting with SYS:STAT:CNT[0x0ac] region of 1 contiguous counters starting with SYS:STAT:CNT[0x100] region of 1 contiguous counters starting with SYS:STAT:CNT[0x101] ... region of 1 contiguous counters starting with SYS:STAT:CNT[0x111] After: region of 67 contiguous counters starting with SYS:STAT:CNT[0x000] region of 45 contiguous counters starting with SYS:STAT:CNT[0x080] region of 18 contiguous counters starting with SYS:STAT:CNT[0x100] Since commit d87b1c08f38a ("net: mscc: ocelot: use bulk reads for stats") intended bulking as a performance improvement, and since now, with trivial-sized regions, performance is even worse than without bulking at all, this could easily qualify as a performance regression. Fixes: d4c367650704 ("net: mscc: ocelot: keep ocelot_stat_layout by reg address, not offset") Signed-off-by: Vladimir Oltean Acked-by: Colin Foster Tested-by: Colin Foster Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mscc/ocelot_stats.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot_stats.c b/drivers/net/ethernet/mscc/ocelot_stats.c index dbd20b125cea..0066219bb0e8 100644 --- a/drivers/net/ethernet/mscc/ocelot_stats.c +++ b/drivers/net/ethernet/mscc/ocelot_stats.c @@ -392,7 +392,8 @@ static int ocelot_prepare_stats_regions(struct ocelot *ocelot) if (!ocelot->stats_layout[i].reg) continue; - if (region && ocelot->stats_layout[i].reg == last + 4) { + if (region && ocelot->map[SYS][ocelot->stats_layout[i].reg & REG_MASK] == + ocelot->map[SYS][last & REG_MASK] + 4) { region->count++; } else { region = devm_kzalloc(ocelot->dev, sizeof(*region), From e0169d62efce979eb7dbda3101eea180b69ca793 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 10 Mar 2023 13:30:05 +0100 Subject: [PATCH 027/179] arm64: efi: Set NX compat flag in PE/COFF header [ Upstream commit 3c66bb1918c262dd52fb4221a8d372619c5da70a ] The PE/COFF header has a NX compat flag which informs the firmware that the application does not rely on memory regions being mapped with both executable and writable permissions at the same time. This is typically used by the firmware to decide whether it can set the NX attribute on all allocations it returns, but going forward, it may be used to enforce a policy that only permits applications with the NX flag set to be loaded to begin wiht in some configurations, e.g., when Secure Boot is in effect. Even though the arm64 version of the EFI stub may relocate the kernel before executing it, it always did so after disabling the MMU, and so we were always in line with what the NX compat flag conveys, we just never bothered to set it. So let's set the flag now. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Sasha Levin --- arch/arm64/kernel/efi-header.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index 28d8a5dca5f1..d731b4655df8 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -66,7 +66,7 @@ .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem - .short 0 // DllCharacteristics + .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics .quad 0 // SizeOfStackReserve .quad 0 // SizeOfStackCommit .quad 0 // SizeOfHeapReserve From 2b4830eefc41b06d4943ae860abf24ec3901d89f Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 24 Mar 2023 16:05:19 -0300 Subject: [PATCH 028/179] cifs: fix missing unload_nls() in smb2_reconnect() [ Upstream commit c24bb1a87dc3f2d77d410eaac2c6a295961bf50e ] Make sure to unload_nls() @nls_codepage if we no longer need it. Fixes: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") Signed-off-by: Paulo Alcantara (SUSE) Cc: Shyam Prasad N Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/smb2pdu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f0b1ae0835d7..b37379b62cc7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -144,7 +144,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, struct TCP_Server_Info *server) { int rc = 0; - struct nls_table *nls_codepage; + struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; /* @@ -216,8 +216,6 @@ again: tcon->ses->chans_need_reconnect, tcon->need_reconnect); - nls_codepage = load_nls_default(); - mutex_lock(&ses->session_mutex); /* * Recheck after acquire mutex. If another thread is negotiating @@ -237,6 +235,8 @@ again: } spin_unlock(&server->srv_lock); + nls_codepage = load_nls_default(); + /* * need to prevent multiple threads trying to simultaneously * reconnect the same SMB session From 5218af4ad5d8948faac19f71583bcd786c3852df Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 9 Feb 2023 09:09:52 +0800 Subject: [PATCH 029/179] xfrm: Zero padding when dumping algos and encap [ Upstream commit 8222d5910dae08213b6d9d4bc9a7f8502855e624 ] When copying data to user-space we should ensure that only valid data is copied over. Padding in structures may be filled with random (possibly sensitve) data and should never be given directly to user-space. This patch fixes the copying of xfrm algorithms and the encap template in xfrm_user so that padding is zeroed. Reported-by: syzbot+fa5414772d5c445dac3c@syzkaller.appspotmail.com Reported-by: Hyunwoo Kim Signed-off-by: Herbert Xu Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_user.c | 45 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e73f9efc54c1..83f35ecacf24 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -997,7 +997,9 @@ static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - memcpy(ap, aead, sizeof(*aead)); + strscpy_pad(ap->alg_name, aead->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = aead->alg_key_len; + ap->alg_icv_len = aead->alg_icv_len; if (redact_secret && aead->alg_key_len) memset(ap->alg_key, 0, (aead->alg_key_len + 7) / 8); @@ -1017,7 +1019,8 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - memcpy(ap, ealg, sizeof(*ealg)); + strscpy_pad(ap->alg_name, ealg->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = ealg->alg_key_len; if (redact_secret && ealg->alg_key_len) memset(ap->alg_key, 0, (ealg->alg_key_len + 7) / 8); @@ -1028,6 +1031,40 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) return 0; } +static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb) +{ + struct nlattr *nla = nla_reserve(skb, XFRMA_ALG_COMP, sizeof(*calg)); + struct xfrm_algo *ap; + + if (!nla) + return -EMSGSIZE; + + ap = nla_data(nla); + strscpy_pad(ap->alg_name, calg->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = 0; + + return 0; +} + +static int copy_to_user_encap(struct xfrm_encap_tmpl *ep, struct sk_buff *skb) +{ + struct nlattr *nla = nla_reserve(skb, XFRMA_ENCAP, sizeof(*ep)); + struct xfrm_encap_tmpl *uep; + + if (!nla) + return -EMSGSIZE; + + uep = nla_data(nla); + memset(uep, 0, sizeof(*uep)); + + uep->encap_type = ep->encap_type; + uep->encap_sport = ep->encap_sport; + uep->encap_dport = ep->encap_dport; + uep->encap_oa = ep->encap_oa; + + return 0; +} + static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m) { int ret = 0; @@ -1083,12 +1120,12 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; } if (x->calg) { - ret = nla_put(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg); + ret = copy_to_user_calg(x->calg, skb); if (ret) goto out; } if (x->encap) { - ret = nla_put(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + ret = copy_to_user_encap(x->encap, skb); if (ret) goto out; } From 57f9a9a232bde7abfe49c3072b29a255da9ba891 Mon Sep 17 00:00:00 2001 From: Ravulapati Vishnu Vardhan Rao Date: Sat, 4 Mar 2023 13:37:02 +0530 Subject: [PATCH 030/179] ASoC: codecs: tx-macro: Fix for KASAN: slab-out-of-bounds [ Upstream commit e5e7e398f6bb7918dab0612eb6991f7bae95520d ] When we run syzkaller we get below Out of Bound. "KASAN: slab-out-of-bounds Read in regcache_flat_read" Below is the backtrace of the issue: dump_backtrace+0x0/0x4c8 show_stack+0x34/0x44 dump_stack_lvl+0xd8/0x118 print_address_description+0x30/0x2d8 kasan_report+0x158/0x198 __asan_report_load4_noabort+0x44/0x50 regcache_flat_read+0x10c/0x110 regcache_read+0xf4/0x180 _regmap_read+0xc4/0x278 _regmap_update_bits+0x130/0x290 regmap_update_bits_base+0xc0/0x15c snd_soc_component_update_bits+0xa8/0x22c snd_soc_component_write_field+0x68/0xd4 tx_macro_digital_mute+0xec/0x140 Actually There is no need to have decimator with 32 bits. By limiting the variable with short type u8 issue is resolved. Signed-off-by: Ravulapati Vishnu Vardhan Rao Link: https://lore.kernel.org/r/20230304080702.609-1-quic_visr@quicinc.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/lpass-tx-macro.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/lpass-tx-macro.c b/sound/soc/codecs/lpass-tx-macro.c index 5d1c58df081a..e5611f655bed 100644 --- a/sound/soc/codecs/lpass-tx-macro.c +++ b/sound/soc/codecs/lpass-tx-macro.c @@ -241,7 +241,7 @@ enum { struct tx_mute_work { struct tx_macro *tx; - u32 decimator; + u8 decimator; struct delayed_work dwork; }; @@ -634,7 +634,7 @@ exit: return 0; } -static bool is_amic_enabled(struct snd_soc_component *component, int decimator) +static bool is_amic_enabled(struct snd_soc_component *component, u8 decimator) { u16 adc_mux_reg, adc_reg, adc_n; @@ -845,7 +845,7 @@ static int tx_macro_enable_dec(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event) { struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); - unsigned int decimator; + u8 decimator; u16 tx_vol_ctl_reg, dec_cfg_reg, hpf_gate_reg, tx_gain_ctl_reg; u8 hpf_cut_off_freq; int hpf_delay = TX_MACRO_DMIC_HPF_DELAY_MS; @@ -1060,7 +1060,8 @@ static int tx_macro_hw_params(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct snd_soc_component *component = dai->component; - u32 decimator, sample_rate; + u32 sample_rate; + u8 decimator; int tx_fs_rate; struct tx_macro *tx = snd_soc_component_get_drvdata(component); @@ -1124,7 +1125,7 @@ static int tx_macro_digital_mute(struct snd_soc_dai *dai, int mute, int stream) { struct snd_soc_component *component = dai->component; struct tx_macro *tx = snd_soc_component_get_drvdata(component); - u16 decimator; + u8 decimator; /* active decimator not set yet */ if (tx->active_decimator[dai->id] == -1) From d0652061c3b85e498d9b666e63c49070b4c1caf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Fri, 3 Mar 2023 14:48:50 +0100 Subject: [PATCH 031/179] ASoC: Intel: avs: max98357a: Explicitly define codec format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit d16c893425d07ada1fdd817ec06d322efcf69480 ] max98357a is speaker codec configured in 48000/2/S16_LE format regardless of front end format, so force it to be so. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20230303134854.2277146-2-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/avs/boards/max98357a.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sound/soc/intel/avs/boards/max98357a.c b/sound/soc/intel/avs/boards/max98357a.c index 921f42caf7e0..183123d08c5a 100644 --- a/sound/soc/intel/avs/boards/max98357a.c +++ b/sound/soc/intel/avs/boards/max98357a.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -24,6 +25,26 @@ static const struct snd_soc_dapm_route card_base_routes[] = { { "Spk", NULL, "Speaker" }, }; +static int +avs_max98357a_be_fixup(struct snd_soc_pcm_runtime *runrime, struct snd_pcm_hw_params *params) +{ + struct snd_interval *rate, *channels; + struct snd_mask *fmt; + + rate = hw_param_interval(params, SNDRV_PCM_HW_PARAM_RATE); + channels = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); + fmt = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); + + /* The ADSP will convert the FE rate to 48k, stereo */ + rate->min = rate->max = 48000; + channels->min = channels->max = 2; + + /* set SSP0 to 16 bit */ + snd_mask_none(fmt); + snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S16_LE); + return 0; +} + static int avs_create_dai_link(struct device *dev, const char *platform_name, int ssp_port, struct snd_soc_dai_link **dai_link) { @@ -55,6 +76,7 @@ static int avs_create_dai_link(struct device *dev, const char *platform_name, in dl->num_platforms = 1; dl->id = 0; dl->dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF | SND_SOC_DAIFMT_CBS_CFS; + dl->be_hw_params_fixup = avs_max98357a_be_fixup; dl->nonatomic = 1; dl->no_pcm = 1; dl->dpcm_playback = 1; From ea7950db76506cf3d0a20898b2de82c29340bccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Fri, 3 Mar 2023 14:48:51 +0100 Subject: [PATCH 032/179] ASoC: Intel: avs: da7219: Explicitly define codec format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 61f368624fe4d0c25c6e9c917574b8ace51d776e ] da7219 is headset codec configured in 48000/2/S24_LE format regardless of front end format, so force it to be so. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20230303134854.2277146-3-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/avs/boards/da7219.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sound/soc/intel/avs/boards/da7219.c b/sound/soc/intel/avs/boards/da7219.c index 02ae542ad779..a63563594b4c 100644 --- a/sound/soc/intel/avs/boards/da7219.c +++ b/sound/soc/intel/avs/boards/da7219.c @@ -111,6 +111,26 @@ static int avs_da7219_codec_init(struct snd_soc_pcm_runtime *runtime) return 0; } +static int +avs_da7219_be_fixup(struct snd_soc_pcm_runtime *runrime, struct snd_pcm_hw_params *params) +{ + struct snd_interval *rate, *channels; + struct snd_mask *fmt; + + rate = hw_param_interval(params, SNDRV_PCM_HW_PARAM_RATE); + channels = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); + fmt = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); + + /* The ADSP will convert the FE rate to 48k, stereo */ + rate->min = rate->max = 48000; + channels->min = channels->max = 2; + + /* set SSP0 to 24 bit */ + snd_mask_none(fmt); + snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S24_LE); + return 0; +} + static int avs_create_dai_link(struct device *dev, const char *platform_name, int ssp_port, struct snd_soc_dai_link **dai_link) { @@ -142,6 +162,7 @@ static int avs_create_dai_link(struct device *dev, const char *platform_name, in dl->num_platforms = 1; dl->id = 0; dl->dai_fmt = SND_SOC_DAIFMT_I2S | SND_SOC_DAIFMT_NB_NF | SND_SOC_DAIFMT_CBS_CFS; + dl->be_hw_params_fixup = avs_da7219_be_fixup; dl->init = avs_da7219_codec_init; dl->nonatomic = 1; dl->no_pcm = 1; From 30878a7eec00d78be07cd94ea8dac98d1fb8203f Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Fri, 3 Mar 2023 14:48:53 +0100 Subject: [PATCH 033/179] ASoC: Intel: avs: ssm4567: Remove nau8825 bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 933de2d127281731166cf2880fa1e23c5a0f7faa ] Some of the nau8825 clock control got into the ssm4567, remove it. Signed-off-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20230303134854.2277146-5-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/avs/boards/ssm4567.c | 31 ---------------------------- 1 file changed, 31 deletions(-) diff --git a/sound/soc/intel/avs/boards/ssm4567.c b/sound/soc/intel/avs/boards/ssm4567.c index 9f84c8ab3447..51a8867326b4 100644 --- a/sound/soc/intel/avs/boards/ssm4567.c +++ b/sound/soc/intel/avs/boards/ssm4567.c @@ -15,7 +15,6 @@ #include #include "../../../codecs/nau8825.h" -#define SKL_NUVOTON_CODEC_DAI "nau8825-hifi" #define SKL_SSM_CODEC_DAI "ssm4567-hifi" static struct snd_soc_codec_conf card_codec_conf[] = { @@ -34,41 +33,11 @@ static const struct snd_kcontrol_new card_controls[] = { SOC_DAPM_PIN_SWITCH("Right Speaker"), }; -static int -platform_clock_control(struct snd_soc_dapm_widget *w, struct snd_kcontrol *control, int event) -{ - struct snd_soc_dapm_context *dapm = w->dapm; - struct snd_soc_card *card = dapm->card; - struct snd_soc_dai *codec_dai; - int ret; - - codec_dai = snd_soc_card_get_codec_dai(card, SKL_NUVOTON_CODEC_DAI); - if (!codec_dai) { - dev_err(card->dev, "Codec dai not found\n"); - return -EINVAL; - } - - if (SND_SOC_DAPM_EVENT_ON(event)) { - ret = snd_soc_dai_set_sysclk(codec_dai, NAU8825_CLK_MCLK, 24000000, - SND_SOC_CLOCK_IN); - if (ret < 0) - dev_err(card->dev, "set sysclk err = %d\n", ret); - } else { - ret = snd_soc_dai_set_sysclk(codec_dai, NAU8825_CLK_INTERNAL, 0, SND_SOC_CLOCK_IN); - if (ret < 0) - dev_err(card->dev, "set sysclk err = %d\n", ret); - } - - return ret; -} - static const struct snd_soc_dapm_widget card_widgets[] = { SND_SOC_DAPM_SPK("Left Speaker", NULL), SND_SOC_DAPM_SPK("Right Speaker", NULL), SND_SOC_DAPM_SPK("DP1", NULL), SND_SOC_DAPM_SPK("DP2", NULL), - SND_SOC_DAPM_SUPPLY("Platform Clock", SND_SOC_NOPM, 0, 0, platform_clock_control, - SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), }; static const struct snd_soc_dapm_route card_base_routes[] = { From 6beb32105eeafc399e684822d07674811e8da40c Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Fri, 3 Mar 2023 14:48:54 +0100 Subject: [PATCH 034/179] ASoC: Intel: avs: nau8825: Adjust clock control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6206b2e787da2ed567922c37bb588a44f6fb6705 ] Internal clock shall be adjusted also in cases when DAPM event other than 'ON' is triggered. Signed-off-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Link: https://lore.kernel.org/r/20230303134854.2277146-6-amadeuszx.slawinski@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/avs/boards/nau8825.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sound/soc/intel/avs/boards/nau8825.c b/sound/soc/intel/avs/boards/nau8825.c index f76909e9f990..8392d8fac8f9 100644 --- a/sound/soc/intel/avs/boards/nau8825.c +++ b/sound/soc/intel/avs/boards/nau8825.c @@ -33,15 +33,15 @@ avs_nau8825_clock_control(struct snd_soc_dapm_widget *w, struct snd_kcontrol *co return -EINVAL; } - if (!SND_SOC_DAPM_EVENT_ON(event)) { + if (SND_SOC_DAPM_EVENT_ON(event)) + ret = snd_soc_dai_set_sysclk(codec_dai, NAU8825_CLK_MCLK, 24000000, + SND_SOC_CLOCK_IN); + else ret = snd_soc_dai_set_sysclk(codec_dai, NAU8825_CLK_INTERNAL, 0, SND_SOC_CLOCK_IN); - if (ret < 0) { - dev_err(card->dev, "set sysclk err = %d\n", ret); - return ret; - } - } + if (ret < 0) + dev_err(card->dev, "Set sysclk failed: %d\n", ret); - return 0; + return ret; } static const struct snd_kcontrol_new card_controls[] = { From 78b342f0cda74f20cea51f0f1e1e9565e3054ffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sun, 29 Jan 2023 14:14:36 +0100 Subject: [PATCH 035/179] zstd: Fix definition of assert() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6906598f1ce93761716d780b6e3f171e13f0f4ce ] assert(x) should emit a warning if x is false. WARN_ON(x) emits a warning if x is true. Thus, assert(x) should be defined as WARN_ON(!x) rather than WARN_ON(x). Signed-off-by: Jonathan Neuschäfer Signed-off-by: Nick Terrell Signed-off-by: Sasha Levin --- lib/zstd/common/zstd_deps.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h index 7a5bf44839c9..f06df065dec0 100644 --- a/lib/zstd/common/zstd_deps.h +++ b/lib/zstd/common/zstd_deps.h @@ -84,7 +84,7 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { #include -#define assert(x) WARN_ON((x)) +#define assert(x) WARN_ON(!(x)) #endif /* ZSTD_DEPS_ASSERT */ #endif /* ZSTD_DEPS_NEED_ASSERT */ From ab7a700fec4053412ca7dd4ae70c31d781219544 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Thu, 2 Mar 2023 17:33:00 +0800 Subject: [PATCH 036/179] ACPI: video: Add backlight=native DMI quirk for Dell Vostro 15 3535 [ Upstream commit 89b0411481967a2e8c91190a211a359966cfcf4b ] Sometimes the system boots up with a acpi_video0 backlight interface which doesn't work. So add Dell Vostro 15 3535 into the video_detect_dmi_table to set it to native explicitly. Signed-off-by: Chia-Lin Kao (AceLan) Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/video_detect.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 7f0ed845cd6a..f06b3d355671 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -714,6 +714,13 @@ static const struct dmi_system_id video_detect_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5515"), }, }, + { + .callback = video_detect_force_native, + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Vostro 15 3535"), + }, + }, /* * Desktops which falsely report a backlight and which our heuristics From a56be07a8c0c5229274c0d1d9efd9fbde2d1c84e Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 7 Mar 2023 13:49:17 +0200 Subject: [PATCH 037/179] ASoC: SOF: ipc3: Check for upper size limit for the received message [ Upstream commit 989a3e4479177d0f4afab8be1960731bc0ffbbd0 ] The sof_ipc3_rx_msg() checks for minimum size of a new rx message but it is missing the check for upper limit. Corrupted or compromised firmware might be able to take advantage of this to cause out of bounds reads outside of the message area. Reported-by: Curtis Malainey Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Curtis Malainey Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20230307114917.5124-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/sof/ipc3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/sof/ipc3.c b/sound/soc/sof/ipc3.c index b28af3a48b70..60b96b0c2412 100644 --- a/sound/soc/sof/ipc3.c +++ b/sound/soc/sof/ipc3.c @@ -970,8 +970,9 @@ static void sof_ipc3_rx_msg(struct snd_sof_dev *sdev) return; } - if (hdr.size < sizeof(hdr)) { - dev_err(sdev->dev, "The received message size is invalid\n"); + if (hdr.size < sizeof(hdr) || hdr.size > SOF_IPC_MSG_MAX_SIZE) { + dev_err(sdev->dev, "The received message size is invalid: %u\n", + hdr.size); return; } From 6290404aa14287f343bbffe287709e05e0cf09a0 Mon Sep 17 00:00:00 2001 From: Seppo Ingalsuo Date: Tue, 7 Mar 2023 13:07:51 +0200 Subject: [PATCH 038/179] ASoC: SOF: ipc4-topology: Fix incorrect sample rate print unit [ Upstream commit 9e269e3aa9006440de639597079ee7140ef5b5f3 ] This patch fixes the sample rate print unit from KHz to Hz. E.g. 48000KHz becomes 48000Hz. Signed-off-by: Seppo Ingalsuo Reviewed-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20230307110751.2053-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/sof/ipc4-topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index a81af5f73a4b..41617569f50f 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -154,7 +154,7 @@ static void sof_ipc4_dbg_audio_format(struct device *dev, for (i = 0; i < num_format; i++, ptr = (u8 *)ptr + object_size) { fmt = ptr; dev_dbg(dev, - " #%d: %uKHz, %ubit (ch_map %#x ch_cfg %u interleaving_style %u fmt_cfg %#x)\n", + " #%d: %uHz, %ubit (ch_map %#x ch_cfg %u interleaving_style %u fmt_cfg %#x)\n", i, fmt->sampling_frequency, fmt->bit_depth, fmt->ch_map, fmt->ch_cfg, fmt->interleaving_style, fmt->fmt_cfg); } From 6d0ab3efb282191025d2583b4a5734cb54abc94f Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 7 Mar 2023 11:53:41 +0200 Subject: [PATCH 039/179] ASoC: SOF: Intel: pci-tng: revert invalid bar size setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ca09e2a351fbc7836ba9418304ff0c3e72addfe0 ] The logic for the ioremap is to find the resource index 3 (IRAM) and infer the BAR address by subtracting the IRAM offset. The BAR size defined in hardware specifications is 2MB. The commit 5947b2726beb6 ("ASoC: SOF: Intel: Check the bar size before remapping") tried to find the BAR size by querying the resource length instead of a pre-canned value, but by requesting the size for index 3 it only gets the size of the IRAM. That's obviously wrong and prevents the probe from proceeding. This commit attempted to fix an issue in a fuzzing/simulated environment but created another on actual devices, so the best course of action is to revert that change. Reported-by: Ferry Toth Tested-by: Ferry Toth (Intel Edison-Arduino) Link: https://github.com/thesofproject/linux/issues/3901 Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Ranjani Sridharan Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20230307095341.3222-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/sof/intel/pci-tng.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sound/soc/sof/intel/pci-tng.c b/sound/soc/sof/intel/pci-tng.c index f0f6d9ba8803..0b17d1bb225e 100644 --- a/sound/soc/sof/intel/pci-tng.c +++ b/sound/soc/sof/intel/pci-tng.c @@ -75,11 +75,7 @@ static int tangier_pci_probe(struct snd_sof_dev *sdev) /* LPE base */ base = pci_resource_start(pci, desc->resindex_lpe_base) - IRAM_OFFSET; - size = pci_resource_len(pci, desc->resindex_lpe_base); - if (size < PCI_BAR_SIZE) { - dev_err(sdev->dev, "error: I/O region is too small.\n"); - return -ENODEV; - } + size = PCI_BAR_SIZE; dev_dbg(sdev->dev, "LPE PHY base at 0x%x size 0x%x", base, size); sdev->bar[DSP_BAR] = devm_ioremap(sdev->dev, base, size); From 50f6507aadf81b1bef255ab7e794b8e4758ebb70 Mon Sep 17 00:00:00 2001 From: Rander Wang Date: Tue, 7 Mar 2023 13:06:56 +0200 Subject: [PATCH 040/179] ASoC: SOF: IPC4: update gain ipc msg definition to align with fw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e45cd86c3a78bfb9875a5eb8ab5dab459b59bbe2 ] Recent firmware changes modified the curve duration from 32 to 64 bits, which breaks volume ramps. A simple solution would be to change the definition, but unfortunately the ASoC topology framework only supports up to 32 bit tokens. This patch suggests breaking the 64 bit value in low and high parts, with only the low-part extracted from topology and high-part only zeroes. Since the curve duration is represented in hundred of nanoseconds, we can still represent a 400s ramp, which is just fine. The defacto ABI change has no effect on existing users since the IPC4 firmware has not been released just yet. Link: https://github.com/thesofproject/linux/issues/4026 Signed-off-by: Rander Wang Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Péter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20230307110656.1816-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/sof/ipc4-control.c | 3 ++- sound/soc/sof/ipc4-topology.c | 4 ++-- sound/soc/sof/ipc4-topology.h | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/ipc4-control.c b/sound/soc/sof/ipc4-control.c index 0d5a578c3496..7442ec1c5a4d 100644 --- a/sound/soc/sof/ipc4-control.c +++ b/sound/soc/sof/ipc4-control.c @@ -84,7 +84,8 @@ sof_ipc4_set_volume_data(struct snd_sof_dev *sdev, struct snd_sof_widget *swidge } /* set curve type and duration from topology */ - data.curve_duration = gain->data.curve_duration; + data.curve_duration_l = gain->data.curve_duration_l; + data.curve_duration_h = gain->data.curve_duration_h; data.curve_type = gain->data.curve_type; msg->data_ptr = &data; diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c index 41617569f50f..49289932ba7e 100644 --- a/sound/soc/sof/ipc4-topology.c +++ b/sound/soc/sof/ipc4-topology.c @@ -106,7 +106,7 @@ static const struct sof_topology_token gain_tokens[] = { get_token_u32, offsetof(struct sof_ipc4_gain_data, curve_type)}, {SOF_TKN_GAIN_RAMP_DURATION, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, - offsetof(struct sof_ipc4_gain_data, curve_duration)}, + offsetof(struct sof_ipc4_gain_data, curve_duration_l)}, {SOF_TKN_GAIN_VAL, SND_SOC_TPLG_TUPLE_TYPE_WORD, get_token_u32, offsetof(struct sof_ipc4_gain_data, init_val)}, }; @@ -682,7 +682,7 @@ static int sof_ipc4_widget_setup_comp_pga(struct snd_sof_widget *swidget) dev_dbg(scomp->dev, "pga widget %s: ramp type: %d, ramp duration %d, initial gain value: %#x, cpc %d\n", - swidget->widget->name, gain->data.curve_type, gain->data.curve_duration, + swidget->widget->name, gain->data.curve_type, gain->data.curve_duration_l, gain->data.init_val, gain->base_config.cpc); ret = sof_ipc4_widget_setup_msg(swidget, &gain->msg); diff --git a/sound/soc/sof/ipc4-topology.h b/sound/soc/sof/ipc4-topology.h index 2363a7cc0b57..cf9d27852457 100644 --- a/sound/soc/sof/ipc4-topology.h +++ b/sound/soc/sof/ipc4-topology.h @@ -217,14 +217,16 @@ struct sof_ipc4_control_data { * @init_val: Initial value * @curve_type: Curve type * @reserved: reserved for future use - * @curve_duration: Curve duration + * @curve_duration_l: Curve duration low part + * @curve_duration_h: Curve duration high part */ struct sof_ipc4_gain_data { uint32_t channels; uint32_t init_val; uint32_t curve_type; uint32_t reserved; - uint32_t curve_duration; + uint32_t curve_duration_l; + uint32_t curve_duration_h; } __aligned(8); /** From df0833da4bfac69739b849b3dff4838e590542e5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 6 Mar 2023 09:36:25 +1100 Subject: [PATCH 041/179] md: avoid signed overflow in slot_store() [ Upstream commit 3bc57292278a0b6ac4656cad94c14f2453344b57 ] slot_store() uses kstrtouint() to get a slot number, but stores the result in an "int" variable (by casting a pointer). This can result in a negative slot number if the unsigned int value is very large. A negative number means that the slot is empty, but setting a negative slot number this way will not remove the device from the array. I don't think this is a serious problem, but it could cause confusion and it is best to fix it. Reported-by: Dan Carpenter Signed-off-by: NeilBrown Signed-off-by: Song Liu Signed-off-by: Sasha Levin --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0368b3c51c7f..d5c362b1602b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3152,6 +3152,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) err = kstrtouint(buf, 10, (unsigned int *)&slot); if (err < 0) return err; + if (slot < 0) + /* overflow */ + return -ENOSPC; } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also From 3e48f7b53de41f5be2fe7d92b50f33d0f7ec7f91 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 13 Mar 2023 15:45:48 +0100 Subject: [PATCH 042/179] x86/PVH: obtain VGA console info in Dom0 [ Upstream commit 934ef33ee75c3846f605f18b65048acd147e3918 ] A new platform-op was added to Xen to allow obtaining the same VGA console information PV Dom0 is handed. Invoke the new function and have the output data processed by xen_init_vga(). Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/8f315e92-7bda-c124-71cc-478ab9c5e610@suse.com Signed-off-by: Juergen Gross Signed-off-by: Sasha Levin --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten_pv.c | 3 ++- arch/x86/xen/enlighten_pvh.c | 13 +++++++++++++ arch/x86/xen/vga.c | 5 ++--- arch/x86/xen/xen-ops.h | 7 ++++--- include/xen/interface/platform.h | 3 +++ 6 files changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3c5b52fbe4a7..a9ec8c9f5c5d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_PV_DOM0) += vga.o +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 8944726255c9..333539bdbdaa 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1389,7 +1389,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) x86_platform.set_legacy_features = xen_dom0_set_legacy_features; - xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_init_vga(info, xen_start_info->console.dom0.info_size, + &boot_params.screen_info); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bcae606bbc5c..1da44aca896c 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -43,6 +43,19 @@ void __init xen_pvh_init(struct boot_params *boot_params) x86_init.oem.banner = xen_banner; xen_efi_init(boot_params); + + if (xen_initial_domain()) { + struct xen_platform_op op = { + .cmd = XENPF_get_dom0_console, + }; + long ret = HYPERVISOR_platform_op(&op); + + if (ret > 0) + xen_init_vga(&op.u.dom0_console, + min(ret * sizeof(char), + sizeof(op.u.dom0_console)), + &boot_params->screen_info); + } } void __init mem_map_via_hcall(struct boot_params *boot_params_p) diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 14ea32e734d5..d97adab8420f 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -9,10 +9,9 @@ #include "xen-ops.h" -void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size, + struct screen_info *screen_info) { - struct screen_info *screen_info = &boot_params.screen_info; - /* This is drawn from a dump from vgacon:startup in * standard Linux. */ screen_info->orig_video_mode = 3; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a8bb972193d..a10903785a33 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -108,11 +108,12 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_PV_DOM0 -void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size, + struct screen_info *); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, - size_t size) + size_t size, struct screen_info *si) { } #endif diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index 655d92e803e1..79a443c65ea9 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -483,6 +483,8 @@ struct xenpf_symdata { }; DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata); +#define XENPF_get_dom0_console 64 + struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ @@ -506,6 +508,7 @@ struct xen_platform_op { struct xenpf_mem_hotadd mem_add; struct xenpf_core_parking core_parking; struct xenpf_symdata symdata; + struct dom0_vga_console_info dom0_console; uint8_t pad[128]; } u; }; From b61b21bcbb8a379269de7d2e3e9ceb42e5368ec4 Mon Sep 17 00:00:00 2001 From: Xiaogang Chen Date: Wed, 1 Mar 2023 10:21:06 -0600 Subject: [PATCH 043/179] drm/amdkfd: Fix BO offset for multi-VMA page migration [ Upstream commit b4ee9606378bb9520c94d8b96f0305c3696f5c29 ] svm_migrate_ram_to_vram migrates a prange from sys ram to vram. The prange may cross multiple vma. Need remember current dst vram offset in the TTM resource for each migration. v2: squash in warning fix (Alex) Signed-off-by: Xiaogang Chen Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 22b077ac9a19..fad500dd224d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -295,7 +295,7 @@ static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate) static int svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, struct migrate_vma *migrate, struct dma_fence **mfence, - dma_addr_t *scratch) + dma_addr_t *scratch, uint64_t ttm_res_offset) { uint64_t npages = migrate->npages; struct device *dev = adev->dev; @@ -305,8 +305,8 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t i, j; int r; - pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start, - prange->last); + pr_debug("svms 0x%p [0x%lx 0x%lx 0x%llx]\n", prange->svms, prange->start, + prange->last, ttm_res_offset); src = scratch; dst = (uint64_t *)(scratch + npages); @@ -317,7 +317,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, goto out; } - amdgpu_res_first(prange->ttm_res, prange->offset << PAGE_SHIFT, + amdgpu_res_first(prange->ttm_res, ttm_res_offset, npages << PAGE_SHIFT, &cursor); for (i = j = 0; i < npages; i++) { struct page *spage; @@ -404,7 +404,7 @@ out: static long svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, - uint64_t end, uint32_t trigger) + uint64_t end, uint32_t trigger, uint64_t ttm_res_offset) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -457,7 +457,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, else pr_debug("0x%lx pages migrated\n", cpages); - r = svm_migrate_copy_to_vram(adev, prange, &migrate, &mfence, scratch); + r = svm_migrate_copy_to_vram(adev, prange, &migrate, &mfence, scratch, ttm_res_offset); migrate_vma_pages(&migrate); pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n", @@ -505,6 +505,7 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, unsigned long addr, start, end; struct vm_area_struct *vma; struct amdgpu_device *adev; + uint64_t ttm_res_offset; unsigned long cpages = 0; long r = 0; @@ -525,6 +526,7 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, start = prange->start << PAGE_SHIFT; end = (prange->last + 1) << PAGE_SHIFT; + ttm_res_offset = prange->offset << PAGE_SHIFT; for (addr = start; addr < end;) { unsigned long next; @@ -534,13 +536,14 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, break; next = min(vma->vm_end, end); - r = svm_migrate_vma_to_vram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_vram(adev, prange, vma, addr, next, trigger, ttm_res_offset); if (r < 0) { pr_debug("failed %ld to migrate\n", r); break; } else { cpages += r; } + ttm_res_offset += next - addr; addr = next; } From b861f0e62ae148eb5f0a48c4d6fcd5dc03b4e252 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Tue, 7 Mar 2023 16:19:02 -0800 Subject: [PATCH 044/179] drm/amdkfd: fix a potential double free in pqm_create_queue [ Upstream commit b2ca5c5d416b4e72d1e9d0293fc720e2d525fd42 ] Set *q to NULL on errors, otherwise pqm_create_queue would free it again. Signed-off-by: Chia-I Wu Signed-off-by: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 5137476ec18e..4236539d9f93 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -218,8 +218,8 @@ static int init_user_queue(struct process_queue_manager *pqm, return 0; cleanup: - if (dev->shared_resources.enable_mes) - uninit_queue(*q); + uninit_queue(*q); + *q = NULL; return retval; } From 5045360f3bb62ccd4f87202e33489f71f8bbc3fc Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Wed, 8 Mar 2023 13:37:24 -0800 Subject: [PATCH 045/179] drm/amdkfd: fix potential kgd_mem UAFs [ Upstream commit 9da050b0d9e04439d225a2ec3044af70cdfb3933 ] kgd_mem pointers returned by kfd_process_device_translate_handle are only guaranteed to be valid while p->mutex is held. As soon as the mutex is unlocked, another thread can free the BO. Signed-off-by: Chia-I Wu Signed-off-by: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f79b8e964140..e191d38f3da6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1298,14 +1298,14 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, args->n_success = i+1; } - mutex_unlock(&p->mutex); - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev, (struct kgd_mem *) mem, true); if (err) { pr_debug("Sync memory failed, wait interrupted by user signal\n"); goto sync_memory_failed; } + mutex_unlock(&p->mutex); + /* Flush TLBs after waiting for the page table updates to complete */ for (i = 0; i < args->n_devices; i++) { peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); @@ -1321,9 +1321,9 @@ get_process_device_data_failed: bind_process_to_device_failed: get_mem_obj_from_handle_failed: map_memory_to_gpu_failed: +sync_memory_failed: mutex_unlock(&p->mutex); copy_from_user_failed: -sync_memory_failed: kfree(devices_arr); return err; @@ -1337,6 +1337,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, void *mem; long err = 0; uint32_t *devices_arr = NULL, i; + bool flush_tlb; if (!args->n_devices) { pr_debug("Device IDs array empty\n"); @@ -1389,16 +1390,19 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } args->n_success = i+1; } - mutex_unlock(&p->mutex); - if (kfd_flush_tlb_after_unmap(pdd->dev)) { + flush_tlb = kfd_flush_tlb_after_unmap(pdd->dev); + if (flush_tlb) { err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev, (struct kgd_mem *) mem, true); if (err) { pr_debug("Sync memory failed, wait interrupted by user signal\n"); goto sync_memory_failed; } + } + mutex_unlock(&p->mutex); + if (flush_tlb) { /* Flush TLBs after waiting for the page table updates to complete */ for (i = 0; i < args->n_devices; i++) { peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); @@ -1414,9 +1418,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, bind_process_to_device_failed: get_mem_obj_from_handle_failed: unmap_memory_from_gpu_failed: +sync_memory_failed: mutex_unlock(&p->mutex); copy_from_user_failed: -sync_memory_failed: kfree(devices_arr); return err; } From 8b4dc07eb938d682717acfad1b9d2448c5a11a83 Mon Sep 17 00:00:00 2001 From: Kristian Overskeid Date: Tue, 7 Mar 2023 14:32:29 +0100 Subject: [PATCH 046/179] net: hsr: Don't log netdev_err message on unknown prp dst node [ Upstream commit 28e8cabe80f3e6e3c98121576eda898eeb20f1b1 ] If no frames has been exchanged with a node for HSR_NODE_FORGET_TIME, the node will be deleted from the node_db list. If a frame is sent to the node after it is deleted, a netdev_err message for each slave interface is produced. This should not happen with dan nodes because of supervision frames, but can happen often with san nodes, which clutters the kernel log. Since the hsr protocol does not support sans, this is only relevant for the prp protocol. Signed-off-by: Kristian Overskeid Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/hsr/hsr_framereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 39a6088080e9..bd0afb899117 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -422,7 +422,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { - if (net_ratelimit()) + if (net_ratelimit() && port->hsr->prot_version != PRP_V1) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } From 69280e8e669c79700346c1aa02bbb5f895b55208 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 13 Mar 2023 00:49:24 +0000 Subject: [PATCH 047/179] ALSA: asihpi: check pao in control_message() [ Upstream commit 9026c0bf233db53b86f74f4c620715e94eb32a09 ] control_message() might be called with pao = NULL. Here indicates control_message() as sample. (B) static void control_message(struct hpi_adapter_obj *pao, ...) { ^^^ struct hpi_hw_obj *phw = pao->priv; ... ^^^ } (A) void _HPI_6205(struct hpi_adapter_obj *pao, ...) { ^^^ ... case HPI_OBJ_CONTROL: (B) control_message(pao, phm, phr); break; ^^^ ... } void HPI_6205(...) { ... (A) _HPI_6205(NULL, phm, phr); ... ^^^^ } Therefore, We will get too many warning via cppcheck, like below sound/pci/asihpi/hpi6205.c:238:27: warning: Possible null pointer dereference: pao [nullPointer] struct hpi_hw_obj *phw = pao->priv; ^ sound/pci/asihpi/hpi6205.c:433:13: note: Calling function '_HPI_6205', 1st argument 'NULL' value is 0 _HPI_6205(NULL, phm, phr); ^ sound/pci/asihpi/hpi6205.c:401:20: note: Calling function 'control_message', 1st argument 'pao' value is 0 control_message(pao, phm, phr); ^ Set phr->error like many functions doing, and don't call _HPI_6205() with NULL. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87ttypeaqz.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/asihpi/hpi6205.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/asihpi/hpi6205.c b/sound/pci/asihpi/hpi6205.c index 27e11b5f70b9..c7d7eff86727 100644 --- a/sound/pci/asihpi/hpi6205.c +++ b/sound/pci/asihpi/hpi6205.c @@ -430,7 +430,7 @@ void HPI_6205(struct hpi_message *phm, struct hpi_response *phr) pao = hpi_find_adapter(phm->adapter_index); } else { /* subsys messages don't address an adapter */ - _HPI_6205(NULL, phm, phr); + phr->error = HPI_ERROR_INVALID_OBJ_INDEX; return; } From 32854bc91ae7debcdefdc7ae881ed83385a04792 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 13 Mar 2023 00:50:28 +0000 Subject: [PATCH 048/179] ALSA: hda/ca0132: fixup buffer overrun at tuning_ctl_set() [ Upstream commit 98e5eb110095ec77cb6d775051d181edbf9cd3cf ] tuning_ctl_set() might have buffer overrun at (X) if it didn't break from loop by matching (A). static int tuning_ctl_set(...) { for (i = 0; i < TUNING_CTLS_COUNT; i++) (A) if (nid == ca0132_tuning_ctls[i].nid) break; snd_hda_power_up(...); (X) dspio_set_param(..., ca0132_tuning_ctls[i].mid, ...); snd_hda_power_down(...); ^ return 1; } We will get below error by cppcheck sound/pci/hda/patch_ca0132.c:4229:2: note: After for loop, i has value 12 for (i = 0; i < TUNING_CTLS_COUNT; i++) ^ sound/pci/hda/patch_ca0132.c:4234:43: note: Array index out of bounds dspio_set_param(codec, ca0132_tuning_ctls[i].mid, 0x20, ^ This patch cares non match case. Signed-off-by: Kuninori Morimoto Link: https://lore.kernel.org/r/87sfe9eap7.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/hda/patch_ca0132.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index acde4cd58785..099722ebaed8 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -4228,8 +4228,10 @@ static int tuning_ctl_set(struct hda_codec *codec, hda_nid_t nid, for (i = 0; i < TUNING_CTLS_COUNT; i++) if (nid == ca0132_tuning_ctls[i].nid) - break; + goto found; + return -EINVAL; +found: snd_hda_power_up(codec); dspio_set_param(codec, ca0132_tuning_ctls[i].mid, 0x20, ca0132_tuning_ctls[i].req, From b8b7d5eac5ee984e42424de776ec1e04d1cc2f89 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Tue, 7 Mar 2023 13:08:56 +0000 Subject: [PATCH 049/179] fbdev: tgafb: Fix potential divide by zero [ Upstream commit f90bd245de82c095187d8c2cabb8b488a39eaecc ] fb_set_var would by called when user invokes ioctl with cmd FBIOPUT_VSCREENINFO. User-provided data would finally reach tgafb_check_var. In case var->pixclock is assigned to zero, divide by zero would occur when checking whether reciprocal of var->pixclock is too high. Similar crashes have happened in other fbdev drivers. There is no check and modification on var->pixclock along the call chain to tgafb_check_var. We believe it could also be triggered in driver tgafb from user site. Signed-off-by: Wei Chen Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/tgafb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/tgafb.c b/drivers/video/fbdev/tgafb.c index 251dbd282f5e..84d5daef9766 100644 --- a/drivers/video/fbdev/tgafb.c +++ b/drivers/video/fbdev/tgafb.c @@ -173,6 +173,9 @@ tgafb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { struct tga_par *par = (struct tga_par *)info->par; + if (!var->pixclock) + return -EINVAL; + if (par->tga_type == TGA_TYPE_8PLANE) { if (var->bits_per_pixel != 8) return -EINVAL; From 2961b331b0745bdd856a0ef18e26913ee641ff86 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 8 Mar 2023 21:23:09 +0800 Subject: [PATCH 050/179] ACPI: tools: pfrut: Check if the input of level and type is in the right numeric range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0bc23d8b2237a104d7f8379d687aa4cb82e2968b ] The user provides arbitrary non-numeic value to level and type, which could bring unexpected behavior. In this case the expected behavior would be to throw an error. pfrut -h usage: pfrut [OPTIONS] code injection: -l, --load -s, --stage -a, --activate -u, --update [stage and activate] -q, --query -d, --revid update telemetry: -G, --getloginfo -T, --type(0:execution, 1:history) -L, --level(0, 1, 2, 4) -R, --read -D, --revid log pfrut -T A pfrut -G log_level:0 log_type:0 log_revid:2 max_data_size:65536 chunk1_size:0 chunk2_size:1530 rollover_cnt:0 reset_cnt:17 Fix this by restricting the input to be in the expected range. Reported-by: Hariganesh Govindarajulu Suggested-by: "Rafael J. Wysocki" Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- tools/power/acpi/tools/pfrut/pfrut.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c index 52aa0351533c..388c9e3ad040 100644 --- a/tools/power/acpi/tools/pfrut/pfrut.c +++ b/tools/power/acpi/tools/pfrut/pfrut.c @@ -97,7 +97,7 @@ static struct option long_options[] = { static void parse_options(int argc, char **argv) { int option_index = 0; - char *pathname; + char *pathname, *endptr; int opt; pathname = strdup(argv[0]); @@ -125,11 +125,23 @@ static void parse_options(int argc, char **argv) log_getinfo = 1; break; case 'T': - log_type = atoi(optarg); + log_type = strtol(optarg, &endptr, 0); + if (*endptr || (log_type != 0 && log_type != 1)) { + printf("Number expected: type(0:execution, 1:history) - Quit.\n"); + exit(1); + } + set_log_type = 1; break; case 'L': - log_level = atoi(optarg); + log_level = strtol(optarg, &endptr, 0); + if (*endptr || + (log_level != 0 && log_level != 1 && + log_level != 2 && log_level != 4)) { + printf("Number expected: level(0, 1, 2, 4) - Quit.\n"); + exit(1); + } + set_log_level = 1; break; case 'R': From c8943cf3ab9b83c4cdc448279dfdbd03b7beeb93 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 14 Mar 2023 19:32:38 -0700 Subject: [PATCH 051/179] sched_getaffinity: don't assume 'cpumask_size()' is fully initialized [ Upstream commit 6015b1aca1a233379625385feb01dd014aca60b5 ] The getaffinity() system call uses 'cpumask_size()' to decide how big the CPU mask is - so far so good. It is indeed the allocation size of a cpumask. But the code also assumes that the whole allocation is initialized without actually doing so itself. That's wrong, because we might have fixed-size allocations (making copying and clearing more efficient), but not all of it is then necessarily used if 'nr_cpu_ids' is smaller. Having checked other users of 'cpumask_size()', they all seem to be ok, either using it purely for the allocation size, or explicitly zeroing the cpumask before using the size in bytes to copy it. See for example the ublk_ctrl_get_queue_affinity() function that uses the proper 'zalloc_cpumask_var()' to make sure that the whole mask is cleared, whether the storage is on the stack or if it was an external allocation. Fix this by just zeroing the allocation before using it. Do the same for the compat version of sched_getaffinity(), which had the same logic. Also, for consistency, make sched_getaffinity() use 'cpumask_bits()' to access the bits. For a cpumask_var_t, it ends up being a pointer to the same data either way, but it's just a good idea to treat it like you would a 'cpumask_t'. The compat case already did that. Reported-by: Ryan Roberts Link: https://lore.kernel.org/lkml/7d026744-6bd6-6827-0471-b5e8eae0be3f@arm.com/ Cc: Yury Norov Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- kernel/compat.c | 2 +- kernel/sched/core.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/compat.c b/kernel/compat.c index 55551989d9da..fb50f29d9b36 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -152,7 +152,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, if (len & (sizeof(compat_ulong_t)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9ebfd484189b..b23dcbeacdf3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8304,14 +8304,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, if (len & (sizeof(unsigned long)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { unsigned int retlen = min(len, cpumask_size()); - if (copy_to_user(user_mask_ptr, mask, retlen)) + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) ret = -EFAULT; else ret = retlen; From e437554b145296fef09a52d7111b137566e434ba Mon Sep 17 00:00:00 2001 From: Philipp Geulen Date: Mon, 13 Mar 2023 11:11:50 +0100 Subject: [PATCH 052/179] nvme-pci: add NVME_QUIRK_BOGUS_NID for Lexar NM620 [ Upstream commit b65d44fa0fe072c91bf41cd8756baa2b4c77eff2 ] Added a quirk to fix Lexar NM620 1TB SSD reporting duplicate NGUIDs. Signed-off-by: Philipp Geulen Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 100f774bc97f..60452f6a9f71 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3547,6 +3547,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1d97, 0x1d97), /* Lexar NM620 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), From b969838c9554a0e9aab3c3cadfcd23d246bc2abe Mon Sep 17 00:00:00 2001 From: David Belanger Date: Tue, 28 Feb 2023 14:11:24 -0500 Subject: [PATCH 053/179] drm/amdkfd: Fixed kfd_process cleanup on module exit. [ Upstream commit 20bc9f76b6a2455c6b54b91ae7634f147f64987f ] Handle case when module is unloaded (kfd_exit) before a process space (mm_struct) is released. v2: Fixed potential race conditions by removing all kfd_process from the process table first, then working on releasing the resources. v3: Fixed loop element access / synchronization. Fixed extra empty lines. Signed-off-by: David Belanger Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdkfd/kfd_module.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 67 +++++++++++++++++++++--- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 09b966dc3768..aee2212e52f6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -77,6 +77,7 @@ err_ioctl: static void kfd_exit(void) { + kfd_cleanup_processes(); kfd_debugfs_fini(); kfd_process_destroy_wq(); kfd_procfs_shutdown(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index bf610e3b683b..6d6588b9beed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -928,6 +928,7 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev); int kfd_process_create_wq(void); void kfd_process_destroy_wq(void); +void kfd_cleanup_processes(void); struct kfd_process *kfd_create_process(struct file *filep); struct kfd_process *kfd_get_process(const struct task_struct *task); struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index dd351105c1bc..7f68d51541e8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1167,6 +1167,17 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn) kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier)); } +static void kfd_process_notifier_release_internal(struct kfd_process *p) +{ + cancel_delayed_work_sync(&p->eviction_work); + cancel_delayed_work_sync(&p->restore_work); + + /* Indicate to other users that MM is no longer valid */ + p->mm = NULL; + + mmu_notifier_put(&p->mmu_notifier); +} + static void kfd_process_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -1181,17 +1192,22 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, return; mutex_lock(&kfd_processes_mutex); + /* + * Do early return if table is empty. + * + * This could potentially happen if this function is called concurrently + * by mmu_notifier and by kfd_cleanup_pocesses. + * + */ + if (hash_empty(kfd_processes_table)) { + mutex_unlock(&kfd_processes_mutex); + return; + } hash_del_rcu(&p->kfd_processes); mutex_unlock(&kfd_processes_mutex); synchronize_srcu(&kfd_processes_srcu); - cancel_delayed_work_sync(&p->eviction_work); - cancel_delayed_work_sync(&p->restore_work); - - /* Indicate to other users that MM is no longer valid */ - p->mm = NULL; - - mmu_notifier_put(&p->mmu_notifier); + kfd_process_notifier_release_internal(p); } static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { @@ -1200,6 +1216,43 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .free_notifier = kfd_process_free_notifier, }; +/* + * This code handles the case when driver is being unloaded before all + * mm_struct are released. We need to safely free the kfd_process and + * avoid race conditions with mmu_notifier that might try to free them. + * + */ +void kfd_cleanup_processes(void) +{ + struct kfd_process *p; + struct hlist_node *p_temp; + unsigned int temp; + HLIST_HEAD(cleanup_list); + + /* + * Move all remaining kfd_process from the process table to a + * temp list for processing. Once done, callback from mmu_notifier + * release will not see the kfd_process in the table and do early return, + * avoiding double free issues. + */ + mutex_lock(&kfd_processes_mutex); + hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) { + hash_del_rcu(&p->kfd_processes); + synchronize_srcu(&kfd_processes_srcu); + hlist_add_head(&p->kfd_processes, &cleanup_list); + } + mutex_unlock(&kfd_processes_mutex); + + hlist_for_each_entry_safe(p, p_temp, &cleanup_list, kfd_processes) + kfd_process_notifier_release_internal(p); + + /* + * Ensures that all outstanding free_notifier get called, triggering + * the release of the kfd_process struct. + */ + mmu_notifier_synchronize(); +} + static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) { unsigned long offset; From 35a32a50dc8f36dd3a0f4c7510997ecc029be586 Mon Sep 17 00:00:00 2001 From: Adham Faris Date: Mon, 23 Jan 2023 10:09:01 +0200 Subject: [PATCH 054/179] net/mlx5e: Lower maximum allowed MTU in XSK to match XDP prerequisites [ Upstream commit 78dee7befd56987283c13877b834c0aa97ad51b9 ] XSK redirecting XDP programs require linearity, hence applies restrictions on the MTU. For PAGE_SIZE=4K, MTU shouldn't exceed 3498. Features that contradict with XDP such HW-LRO and HW-GRO are enforced by the driver in advance, during XSK params validation, except for MTU, which was not enforced before this patch. This has been spotted during test scenario described below: Attaching xdpsock program (PAGE_SIZE=4K), with MTU < 3498, detaching XDP program, changing the MTU to arbitrary value in the range [3499, 3754], attaching XDP program again, which ended up with failure since MTU is > 3498. This commit lowers the XSK MTU limitation to be aligned with XDP MTU limitation, since XSK socket is meaningless without XDP program. Signed-off-by: Adham Faris Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3b5c5064cfaf..5e01de4c3203 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4104,13 +4104,17 @@ static bool mlx5e_xsk_validate_mtu(struct net_device *netdev, struct xsk_buff_pool *xsk_pool = mlx5e_xsk_get_pool(&chs->params, chs->params.xsk, ix); struct mlx5e_xsk_param xsk; + int max_xdp_mtu; if (!xsk_pool) continue; mlx5e_build_xsk_param(xsk_pool, &xsk); + max_xdp_mtu = mlx5e_xdp_max_mtu(new_params, &xsk); - if (!mlx5e_validate_xsk_param(new_params, &xsk, mdev)) { + /* Validate XSK params and XDP MTU in advance */ + if (!mlx5e_validate_xsk_param(new_params, &xsk, mdev) || + new_params->sw_mtu > max_xdp_mtu) { u32 hr = mlx5e_get_linear_rq_headroom(new_params, &xsk); int max_mtu_frame, max_mtu_page, max_mtu; @@ -4120,9 +4124,9 @@ static bool mlx5e_xsk_validate_mtu(struct net_device *netdev, */ max_mtu_frame = MLX5E_HW2SW_MTU(new_params, xsk.chunk_size - hr); max_mtu_page = MLX5E_HW2SW_MTU(new_params, SKB_MAX_HEAD(0)); - max_mtu = min(max_mtu_frame, max_mtu_page); + max_mtu = min3(max_mtu_frame, max_mtu_page, max_xdp_mtu); - netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u. Try MTU <= %d\n", + netdev_err(netdev, "MTU %d is too big for an XSK running on channel %u or its redirection XDP program. Try MTU <= %d\n", new_params->sw_mtu, ix, max_mtu); return false; } From c8d88107971e13e65575885e42ce1908b0d6799e Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Wed, 15 Mar 2023 07:18:31 +0000 Subject: [PATCH 055/179] fbdev: nvidia: Fix potential divide by zero [ Upstream commit 92e2a00f2987483e1f9253625828622edd442e61 ] variable var->pixclock can be set by user. In case it equals to zero, divide by zero would occur in nvidiafb_set_par. Similar crashes have happened in other fbdev drivers. There is no check and modification on var->pixclock along the call chain to nvidia_check_var and nvidiafb_set_par. We believe it could also be triggered in driver nvidia from user site. Signed-off-by: Wei Chen Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/nvidia/nvidia.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/video/fbdev/nvidia/nvidia.c b/drivers/video/fbdev/nvidia/nvidia.c index a6c3bc222246..1b8904824ad8 100644 --- a/drivers/video/fbdev/nvidia/nvidia.c +++ b/drivers/video/fbdev/nvidia/nvidia.c @@ -764,6 +764,8 @@ static int nvidiafb_check_var(struct fb_var_screeninfo *var, int pitch, err = 0; NVTRACE_ENTER(); + if (!var->pixclock) + return -EINVAL; var->transp.offset = 0; var->transp.length = 0; From 8ab9eada22d93fa501072161ceedbd6d61bc8df7 Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Wed, 15 Mar 2023 08:33:47 +0000 Subject: [PATCH 056/179] fbdev: intelfb: Fix potential divide by zero [ Upstream commit d823685486a3446d061fed7c7d2f80af984f119a ] Variable var->pixclock is controlled by user and can be assigned to zero. Without proper check, divide by zero would occur in intelfbhw_validate_mode and intelfbhw_mode_to_hw. Error out if var->pixclock is zero. Signed-off-by: Wei Chen Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/intelfb/intelfbdrv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/intelfb/intelfbdrv.c b/drivers/video/fbdev/intelfb/intelfbdrv.c index d4a2891a9a7a..a93dd531d00d 100644 --- a/drivers/video/fbdev/intelfb/intelfbdrv.c +++ b/drivers/video/fbdev/intelfb/intelfbdrv.c @@ -1219,6 +1219,9 @@ static int intelfb_check_var(struct fb_var_screeninfo *var, dinfo = GET_DINFO(info); + if (!var->pixclock) + return -EINVAL; + /* update the pitch */ if (intelfbhw_validate_mode(dinfo, var) != 0) return -EINVAL; From 9f2a69d5e6770a064a7ae9bdab7e9907a8d292ff Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Wed, 15 Mar 2023 09:05:18 +0000 Subject: [PATCH 057/179] fbdev: lxfb: Fix potential divide by zero [ Upstream commit 61ac4b86a4c047c20d5cb423ddd87496f14d9868 ] var->pixclock can be assigned to zero by user. Without proper check, divide by zero would occur in lx_set_clock. Error out if var->pixclock is zero. Signed-off-by: Wei Chen Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/geode/lxfb_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/geode/lxfb_core.c b/drivers/video/fbdev/geode/lxfb_core.c index 9d26592dbfce..41fda498406c 100644 --- a/drivers/video/fbdev/geode/lxfb_core.c +++ b/drivers/video/fbdev/geode/lxfb_core.c @@ -235,6 +235,9 @@ static void get_modedb(struct fb_videomode **modedb, unsigned int *size) static int lxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { + if (!var->pixclock) + return -EINVAL; + if (var->xres > 1920 || var->yres > 1440) return -EINVAL; From 2a3562ea9d726675e3e38094a508f8dd56726bdc Mon Sep 17 00:00:00 2001 From: Wei Chen Date: Wed, 15 Mar 2023 09:22:54 +0000 Subject: [PATCH 058/179] fbdev: au1200fb: Fix potential divide by zero [ Upstream commit 44a3b36b42acfc433aaaf526191dd12fbb919fdb ] var->pixclock can be assigned to zero by user. Without proper check, divide by zero would occur when invoking macro PICOS2KHZ in au1200fb_fb_check_var. Error out if var->pixclock is zero. Signed-off-by: Wei Chen Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/au1200fb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/video/fbdev/au1200fb.c b/drivers/video/fbdev/au1200fb.c index 81c315454428..b6b22fa4a8a0 100644 --- a/drivers/video/fbdev/au1200fb.c +++ b/drivers/video/fbdev/au1200fb.c @@ -1040,6 +1040,9 @@ static int au1200fb_fb_check_var(struct fb_var_screeninfo *var, u32 pixclock; int screen_size, plane; + if (!var->pixclock) + return -EINVAL; + plane = fbdev->plane; /* Make sure that the mode respect all LCD controller and From 6ecdea91ef95989bc55bc70e1ee2e2b98e46e93f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 15 Dec 2022 10:18:16 -0500 Subject: [PATCH 059/179] tools/power turbostat: Fix /dev/cpu_dma_latency warnings [ Upstream commit 40aafc7d58d3544f152a863a0e9863014b6d5d8c ] When running as non-root the following error is seen in turbostat: turbostat: fopen /dev/cpu_dma_latency : Permission denied turbostat and the man page have information on how to avoid other permission errors, so these can be fixed the same way. Provide better /dev/cpu_dma_latency warnings that provide instructions on how to avoid the error, and update the man page. Signed-off-by: Prarit Bhargava Cc: linux-pm@vger.kernel.org Signed-off-by: Len Brown Signed-off-by: Sasha Levin --- tools/power/x86/turbostat/turbostat.8 | 2 ++ tools/power/x86/turbostat/turbostat.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index c7b26a3603af..3e1a4c4be001 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -344,6 +344,8 @@ Alternatively, non-root users can be enabled to run turbostat this way: # chmod +r /dev/cpu/*/msr +# chmod +r /dev/cpu_dma_latency + .B "turbostat " reads hardware counters, but doesn't write them. So it will not interfere with the OS or other programs, including diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index aba460410dbd..c24054e3ef7a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5482,7 +5482,7 @@ void print_dev_latency(void) retval = read(fd, (void *)&value, sizeof(int)); if (retval != sizeof(int)) { - warn("read %s\n", path); + warn("read failed %s\n", path); close(fd); return; } From 88cdf1d8a522bad9b87b04e21b65d1d148232034 Mon Sep 17 00:00:00 2001 From: Antti Laakso Date: Wed, 25 Jan 2023 15:17:50 +0200 Subject: [PATCH 060/179] tools/power turbostat: fix decoding of HWP_STATUS [ Upstream commit 92c25393586ac799b9b7d9e50434f3c44a7622c4 ] The "excursion to minimum" information is in bit2 in HWP_STATUS MSR. Fix the bitmask used for decoding the register. Signed-off-by: Antti Laakso Reviewed-by: Artem Bityutskiy Signed-off-by: Len Brown Signed-off-by: Sasha Levin --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c24054e3ef7a..c61c6c704fbe 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4426,7 +4426,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx " "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", - cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x2) ? "" : "No-"); + cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-"); return 0; } From 089d656992c0f5cfdd1ea3f86860d0b67d211a20 Mon Sep 17 00:00:00 2001 From: Anton Gusev Date: Tue, 31 Jan 2023 10:58:18 +0300 Subject: [PATCH 061/179] tracing: Fix wrong return in kprobe_event_gen_test.c [ Upstream commit bc4f359b3b607daac0290d0038561237a86b38cb ] Overwriting the error code with the deletion result may cause the function to return 0 despite encountering an error. Commit b111545d26c0 ("tracing: Remove the useless value assignment in test_create_synth_event()") solves a similar issue by returning the original error code, so this patch does the same. Found by Linux Verification Center (linuxtesting.org) with SVACE. Link: https://lore.kernel.org/linux-trace-kernel/20230131075818.5322-1-aagusev@ispras.ru Signed-off-by: Anton Gusev Reviewed-by: Steven Rostedt (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) Signed-off-by: Sasha Levin --- kernel/trace/kprobe_event_gen_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index c736487fc0e4..e0c420eb0b2b 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -146,7 +146,7 @@ static int __init test_gen_kprobe_cmd(void) if (trace_event_file_is_valid(gen_kprobe_test)) gen_kprobe_test = NULL; /* We got an error after creating the event, delete it */ - ret = kprobe_event_delete("gen_kprobe_test"); + kprobe_event_delete("gen_kprobe_test"); goto out; } @@ -211,7 +211,7 @@ static int __init test_gen_kretprobe_cmd(void) if (trace_event_file_is_valid(gen_kretprobe_test)) gen_kretprobe_test = NULL; /* We got an error after creating the event, delete it */ - ret = kprobe_event_delete("gen_kretprobe_test"); + kprobe_event_delete("gen_kretprobe_test"); goto out; } From bd265f2061aa9d902f605942a3cb3920729c933f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Dec 2022 15:15:54 -0500 Subject: [PATCH 062/179] btrfs: fix uninitialized variable warning in btrfs_update_block_group [ Upstream commit efbf35a102b20246cfe4409c6ae92e72ecb67ab8 ] reclaim isn't set in the alloc case, however we only care about reclaim in the !alloc case. This isn't an actual problem, however -Wmaybe-uninitialized will complain, so initialize reclaim to quiet the compiler. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Stable-dep-of: df384da5a49c ("btrfs: use temporary variable for space_info in btrfs_update_block_group") Signed-off-by: Sasha Levin --- fs/btrfs/block-group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4b69945755e4..380cb10f0d37 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3259,7 +3259,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { - bool reclaim; + bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { From f5527b3b4d3dde57b15209694207f5e6d5b542e7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 1 Mar 2023 16:14:43 -0500 Subject: [PATCH 063/179] btrfs: use temporary variable for space_info in btrfs_update_block_group [ Upstream commit df384da5a49cace5c5e3100803dfd563fd982f93 ] We do cache->space_info->counter += num_bytes; everywhere in here. This is makes the lines longer than they need to be, and will be especially noticeable when we add the active tracking in, so add a temp variable for the space_info so this is cleaner. Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/block-group.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 380cb10f0d37..f33ddd5922b8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3259,6 +3259,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + struct btrfs_space_info *space_info; bool reclaim = false; cache = btrfs_lookup_block_group(info, bytenr); @@ -3266,6 +3267,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, ret = -ENOENT; break; } + space_info = cache->space_info; factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -3280,7 +3282,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, byte_in_group = bytenr - cache->start; WARN_ON(byte_in_group > cache->length); - spin_lock(&cache->space_info->lock); + spin_lock(&space_info->lock); spin_lock(&cache->lock); if (btrfs_test_opt(info, SPACE_CACHE) && @@ -3293,23 +3295,23 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; cache->used = old_val; cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; - cache->space_info->disk_used += num_bytes * factor; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_unlock(&space_info->lock); } else { old_val -= num_bytes; cache->used = old_val; cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, - cache->space_info, num_bytes); - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; + btrfs_space_info_update_bytes_pinned(info, space_info, + num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_unlock(&space_info->lock); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, From c84d28b3223655f6b83c85789d1cd024b9bedf40 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Mon, 27 Feb 2023 13:24:25 +0300 Subject: [PATCH 064/179] mtd: rawnand: meson: initialize struct with zeroes [ Upstream commit 4ce341de6c02d02aba7c78a6447ccfcaa9eeb328 ] This structure must be zeroed, because it's field 'hw->core' is used as 'parent' in 'clk_core_fill_parent_index()', but it will be uninitialized. This happens, because when this struct is not zeroed, pointer 'hw' is "initialized" by garbage, which is valid pointer, but points to some garbage. So 'hw' will be dereferenced, but 'core' contains some random data which will be interpreted as a pointer. The following backtrace is result of dereference of such pointer: [ 1.081319] __clk_register+0x414/0x820 [ 1.085113] devm_clk_register+0x64/0xd0 [ 1.088995] meson_nfc_probe+0x258/0x6ec [ 1.092875] platform_probe+0x70/0xf0 [ 1.096498] really_probe+0xc8/0x3e0 [ 1.100034] __driver_probe_device+0x84/0x190 [ 1.104346] driver_probe_device+0x44/0x120 [ 1.108487] __driver_attach+0xb4/0x220 [ 1.112282] bus_for_each_dev+0x78/0xd0 [ 1.116077] driver_attach+0x2c/0x40 [ 1.119613] bus_add_driver+0x184/0x240 [ 1.123408] driver_register+0x80/0x140 [ 1.127203] __platform_driver_register+0x30/0x40 [ 1.131860] meson_nfc_driver_init+0x24/0x30 Fixes: 1e4d3ba66888 ("mtd: rawnand: meson: fix the clock") Signed-off-by: Arseniy Krasnov Acked-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20230227102425.793841-1-AVKrasnov@sberdevices.ru Signed-off-by: Sasha Levin --- drivers/mtd/nand/raw/meson_nand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index 5ee01231ac4c..30e326adabfc 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -991,7 +991,7 @@ static const struct mtd_ooblayout_ops meson_ooblayout_ops = { static int meson_nfc_clk_init(struct meson_nfc *nfc) { - struct clk_parent_data nfc_divider_parent_data[1]; + struct clk_parent_data nfc_divider_parent_data[1] = {0}; struct clk_init_data init = {0}; int ret; From 8b46440d1a6350fba0cda1be163e36235226e1d0 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 15 Feb 2023 12:08:45 +0100 Subject: [PATCH 065/179] mtd: nand: mxic-ecc: Fix mxic_ecc_data_xfer_wait_for_completion() when irq is used [ Upstream commit 75dce6a941e3f16c3b4878c8b2f46d5d07c619ce ] wait_for_completion_timeout() and readl_poll_timeout() don't handle their return value the same way. wait_for_completion_timeout() returns 0 on time out (and >0 in all other cases) readl_poll_timeout() returns 0 on success and -ETIMEDOUT upon a timeout. In order for the error handling path to work in both cases, the logic against wait_for_completion_timeout() needs to be inverted. Fixes: 48e6633a9fa2 ("mtd: nand: mxic-ecc: Add Macronix external ECC engine support") Signed-off-by: Christophe JAILLET Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/beddbc374557e44ceec897e68c4a5d12764ddbb9.1676459308.git.christophe.jaillet@wanadoo.fr Signed-off-by: Sasha Levin --- drivers/mtd/nand/ecc-mxic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/nand/ecc-mxic.c b/drivers/mtd/nand/ecc-mxic.c index 8afdca731b87..6b487ffe2f2d 100644 --- a/drivers/mtd/nand/ecc-mxic.c +++ b/drivers/mtd/nand/ecc-mxic.c @@ -429,6 +429,7 @@ static int mxic_ecc_data_xfer_wait_for_completion(struct mxic_ecc_engine *mxic) mxic_ecc_enable_int(mxic); ret = wait_for_completion_timeout(&mxic->complete, msecs_to_jiffies(1000)); + ret = ret ? 0 : -ETIMEDOUT; mxic_ecc_disable_int(mxic); } else { ret = readl_poll_timeout(mxic->regs + INTRPT_STS, val, From 60b2027077ea664016cf43f77d77e843eff582c9 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 6 Mar 2023 11:18:24 -0800 Subject: [PATCH 066/179] ca8210: Fix unsigned mac_len comparison with zero in ca8210_skb_tx() [ Upstream commit 748b2f5e82d17480404b3e2895388fc2925f7caf ] mac_len is of type unsigned, which can never be less than zero. mac_len = ieee802154_hdr_peek_addrs(skb, &header); if (mac_len < 0) return mac_len; Change this to type int as ieee802154_hdr_peek_addrs() can return negative integers, this is found by static analysis with smatch. Fixes: 6c993779ea1d ("ca8210: fix mac_len negative array access") Signed-off-by: Harshit Mogalapalli Acked-by: Alexander Aring Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230306191824.4115839-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Stefan Schmidt Signed-off-by: Sasha Levin --- drivers/net/ieee802154/ca8210.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index 0b0c6c0764fe..d0b5129439ed 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -1902,10 +1902,9 @@ static int ca8210_skb_tx( struct ca8210_priv *priv ) { - int status; struct ieee802154_hdr header = { }; struct secspec secspec; - unsigned int mac_len; + int mac_len, status; dev_dbg(&priv->spi->dev, "%s called\n", __func__); From a07cf4fd610ed4adab2eb86c56ddfed8a287c45b Mon Sep 17 00:00:00 2001 From: Rajnesh Kanwal Date: Fri, 10 Feb 2023 14:27:11 +0000 Subject: [PATCH 067/179] riscv/kvm: Fix VM hang in case of timer delta being zero. [ Upstream commit 6eff38048944cadc3cddcf117acfa5199ec32490 ] In case when VCPU is blocked due to WFI, we schedule the timer from `kvm_riscv_vcpu_timer_blocking()` to keep timer interrupt ticking. But in case when delta_ns comes to be zero, we never schedule the timer and VCPU keeps sleeping indefinitely until any activity is done with VM console. This is easily reproduce-able using kvmtool. ./lkvm-static run -c1 --console virtio -p "earlycon root=/dev/vda" \ -k ./Image -d rootfs.ext4 Also, just add a print in kvm_riscv_vcpu_vstimer_expired() to check the interrupt delivery and run `top` or similar auto-upating cmd from guest. Within sometime one can notice that print from timer expiry routine stops and the `top` cmd output will stop updating. This change fixes this by making sure we schedule the timer even with delta_ns being zero to bring the VCPU out of sleep immediately. Fixes: 8f5cb44b1bae ("RISC-V: KVM: Support sstc extension") Signed-off-by: Rajnesh Kanwal Reviewed-by: Atish Patra Signed-off-by: Anup Patel Signed-off-by: Sasha Levin --- arch/riscv/kvm/vcpu_timer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index ad34519c8a13..3ac2ff6a65da 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -147,10 +147,8 @@ static void kvm_riscv_vcpu_timer_blocking(struct kvm_vcpu *vcpu) return; delta_ns = kvm_riscv_delta_cycles2ns(t->next_cycles, gt, t); - if (delta_ns) { - hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); - t->next_set = true; - } + hrtimer_start(&t->hrt, ktime_set(0, delta_ns), HRTIMER_MODE_REL); + t->next_set = true; } static void kvm_riscv_vcpu_timer_unblocking(struct kvm_vcpu *vcpu) From 2cdbcff99f15db86a10672fb220379a1ae46ccae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Fri, 17 Mar 2023 11:20:04 +0100 Subject: [PATCH 068/179] mips: bmips: BCM6358: disable RAC flush for TP1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ab327f8acdf8d06601fbf058859a539a9422afff ] RAC flush causes kernel panics on BCM6358 with EHCI/OHCI when booting from TP1: [ 3.881739] usb 1-1: new high-speed USB device number 2 using ehci-platform [ 3.895011] Reserved instruction in kernel code[#1]: [ 3.900113] CPU: 0 PID: 1 Comm: init Not tainted 5.10.16 #0 [ 3.905829] $ 0 : 00000000 10008700 00000000 77d94060 [ 3.911238] $ 4 : 7fd1f088 00000000 81431cac 81431ca0 [ 3.916641] $ 8 : 00000000 ffffefff 8075cd34 00000000 [ 3.922043] $12 : 806f8d40 f3e812b7 00000000 000d9aaa [ 3.927446] $16 : 7fd1f068 7fd1f080 7ff559b8 81428470 [ 3.932848] $20 : 00000000 00000000 55590000 77d70000 [ 3.938251] $24 : 00000018 00000010 [ 3.943655] $28 : 81430000 81431e60 81431f28 800157fc [ 3.949058] Hi : 00000000 [ 3.952013] Lo : 00000000 [ 3.955019] epc : 80015808 setup_sigcontext+0x54/0x24c [ 3.960464] ra : 800157fc setup_sigcontext+0x48/0x24c [ 3.965913] Status: 10008703 KERNEL EXL IE [ 3.970216] Cause : 00800028 (ExcCode 0a) [ 3.974340] PrId : 0002a010 (Broadcom BMIPS4350) [ 3.979170] Modules linked in: ohci_platform ohci_hcd fsl_mph_dr_of ehci_platform ehci_fsl ehci_hcd gpio_button_hotplug usbcore nls_base usb_common [ 3.992907] Process init (pid: 1, threadinfo=(ptrval), task=(ptrval), tls=77e22ec8) [ 4.000776] Stack : 81431ef4 7fd1f080 81431f28 81428470 7fd1f068 81431edc 7ff559b8 81428470 [ 4.009467] 81431f28 7fd1f080 55590000 77d70000 77d5498c 80015c70 806f0000 8063ae74 [ 4.018149] 08100002 81431f28 0000000a 08100002 81431f28 0000000a 77d6b418 00000003 [ 4.026831] ffffffff 80016414 80080734 81431ecc 81431ecc 00000001 00000000 04000000 [ 4.035512] 77d54874 00000000 00000000 00000000 00000000 00000012 00000002 00000000 [ 4.044196] ... [ 4.046706] Call Trace: [ 4.049238] [<80015808>] setup_sigcontext+0x54/0x24c [ 4.054356] [<80015c70>] setup_frame+0xdc/0x124 [ 4.059015] [<80016414>] do_notify_resume+0x1dc/0x288 [ 4.064207] [<80011b50>] work_notifysig+0x10/0x18 [ 4.069036] [ 4.070538] Code: 8fc300b4 00001025 26240008 ac830004 3c048063 0c0228aa 24846a00 26240010 [ 4.080686] [ 4.082517] ---[ end trace 22a8edb41f5f983b ]--- [ 4.087374] Kernel panic - not syncing: Fatal exception [ 4.092753] Rebooting in 1 seconds.. Because the bootloader (CFE) is not initializing the Read-ahead cache properly on the second thread (TP1). Since the RAC was not initialized properly, we should avoid flushing it at the risk of corrupting the instruction stream as seen in the trace above. Fixes: d59098a0e9cb ("MIPS: bmips: use generic dma noncoherent ops") Signed-off-by: Álvaro Fernández Rojas Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/bmips/dma.c | 5 +++++ arch/mips/bmips/setup.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/mips/bmips/dma.c b/arch/mips/bmips/dma.c index 33788668cbdb..3779e7855bd7 100644 --- a/arch/mips/bmips/dma.c +++ b/arch/mips/bmips/dma.c @@ -5,6 +5,8 @@ #include #include +bool bmips_rac_flush_disable; + void arch_sync_dma_for_cpu_all(void) { void __iomem *cbr = BMIPS_GET_CBR(); @@ -15,6 +17,9 @@ void arch_sync_dma_for_cpu_all(void) boot_cpu_type() != CPU_BMIPS4380) return; + if (unlikely(bmips_rac_flush_disable)) + return; + /* Flush stale data out of the readahead cache */ cfg = __raw_readl(cbr + BMIPS_RAC_CONFIG); __raw_writel(cfg | 0x100, cbr + BMIPS_RAC_CONFIG); diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index e95b3f78e7cd..549a6392a3d2 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -35,6 +35,8 @@ #define REG_BCM6328_OTP ((void __iomem *)CKSEG1ADDR(0x1000062c)) #define BCM6328_TP1_DISABLED BIT(9) +extern bool bmips_rac_flush_disable; + static const unsigned long kbase = VMLINUX_LOAD_ADDRESS & 0xfff00000; struct bmips_quirk { @@ -104,6 +106,12 @@ static void bcm6358_quirks(void) * disable SMP for now */ bmips_smp_enabled = 0; + + /* + * RAC flush causes kernel panics on BCM6358 when booting from TP1 + * because the bootloader is not initializing it properly. + */ + bmips_rac_flush_disable = !!(read_c0_brcm_cmt_local() & (1 << 31)); } static void bcm6368_quirks(void) From e39afd6095a59ddecfd44422241db7605c8846e7 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 20 Mar 2023 15:28:38 +0100 Subject: [PATCH 069/179] ALSA: usb-audio: Fix recursive locking at XRUN during syncing [ Upstream commit 8c721c53dda512fdd48eb24d6d99e56deee57898 ] The recent support of low latency playback in USB-audio driver made the snd_usb_queue_pending_output_urbs() function to be called via PCM ack ops. In the new code path, the function is performed already in the PCM stream lock. The problem is that, when an XRUN is detected, the function calls snd_pcm_xrun() to notify, but snd_pcm_xrun() is supposed to be called only outside the stream lock. As a result, it leads to a deadlock of PCM stream locking. For avoiding such a recursive locking, this patch adds an additional check to the code paths in PCM core that call the ack callback; now it checks the error code from the callback, and if it's -EPIPE, the XRUN is handled in the PCM core side gracefully. Along with it, the USB-audio driver code is changed to follow that, i.e. -EPIPE is returned instead of the explicit snd_pcm_xrun() call when the function is performed already in the stream lock. Fixes: d5f871f89e21 ("ALSA: usb-audio: Improved lowlatency playback support") Reported-and-tested-by: John Keeping Link: https://lore.kernel.org/r/20230317195128.3911155-1-john@metanate.com Reviewed-by: Jaroslav Kysela Reviewed-by; Takashi Sakamoto Link: https://lore.kernel.org/r/20230320142838.494-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/core/pcm_lib.c | 2 ++ sound/usb/endpoint.c | 22 ++++++++++++++-------- sound/usb/endpoint.h | 4 ++-- sound/usb/pcm.c | 2 +- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 8b6aeb8a78f7..02fd65993e7e 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -2155,6 +2155,8 @@ int pcm_lib_apply_appl_ptr(struct snd_pcm_substream *substream, ret = substream->ops->ack(substream); if (ret < 0) { runtime->control->appl_ptr = old_appl_ptr; + if (ret == -EPIPE) + __snd_pcm_xrun(substream); return ret; } } diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c index 419302e2057e..647fa054d8b1 100644 --- a/sound/usb/endpoint.c +++ b/sound/usb/endpoint.c @@ -455,8 +455,8 @@ static void push_back_to_ready_list(struct snd_usb_endpoint *ep, * This function is used both for implicit feedback endpoints and in low- * latency playback mode. */ -void snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, - bool in_stream_lock) +int snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, + bool in_stream_lock) { bool implicit_fb = snd_usb_endpoint_implicit_feedback_sink(ep); @@ -480,7 +480,7 @@ void snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, spin_unlock_irqrestore(&ep->lock, flags); if (ctx == NULL) - return; + break; /* copy over the length information */ if (implicit_fb) { @@ -495,11 +495,14 @@ void snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, break; if (err < 0) { /* push back to ready list again for -EAGAIN */ - if (err == -EAGAIN) + if (err == -EAGAIN) { push_back_to_ready_list(ep, ctx); - else + break; + } + + if (!in_stream_lock) notify_xrun(ep); - return; + return -EPIPE; } err = usb_submit_urb(ctx->urb, GFP_ATOMIC); @@ -507,13 +510,16 @@ void snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, usb_audio_err(ep->chip, "Unable to submit urb #%d: %d at %s\n", ctx->index, err, __func__); - notify_xrun(ep); - return; + if (!in_stream_lock) + notify_xrun(ep); + return -EPIPE; } set_bit(ctx->index, &ep->active_mask); atomic_inc(&ep->submitted_urbs); } + + return 0; } /* diff --git a/sound/usb/endpoint.h b/sound/usb/endpoint.h index 924f4351588c..c09f68ce08b1 100644 --- a/sound/usb/endpoint.h +++ b/sound/usb/endpoint.h @@ -52,7 +52,7 @@ int snd_usb_endpoint_implicit_feedback_sink(struct snd_usb_endpoint *ep); int snd_usb_endpoint_next_packet_size(struct snd_usb_endpoint *ep, struct snd_urb_ctx *ctx, int idx, unsigned int avail); -void snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, - bool in_stream_lock); +int snd_usb_queue_pending_output_urbs(struct snd_usb_endpoint *ep, + bool in_stream_lock); #endif /* __USBAUDIO_ENDPOINT_H */ diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 2c5765cbed2d..1e1d7458bce1 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -1595,7 +1595,7 @@ static int snd_usb_pcm_playback_ack(struct snd_pcm_substream *substream) * outputs here */ if (!ep->active_mask) - snd_usb_queue_pending_output_urbs(ep, true); + return snd_usb_queue_pending_output_urbs(ep, true); return 0; } From ba85e83f9330b73db7d3f8ea0bf6fc41e65a8633 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Fri, 10 Mar 2023 21:34:58 +0900 Subject: [PATCH 070/179] PCI: dwc: Fix PORT_LINK_CONTROL update when CDM check enabled [ Upstream commit cdce67099117ece371582f706c6eff7d3a65326d ] If CDM_CHECK is enabled (by the DT "snps,enable-cdm-check" property), 'val' is overwritten by PCIE_PL_CHK_REG_CONTROL_STATUS initialization. Commit ec7b952f453c ("PCI: dwc: Always enable CDM check if "snps,enable-cdm-check" exists") did not account for further usage of 'val', so we wrote improper values to PCIE_PORT_LINK_CONTROL when the CDM check is enabled. Move the PCIE_PORT_LINK_CONTROL update to be completely after the PCIE_PL_CHK_REG_CONTROL_STATUS register initialization. [bhelgaas: commit log adapted from Serge's version] Fixes: ec7b952f453c ("PCI: dwc: Always enable CDM check if "snps,enable-cdm-check" exists") Link: https://lore.kernel.org/r/20230310123510.675685-2-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Yoshihiro Shimoda Signed-off-by: Bjorn Helgaas Reviewed-by: Serge Semin Signed-off-by: Sasha Levin --- drivers/pci/controller/dwc/pcie-designware.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 9e4d96e5a3f5..575834cae3b9 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -645,11 +645,6 @@ void dw_pcie_setup(struct dw_pcie *pci) dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, val); } - val = dw_pcie_readl_dbi(pci, PCIE_PORT_LINK_CONTROL); - val &= ~PORT_LINK_FAST_LINK_MODE; - val |= PORT_LINK_DLL_LINK_EN; - dw_pcie_writel_dbi(pci, PCIE_PORT_LINK_CONTROL, val); - if (of_property_read_bool(np, "snps,enable-cdm-check")) { val = dw_pcie_readl_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS); val |= PCIE_PL_CHK_REG_CHK_REG_CONTINUOUS | @@ -657,6 +652,11 @@ void dw_pcie_setup(struct dw_pcie *pci) dw_pcie_writel_dbi(pci, PCIE_PL_CHK_REG_CONTROL_STATUS, val); } + val = dw_pcie_readl_dbi(pci, PCIE_PORT_LINK_CONTROL); + val &= ~PORT_LINK_FAST_LINK_MODE; + val |= PORT_LINK_DLL_LINK_EN; + dw_pcie_writel_dbi(pci, PCIE_PORT_LINK_CONTROL, val); + of_property_read_u32(np, "num-lanes", &pci->num_lanes); if (!pci->num_lanes) { dev_dbg(pci->dev, "Using h/w default number of lanes\n"); From 6c69f1ab7bdc97b2df37a9d3f7bbdb45ddbeab17 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Sun, 19 Mar 2023 20:32:18 -0400 Subject: [PATCH 071/179] platform/x86: think-lmi: add missing type attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 583329dcf22e568a328a944f20427ccfc95dce01 ] This driver was missing the mandatory type attribute...oops. Add it in along with logic to determine whether the attribute is an enumeration type or a string by parsing the possible_values attribute. Upstream bug https://bugzilla.kernel.org/show_bug.cgi?id=216460 Fixes: a40cd7ef22fb ("platform/x86: think-lmi: Add WMI interface support on Lenovo platforms") Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20230320003221.561750-1-mpearson-lenovo@squebb.ca Reviewed-by: Thomas Weißschuh Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/think-lmi.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c index a01a92769c1a..07c9dc21eff5 100644 --- a/drivers/platform/x86/think-lmi.c +++ b/drivers/platform/x86/think-lmi.c @@ -947,6 +947,20 @@ static ssize_t possible_values_show(struct kobject *kobj, struct kobj_attribute return sysfs_emit(buf, "%s\n", setting->possible_values); } +static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct tlmi_attr_setting *setting = to_tlmi_attr_setting(kobj); + + if (setting->possible_values) { + /* Figure out what setting type is as BIOS does not return this */ + if (strchr(setting->possible_values, ',')) + return sysfs_emit(buf, "enumeration\n"); + } + /* Anything else is going to be a string */ + return sysfs_emit(buf, "string\n"); +} + static ssize_t current_value_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1036,10 +1050,13 @@ static struct kobj_attribute attr_possible_values = __ATTR_RO(possible_values); static struct kobj_attribute attr_current_val = __ATTR_RW_MODE(current_value, 0600); +static struct kobj_attribute attr_type = __ATTR_RO(type); + static struct attribute *tlmi_attrs[] = { &attr_displ_name.attr, &attr_current_val.attr, &attr_possible_values.attr, + &attr_type.attr, NULL }; From 3991efd0c1aa989a23c4b922c7d36a31b6aab83c Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Sun, 19 Mar 2023 20:32:19 -0400 Subject: [PATCH 072/179] platform/x86: think-lmi: use correct possible_values delimiters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 45e21289bfc6e257885514790a8a8887da822d40 ] firmware-attributes class requires that possible values are delimited using ';' but the Lenovo firmware uses ',' instead. Parse string and replace where appropriate. Suggested-by: Thomas Weißschuh Fixes: a40cd7ef22fb ("platform/x86: think-lmi: Add WMI interface support on Lenovo platforms") Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20230320003221.561750-2-mpearson-lenovo@squebb.ca Reviewed-by: Thomas Weißschuh Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/think-lmi.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c index 07c9dc21eff5..62241680c8a9 100644 --- a/drivers/platform/x86/think-lmi.c +++ b/drivers/platform/x86/think-lmi.c @@ -954,7 +954,7 @@ static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, if (setting->possible_values) { /* Figure out what setting type is as BIOS does not return this */ - if (strchr(setting->possible_values, ',')) + if (strchr(setting->possible_values, ';')) return sysfs_emit(buf, "enumeration\n"); } /* Anything else is going to be a string */ @@ -1441,6 +1441,13 @@ static int tlmi_analyze(void) pr_info("Error retrieving possible values for %d : %s\n", i, setting->display_name); } + /* + * firmware-attributes requires that possible_values are separated by ';' but + * Lenovo FW uses ','. Replace appropriately. + */ + if (setting->possible_values) + strreplace(setting->possible_values, ',', ';'); + kobject_init(&setting->kobj, &tlmi_attr_setting_ktype); tlmi_priv.setting[i] = setting; kfree(item); From 5b2e50d837f2c55a1101e26c4250f4462eb16f75 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Sun, 19 Mar 2023 20:32:20 -0400 Subject: [PATCH 073/179] platform/x86: think-lmi: only display possible_values if available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit cf337f27f3bfc4aeab4954c468239fd6233c7638 ] Some attributes don't have any values available. In those cases don't make the possible_values entry visible. Fixes: a40cd7ef22fb ("platform/x86: think-lmi: Add WMI interface support on Lenovo platforms") Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20230320003221.561750-3-mpearson-lenovo@squebb.ca Reviewed-by: Thomas Weißschuh Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/think-lmi.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c index 62241680c8a9..ccd085bacf29 100644 --- a/drivers/platform/x86/think-lmi.c +++ b/drivers/platform/x86/think-lmi.c @@ -941,9 +941,6 @@ static ssize_t possible_values_show(struct kobject *kobj, struct kobj_attribute { struct tlmi_attr_setting *setting = to_tlmi_attr_setting(kobj); - if (!tlmi_priv.can_get_bios_selections) - return -EOPNOTSUPP; - return sysfs_emit(buf, "%s\n", setting->possible_values); } @@ -1052,6 +1049,18 @@ static struct kobj_attribute attr_current_val = __ATTR_RW_MODE(current_value, 06 static struct kobj_attribute attr_type = __ATTR_RO(type); +static umode_t attr_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct tlmi_attr_setting *setting = to_tlmi_attr_setting(kobj); + + /* We don't want to display possible_values attributes if not available */ + if ((attr == &attr_possible_values.attr) && (!setting->possible_values)) + return 0; + + return attr->mode; +} + static struct attribute *tlmi_attrs[] = { &attr_displ_name.attr, &attr_current_val.attr, @@ -1061,6 +1070,7 @@ static struct attribute *tlmi_attrs[] = { }; static const struct attribute_group tlmi_attr_group = { + .is_visible = attr_is_visible, .attrs = tlmi_attrs, }; From f0a67ad7dce49d93570edc795e0312bb787f19bb Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Sun, 19 Mar 2023 20:32:21 -0400 Subject: [PATCH 074/179] platform/x86: think-lmi: Add possible_values for ThinkStation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8a02d70679fc1c434401863333c8ea7dbf201494 ] ThinkStation platforms don't support the API to return possible_values but instead embed it in the settings string. Try and extract this information and set the possible_values attribute appropriately. Fixes: a40cd7ef22fb ("platform/x86: think-lmi: Add WMI interface support on Lenovo platforms") Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20230320003221.561750-4-mpearson-lenovo@squebb.ca Reviewed-by: Thomas Weißschuh Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/think-lmi.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c index ccd085bacf29..74af3e593b2c 100644 --- a/drivers/platform/x86/think-lmi.c +++ b/drivers/platform/x86/think-lmi.c @@ -1450,6 +1450,26 @@ static int tlmi_analyze(void) if (ret || !setting->possible_values) pr_info("Error retrieving possible values for %d : %s\n", i, setting->display_name); + } else { + /* + * Older Thinkstations don't support the bios_selections API. + * Instead they store this as a [Optional:Option1,Option2] section of the + * name string. + * Try and pull that out if it's available. + */ + char *item, *optstart, *optend; + + if (!tlmi_setting(setting->index, &item, LENOVO_BIOS_SETTING_GUID)) { + optstart = strstr(item, "[Optional:"); + if (optstart) { + optstart += strlen("[Optional:"); + optend = strstr(optstart, "]"); + if (optend) + setting->possible_values = + kstrndup(optstart, optend - optstart, + GFP_KERNEL); + } + } } /* * firmware-attributes requires that possible_values are separated by ';' but From 53dc0b69fbac82ab6b2286728e100a77a862fa83 Mon Sep 17 00:00:00 2001 From: Liang He Date: Wed, 22 Mar 2023 11:30:57 +0800 Subject: [PATCH 075/179] platform/surface: aggregator: Add missing fwnode_handle_put() [ Upstream commit acd0acb802b90f88d19ad4337183e44fd0f77c50 ] In fwnode_for_each_child_node(), we should add fwnode_handle_put() when break out of the iteration fwnode_for_each_child_node() as it will automatically increase and decrease the refcounter. Fixes: fc622b3d36e6 ("platform/surface: Set up Surface Aggregator device registry") Signed-off-by: Liang He Reviewed-by: Maximilian Luz Link: https://lore.kernel.org/r/20230322033057.1855741-1-windhl@126.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/surface/aggregator/bus.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/platform/surface/aggregator/bus.c b/drivers/platform/surface/aggregator/bus.c index de539938896e..b501a79f2a08 100644 --- a/drivers/platform/surface/aggregator/bus.c +++ b/drivers/platform/surface/aggregator/bus.c @@ -485,8 +485,10 @@ int __ssam_register_clients(struct device *parent, struct ssam_controller *ctrl, * device, so ignore it and continue with the next one. */ status = ssam_add_client_device(parent, ctrl, child); - if (status && status != -ENODEV) + if (status && status != -ENODEV) { + fwnode_handle_put(child); goto err; + } } return 0; From 43b70c9c4c903ec400de32604f298b3147a1ab0a Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Mon, 13 Mar 2023 10:32:44 +0300 Subject: [PATCH 076/179] mtd: rawnand: meson: invalidate cache on polling ECC bit [ Upstream commit e732e39ed9929c05fd219035bc9653ba4100d4fa ] 'info_buf' memory is cached and driver polls ECC bit in it. This bit is set by the NAND controller. If 'usleep_range()' returns before device sets this bit, 'info_buf' will be cached and driver won't see update of this bit and will loop forever. Fixes: 8fae856c5350 ("mtd: rawnand: meson: add support for Amlogic NAND flash controller") Signed-off-by: Arseniy Krasnov Reviewed-by: Neil Armstrong Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/d4ef0bd6-816e-f6fa-9385-f05f775f0ae2@sberdevices.ru Signed-off-by: Sasha Levin --- drivers/mtd/nand/raw/meson_nand.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/raw/meson_nand.c b/drivers/mtd/nand/raw/meson_nand.c index 30e326adabfc..a28574c00900 100644 --- a/drivers/mtd/nand/raw/meson_nand.c +++ b/drivers/mtd/nand/raw/meson_nand.c @@ -176,6 +176,7 @@ struct meson_nfc { dma_addr_t daddr; dma_addr_t iaddr; + u32 info_bytes; unsigned long assigned_cs; }; @@ -503,6 +504,7 @@ static int meson_nfc_dma_buffer_setup(struct nand_chip *nand, void *databuf, nfc->daddr, datalen, dir); return ret; } + nfc->info_bytes = infolen; cmd = GENCMDIADDRL(NFC_CMD_AIL, nfc->iaddr); writel(cmd, nfc->reg_base + NFC_REG_CMD); @@ -520,8 +522,10 @@ static void meson_nfc_dma_buffer_release(struct nand_chip *nand, struct meson_nfc *nfc = nand_get_controller_data(nand); dma_unmap_single(nfc->dev, nfc->daddr, datalen, dir); - if (infolen) + if (infolen) { dma_unmap_single(nfc->dev, nfc->iaddr, infolen, dir); + nfc->info_bytes = 0; + } } static int meson_nfc_read_buf(struct nand_chip *nand, u8 *buf, int len) @@ -710,6 +714,8 @@ static void meson_nfc_check_ecc_pages_valid(struct meson_nfc *nfc, usleep_range(10, 15); /* info is updated by nfc dma engine*/ smp_rmb(); + dma_sync_single_for_cpu(nfc->dev, nfc->iaddr, nfc->info_bytes, + DMA_FROM_DEVICE); ret = *info & ECC_COMPLETE; } while (!ret); } From c5a159d5e5591bc9336048db05752d018f3cd9fe Mon Sep 17 00:00:00 2001 From: Siddharth Kawar Date: Mon, 20 Mar 2023 20:37:40 +0000 Subject: [PATCH 077/179] SUNRPC: fix shutdown of NFS TCP client socket [ Upstream commit 943d045a6d796175e5d08f9973953b1d2c07d797 ] NFS server Duplicate Request Cache (DRC) algorithms rely on NFS clients reconnecting using the same local TCP port. Unique NFS operations are identified by the per-TCP connection set of XIDs. This prevents file corruption when non-idempotent NFS operations are retried. Currently, NFS client TCP connections are using different local TCP ports when reconnecting to NFS servers. After an NFS server initiates shutdown of the TCP connection, the NFS client's TCP socket is set to NULL after the socket state has reached TCP_LAST_ACK(9). When reconnecting, the new socket attempts to reuse the same local port but fails with EADDRNOTAVAIL (99). This forces the socket to use a different local TCP port to reconnect to the remote NFS server. State Transition and Events: TCP_CLOSE_WAIT(8) TCP_LAST_ACK(9) connect(fail EADDRNOTAVAIL(99)) TCP_CLOSE(7) bind on new port connect success dmesg excerpts showing reconnect switching from TCP local port of 926 to 763 after commit 7c81e6a9d75b: [13354.947854] NFS call mkdir testW ... [13405.654781] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.654813] RPC: state 8 conn 1 dead 0 zapped 1 sk_shutdown 1 [13405.654826] RPC: xs_data_ready... [13405.654892] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.654895] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [13405.654899] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.654900] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [13405.654950] RPC: xs_connect scheduled xprt 00000000037d0f03 [13405.654975] RPC: xs_bind 0.0.0.0:926: ok (0) [13405.654980] RPC: worker connecting xprt 00000000037d0f03 via tcp to 10.101.6.228 (port 2049) [13405.654991] RPC: 00000000037d0f03 connect status 99 connected 0 sock state 7 [13405.655001] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.655002] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [13405.655024] RPC: xs_connect scheduled xprt 00000000037d0f03 [13405.655038] RPC: xs_bind 0.0.0.0:763: ok (0) [13405.655041] RPC: worker connecting xprt 00000000037d0f03 via tcp to 10.101.6.228 (port 2049) [13405.655065] RPC: 00000000037d0f03 connect status 115 connected 0 sock state 2 State Transition and Events with patch applied: TCP_CLOSE_WAIT(8) TCP_LAST_ACK(9) TCP_CLOSE(7) connect(reuse of port succeeds) dmesg excerpts showing reconnect on same TCP local port of 936 with patch applied: [ 257.139935] NFS: mkdir(0:59/560857152), testQ [ 257.139937] NFS call mkdir testQ ... [ 307.822702] RPC: state 8 conn 1 dead 0 zapped 1 sk_shutdown 1 [ 307.822714] RPC: xs_data_ready... [ 307.822817] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.822821] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.822825] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.822826] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.823606] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.823609] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.823629] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.823632] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.823676] RPC: xs_connect scheduled xprt 00000000ce702f14 [ 307.823704] RPC: xs_bind 0.0.0.0:936: ok (0) [ 307.823709] RPC: worker connecting xprt 00000000ce702f14 via tcp to 10.101.1.30 (port 2049) [ 307.823748] RPC: 00000000ce702f14 connect status 115 connected 0 sock state 2 ... [ 314.916193] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 314.916251] RPC: xs_connect scheduled xprt 00000000ce702f14 [ 314.916282] RPC: xs_bind 0.0.0.0:936: ok (0) [ 314.916292] RPC: worker connecting xprt 00000000ce702f14 via tcp to 10.101.1.30 (port 2049) [ 314.916342] RPC: 00000000ce702f14 connect status 115 connected 0 sock state 2 Fixes: 7c81e6a9d75b ("SUNRPC: Tweak TCP socket shutdown in the RPC client") Signed-off-by: Siddharth Rajendra Kawar Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index b3ab6d9d752e..05aa32696e7c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2153,6 +2153,7 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) switch (skst) { case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: + case TCP_LAST_ACK: break; case TCP_ESTABLISHED: case TCP_CLOSE_WAIT: From 1da26860a348e898120deade0ee4813039caee7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Thu, 23 Mar 2023 09:34:17 +0100 Subject: [PATCH 078/179] sfc: ef10: don't overwrite offload features at NIC reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ca4a80e4bb7e87daf33b27d2ab9e4f5311018a89 ] At NIC reset, some offload features related to encapsulated traffic might have changed (this mainly happens if the firmware-variant is changed with the sfboot userspace tool). Because of this, features are checked and set again at reset time. However, this was not done right, and some features were improperly overwritten at NIC reset: - Tunneled IPv6 segmentation was always disabled - Features disabled with ethtool were reenabled - Features that becomes unsupported after the reset were not disabled Also, checking if the device supports IPV6_CSUM to enable TSO6 is no longer necessary because all currently supported devices support it. Additionally, move the assignment of some other features to the EF10_OFFLOAD_FEATURES macro, like it is done in ef100, leaving the selection of features in efx_pci_probe_post_io a bit cleaner. Fixes: ffffd2454a7a ("sfc: correctly advertise tunneled IPv6 segmentation") Fixes: 24b2c3751aa3 ("sfc: advertise encapsulated offloads on EF10") Reported-by: Tianhao Zhao Suggested-by: Jonathan Cooper Tested-by: Jonathan Cooper Signed-off-by: Íñigo Huguet Acked-by: Edward Cree Link: https://lore.kernel.org/r/20230323083417.7345-1-ihuguet@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/sfc/ef10.c | 40 ++++++++++++++++++++++----------- drivers/net/ethernet/sfc/efx.c | 17 ++++++-------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 7022fb2005a2..d30459dbfe8f 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -1304,7 +1304,8 @@ static void efx_ef10_fini_nic(struct efx_nic *efx) static int efx_ef10_init_nic(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx->nic_data; - netdev_features_t hw_enc_features = 0; + struct net_device *net_dev = efx->net_dev; + netdev_features_t tun_feats, tso_feats; int rc; if (nic_data->must_check_datapath_caps) { @@ -1349,20 +1350,30 @@ static int efx_ef10_init_nic(struct efx_nic *efx) nic_data->must_restore_piobufs = false; } - /* add encapsulated checksum offload features */ + /* encap features might change during reset if fw variant changed */ if (efx_has_cap(efx, VXLAN_NVGRE) && !efx_ef10_is_vf(efx)) - hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; - /* add encapsulated TSO features */ + net_dev->hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + else + net_dev->hw_enc_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + + tun_feats = NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE | + NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM; + tso_feats = NETIF_F_TSO | NETIF_F_TSO6; + if (efx_has_cap(efx, TX_TSO_V2_ENCAP)) { - netdev_features_t encap_tso_features; - - encap_tso_features = NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE | - NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM; - - hw_enc_features |= encap_tso_features | NETIF_F_TSO; - efx->net_dev->features |= encap_tso_features; + /* If this is first nic_init, or if it is a reset and a new fw + * variant has added new features, enable them by default. + * If the features are not new, maintain their current value. + */ + if (!(net_dev->hw_features & tun_feats)) + net_dev->features |= tun_feats; + net_dev->hw_enc_features |= tun_feats | tso_feats; + net_dev->hw_features |= tun_feats; + } else { + net_dev->hw_enc_features &= ~(tun_feats | tso_feats); + net_dev->hw_features &= ~tun_feats; + net_dev->features &= ~tun_feats; } - efx->net_dev->hw_enc_features = hw_enc_features; /* don't fail init if RSS setup doesn't work */ rc = efx->type->rx_push_rss_config(efx, false, @@ -4021,7 +4032,10 @@ static unsigned int efx_ef10_recycle_ring_size(const struct efx_nic *efx) NETIF_F_HW_VLAN_CTAG_FILTER | \ NETIF_F_IPV6_CSUM | \ NETIF_F_RXHASH | \ - NETIF_F_NTUPLE) + NETIF_F_NTUPLE | \ + NETIF_F_SG | \ + NETIF_F_RXCSUM | \ + NETIF_F_RXALL) const struct efx_nic_type efx_hunt_a0_vf_nic_type = { .is_vf = true, diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 3a86f1213a05..6a1bff54bc6c 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -1001,21 +1001,18 @@ static int efx_pci_probe_post_io(struct efx_nic *efx) } /* Determine netdevice features */ - net_dev->features |= (efx->type->offload_features | NETIF_F_SG | - NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_RXALL); - if (efx->type->offload_features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) { - net_dev->features |= NETIF_F_TSO6; - if (efx_has_cap(efx, TX_TSO_V2_ENCAP)) - net_dev->hw_enc_features |= NETIF_F_TSO6; - } - /* Check whether device supports TSO */ - if (!efx->type->tso_versions || !efx->type->tso_versions(efx)) - net_dev->features &= ~NETIF_F_ALL_TSO; + net_dev->features |= efx->type->offload_features; + + /* Add TSO features */ + if (efx->type->tso_versions && efx->type->tso_versions(efx)) + net_dev->features |= NETIF_F_TSO | NETIF_F_TSO6; + /* Mask for features that also apply to VLAN devices */ net_dev->vlan_features |= (NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_ALL_TSO | NETIF_F_RXCSUM); + /* Determine user configurable features */ net_dev->hw_features |= net_dev->features & ~efx->fixed_features; /* Disable receiving frames with bad FCS, by default. */ From 9526222c1a11a18a0e038a28c5597edde394dc8a Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Fri, 24 Mar 2023 16:01:34 +0100 Subject: [PATCH 079/179] scsi: megaraid_sas: Fix crash after a double completion [ Upstream commit 2309df27111a51734cb9240b4d3c25f2f3c6ab06 ] When a physical disk is attached directly "without JBOD MAP support" (see megasas_get_tm_devhandle()) then there is no real error handling in the driver. Return FAILED instead of SUCCESS. Fixes: 18365b138508 ("megaraid_sas: Task management support") Signed-off-by: Tomas Henzl Link: https://lore.kernel.org/r/20230324150134.14696-1-thenzl@redhat.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/megaraid/megaraid_sas_fusion.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index 6650f8c8e9b0..af22ffa8f6a2 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -4768,7 +4768,7 @@ int megasas_task_abort_fusion(struct scsi_cmnd *scmd) devhandle = megasas_get_tm_devhandle(scmd->device); if (devhandle == (u16)ULONG_MAX) { - ret = SUCCESS; + ret = FAILED; sdev_printk(KERN_INFO, scmd->device, "task abort issued for invalid devhandle\n"); mutex_unlock(&instance->reset_mutex); @@ -4838,7 +4838,7 @@ int megasas_reset_target_fusion(struct scsi_cmnd *scmd) devhandle = megasas_get_tm_devhandle(scmd->device); if (devhandle == (u16)ULONG_MAX) { - ret = SUCCESS; + ret = FAILED; sdev_printk(KERN_INFO, scmd->device, "target reset issued for invalid devhandle\n"); mutex_unlock(&instance->reset_mutex); From 089e6318e1de220ec85f55bcc66d69cecd49bae3 Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Fri, 24 Mar 2023 12:32:04 -0700 Subject: [PATCH 080/179] scsi: mpt3sas: Don't print sense pool info twice [ Upstream commit d684a7a26f7d2c7122a4581ac966ed64e88fb29c ] _base_allocate_sense_dma_pool() already prints out the sense pool information, so don't print it a second time after calling it in _base_allocate_memory_pools(). In addition the version in _base_allocate_memory_pools() was using the wrong size value, sz, which was last assigned when doing some nvme calculations instead of sense_sz to determine the pool size in kilobytes. Cc: Sathya Prakash Cc: Sreekanth Reddy Cc: Suganath Prabu Subramani Cc: MPT-FusionLinux.pdl@broadcom.com Cc: "Martin K. Petersen" Cc: "James E.J. Bottomley" Fixes: 970ac2bb70e7 ("scsi: mpt3sas: Force sense buffer allocations to be within same 4 GB region") Signed-off-by: Jerry Snitselaar Link: https://lore.kernel.org/r/20230324193204.567932-1-jsnitsel@redhat.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/mpt3sas/mpt3sas_base.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 2ee9ea57554d..14ae0a9c5d3d 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -6616,11 +6616,6 @@ _base_allocate_memory_pools(struct MPT3SAS_ADAPTER *ioc) else if (rc == -EAGAIN) goto try_32bit_dma; total_sz += sense_sz; - ioc_info(ioc, - "sense pool(0x%p)- dma(0x%llx): depth(%d)," - "element_size(%d), pool_size(%d kB)\n", - ioc->sense, (unsigned long long)ioc->sense_dma, ioc->scsiio_depth, - SCSI_SENSE_BUFFERSIZE, sz / 1024); /* reply pool, 4 byte align */ sz = ioc->reply_free_queue_depth * ioc->reply_sz; rc = _base_allocate_reply_pool(ioc, sz); From cc0f9bb99735d2b68fac68f37b585d615728ce5b Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Thu, 23 Mar 2023 11:37:35 +0100 Subject: [PATCH 081/179] net: dsa: realtek: fix out-of-bounds access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit b93eb564869321d0dffaf23fcc5c88112ed62466 ] The probe function sets priv->chip_data to (void *)priv + sizeof(*priv) with the expectation that priv has enough trailing space. However, only realtek-smi actually allocated this chip_data space. Do likewise in realtek-mdio to fix out-of-bounds accesses. These accesses likely went unnoticed so far, because of an (unused) buf[4096] member in struct realtek_priv, which caused kmalloc to round up the allocated buffer to a big enough size, so nothing of value was overwritten. With a different allocator (like in the barebox bootloader port of the driver) or with KASAN, the memory corruption becomes quickly apparent. Fixes: aac94001067d ("net: dsa: realtek: add new mdio interface for drivers") Reviewed-by: Florian Fainelli Reviewed-by: Luiz Angelo Daros de Luca Reviewed-by: Alvin Šipraga Reviewed-by: Linus Walleij Signed-off-by: Ahmad Fatoum Link: https://lore.kernel.org/r/20230323103735.2331786-1-a.fatoum@pengutronix.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/realtek/realtek-mdio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/realtek/realtek-mdio.c b/drivers/net/dsa/realtek/realtek-mdio.c index 3e54fac5f902..5a8fe707ca25 100644 --- a/drivers/net/dsa/realtek/realtek-mdio.c +++ b/drivers/net/dsa/realtek/realtek-mdio.c @@ -21,6 +21,7 @@ #include #include +#include #include #include "realtek.h" @@ -152,7 +153,9 @@ static int realtek_mdio_probe(struct mdio_device *mdiodev) if (!var) return -EINVAL; - priv = devm_kzalloc(&mdiodev->dev, sizeof(*priv), GFP_KERNEL); + priv = devm_kzalloc(&mdiodev->dev, + size_add(sizeof(*priv), var->chip_data_sz), + GFP_KERNEL); if (!priv) return -ENOMEM; From 43b4331ce0cd88ccba425e0702ba35c1a52daccf Mon Sep 17 00:00:00 2001 From: SongJingyi Date: Fri, 24 Mar 2023 11:14:06 +0800 Subject: [PATCH 082/179] ptp_qoriq: fix memory leak in probe() [ Upstream commit f33642224e38d7e0d59336e10e7b4e370b1c4506 ] Smatch complains that: drivers/ptp/ptp_qoriq.c ptp_qoriq_probe() warn: 'base' from ioremap() not released. Fix this by revising the parameter from 'ptp_qoriq->base' to 'base'. This is only a bug if ptp_qoriq_init() returns on the first -ENODEV error path. For other error paths ptp_qoriq->base and base are the same. And this change makes the code more readable. Fixes: 7f4399ba405b ("ptp_qoriq: fix NULL access if ptp dt node missing") Signed-off-by: SongJingyi Reviewed-by: Dan Carpenter Reviewed-by: Dongliang Mu Link: https://lore.kernel.org/r/20230324031406.1895159-1-u201912584@hust.edu.cn Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/ptp/ptp_qoriq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index 08f4cf0ad9e3..8fa9772acf79 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -601,7 +601,7 @@ static int ptp_qoriq_probe(struct platform_device *dev) return 0; no_clock: - iounmap(ptp_qoriq->base); + iounmap(base); no_ioremap: release_resource(ptp_qoriq->rsrc); no_resource: From 9524c2ea47ccac230693135014880417102bddf7 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 09:06:03 +0100 Subject: [PATCH 083/179] net: dsa: microchip: ksz8: fix ksz8_fdb_dump() [ Upstream commit 88e943e83827a349f70c3464b3eba7260be7461d ] Before this patch, the ksz8_fdb_dump() function had several issues, such as uninitialized variables and incorrect usage of source port as a bit mask. These problems caused inaccurate reporting of vid information and port assignment in the bridge fdb. Fixes: e587be759e6e ("net: dsa: microchip: update fdb add/del/dump in ksz_common") Signed-off-by: Oleksij Rempel Acked-by: Arun Ramadoss Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/microchip/ksz8795.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index bd3b133e7085..22250ae222b5 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -907,15 +907,14 @@ int ksz8_fdb_dump(struct ksz_device *dev, int port, u16 entries = 0; u8 timestamp = 0; u8 fid; - u8 member; - struct alu_struct alu; + u8 src_port; + u8 mac[ETH_ALEN]; do { - alu.is_static = false; - ret = ksz8_r_dyn_mac_table(dev, i, alu.mac, &fid, &member, + ret = ksz8_r_dyn_mac_table(dev, i, mac, &fid, &src_port, ×tamp, &entries); - if (!ret && (member & BIT(port))) { - ret = cb(alu.mac, alu.fid, alu.is_static, data); + if (!ret && port == src_port) { + ret = cb(mac, fid, false, data); if (ret) break; } From 91840597869daf7faa35e6387f5b9636678b9e9c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 09:06:04 +0100 Subject: [PATCH 084/179] net: dsa: microchip: ksz8: fix ksz8_fdb_dump() to extract all 1024 entries [ Upstream commit 5d90492dd4ff50ad65c582c76c345d0b90001728 ] Current ksz8_fdb_dump() is able to extract only max 249 entries on the ksz8863/ksz8873 series of switches. This happened due to wrong bit mask and offset calculation. This commit corrects the issue and allows for the complete extraction of all 1024 entries. Fixes: 4b20a07e103f ("net: dsa: microchip: ksz8795: add support for ksz88xx chips") Signed-off-by: Oleksij Rempel Acked-by: Arun Ramadoss Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/microchip/ksz_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 07f6776bba12..b15a0b844c34 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -360,10 +360,10 @@ static const u32 ksz8863_masks[] = { [STATIC_MAC_TABLE_FID] = GENMASK(29, 26), [STATIC_MAC_TABLE_OVERRIDE] = BIT(20), [STATIC_MAC_TABLE_FWD_PORTS] = GENMASK(18, 16), - [DYNAMIC_MAC_TABLE_ENTRIES_H] = GENMASK(5, 0), + [DYNAMIC_MAC_TABLE_ENTRIES_H] = GENMASK(1, 0), [DYNAMIC_MAC_TABLE_MAC_EMPTY] = BIT(7), [DYNAMIC_MAC_TABLE_NOT_READY] = BIT(7), - [DYNAMIC_MAC_TABLE_ENTRIES] = GENMASK(31, 28), + [DYNAMIC_MAC_TABLE_ENTRIES] = GENMASK(31, 24), [DYNAMIC_MAC_TABLE_FID] = GENMASK(19, 16), [DYNAMIC_MAC_TABLE_SRC_PORT] = GENMASK(21, 20), [DYNAMIC_MAC_TABLE_TIMESTAMP] = GENMASK(23, 22), @@ -373,7 +373,7 @@ static u8 ksz8863_shifts[] = { [VLAN_TABLE_MEMBERSHIP_S] = 16, [STATIC_MAC_FWD_PORTS] = 16, [STATIC_MAC_FID] = 22, - [DYNAMIC_MAC_ENTRIES_H] = 3, + [DYNAMIC_MAC_ENTRIES_H] = 8, [DYNAMIC_MAC_ENTRIES] = 24, [DYNAMIC_MAC_FID] = 16, [DYNAMIC_MAC_TIMESTAMP] = 24, From 628f76b8ae7b8d63f41b5fa9086b2060390a6096 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 09:06:05 +0100 Subject: [PATCH 085/179] net: dsa: microchip: ksz8: fix offset for the timestamp filed [ Upstream commit b3177aab89be540dc50d2328427b073361093e38 ] We are using wrong offset, so we will get not a timestamp. Fixes: 4b20a07e103f ("net: dsa: microchip: ksz8795: add support for ksz88xx chips") Signed-off-by: Oleksij Rempel Acked-by: Arun Ramadoss Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index b15a0b844c34..160d7ad26ca0 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -376,7 +376,7 @@ static u8 ksz8863_shifts[] = { [DYNAMIC_MAC_ENTRIES_H] = 8, [DYNAMIC_MAC_ENTRIES] = 24, [DYNAMIC_MAC_FID] = 16, - [DYNAMIC_MAC_TIMESTAMP] = 24, + [DYNAMIC_MAC_TIMESTAMP] = 22, [DYNAMIC_MAC_SRC_PORT] = 20, }; From 8d86ea65e0f19d978d8b359d081f45b7684301cc Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 09:06:06 +0100 Subject: [PATCH 086/179] net: dsa: microchip: ksz8: ksz8_fdb_dump: avoid extracting ghost entry from empty dynamic MAC table. [ Upstream commit 492606cdc74804d372ab1bdb8f3ef4a6fb6f9f59 ] If the dynamic MAC table is empty, we will still extract one outdated entry. Fix it by using correct bit offset. Fixes: 4b20a07e103f ("net: dsa: microchip: ksz8795: add support for ksz88xx chips") Signed-off-by: Oleksij Rempel Acked-by: Arun Ramadoss Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 160d7ad26ca0..286e081830e7 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -361,7 +361,7 @@ static const u32 ksz8863_masks[] = { [STATIC_MAC_TABLE_OVERRIDE] = BIT(20), [STATIC_MAC_TABLE_FWD_PORTS] = GENMASK(18, 16), [DYNAMIC_MAC_TABLE_ENTRIES_H] = GENMASK(1, 0), - [DYNAMIC_MAC_TABLE_MAC_EMPTY] = BIT(7), + [DYNAMIC_MAC_TABLE_MAC_EMPTY] = BIT(2), [DYNAMIC_MAC_TABLE_NOT_READY] = BIT(7), [DYNAMIC_MAC_TABLE_ENTRIES] = GENMASK(31, 24), [DYNAMIC_MAC_TABLE_FID] = GENMASK(19, 16), From adfe556652033b6f878edb760992965aba730537 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 09:06:07 +0100 Subject: [PATCH 087/179] net: dsa: microchip: ksz8863_smi: fix bulk access [ Upstream commit 392ff7a84cbca34118ca286dfbfe8aee24605897 ] Current regmap bulk access is broken, resulting to wrong reads/writes if ksz_read64/ksz_write64 functions are used. Mostly this issue was visible by using ksz8_fdb_dump(), which returned corrupt MAC address. The reason is that regmap was configured to have max_raw_read/write, even if ksz8863_mdio_read/write functions are able to handle unlimited read/write accesses. On ksz_read64 function we are using multiple 32bit accesses by incrementing each access by 1 instead of 4. Resulting buffer had 01234567.12345678 instead of 01234567.89abcdef. We have multiple ways to fix it: - enable 4 byte alignment for 32bit accesses. Since the HW do not have this requirement. It will break driver. - disable max_raw_* limit. This patch is removing max_raw_* limit for regmap accesses in ksz8863_smi. Fixes: 60a364760002 ("net: dsa: microchip: Add Microchip KSZ8863 SMI based driver support") Signed-off-by: Oleksij Rempel Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/microchip/ksz8863_smi.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8863_smi.c b/drivers/net/dsa/microchip/ksz8863_smi.c index ddb40838181e..ed77ac222895 100644 --- a/drivers/net/dsa/microchip/ksz8863_smi.c +++ b/drivers/net/dsa/microchip/ksz8863_smi.c @@ -82,22 +82,16 @@ static const struct regmap_bus regmap_smi[] = { { .read = ksz8863_mdio_read, .write = ksz8863_mdio_write, - .max_raw_read = 1, - .max_raw_write = 1, }, { .read = ksz8863_mdio_read, .write = ksz8863_mdio_write, .val_format_endian_default = REGMAP_ENDIAN_BIG, - .max_raw_read = 2, - .max_raw_write = 2, }, { .read = ksz8863_mdio_read, .write = ksz8863_mdio_write, .val_format_endian_default = REGMAP_ENDIAN_BIG, - .max_raw_read = 4, - .max_raw_write = 4, } }; @@ -108,7 +102,6 @@ static const struct regmap_config ksz8863_regmap_config[] = { .pad_bits = 24, .val_bits = 8, .cache_type = REGCACHE_NONE, - .use_single_read = 1, .lock = ksz_regmap_lock, .unlock = ksz_regmap_unlock, }, @@ -118,7 +111,6 @@ static const struct regmap_config ksz8863_regmap_config[] = { .pad_bits = 24, .val_bits = 16, .cache_type = REGCACHE_NONE, - .use_single_read = 1, .lock = ksz_regmap_lock, .unlock = ksz_regmap_unlock, }, @@ -128,7 +120,6 @@ static const struct regmap_config ksz8863_regmap_config[] = { .pad_bits = 24, .val_bits = 32, .cache_type = REGCACHE_NONE, - .use_single_read = 1, .lock = ksz_regmap_lock, .unlock = ksz_regmap_unlock, } From 4ffa3fec1c3496c317eb54749b432f21cae77d32 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 09:06:08 +0100 Subject: [PATCH 088/179] net: dsa: microchip: ksz8: fix MDB configuration with non-zero VID [ Upstream commit 9aa5757e1f71d85facdc3c98028762cbab8d15c7 ] FID is directly mapped to VID. However, configuring a MAC address with a VID != 0 resulted in incorrect configuration due to an incorrect bit mask. This kernel commit fixed the issue by correcting the bit mask and ensuring proper configuration of MAC addresses with non-zero VID. Fixes: 4b20a07e103f ("net: dsa: microchip: ksz8795: add support for ksz88xx chips") Signed-off-by: Oleksij Rempel Acked-by: Arun Ramadoss Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/dsa/microchip/ksz_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 286e081830e7..3d59298eaa5c 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -357,7 +357,7 @@ static const u32 ksz8863_masks[] = { [VLAN_TABLE_VALID] = BIT(19), [STATIC_MAC_TABLE_VALID] = BIT(19), [STATIC_MAC_TABLE_USE_FID] = BIT(21), - [STATIC_MAC_TABLE_FID] = GENMASK(29, 26), + [STATIC_MAC_TABLE_FID] = GENMASK(25, 22), [STATIC_MAC_TABLE_OVERRIDE] = BIT(20), [STATIC_MAC_TABLE_FWD_PORTS] = GENMASK(18, 16), [DYNAMIC_MAC_TABLE_ENTRIES_H] = GENMASK(1, 0), From 1b808f5d844953a2208dbb56aa2003503edc8d7a Mon Sep 17 00:00:00 2001 From: ChunHao Lin Date: Thu, 23 Mar 2023 22:33:09 +0800 Subject: [PATCH 089/179] r8169: fix RTL8168H and RTL8107E rx crc error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 33189f0a94b9639c058781fcf82e4ea3803b1682 ] When link speed is 10 Mbps and temperature is under -20°C, RTL8168H and RTL8107E may have rx crc error. Disable phy 10 Mbps pll off to fix this issue. Fixes: 6e1d0b898818 ("r8169:add support for RTL8168H and RTL8107E") Signed-off-by: ChunHao Lin Reviewed-by: Heiner Kallweit Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/realtek/r8169_phy_config.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index 930496cd34ed..b50f16786c24 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -826,6 +826,9 @@ static void rtl8168h_2_hw_phy_config(struct rtl8169_private *tp, /* disable phy pfm mode */ phy_modify_paged(phydev, 0x0a44, 0x11, BIT(7), 0); + /* disable 10m pll off */ + phy_modify_paged(phydev, 0x0a43, 0x10, BIT(0), 0); + rtl8168g_disable_aldps(phydev); rtl8168g_config_eee_phy(phydev); } From f70328a0bfb2e3dea05682638c9c0e0d7b8ce2f2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Mar 2023 10:29:33 +0200 Subject: [PATCH 090/179] regulator: Handle deferred clk [ Upstream commit 02bcba0b9f9da706d5bd1e8cbeb83493863e17b5 ] devm_clk_get() can return -EPROBE_DEFER. So it is better to return the error code from devm_clk_get(), instead of a hard coded -ENOENT. This gives more opportunities to successfully probe the driver. Fixes: 8959e5324485 ("regulator: fixed: add possibility to enable by clock") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/18459fae3d017a66313699c7c8456b28158b2dd0.1679819354.git.christophe.jaillet@wanadoo.fr Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/regulator/fixed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c index 2a9867abba20..e6724a229d23 100644 --- a/drivers/regulator/fixed.c +++ b/drivers/regulator/fixed.c @@ -215,7 +215,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev) drvdata->enable_clock = devm_clk_get(dev, NULL); if (IS_ERR(drvdata->enable_clock)) { dev_err(dev, "Can't get enable-clock from devicetree\n"); - return -ENOENT; + return PTR_ERR(drvdata->enable_clock); } } else if (drvtype && drvtype->has_performance_state) { drvdata->desc.ops = &fixed_voltage_domain_ops; From c942f5cd63b7c2e73fe06744185a34b03267595b Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Fri, 24 Mar 2023 17:19:54 +0800 Subject: [PATCH 091/179] net/net_failover: fix txq exceeding warning [ Upstream commit e3cbdcb0fbb61045ef3ce0e072927cc41737f787 ] The failover txq is inited as 16 queues. when a packet is transmitted from the failover device firstly, the failover device will select the queue which is returned from the primary device if the primary device is UP and running. If the primary device txq is bigger than the default 16, it can lead to the following warning: eth0 selects TX queue 18, but real number of TX queues is 16 The warning backtrace is: [ 32.146376] CPU: 18 PID: 9134 Comm: chronyd Tainted: G E 6.2.8-1.el7.centos.x86_64 #1 [ 32.147175] Hardware name: Red Hat KVM, BIOS 1.10.2-3.el7_4.1 04/01/2014 [ 32.147730] Call Trace: [ 32.147971] [ 32.148183] dump_stack_lvl+0x48/0x70 [ 32.148514] dump_stack+0x10/0x20 [ 32.148820] netdev_core_pick_tx+0xb1/0xe0 [ 32.149180] __dev_queue_xmit+0x529/0xcf0 [ 32.149533] ? __check_object_size.part.0+0x21c/0x2c0 [ 32.149967] ip_finish_output2+0x278/0x560 [ 32.150327] __ip_finish_output+0x1fe/0x2f0 [ 32.150690] ip_finish_output+0x2a/0xd0 [ 32.151032] ip_output+0x7a/0x110 [ 32.151337] ? __pfx_ip_finish_output+0x10/0x10 [ 32.151733] ip_local_out+0x5e/0x70 [ 32.152054] ip_send_skb+0x19/0x50 [ 32.152366] udp_send_skb.isra.0+0x163/0x3a0 [ 32.152736] udp_sendmsg+0xba8/0xec0 [ 32.153060] ? __folio_memcg_unlock+0x25/0x60 [ 32.153445] ? __pfx_ip_generic_getfrag+0x10/0x10 [ 32.153854] ? sock_has_perm+0x85/0xa0 [ 32.154190] inet_sendmsg+0x6d/0x80 [ 32.154508] ? inet_sendmsg+0x6d/0x80 [ 32.154838] sock_sendmsg+0x62/0x70 [ 32.155152] ____sys_sendmsg+0x134/0x290 [ 32.155499] ___sys_sendmsg+0x81/0xc0 [ 32.155828] ? _get_random_bytes.part.0+0x79/0x1a0 [ 32.156240] ? ip4_datagram_release_cb+0x5f/0x1e0 [ 32.156649] ? get_random_u16+0x69/0xf0 [ 32.156989] ? __fget_light+0xcf/0x110 [ 32.157326] __sys_sendmmsg+0xc4/0x210 [ 32.157657] ? __sys_connect+0xb7/0xe0 [ 32.157995] ? __audit_syscall_entry+0xce/0x140 [ 32.158388] ? syscall_trace_enter.isra.0+0x12c/0x1a0 [ 32.158820] __x64_sys_sendmmsg+0x24/0x30 [ 32.159171] do_syscall_64+0x38/0x90 [ 32.159493] entry_SYSCALL_64_after_hwframe+0x72/0xdc Fix that by reducing txq number as the non-existent primary-dev does. Fixes: cfc80d9a1163 ("net: Introduce net_failover driver") Signed-off-by: Faicker Mo Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/net_failover.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 7a28e082436e..d0c916a53d7c 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -130,14 +130,10 @@ static u16 net_failover_select_queue(struct net_device *dev, txq = ops->ndo_select_queue(primary_dev, skb, sb_dev); else txq = netdev_pick_tx(primary_dev, skb, NULL); - - qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; - - return txq; + } else { + txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; } - txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; - /* Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; From ce1b88dd4013fa87bf80d3e675460f399daa508a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 25 Mar 2023 13:28:15 +0200 Subject: [PATCH 092/179] net: stmmac: don't reject VLANs when IFF_PROMISC is set [ Upstream commit a7602e7332b97cfbec7bacb0f1ade99a575fe104 ] The blamed commit has introduced the following tests to dwmac4_add_hw_vlan_rx_fltr(), called from stmmac_vlan_rx_add_vid(): if (hw->promisc) { netdev_err(dev, "Adding VLAN in promisc mode not supported\n"); return -EPERM; } "VLAN promiscuous" mode is keyed in this driver to IFF_PROMISC, and so, vlan_vid_add() and vlan_vid_del() calls cannot take place in IFF_PROMISC mode. I have the following 2 arguments that this restriction is.... hm, how shall I put it nicely... unproductive :) First, take the case of a Linux bridge. If the kernel is compiled with CONFIG_BRIDGE_VLAN_FILTERING=y, then this bridge shall have a VLAN database. The bridge shall try to call vlan_add_vid() on its bridge ports for each VLAN in the VLAN table. It will do this irrespectively of whether that port is *currently* VLAN-aware or not. So it will do this even when the bridge was created with vlan_filtering 0. But the Linux bridge, in VLAN-unaware mode, configures its ports in promiscuous (IFF_PROMISC) mode, so that they accept packets with any MAC DA (a switch must do this in order to forward those packets which are not directly targeted to its MAC address). As a result, the stmmac driver does not work as a bridge port, when the kernel is compiled with CONFIG_BRIDGE_VLAN_FILTERING=y. $ ip link add br0 type bridge && ip link set br0 up $ ip link set eth0 master br0 && ip link set eth0 up [ 2333.943296] br0: port 1(eth0) entered blocking state [ 2333.943381] br0: port 1(eth0) entered disabled state [ 2333.943782] device eth0 entered promiscuous mode [ 2333.944080] 4033c000.ethernet eth0: Adding VLAN in promisc mode not supported [ 2333.976509] 4033c000.ethernet eth0: failed to initialize vlan filtering on this port RTNETLINK answers: Operation not permitted Secondly, take the case of stmmac as DSA master. Some switch tagging protocols are based on 802.1Q VLANs (tag_sja1105.c), and as such, tag_8021q.c uses vlan_vid_add() to work with VLAN-filtering DSA masters. But also, when a DSA port becomes promiscuous (for example when it joins a bridge), the DSA framework also makes the DSA master promiscuous. Moreover, for every VLAN that a DSA switch sends to the CPU, DSA also programs a VLAN filter on the DSA master, because if the the DSA switch uses a tail tag, then the hardware frame parser of the DSA master will see VLAN as VLAN, and might filter them out, for being unknown. Due to the above 2 reasons, my belief is that the stmmac driver does not get to choose to not accept vlan_vid_add() calls while IFF_PROMISC is enabled, because the 2 are completely independent and there are code paths in the network stack which directly lead to this situation occurring, without the user's direct input. In fact, my belief is that "VLAN promiscuous" mode should have never been keyed on IFF_PROMISC in the first place, but rather, on the NETIF_F_HW_VLAN_CTAG_FILTER feature flag which can be toggled by the user through ethtool -k, when present in netdev->hw_features. In the stmmac driver, NETIF_F_HW_VLAN_CTAG_FILTER is only present in "features", making this feature "on [fixed]". I have this belief because I am unaware of any definition of promiscuity which implies having an effect on anything other than MAC DA (therefore not VLAN). However, I seem to be rather alone in having this opinion, looking back at the disagreements from this discussion: https://lore.kernel.org/netdev/20201110153958.ci5ekor3o2ekg3ky@ipetronik.com/ In any case, to remove the vlan_vid_add() dependency on !IFF_PROMISC, one would need to remove the check and see what fails. I guess the test was there because of the way in which dwmac4_vlan_promisc_enable() is implemented. For context, the dwmac4 supports Perfect Filtering for a limited number of VLANs - dwmac4_get_num_vlan(), priv->hw->num_vlan, with a fallback on Hash Filtering - priv->dma_cap.vlhash - see stmmac_vlan_update(), also visible in cat /sys/kernel/debug/stmmaceth/eth0/dma_cap | grep 'VLAN Hash Filtering'. The perfect filtering is based on MAC_VLAN_Tag_Filter/MAC_VLAN_Tag_Data registers, accessed in the driver through dwmac4_write_vlan_filter(). The hash filtering is based on the MAC_VLAN_Hash_Table register, named GMAC_VLAN_HASH_TABLE in the driver and accessed by dwmac4_update_vlan_hash(). The control bit for enabling hash filtering is GMAC_VLAN_VTHM (MAC_VLAN_Tag_Ctrl bit VTHM: VLAN Tag Hash Table Match Enable). Now, the description of dwmac4_vlan_promisc_enable() is that it iterates through the driver's cache of perfect filter entries (hw->vlan_filter[i], added by dwmac4_add_hw_vlan_rx_fltr()), and evicts them from hardware by unsetting their GMAC_VLAN_TAG_DATA_VEN (MAC_VLAN_Tag_Data bit VEN - VLAN Tag Enable) bit. Then it unsets the GMAC_VLAN_VTHM bit, which disables hash matching. This leaves the MAC, according to table "VLAN Match Status" from the documentation, to always enter these data paths: VID |VLAN Perfect Filter |VTHM Bit |VLAN Hash Filter |Final VLAN Match |Match Result | |Match Result |Status -------|--------------------|---------|-----------------|---------------- VID!=0 |Fail |0 |don't care |Pass So, dwmac4_vlan_promisc_enable() does its job, but by unsetting GMAC_VLAN_VTHM, it conflicts with the other code path which controls this bit: dwmac4_update_vlan_hash(), called through stmmac_update_vlan_hash() from stmmac_vlan_rx_add_vid() and from stmmac_vlan_rx_kill_vid(). This is, I guess, why dwmac4_add_hw_vlan_rx_fltr() is not allowed to run after dwmac4_vlan_promisc_enable() has unset GMAC_VLAN_VTHM: because if it did, then dwmac4_update_vlan_hash() would set GMAC_VLAN_VTHM again, breaking the "VLAN promiscuity". It turns out that dwmac4_vlan_promisc_enable() is way too complicated for what needs to be done. The MAC_Packet_Filter register also has the VTFE bit (VLAN Tag Filter Enable), which simply controls whether VLAN tagged packets which don't match the filtering tables (either perfect or hash) are dropped or not. At the moment, this driver unconditionally sets GMAC_PACKET_FILTER_VTFE if NETIF_F_HW_VLAN_CTAG_FILTER was detected through the priv->dma_cap.vlhash capability bits of the device, in stmmac_dvr_probe(). I would suggest deleting the unnecessarily complex logic from dwmac4_vlan_promisc_enable(), and simply unsetting GMAC_PACKET_FILTER_VTFE when becoming IFF_PROMISC, which has the same effect of allowing packets with any VLAN tags, but has the additional benefit of being able to run concurrently with stmmac_vlan_rx_add_vid() and stmmac_vlan_rx_kill_vid(). As much as I believe that the VTFE bit should have been exclusively controlled by NETIF_F_HW_VLAN_CTAG_FILTER through ethtool, and not by IFF_PROMISC, changing that is not a punctual fix to the problem, and it would probably break the VFFQ feature added by the later commit e0f9956a3862 ("net: stmmac: Add option for VLAN filter fail queue enable"). From the commit description, VFFQ needs IFF_PROMISC=on and VTFE=off in order to work (and this change respects that). But if VTFE was changed to be controlled through ethtool -k, then a user-visible change would have been introduced in Intel's scripts (a need to run "ethtool -k eth0 rx-vlan-filter off" which did not exist before). The patch was tested with this set of commands: ip link set eth0 up ip link add link eth0 name eth0.100 type vlan id 100 ip addr add 192.168.100.2/24 dev eth0.100 && ip link set eth0.100 up ip link set eth0 promisc on ip link add link eth0 name eth0.101 type vlan id 101 ip addr add 192.168.101.2/24 dev eth0.101 && ip link set eth0.101 up ip link set eth0 promisc off ping -c 5 192.168.100.1 ping -c 5 192.168.101.1 ip link set eth0 promisc on ping -c 5 192.168.100.1 ping -c 5 192.168.101.1 ip link del eth0.100 ip link del eth0.101 # Wait for VLAN-tagged pings from the other end... # Check with "tcpdump -i eth0 -e -n -p" and we should see them ip link set eth0 promisc off # Wait for VLAN-tagged pings from the other end... # Check with "tcpdump -i eth0 -e -n -p" and we shouldn't see them # anymore, but remove the "-p" argument from tcpdump and they're there. Fixes: c89f44ff10fd ("net: stmmac: Add support for VLAN promiscuous mode") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 - .../net/ethernet/stmicro/stmmac/dwmac4_core.c | 61 +------------------ 2 files changed, 3 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index ec9c130276d8..54bb072aeb2d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -532,7 +532,6 @@ struct mac_device_info { unsigned int xlgmac; unsigned int num_vlan; u32 vlan_filter[32]; - unsigned int promisc; bool vlan_fail_q_en; u8 vlan_fail_q; }; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index e5cfde1cbd5c..188a00065f66 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -481,12 +481,6 @@ static int dwmac4_add_hw_vlan_rx_fltr(struct net_device *dev, if (vid > 4095) return -EINVAL; - if (hw->promisc) { - netdev_err(dev, - "Adding VLAN in promisc mode not supported\n"); - return -EPERM; - } - /* Single Rx VLAN Filter */ if (hw->num_vlan == 1) { /* For single VLAN filter, VID 0 means VLAN promiscuous */ @@ -536,12 +530,6 @@ static int dwmac4_del_hw_vlan_rx_fltr(struct net_device *dev, { int i, ret = 0; - if (hw->promisc) { - netdev_err(dev, - "Deleting VLAN in promisc mode not supported\n"); - return -EPERM; - } - /* Single Rx VLAN Filter */ if (hw->num_vlan == 1) { if ((hw->vlan_filter[0] & GMAC_VLAN_TAG_VID) == vid) { @@ -566,39 +554,6 @@ static int dwmac4_del_hw_vlan_rx_fltr(struct net_device *dev, return ret; } -static void dwmac4_vlan_promisc_enable(struct net_device *dev, - struct mac_device_info *hw) -{ - void __iomem *ioaddr = hw->pcsr; - u32 value; - u32 hash; - u32 val; - int i; - - /* Single Rx VLAN Filter */ - if (hw->num_vlan == 1) { - dwmac4_write_single_vlan(dev, 0); - return; - } - - /* Extended Rx VLAN Filter Enable */ - for (i = 0; i < hw->num_vlan; i++) { - if (hw->vlan_filter[i] & GMAC_VLAN_TAG_DATA_VEN) { - val = hw->vlan_filter[i] & ~GMAC_VLAN_TAG_DATA_VEN; - dwmac4_write_vlan_filter(dev, hw, i, val); - } - } - - hash = readl(ioaddr + GMAC_VLAN_HASH_TABLE); - if (hash & GMAC_VLAN_VLHT) { - value = readl(ioaddr + GMAC_VLAN_TAG); - if (value & GMAC_VLAN_VTHM) { - value &= ~GMAC_VLAN_VTHM; - writel(value, ioaddr + GMAC_VLAN_TAG); - } - } -} - static void dwmac4_restore_hw_vlan_rx_fltr(struct net_device *dev, struct mac_device_info *hw) { @@ -718,22 +673,12 @@ static void dwmac4_set_filter(struct mac_device_info *hw, } /* VLAN filtering */ - if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) + if (dev->flags & IFF_PROMISC && !hw->vlan_fail_q_en) + value &= ~GMAC_PACKET_FILTER_VTFE; + else if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) value |= GMAC_PACKET_FILTER_VTFE; writel(value, ioaddr + GMAC_PACKET_FILTER); - - if (dev->flags & IFF_PROMISC && !hw->vlan_fail_q_en) { - if (!hw->promisc) { - hw->promisc = 1; - dwmac4_vlan_promisc_enable(dev, hw); - } - } else { - if (hw->promisc) { - hw->promisc = 0; - dwmac4_restore_hw_vlan_rx_fltr(dev, hw); - } - } } static void dwmac4_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, From 75084659969f5cd0287a86e7faae3ef0a5651d1e Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Thu, 16 Mar 2023 15:17:13 +0200 Subject: [PATCH 093/179] drm/i915/tc: Fix the ICL PHY ownership check in TC-cold state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 38c583019484f190d5b33f59b8ae810e6b1763c6 ] The commit renaming icl_tc_phy_is_in_safe_mode() to icl_tc_phy_take_ownership() didn't flip the function's return value accordingly, fix this up. This didn't cause an actual problem besides state check errors, since the function is only used during HW readout. Cc: José Roberto de Souza Fixes: f53979d68a77 ("drm/i915/display/tc: Rename safe_mode functions ownership") Reviewed-by: José Roberto de Souza Reviewed-by: Ville Syrjälä Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20230316131724.359612-4-imre.deak@intel.com (cherry picked from commit f2c7959dda614d9b7c6a41510492de39d31705ec) Signed-off-by: Jani Nikula Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/display/intel_tc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_tc.c b/drivers/gpu/drm/i915/display/intel_tc.c index e5af955b5600..8d6dac32c896 100644 --- a/drivers/gpu/drm/i915/display/intel_tc.c +++ b/drivers/gpu/drm/i915/display/intel_tc.c @@ -440,9 +440,9 @@ static bool icl_tc_phy_is_owned(struct intel_digital_port *dig_port) PORT_TX_DFLEXDPCSSS(dig_port->tc_phy_fia)); if (val == 0xffffffff) { drm_dbg_kms(&i915->drm, - "Port %s: PHY in TCCOLD, assume safe mode\n", + "Port %s: PHY in TCCOLD, assume not owned\n", dig_port->tc_port_name); - return true; + return false; } return val & DP_PHY_MODE_STATUS_NOT_SAFE(dig_port->tc_phy_fia_idx); From 7ffdf7e6fc92d636fb0871ecbfabf710f0ba36a9 Mon Sep 17 00:00:00 2001 From: Rajvi Jingar Date: Mon, 20 Mar 2023 14:20:29 -0700 Subject: [PATCH 094/179] platform/x86/intel/pmc: Alder Lake PCH slp_s0_residency fix [ Upstream commit fb5755100a0a5aa5957bdb204fd1e249684557fc ] For platforms with Alder Lake PCH (Alder Lake S and Raptor Lake S) the slp_s0_residency attribute has been reporting the wrong value. Unlike other platforms, ADL PCH does not have a counter for the time that the SLP_S0 signal was asserted. Instead, firmware uses the aggregate of the Low Power Mode (LPM) substate counters as the S0ix value. Since the LPM counters run at a different frequency, this lead to misreporting of the S0ix time. Add a check for Alder Lake PCH and adjust the frequency accordingly when display slp_s0_residency. Fixes: bbab31101f44 ("platform/x86/intel: pmc/core: Add Alderlake support to pmc core driver") Signed-off-by: Rajvi Jingar Signed-off-by: David E. Box Reviewed-by: Rajneesh Bhardwaj Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230320212029.3154407-1-david.e.box@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Sasha Levin --- drivers/platform/x86/intel/pmc/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 17ec5825d13d..be0fb9401202 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -958,7 +958,18 @@ static inline void pmc_core_reg_write(struct pmc_dev *pmcdev, int reg_offset, static inline u64 pmc_core_adjust_slp_s0_step(struct pmc_dev *pmcdev, u32 value) { - return (u64)value * pmcdev->map->slp_s0_res_counter_step; + /* + * ADL PCH does not have the SLP_S0 counter and LPM Residency counters are + * used as a workaround which uses 30.5 usec tick. All other client + * programs have the legacy SLP_S0 residency counter that is using the 122 + * usec tick. + */ + const int lpm_adj_x2 = pmcdev->map->lpm_res_counter_step_x2; + + if (pmcdev->map == &adl_reg_map) + return (u64)value * GET_X2_COUNTER((u64)lpm_adj_x2); + else + return (u64)value * pmcdev->map->slp_s0_res_counter_step; } static int set_etr3(struct pmc_dev *pmcdev) From c11dbc7705b3739974ac31a13f4ab81e61a5fb07 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 14 Mar 2023 16:04:45 +0400 Subject: [PATCH 095/179] can: bcm: bcm_tx_setup(): fix KMSAN uninit-value in vfs_write [ Upstream commit 2b4c99f7d9a57ecd644eda9b1fb0a1072414959f ] Syzkaller reported the following issue: ===================================================== BUG: KMSAN: uninit-value in aio_rw_done fs/aio.c:1520 [inline] BUG: KMSAN: uninit-value in aio_write+0x899/0x950 fs/aio.c:1600 aio_rw_done fs/aio.c:1520 [inline] aio_write+0x899/0x950 fs/aio.c:1600 io_submit_one+0x1d1c/0x3bf0 fs/aio.c:2019 __do_sys_io_submit fs/aio.c:2078 [inline] __se_sys_io_submit+0x293/0x770 fs/aio.c:2048 __x64_sys_io_submit+0x92/0xd0 fs/aio.c:2048 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook mm/slab.h:766 [inline] slab_alloc_node mm/slub.c:3452 [inline] __kmem_cache_alloc_node+0x71f/0xce0 mm/slub.c:3491 __do_kmalloc_node mm/slab_common.c:967 [inline] __kmalloc+0x11d/0x3b0 mm/slab_common.c:981 kmalloc_array include/linux/slab.h:636 [inline] bcm_tx_setup+0x80e/0x29d0 net/can/bcm.c:930 bcm_sendmsg+0x3a2/0xce0 net/can/bcm.c:1351 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg net/socket.c:734 [inline] sock_write_iter+0x495/0x5e0 net/socket.c:1108 call_write_iter include/linux/fs.h:2189 [inline] aio_write+0x63a/0x950 fs/aio.c:1600 io_submit_one+0x1d1c/0x3bf0 fs/aio.c:2019 __do_sys_io_submit fs/aio.c:2078 [inline] __se_sys_io_submit+0x293/0x770 fs/aio.c:2048 __x64_sys_io_submit+0x92/0xd0 fs/aio.c:2048 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd CPU: 1 PID: 5034 Comm: syz-executor350 Not tainted 6.2.0-rc6-syzkaller-80422-geda666ff2276 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/12/2023 ===================================================== We can follow the call chain and find that 'bcm_tx_setup' function calls 'memcpy_from_msg' to copy some content to the newly allocated frame of 'op->frames'. After that the 'len' field of copied structure being compared with some constant value (64 or 8). However, if 'memcpy_from_msg' returns an error, we will compare some uninitialized memory. This triggers 'uninit-value' issue. This patch will add 'memcpy_from_msg' possible errors processing to avoid uninit-value issue. Tested via syzkaller Reported-by: syzbot+c9bfd85eca611ebf5db1@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=47f897f8ad958bbde5790ebf389b5e7e0a345089 Signed-off-by: Ivan Orlov Fixes: 6f3b911d5f29b ("can: bcm: add support for CAN FD frames") Acked-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230314120445.12407-1-ivan.orlov0322@gmail.com Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin --- net/can/bcm.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index 27706f6ace34..a962ec2b8ba5 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -941,6 +941,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); + if (err < 0) + goto free_op; if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) @@ -950,12 +952,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, err = -EINVAL; } - if (err < 0) { - if (op->frames != &op->sframe) - kfree(op->frames); - kfree(op); - return err; - } + if (err < 0) + goto free_op; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ @@ -1026,6 +1024,12 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; + +free_op: + if (op->frames != &op->sframe) + kfree(op->frames); + kfree(op); + return err; } /* From 6a40fda14b4be3e38f03cc42ffd4efbc64fb3e67 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Mon, 20 Mar 2023 11:04:47 -0400 Subject: [PATCH 096/179] s390/vfio-ap: fix memory leak in vfio_ap device driver [ Upstream commit 8f8cf767589f2131ae5d40f3758429095c701c84 ] The device release callback function invoked to release the matrix device uses the dev_get_drvdata(device *dev) function to retrieve the pointer to the vfio_matrix_dev object in order to free its storage. The problem is, this object is not stored as drvdata with the device; since the kfree function will accept a NULL pointer, the memory for the vfio_matrix_dev object is never freed. Since the device being released is contained within the vfio_matrix_dev object, the container_of macro will be used to retrieve its pointer. Fixes: 1fde573413b5 ("s390: vfio-ap: base implementation of VFIO AP device driver") Signed-off-by: Tony Krowiak Reviewed-by: Harald Freudenberger Link: https://lore.kernel.org/r/20230320150447.34557-1-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Sasha Levin --- drivers/s390/crypto/vfio_ap_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c index f43cfeabd2cc..0afceb63ac43 100644 --- a/drivers/s390/crypto/vfio_ap_drv.c +++ b/drivers/s390/crypto/vfio_ap_drv.c @@ -54,8 +54,9 @@ static struct ap_driver vfio_ap_drv = { static void vfio_ap_matrix_dev_release(struct device *dev) { - struct ap_matrix_dev *matrix_dev = dev_get_drvdata(dev); + struct ap_matrix_dev *matrix_dev; + matrix_dev = container_of(dev, struct ap_matrix_dev, device); kfree(matrix_dev); } From b7707176f858a4361566ba5bdef2d3a7f519fb93 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 24 Mar 2023 14:33:42 +0100 Subject: [PATCH 097/179] ACPI: bus: Rework system-level device notification handling [ Upstream commit c56610a869bce03490faf4f157076370c71b8ae3 ] For ACPI drivers that provide a ->notify() callback and set ACPI_DRIVER_ALL_NOTIFY_EVENTS in their flags, that callback can be invoked while either the ->add() or the ->remove() callback is running without any synchronization at the bus type level which is counter to the common-sense expectation that notification handling should only be enabled when the driver is actually bound to the device. As a result, if the driver is not careful enough, it's ->notify() callback may crash when it is invoked too early or too late [1]. This issue has been amplified by commit d6fb6ee1820c ("ACPI: bus: Drop driver member of struct acpi_device") that made acpi_bus_notify() check for the presence of the driver and its ->notify() callback directly instead of using an extra driver pointer that was only set and cleared by the bus type code, but it was present before that commit although it was harder to reproduce then. It can be addressed by using the observation that acpi_device_install_notify_handler() can be modified to install the handler for all types of events when ACPI_DRIVER_ALL_NOTIFY_EVENTS is set in the driver flags, in which case acpi_bus_notify() will not need to invoke the driver's ->notify() callback any more and that callback will only be invoked after acpi_device_install_notify_handler() has run and before acpi_device_remove_notify_handler() runs, which implies the correct ordering with respect to the other ACPI driver callbacks. Modify the code accordingly and while at it, drop two redundant local variables from acpi_bus_notify() and turn its description comment into a proper kerneldoc one. Fixes: d6fb6ee1820c ("ACPI: bus: Drop driver member of struct acpi_device") Link: https://lore.kernel.org/linux-acpi/9f6cba7a8a57e5a687c934e8e406e28c.squirrel@mail.panix.com # [1] Reported-by: Pierre Asselin Signed-off-by: Rafael J. Wysocki Tested-by: Pierre Asselin Signed-off-by: Sasha Levin --- drivers/acpi/bus.c | 85 +++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 47 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index d466c8195314..3b6146b1e25c 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -456,85 +456,67 @@ out_free: Notification Handling -------------------------------------------------------------------------- */ -/* - * acpi_bus_notify - * --------------- - * Callback for all 'system-level' device notifications (values 0x00-0x7F). +/** + * acpi_bus_notify - Global system-level (0x00-0x7F) notifications handler + * @handle: Target ACPI object. + * @type: Notification type. + * @data: Ignored. + * + * This only handles notifications related to device hotplug. */ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data) { struct acpi_device *adev; - u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; - bool hotplug_event = false; switch (type) { case ACPI_NOTIFY_BUS_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_BUS_CHECK event\n"); - hotplug_event = true; break; case ACPI_NOTIFY_DEVICE_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK event\n"); - hotplug_event = true; break; case ACPI_NOTIFY_DEVICE_WAKE: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_WAKE event\n"); - break; + return; case ACPI_NOTIFY_EJECT_REQUEST: acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n"); - hotplug_event = true; break; case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK_LIGHT event\n"); /* TBD: Exactly what does 'light' mean? */ - break; + return; case ACPI_NOTIFY_FREQUENCY_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a frequency mismatch\n"); - break; + return; case ACPI_NOTIFY_BUS_MODE_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a bus mode mismatch\n"); - break; + return; case ACPI_NOTIFY_POWER_FAULT: acpi_handle_err(handle, "Device has suffered a power fault\n"); - break; + return; default: acpi_handle_debug(handle, "Unknown event type 0x%x\n", type); - break; - } - - adev = acpi_get_acpi_dev(handle); - if (!adev) - goto err; - - if (adev->dev.driver) { - struct acpi_driver *driver = to_acpi_driver(adev->dev.driver); - - if (driver && driver->ops.notify && - (driver->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS)) - driver->ops.notify(adev, type); - } - - if (!hotplug_event) { - acpi_put_acpi_dev(adev); return; } - if (ACPI_SUCCESS(acpi_hotplug_schedule(adev, type))) + adev = acpi_get_acpi_dev(handle); + + if (adev && ACPI_SUCCESS(acpi_hotplug_schedule(adev, type))) return; acpi_put_acpi_dev(adev); - err: - acpi_evaluate_ost(handle, type, ost_code, NULL); + acpi_evaluate_ost(handle, type, ACPI_OST_SC_NON_SPECIFIC_FAILURE, NULL); } static void acpi_notify_device(acpi_handle handle, u32 event, void *data) @@ -559,42 +541,51 @@ static u32 acpi_device_fixed_event(void *data) return ACPI_INTERRUPT_HANDLED; } -static int acpi_device_install_notify_handler(struct acpi_device *device) +static int acpi_device_install_notify_handler(struct acpi_device *device, + struct acpi_driver *acpi_drv) { acpi_status status; - if (device->device_type == ACPI_BUS_TYPE_POWER_BUTTON) + if (device->device_type == ACPI_BUS_TYPE_POWER_BUTTON) { status = acpi_install_fixed_event_handler(ACPI_EVENT_POWER_BUTTON, acpi_device_fixed_event, device); - else if (device->device_type == ACPI_BUS_TYPE_SLEEP_BUTTON) + } else if (device->device_type == ACPI_BUS_TYPE_SLEEP_BUTTON) { status = acpi_install_fixed_event_handler(ACPI_EVENT_SLEEP_BUTTON, acpi_device_fixed_event, device); - else - status = acpi_install_notify_handler(device->handle, - ACPI_DEVICE_NOTIFY, + } else { + u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? + ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; + + status = acpi_install_notify_handler(device->handle, type, acpi_notify_device, device); + } if (ACPI_FAILURE(status)) return -EINVAL; return 0; } -static void acpi_device_remove_notify_handler(struct acpi_device *device) +static void acpi_device_remove_notify_handler(struct acpi_device *device, + struct acpi_driver *acpi_drv) { - if (device->device_type == ACPI_BUS_TYPE_POWER_BUTTON) + if (device->device_type == ACPI_BUS_TYPE_POWER_BUTTON) { acpi_remove_fixed_event_handler(ACPI_EVENT_POWER_BUTTON, acpi_device_fixed_event); - else if (device->device_type == ACPI_BUS_TYPE_SLEEP_BUTTON) + } else if (device->device_type == ACPI_BUS_TYPE_SLEEP_BUTTON) { acpi_remove_fixed_event_handler(ACPI_EVENT_SLEEP_BUTTON, acpi_device_fixed_event); - else - acpi_remove_notify_handler(device->handle, ACPI_DEVICE_NOTIFY, + } else { + u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? + ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; + + acpi_remove_notify_handler(device->handle, type, acpi_notify_device); + } } /* Handle events targeting \_SB device (at present only graceful shutdown) */ @@ -1036,7 +1027,7 @@ static int acpi_device_probe(struct device *dev) acpi_drv->name, acpi_dev->pnp.bus_id); if (acpi_drv->ops.notify) { - ret = acpi_device_install_notify_handler(acpi_dev); + ret = acpi_device_install_notify_handler(acpi_dev, acpi_drv); if (ret) { if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); @@ -1059,7 +1050,7 @@ static void acpi_device_remove(struct device *dev) struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); if (acpi_drv->ops.notify) - acpi_device_remove_notify_handler(acpi_dev); + acpi_device_remove_notify_handler(acpi_dev, acpi_drv); if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); From bee9ca40b839e9b2ca1fdb37ffcce429e7899d79 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Mon, 20 Mar 2023 13:54:30 +0100 Subject: [PATCH 098/179] loop: LOOP_CONFIGURE: send uevents for partitions [ Upstream commit bb430b69422640891b0b8db762885730579a4145 ] LOOP_CONFIGURE is, as far as I understand it, supposed to be a way to combine LOOP_SET_FD and LOOP_SET_STATUS64 into a single syscall. When using LOOP_SET_FD+LOOP_SET_STATUS64, a single uevent would be sent for each partition found on the loop device after the second ioctl(), but when using LOOP_CONFIGURE, no such uevent was being sent. In the old setup, uevents are disabled for LOOP_SET_FD, but not for LOOP_SET_STATUS64. This makes sense, as it prevents uevents being sent for a partially configured device during LOOP_SET_FD - they're only sent at the end of LOOP_SET_STATUS64. But for LOOP_CONFIGURE, uevents were disabled for the entire operation, so that final notification was never issued. To fix this, reduce the critical section to exclude the loop_reread_partitions() call, which causes the uevents to be issued, to after uevents are re-enabled, matching the behaviour of the LOOP_SET_FD+LOOP_SET_STATUS64 combination. I noticed this because Busybox's losetup program recently changed from using LOOP_SET_FD+LOOP_SET_STATUS64 to LOOP_CONFIGURE, and this broke my setup, for which I want a notification from the kernel any time a new partition becomes available. Signed-off-by: Alyssa Ross [hch: reduced the critical section] Signed-off-by: Christoph Hellwig Fixes: 3448914e8cc5 ("loop: Add LOOP_CONFIGURE ioctl") Link: https://lore.kernel.org/r/20230320125430.55367-1-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- drivers/block/loop.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 793ae876918c..426d0b42685a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1010,9 +1010,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - /* suppress uevents while reconfiguring the device */ - dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); - /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. @@ -1067,6 +1064,9 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, } } + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); @@ -1109,17 +1109,17 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); + if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); - error = 0; -done: - /* enable and uncork uevent now that we are done */ - dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); - return error; + return 0; out_unlock: loop_global_unlock(lo, is_loop); @@ -1130,7 +1130,7 @@ out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - goto done; + return error; } static void __loop_clr_fd(struct loop_device *lo, bool release) From 5718b58ca0ec3aee1badb33021c35da1883aa9e4 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Sat, 25 Mar 2023 17:40:29 +0100 Subject: [PATCH 099/179] net: mvpp2: classifier flow fix fragmentation flags [ Upstream commit 9a251cae51d57289908222e6c322ca61fccc25fd ] Add missing IP Fragmentation Flag. Fixes: f9358e12a0af ("net: mvpp2: split ingress traffic into multiple flows") Signed-off-by: Sven Auhagen Reviewed-by: Marcin Wojtas Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../net/ethernet/marvell/mvpp2/mvpp2_cls.c | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c index 41d935d1aaf6..40aeaa7bd739 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_cls.c @@ -62,35 +62,38 @@ static const struct mvpp2_cls_flow cls_flows[MVPP2_N_PRS_FLOWS] = { MVPP2_DEF_FLOW(MVPP22_FLOW_TCP4, MVPP2_FL_IP4_TCP_FRAG_UNTAG, MVPP22_CLS_HEK_IP4_2T, MVPP2_PRS_RI_VLAN_NONE | MVPP2_PRS_RI_L3_IP4 | - MVPP2_PRS_RI_L4_TCP, + MVPP2_PRS_RI_IP_FRAG_TRUE | MVPP2_PRS_RI_L4_TCP, MVPP2_PRS_IP_MASK | MVPP2_PRS_RI_VLAN_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_TCP4, MVPP2_FL_IP4_TCP_FRAG_UNTAG, MVPP22_CLS_HEK_IP4_2T, MVPP2_PRS_RI_VLAN_NONE | MVPP2_PRS_RI_L3_IP4_OPT | - MVPP2_PRS_RI_L4_TCP, + MVPP2_PRS_RI_IP_FRAG_TRUE | MVPP2_PRS_RI_L4_TCP, MVPP2_PRS_IP_MASK | MVPP2_PRS_RI_VLAN_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_TCP4, MVPP2_FL_IP4_TCP_FRAG_UNTAG, MVPP22_CLS_HEK_IP4_2T, MVPP2_PRS_RI_VLAN_NONE | MVPP2_PRS_RI_L3_IP4_OTHER | - MVPP2_PRS_RI_L4_TCP, + MVPP2_PRS_RI_IP_FRAG_TRUE | MVPP2_PRS_RI_L4_TCP, MVPP2_PRS_IP_MASK | MVPP2_PRS_RI_VLAN_MASK), /* TCP over IPv4 flows, fragmented, with vlan tag */ MVPP2_DEF_FLOW(MVPP22_FLOW_TCP4, MVPP2_FL_IP4_TCP_FRAG_TAG, MVPP22_CLS_HEK_IP4_2T | MVPP22_CLS_HEK_TAGGED, - MVPP2_PRS_RI_L3_IP4 | MVPP2_PRS_RI_L4_TCP, + MVPP2_PRS_RI_L3_IP4 | MVPP2_PRS_RI_IP_FRAG_TRUE | + MVPP2_PRS_RI_L4_TCP, MVPP2_PRS_IP_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_TCP4, MVPP2_FL_IP4_TCP_FRAG_TAG, MVPP22_CLS_HEK_IP4_2T | MVPP22_CLS_HEK_TAGGED, - MVPP2_PRS_RI_L3_IP4_OPT | MVPP2_PRS_RI_L4_TCP, + MVPP2_PRS_RI_L3_IP4_OPT | MVPP2_PRS_RI_IP_FRAG_TRUE | + MVPP2_PRS_RI_L4_TCP, MVPP2_PRS_IP_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_TCP4, MVPP2_FL_IP4_TCP_FRAG_TAG, MVPP22_CLS_HEK_IP4_2T | MVPP22_CLS_HEK_TAGGED, - MVPP2_PRS_RI_L3_IP4_OTHER | MVPP2_PRS_RI_L4_TCP, + MVPP2_PRS_RI_L3_IP4_OTHER | MVPP2_PRS_RI_IP_FRAG_TRUE | + MVPP2_PRS_RI_L4_TCP, MVPP2_PRS_IP_MASK), /* UDP over IPv4 flows, Not fragmented, no vlan tag */ @@ -132,35 +135,38 @@ static const struct mvpp2_cls_flow cls_flows[MVPP2_N_PRS_FLOWS] = { MVPP2_DEF_FLOW(MVPP22_FLOW_UDP4, MVPP2_FL_IP4_UDP_FRAG_UNTAG, MVPP22_CLS_HEK_IP4_2T, MVPP2_PRS_RI_VLAN_NONE | MVPP2_PRS_RI_L3_IP4 | - MVPP2_PRS_RI_L4_UDP, + MVPP2_PRS_RI_IP_FRAG_TRUE | MVPP2_PRS_RI_L4_UDP, MVPP2_PRS_IP_MASK | MVPP2_PRS_RI_VLAN_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_UDP4, MVPP2_FL_IP4_UDP_FRAG_UNTAG, MVPP22_CLS_HEK_IP4_2T, MVPP2_PRS_RI_VLAN_NONE | MVPP2_PRS_RI_L3_IP4_OPT | - MVPP2_PRS_RI_L4_UDP, + MVPP2_PRS_RI_IP_FRAG_TRUE | MVPP2_PRS_RI_L4_UDP, MVPP2_PRS_IP_MASK | MVPP2_PRS_RI_VLAN_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_UDP4, MVPP2_FL_IP4_UDP_FRAG_UNTAG, MVPP22_CLS_HEK_IP4_2T, MVPP2_PRS_RI_VLAN_NONE | MVPP2_PRS_RI_L3_IP4_OTHER | - MVPP2_PRS_RI_L4_UDP, + MVPP2_PRS_RI_IP_FRAG_TRUE | MVPP2_PRS_RI_L4_UDP, MVPP2_PRS_IP_MASK | MVPP2_PRS_RI_VLAN_MASK), /* UDP over IPv4 flows, fragmented, with vlan tag */ MVPP2_DEF_FLOW(MVPP22_FLOW_UDP4, MVPP2_FL_IP4_UDP_FRAG_TAG, MVPP22_CLS_HEK_IP4_2T | MVPP22_CLS_HEK_TAGGED, - MVPP2_PRS_RI_L3_IP4 | MVPP2_PRS_RI_L4_UDP, + MVPP2_PRS_RI_L3_IP4 | MVPP2_PRS_RI_IP_FRAG_TRUE | + MVPP2_PRS_RI_L4_UDP, MVPP2_PRS_IP_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_UDP4, MVPP2_FL_IP4_UDP_FRAG_TAG, MVPP22_CLS_HEK_IP4_2T | MVPP22_CLS_HEK_TAGGED, - MVPP2_PRS_RI_L3_IP4_OPT | MVPP2_PRS_RI_L4_UDP, + MVPP2_PRS_RI_L3_IP4_OPT | MVPP2_PRS_RI_IP_FRAG_TRUE | + MVPP2_PRS_RI_L4_UDP, MVPP2_PRS_IP_MASK), MVPP2_DEF_FLOW(MVPP22_FLOW_UDP4, MVPP2_FL_IP4_UDP_FRAG_TAG, MVPP22_CLS_HEK_IP4_2T | MVPP22_CLS_HEK_TAGGED, - MVPP2_PRS_RI_L3_IP4_OTHER | MVPP2_PRS_RI_L4_UDP, + MVPP2_PRS_RI_L3_IP4_OTHER | MVPP2_PRS_RI_IP_FRAG_TRUE | + MVPP2_PRS_RI_L4_UDP, MVPP2_PRS_IP_MASK), /* TCP over IPv6 flows, not fragmented, no vlan tag */ From 2a4f74420538c53b443ee35e5ce95ffc0ecf4510 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Sat, 25 Mar 2023 17:40:53 +0100 Subject: [PATCH 100/179] net: mvpp2: parser fix QinQ [ Upstream commit a587a84813b90372cb0a7565e201a4075da67919 ] The mvpp2 parser entry for QinQ has the inner and outer VLAN in the wrong order. Fix the problem by swapping them. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Sven Auhagen Reviewed-by: Marcin Wojtas Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c index 75ba57bd1d46..ed8be396428b 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c @@ -1539,8 +1539,8 @@ static int mvpp2_prs_vlan_init(struct platform_device *pdev, struct mvpp2 *priv) if (!priv->prs_double_vlans) return -ENOMEM; - /* Double VLAN: 0x8100, 0x88A8 */ - err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021Q, ETH_P_8021AD, + /* Double VLAN: 0x88A8, 0x8100 */ + err = mvpp2_prs_double_vlan_add(priv, ETH_P_8021AD, ETH_P_8021Q, MVPP2_PRS_PORT_MASK); if (err) return err; From a921dbbc04e3e4edfaa6245e905207030c1a14bd Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Sat, 25 Mar 2023 17:41:05 +0100 Subject: [PATCH 101/179] net: mvpp2: parser fix PPPoE [ Upstream commit 031a416c2170866be5132ae42e14453d669b0cb1 ] In PPPoE add all IPv4 header option length to the parser and adjust the L3 and L4 offset accordingly. Currently the L4 match does not work with PPPoE and all packets are matched as L3 IP4 OPT. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Sven Auhagen Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../net/ethernet/marvell/mvpp2/mvpp2_prs.c | 82 ++++++++----------- 1 file changed, 34 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c index ed8be396428b..9af22f497a40 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c @@ -1607,59 +1607,45 @@ static int mvpp2_prs_vlan_init(struct platform_device *pdev, struct mvpp2 *priv) static int mvpp2_prs_pppoe_init(struct mvpp2 *priv) { struct mvpp2_prs_entry pe; - int tid; + int tid, ihl; - /* IPv4 over PPPoE with options */ - tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, - MVPP2_PE_LAST_FREE_TID); - if (tid < 0) - return tid; + /* IPv4 over PPPoE with header length >= 5 */ + for (ihl = MVPP2_PRS_IPV4_IHL_MIN; ihl <= MVPP2_PRS_IPV4_IHL_MAX; ihl++) { + tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, + MVPP2_PE_LAST_FREE_TID); + if (tid < 0) + return tid; - memset(&pe, 0, sizeof(pe)); - mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE); - pe.index = tid; + memset(&pe, 0, sizeof(pe)); + mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_PPPOE); + pe.index = tid; - mvpp2_prs_match_etype(&pe, 0, PPP_IP); + mvpp2_prs_match_etype(&pe, 0, PPP_IP); + mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN, + MVPP2_PRS_IPV4_HEAD | ihl, + MVPP2_PRS_IPV4_HEAD_MASK | + MVPP2_PRS_IPV4_IHL_MASK); - mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4); - mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4_OPT, - MVPP2_PRS_RI_L3_PROTO_MASK); - /* goto ipv4 dest-address (skip eth_type + IP-header-size - 4) */ - mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + - sizeof(struct iphdr) - 4, - MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD); - /* Set L3 offset */ - mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3, - MVPP2_ETH_TYPE_LEN, - MVPP2_PRS_SRAM_OP_SEL_UDF_ADD); + mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP4); + mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4, + MVPP2_PRS_RI_L3_PROTO_MASK); + /* goto ipv4 dst-address (skip eth_type + IP-header-size - 4) */ + mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + + sizeof(struct iphdr) - 4, + MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD); + /* Set L3 offset */ + mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3, + MVPP2_ETH_TYPE_LEN, + MVPP2_PRS_SRAM_OP_SEL_UDF_ADD); + /* Set L4 offset */ + mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L4, + MVPP2_ETH_TYPE_LEN + (ihl * 4), + MVPP2_PRS_SRAM_OP_SEL_UDF_ADD); - /* Update shadow table and hw entry */ - mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE); - mvpp2_prs_hw_write(priv, &pe); - - /* IPv4 over PPPoE without options */ - tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, - MVPP2_PE_LAST_FREE_TID); - if (tid < 0) - return tid; - - pe.index = tid; - - mvpp2_prs_tcam_data_byte_set(&pe, MVPP2_ETH_TYPE_LEN, - MVPP2_PRS_IPV4_HEAD | - MVPP2_PRS_IPV4_IHL_MIN, - MVPP2_PRS_IPV4_HEAD_MASK | - MVPP2_PRS_IPV4_IHL_MASK); - - /* Clear ri before updating */ - pe.sram[MVPP2_PRS_SRAM_RI_WORD] = 0x0; - pe.sram[MVPP2_PRS_SRAM_RI_CTRL_WORD] = 0x0; - mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP4, - MVPP2_PRS_RI_L3_PROTO_MASK); - - /* Update shadow table and hw entry */ - mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE); - mvpp2_prs_hw_write(priv, &pe); + /* Update shadow table and hw entry */ + mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_PPPOE); + mvpp2_prs_hw_write(priv, &pe); + } /* IPv6 over PPPoE */ tid = mvpp2_prs_tcam_first_free(priv, MVPP2_PE_FIRST_FREE_TID, From 0b9541ae1dd1a7436af697e12ca76ff6a507b3cb Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 27 Mar 2023 10:31:38 +0200 Subject: [PATCH 102/179] smsc911x: avoid PHY being resumed when interface is not up [ Upstream commit f22c993f31fa9615df46e49cd768b713d39a852f ] SMSC911x doesn't need mdiobus suspend/resume, that's why it sets 'mac_managed_pm'. However, setting it needs to be moved from init to probe, so mdiobus PM functions will really never be called (e.g. when the interface is not up yet during suspend/resume). Fixes: 3ce9f2bef755 ("net: smsc911x: Stop and start PHY during suspend and resume") Suggested-by: Heiner Kallweit Signed-off-by: Wolfram Sang Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230327083138.6044-1-wsa+renesas@sang-engineering.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/smsc/smsc911x.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index a2e511912e6a..a690d139e177 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -1037,8 +1037,6 @@ static int smsc911x_mii_probe(struct net_device *dev) return ret; } - /* Indicate that the MAC is responsible for managing PHY PM */ - phydev->mac_managed_pm = true; phy_attached_info(phydev); phy_set_max_speed(phydev, SPEED_100); @@ -1066,6 +1064,7 @@ static int smsc911x_mii_init(struct platform_device *pdev, struct net_device *dev) { struct smsc911x_data *pdata = netdev_priv(dev); + struct phy_device *phydev; int err = -ENXIO; pdata->mii_bus = mdiobus_alloc(); @@ -1108,6 +1107,10 @@ static int smsc911x_mii_init(struct platform_device *pdev, goto err_out_free_bus_2; } + phydev = phy_find_first(pdata->mii_bus); + if (phydev) + phydev->mac_managed_pm = true; + return 0; err_out_free_bus_2: From 088573884165806248e15b69277c582c31c5f361 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 13 Mar 2023 13:36:08 -0700 Subject: [PATCH 103/179] ice: Fix ice_cfg_rdma_fltr() to only update relevant fields [ Upstream commit d94dbdc4e0209b5e7d736ab696f8d635b034e3ee ] The current implementation causes ice_vsi_update() to update all VSI fields based on the cached VSI context. This also assumes that the ICE_AQ_VSI_PROP_Q_OPT_VALID bit is set. This can cause problems if the VSI context is not correctly synced by the driver. Fix this by only updating the fields that correspond to ICE_AQ_VSI_PROP_Q_OPT_VALID. Also, make sure to save the updated result in the cached VSI context on success. Fixes: 348048e724a0 ("ice: Implement iidc operations") Co-developed-by: Robert Malz Signed-off-by: Robert Malz Signed-off-by: Brett Creeley Signed-off-by: Jesse Brandeburg Reviewed-by: Piotr Raczynski Tested-by: Jakub Andrysiak Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_switch.c | 26 +++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 61f844d22512..46b36851af46 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -1780,18 +1780,36 @@ ice_update_vsi(struct ice_hw *hw, u16 vsi_handle, struct ice_vsi_ctx *vsi_ctx, int ice_cfg_rdma_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable) { - struct ice_vsi_ctx *ctx; + struct ice_vsi_ctx *ctx, *cached_ctx; + int status; - ctx = ice_get_vsi_ctx(hw, vsi_handle); + cached_ctx = ice_get_vsi_ctx(hw, vsi_handle); + if (!cached_ctx) + return -ENOENT; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) - return -EIO; + return -ENOMEM; + + ctx->info.q_opt_rss = cached_ctx->info.q_opt_rss; + ctx->info.q_opt_tc = cached_ctx->info.q_opt_tc; + ctx->info.q_opt_flags = cached_ctx->info.q_opt_flags; + + ctx->info.valid_sections = cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID); if (enable) ctx->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; else ctx->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; - return ice_update_vsi(hw, vsi_handle, ctx, NULL); + status = ice_update_vsi(hw, vsi_handle, ctx, NULL); + if (!status) { + cached_ctx->info.q_opt_flags = ctx->info.q_opt_flags; + cached_ctx->info.valid_sections |= ctx->info.valid_sections; + } + + kfree(ctx); + return status; } /** From d10900058fa6d6c0a4ec6f6967a9b4c607e303ce Mon Sep 17 00:00:00 2001 From: Junfeng Guo Date: Tue, 14 Mar 2023 10:03:15 +0800 Subject: [PATCH 104/179] ice: add profile conflict check for AVF FDIR [ Upstream commit 29486b6df3e6a63b57d1ed1dce06051267282ff4 ] Add profile conflict check while adding some FDIR rules to avoid unexpected flow behavior, rules may have conflict including: IPv4 <---> {IPv4_UDP, IPv4_TCP, IPv4_SCTP} IPv6 <---> {IPv6_UDP, IPv6_TCP, IPv6_SCTP} For example, when we create an FDIR rule for IPv4, this rule will work on packets including IPv4, IPv4_UDP, IPv4_TCP and IPv4_SCTP. But if we then create an FDIR rule for IPv4_UDP and then destroy it, the first FDIR rule for IPv4 cannot work on pkt IPv4_UDP then. To prevent this unexpected behavior, we add restriction in software when creating FDIR rules by adding necessary profile conflict check. Fixes: 1f7ea1cd6a37 ("ice: Enable FDIR Configure for AVF") Signed-off-by: Junfeng Guo Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- .../ethernet/intel/ice/ice_virtchnl_fdir.c | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c index c6a58343d81d..a2645ff3100e 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c @@ -541,6 +541,72 @@ static void ice_vc_fdir_rem_prof_all(struct ice_vf *vf) } } +/** + * ice_vc_fdir_has_prof_conflict + * @vf: pointer to the VF structure + * @conf: FDIR configuration for each filter + * + * Check if @conf has conflicting profile with existing profiles + * + * Return: true on success, and false on error. + */ +static bool +ice_vc_fdir_has_prof_conflict(struct ice_vf *vf, + struct virtchnl_fdir_fltr_conf *conf) +{ + struct ice_fdir_fltr *desc; + + list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) { + struct virtchnl_fdir_fltr_conf *existing_conf; + enum ice_fltr_ptype flow_type_a, flow_type_b; + struct ice_fdir_fltr *a, *b; + + existing_conf = to_fltr_conf_from_desc(desc); + a = &existing_conf->input; + b = &conf->input; + flow_type_a = a->flow_type; + flow_type_b = b->flow_type; + + /* No need to compare two rules with different tunnel types or + * with the same protocol type. + */ + if (existing_conf->ttype != conf->ttype || + flow_type_a == flow_type_b) + continue; + + switch (flow_type_a) { + case ICE_FLTR_PTYPE_NONF_IPV4_UDP: + case ICE_FLTR_PTYPE_NONF_IPV4_TCP: + case ICE_FLTR_PTYPE_NONF_IPV4_SCTP: + if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_OTHER) + return true; + break; + case ICE_FLTR_PTYPE_NONF_IPV4_OTHER: + if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_UDP || + flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_TCP || + flow_type_b == ICE_FLTR_PTYPE_NONF_IPV4_SCTP) + return true; + break; + case ICE_FLTR_PTYPE_NONF_IPV6_UDP: + case ICE_FLTR_PTYPE_NONF_IPV6_TCP: + case ICE_FLTR_PTYPE_NONF_IPV6_SCTP: + if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_OTHER) + return true; + break; + case ICE_FLTR_PTYPE_NONF_IPV6_OTHER: + if (flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_UDP || + flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_TCP || + flow_type_b == ICE_FLTR_PTYPE_NONF_IPV6_SCTP) + return true; + break; + default: + break; + } + } + + return false; +} + /** * ice_vc_fdir_write_flow_prof * @vf: pointer to the VF structure @@ -677,6 +743,13 @@ ice_vc_fdir_config_input_set(struct ice_vf *vf, struct virtchnl_fdir_add *fltr, enum ice_fltr_ptype flow; int ret; + ret = ice_vc_fdir_has_prof_conflict(vf, conf); + if (ret) { + dev_dbg(dev, "Found flow profile conflict for VF %d\n", + vf->vf_id); + return ret; + } + flow = input->flow_type; ret = ice_vc_fdir_alloc_prof(vf, flow); if (ret) { From 9bc9e4442fe3aee93269b5c6031c025257d98758 Mon Sep 17 00:00:00 2001 From: Jakob Koschel Date: Mon, 20 Mar 2023 13:48:15 +0100 Subject: [PATCH 105/179] ice: fix invalid check for empty list in ice_sched_assoc_vsi_to_agg() [ Upstream commit e9a1cc2e4c4ee7c7e60fb26345618c2522a2a10f ] The code implicitly assumes that the list iterator finds a correct handle. If 'vsi_handle' is not found the 'old_agg_vsi_info' was pointing to an bogus memory location. For safety a separate list iterator variable should be used to make the != NULL check on 'old_agg_vsi_info' correct under any circumstances. Additionally Linus proposed to avoid any use of the list iterator variable after the loop, in the attempt to move the list iterator variable declaration into the macro to avoid any potential misuse after the loop. Using it in a pointer comparison after the loop is undefined behavior and should be omitted if possible [1]. Fixes: 37c592062b16 ("ice: remove the VSI info from previous agg") Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/ [1] Signed-off-by: Jakob Koschel Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index 118595763bba..2c62c1763ee0 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -2756,7 +2756,7 @@ static int ice_sched_assoc_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, u16 vsi_handle, unsigned long *tc_bitmap) { - struct ice_sched_agg_vsi_info *agg_vsi_info, *old_agg_vsi_info = NULL; + struct ice_sched_agg_vsi_info *agg_vsi_info, *iter, *old_agg_vsi_info = NULL; struct ice_sched_agg_info *agg_info, *old_agg_info; struct ice_hw *hw = pi->hw; int status = 0; @@ -2774,11 +2774,13 @@ ice_sched_assoc_vsi_to_agg(struct ice_port_info *pi, u32 agg_id, if (old_agg_info && old_agg_info != agg_info) { struct ice_sched_agg_vsi_info *vtmp; - list_for_each_entry_safe(old_agg_vsi_info, vtmp, + list_for_each_entry_safe(iter, vtmp, &old_agg_info->agg_vsi_list, list_entry) - if (old_agg_vsi_info->vsi_handle == vsi_handle) + if (iter->vsi_handle == vsi_handle) { + old_agg_vsi_info = iter; break; + } } /* check if entry already exist */ From db7d7782677ff998c06997903d5400a0ba91cebb Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Wed, 29 Mar 2023 06:24:22 +0300 Subject: [PATCH 106/179] ALSA: ymfpci: Create card with device-managed snd_devm_card_new() [ Upstream commit f33fc1576757741479452255132d6e3aaf558ffe ] snd_card_ymfpci_remove() was removed in commit c6e6bb5eab74 ("ALSA: ymfpci: Allocate resources with device-managed APIs"), but the call to snd_card_new() was not replaced with snd_devm_card_new(). Since there was no longer a call to snd_card_free, unloading the module would eventually result in Oops: [697561.532887] BUG: unable to handle page fault for address: ffffffffc0924480 [697561.532893] #PF: supervisor read access in kernel mode [697561.532896] #PF: error_code(0x0000) - not-present page [697561.532899] PGD ae1e15067 P4D ae1e15067 PUD ae1e17067 PMD 11a8f5067 PTE 0 [697561.532905] Oops: 0000 [#1] PREEMPT SMP NOPTI [697561.532909] CPU: 21 PID: 5080 Comm: wireplumber Tainted: G W OE 6.2.7 #1 [697561.532914] Hardware name: System manufacturer System Product Name/TUF GAMING X570-PLUS, BIOS 4408 10/28/2022 [697561.532916] RIP: 0010:try_module_get.part.0+0x1a/0xe0 [697561.532924] Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc bf 01 00 00 00 e8 56 3c f8 ff <41> 83 3c 24 02 0f 84 96 00 00 00 41 8b 84 24 30 03 00 00 85 c0 0f [697561.532927] RSP: 0018:ffffbe9b858c3bd8 EFLAGS: 00010246 [697561.532930] RAX: ffff9815d14f1900 RBX: ffff9815c14e6000 RCX: 0000000000000000 [697561.532933] RDX: 0000000000000000 RSI: ffffffffc055092c RDI: ffffffffb3778c1a [697561.532935] RBP: ffffbe9b858c3be8 R08: 0000000000000040 R09: ffff981a1a741380 [697561.532937] R10: ffffbe9b858c3c80 R11: 00000009d56533a6 R12: ffffffffc0924480 [697561.532939] R13: ffff9823439d8500 R14: 0000000000000025 R15: ffff9815cd109f80 [697561.532942] FS: 00007f13084f1f80(0000) GS:ffff9824aef40000(0000) knlGS:0000000000000000 [697561.532945] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [697561.532947] CR2: ffffffffc0924480 CR3: 0000000145344000 CR4: 0000000000350ee0 [697561.532949] Call Trace: [697561.532951] [697561.532955] try_module_get+0x13/0x30 [697561.532960] snd_ctl_open+0x61/0x1c0 [snd] [697561.532976] snd_open+0xb4/0x1e0 [snd] [697561.532989] chrdev_open+0xc7/0x240 [697561.532995] ? fsnotify_perm.part.0+0x6e/0x160 [697561.533000] ? __pfx_chrdev_open+0x10/0x10 [697561.533005] do_dentry_open+0x169/0x440 [697561.533009] vfs_open+0x2d/0x40 [697561.533012] path_openat+0xa9d/0x10d0 [697561.533017] ? debug_smp_processor_id+0x17/0x20 [697561.533022] ? trigger_load_balance+0x65/0x370 [697561.533026] do_filp_open+0xb2/0x160 [697561.533032] ? _raw_spin_unlock+0x19/0x40 [697561.533036] ? alloc_fd+0xa9/0x190 [697561.533040] do_sys_openat2+0x9f/0x160 [697561.533044] __x64_sys_openat+0x55/0x90 [697561.533048] do_syscall_64+0x3b/0x90 [697561.533052] entry_SYSCALL_64_after_hwframe+0x72/0xdc [697561.533056] RIP: 0033:0x7f1308a40db4 [697561.533059] Code: 24 20 eb 8f 66 90 44 89 54 24 0c e8 46 68 f8 ff 44 8b 54 24 0c 44 89 e2 48 89 ee 41 89 c0 bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 77 32 44 89 c7 89 44 24 0c e8 78 68 f8 ff 8b 44 [697561.533062] RSP: 002b:00007ffcce664450 EFLAGS: 00000293 ORIG_RAX: 0000000000000101 [697561.533066] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f1308a40db4 [697561.533068] RDX: 0000000000080000 RSI: 00007ffcce664690 RDI: 00000000ffffff9c [697561.533070] RBP: 00007ffcce664690 R08: 0000000000000000 R09: 0000000000000012 [697561.533072] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000080000 [697561.533074] R13: 00007f13054b069b R14: 0000565209f83200 R15: 0000000000000000 [697561.533078] Fixes: c6e6bb5eab74 ("ALSA: ymfpci: Allocate resources with device-managed APIs") Signed-off-by: Tasos Sahanidis Link: https://lore.kernel.org/r/20230329032422.170024-1-tasos@tasossah.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/ymfpci/ymfpci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/ymfpci/ymfpci.c b/sound/pci/ymfpci/ymfpci.c index 1e198e4d57b8..82d4e0fda91b 100644 --- a/sound/pci/ymfpci/ymfpci.c +++ b/sound/pci/ymfpci/ymfpci.c @@ -170,7 +170,7 @@ static int snd_card_ymfpci_probe(struct pci_dev *pci, return -ENOENT; } - err = snd_card_new(&pci->dev, index[dev], id[dev], THIS_MODULE, + err = snd_devm_card_new(&pci->dev, index[dev], id[dev], THIS_MODULE, sizeof(*chip), &card); if (err < 0) return err; From 32b9bd7cfc2e2d92d595386add4e111b232b351f Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Wed, 29 Mar 2023 06:28:08 +0300 Subject: [PATCH 107/179] ALSA: ymfpci: Fix BUG_ON in probe function [ Upstream commit 6be2e7522eb529b41c16d459f33bbdbcddbf5c15 ] The snd_dma_buffer.bytes field now contains the aligned size, which this snd_BUG_ON() did not account for, resulting in the following: [ 9.625915] ------------[ cut here ]------------ [ 9.633440] WARNING: CPU: 0 PID: 126 at sound/pci/ymfpci/ymfpci_main.c:2168 snd_ymfpci_create+0x681/0x698 [snd_ymfpci] [ 9.648926] Modules linked in: snd_ymfpci(+) snd_intel_dspcfg kvm(+) snd_intel_sdw_acpi snd_ac97_codec snd_mpu401_uart snd_opl3_lib irqbypass snd_hda_codec gameport snd_rawmidi crct10dif_pclmul crc32_pclmul cfg80211 snd_hda_core polyval_clmulni polyval_generic gf128mul snd_seq_device ghash_clmulni_intel snd_hwdep ac97_bus sha512_ssse3 rfkill snd_pcm aesni_intel tg3 snd_timer crypto_simd snd mxm_wmi libphy cryptd k10temp fam15h_power pcspkr soundcore sp5100_tco wmi acpi_cpufreq mac_hid dm_multipath sg loop fuse dm_mod bpf_preload ip_tables x_tables ext4 crc32c_generic crc16 mbcache jbd2 sr_mod cdrom ata_generic pata_acpi firewire_ohci crc32c_intel firewire_core xhci_pci crc_itu_t pata_via xhci_pci_renesas floppy [ 9.711849] CPU: 0 PID: 126 Comm: kworker/0:2 Not tainted 6.1.21-1-lts #1 08d2e5ece03136efa7c6aeea9a9c40916b1bd8da [ 9.722200] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./990FX Extreme4, BIOS P2.70 06/05/2014 [ 9.732204] Workqueue: events work_for_cpu_fn [ 9.736580] RIP: 0010:snd_ymfpci_create+0x681/0x698 [snd_ymfpci] [ 9.742594] Code: 8c c0 4c 89 e2 48 89 df 48 c7 c6 92 c6 8c c0 e8 15 d0 e9 ff 48 83 c4 08 44 89 e8 5b 5d 41 5c 41 5d 41 5e 41 5f e9 d3 7a 33 e3 <0f> 0b e9 cb fd ff ff 41 bd fb ff ff ff eb db 41 bd f4 ff ff ff eb [ 9.761358] RSP: 0018:ffffab64804e7da0 EFLAGS: 00010287 [ 9.766594] RAX: ffff8fa2df06c400 RBX: ffff8fa3073a8000 RCX: ffff8fa303fbc4a8 [ 9.773734] RDX: ffff8fa2df06d000 RSI: 0000000000000010 RDI: 0000000000000020 [ 9.780876] RBP: ffff8fa300b5d0d0 R08: ffff8fa3073a8e50 R09: 00000000df06bf00 [ 9.788018] R10: ffff8fa2df06bf00 R11: 00000000df068200 R12: ffff8fa3073a8918 [ 9.795159] R13: 0000000000000000 R14: 0000000000000080 R15: ffff8fa2df068200 [ 9.802317] FS: 0000000000000000(0000) GS:ffff8fa9fec00000(0000) knlGS:0000000000000000 [ 9.810414] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9.816158] CR2: 000055febaf66500 CR3: 0000000101a2e000 CR4: 00000000000406f0 [ 9.823301] Call Trace: [ 9.825747] [ 9.827889] snd_card_ymfpci_probe+0x194/0x950 [snd_ymfpci b78a5fe64b5663a6390a909c67808567e3e73615] [ 9.837030] ? finish_task_switch.isra.0+0x90/0x2d0 [ 9.841918] local_pci_probe+0x45/0x80 [ 9.845680] work_for_cpu_fn+0x1a/0x30 [ 9.849431] process_one_work+0x1c7/0x380 [ 9.853464] worker_thread+0x1af/0x390 [ 9.857225] ? rescuer_thread+0x3b0/0x3b0 [ 9.861254] kthread+0xde/0x110 [ 9.864414] ? kthread_complete_and_exit+0x20/0x20 [ 9.869210] ret_from_fork+0x22/0x30 [ 9.872792] [ 9.874985] ---[ end trace 0000000000000000 ]--- Fixes: 5c1733e33c88 ("ALSA: memalloc: Align buffer allocations in page size") Signed-off-by: Tasos Sahanidis Link: https://lore.kernel.org/r/20230329032808.170403-1-tasos@tasossah.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/ymfpci/ymfpci_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/ymfpci/ymfpci_main.c b/sound/pci/ymfpci/ymfpci_main.c index c80114c0ad7b..b492c32ce070 100644 --- a/sound/pci/ymfpci/ymfpci_main.c +++ b/sound/pci/ymfpci/ymfpci_main.c @@ -2165,7 +2165,7 @@ static int snd_ymfpci_memalloc(struct snd_ymfpci *chip) chip->work_base = ptr; chip->work_base_addr = ptr_addr; - snd_BUG_ON(ptr + chip->work_size != + snd_BUG_ON(ptr + PAGE_ALIGN(chip->work_size) != chip->work_ptr->area + chip->work_ptr->bytes); snd_ymfpci_writel(chip, YDSXGR_PLAYCTRLBASE, chip->bank_base_playback_addr); From 6e43dc9dcf22ebbe8ad54e3f2109f129dccd09cc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 28 Mar 2023 11:27:51 -0500 Subject: [PATCH 108/179] net: ipa: compute DMA pool size properly [ Upstream commit 6c75dc94f2b27fff57b305af9236eea181a00b6c ] In gsi_trans_pool_init_dma(), the total size of a pool of memory used for DMA transactions is calculated. However the calculation is done incorrectly. For 4KB pages, this total size is currently always more than one page, and as a result, the calculation produces a positive (though incorrect) total size. The code still works in this case; we just end up with fewer DMA pool entries than we intended. Bjorn Andersson tested booting a kernel with 16KB pages, and hit a null pointer derereference in sg_alloc_append_table_from_pages(), descending from gsi_trans_pool_init_dma(). The cause of this was that a 16KB total size was going to be allocated, and with 16KB pages the order of that allocation is 0. The total_size calculation yielded 0, which eventually led to the crash. Correcting the total_size calculation fixes the problem. Reported-by: Bjorn Andersson Tested-by: Bjorn Andersson Fixes: 9dd441e4ed57 ("soc: qcom: ipa: GSI transactions") Reviewed-by: Mark Bloch Signed-off-by: Alex Elder Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230328162751.2861791-1-elder@linaro.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ipa/gsi_trans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/gsi_trans.c b/drivers/net/ipa/gsi_trans.c index 26b7f683a3e1..fa6863c265eb 100644 --- a/drivers/net/ipa/gsi_trans.c +++ b/drivers/net/ipa/gsi_trans.c @@ -153,7 +153,7 @@ int gsi_trans_pool_init_dma(struct device *dev, struct gsi_trans_pool *pool, * gsi_trans_pool_exit_dma() can assume the total allocated * size is exactly (count * size). */ - total_size = get_order(total_size) << PAGE_SHIFT; + total_size = PAGE_SIZE << get_order(total_size); virt = dma_alloc_coherent(dev, total_size, &addr, GFP_KERNEL); if (!virt) From 37f9fe34482ef5e19cf8fb9dfb43a26e01f101f1 Mon Sep 17 00:00:00 2001 From: Radoslaw Tyl Date: Tue, 28 Mar 2023 10:26:59 -0700 Subject: [PATCH 109/179] i40e: fix registers dump after run ethtool adapter self test [ Upstream commit c5cff16f461a4a434a9915a7be7ac9ced861a8a4 ] Fix invalid registers dump from ethtool -d ethX after adapter self test by ethtool -t ethY. It causes invalid data display. The problem was caused by overwriting i40e_reg_list[].elements which is common for ethtool self test and dump. Fixes: 22dd9ae8afcc ("i40e: Rework register diagnostic") Signed-off-by: Radoslaw Tyl Reviewed-by: Michal Swiatkowski Tested-by: Arpana Arland (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230328172659.3906413-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_diag.c | 11 ++++++----- drivers/net/ethernet/intel/i40e/i40e_diag.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.c b/drivers/net/ethernet/intel/i40e/i40e_diag.c index ef4d3762bf37..ca229b0efeb6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_diag.c +++ b/drivers/net/ethernet/intel/i40e/i40e_diag.c @@ -44,7 +44,7 @@ static i40e_status i40e_diag_reg_pattern_test(struct i40e_hw *hw, return 0; } -struct i40e_diag_reg_test_info i40e_reg_list[] = { +const struct i40e_diag_reg_test_info i40e_reg_list[] = { /* offset mask elements stride */ {I40E_QTX_CTL(0), 0x0000FFBF, 1, I40E_QTX_CTL(1) - I40E_QTX_CTL(0)}, @@ -78,27 +78,28 @@ i40e_status i40e_diag_reg_test(struct i40e_hw *hw) { i40e_status ret_code = 0; u32 reg, mask; + u32 elements; u32 i, j; for (i = 0; i40e_reg_list[i].offset != 0 && !ret_code; i++) { + elements = i40e_reg_list[i].elements; /* set actual reg range for dynamically allocated resources */ if (i40e_reg_list[i].offset == I40E_QTX_CTL(0) && hw->func_caps.num_tx_qp != 0) - i40e_reg_list[i].elements = hw->func_caps.num_tx_qp; + elements = hw->func_caps.num_tx_qp; if ((i40e_reg_list[i].offset == I40E_PFINT_ITRN(0, 0) || i40e_reg_list[i].offset == I40E_PFINT_ITRN(1, 0) || i40e_reg_list[i].offset == I40E_PFINT_ITRN(2, 0) || i40e_reg_list[i].offset == I40E_QINT_TQCTL(0) || i40e_reg_list[i].offset == I40E_QINT_RQCTL(0)) && hw->func_caps.num_msix_vectors != 0) - i40e_reg_list[i].elements = - hw->func_caps.num_msix_vectors - 1; + elements = hw->func_caps.num_msix_vectors - 1; /* test register access */ mask = i40e_reg_list[i].mask; - for (j = 0; j < i40e_reg_list[i].elements && !ret_code; j++) { + for (j = 0; j < elements && !ret_code; j++) { reg = i40e_reg_list[i].offset + (j * i40e_reg_list[i].stride); ret_code = i40e_diag_reg_pattern_test(hw, reg, mask); diff --git a/drivers/net/ethernet/intel/i40e/i40e_diag.h b/drivers/net/ethernet/intel/i40e/i40e_diag.h index c3340f320a18..1db7c6d57231 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_diag.h +++ b/drivers/net/ethernet/intel/i40e/i40e_diag.h @@ -20,7 +20,7 @@ struct i40e_diag_reg_test_info { u32 stride; /* bytes between each element */ }; -extern struct i40e_diag_reg_test_info i40e_reg_list[]; +extern const struct i40e_diag_reg_test_info i40e_reg_list[]; i40e_status i40e_diag_reg_test(struct i40e_hw *hw); i40e_status i40e_diag_eeprom_test(struct i40e_hw *hw); From a4dd9f756973dae340c8daaae6e1d992f2553c11 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 28 Mar 2023 18:30:19 -0700 Subject: [PATCH 110/179] bnxt_en: Fix reporting of test result in ethtool selftest [ Upstream commit 83714dc3db0e4a088673601bc8099b079bc1a077 ] When the selftest command fails, driver is not reporting the failure by updating the "test->flags" when bnxt_close_nic() fails. Fixes: eb51365846bc ("bnxt_en: Add basic ethtool -t selftest support.") Reviewed-by: Pavan Chebbi Reviewed-by: Somnath Kotur Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 703fc163235f..cdbc62ad659c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3634,6 +3634,7 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, bnxt_ulp_stop(bp); rc = bnxt_close_nic(bp, true, false); if (rc) { + etest->flags |= ETH_TEST_FL_FAILED; bnxt_ulp_start(bp, rc); return; } From f06ae13e995ad5cc0e871d19dee919fd00c92700 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 28 Mar 2023 18:30:20 -0700 Subject: [PATCH 111/179] bnxt_en: Fix typo in PCI id to device description string mapping [ Upstream commit 62aad36ed31abc80f35db11e187e690448a79f7d ] Fix 57502 and 57508 NPAR description string entries. The typos caused these devices to not match up with lspci output. Fixes: 49c98421e6ab ("bnxt_en: Add PCI IDs for 57500 series NPAR devices.") Reviewed-by: Pavan Chebbi Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 251b102d2792..c6e36603bd2d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -175,12 +175,12 @@ static const struct pci_device_id bnxt_pci_tbl[] = { { PCI_VDEVICE(BROADCOM, 0x1750), .driver_data = BCM57508 }, { PCI_VDEVICE(BROADCOM, 0x1751), .driver_data = BCM57504 }, { PCI_VDEVICE(BROADCOM, 0x1752), .driver_data = BCM57502 }, - { PCI_VDEVICE(BROADCOM, 0x1800), .driver_data = BCM57508_NPAR }, + { PCI_VDEVICE(BROADCOM, 0x1800), .driver_data = BCM57502_NPAR }, { PCI_VDEVICE(BROADCOM, 0x1801), .driver_data = BCM57504_NPAR }, - { PCI_VDEVICE(BROADCOM, 0x1802), .driver_data = BCM57502_NPAR }, - { PCI_VDEVICE(BROADCOM, 0x1803), .driver_data = BCM57508_NPAR }, + { PCI_VDEVICE(BROADCOM, 0x1802), .driver_data = BCM57508_NPAR }, + { PCI_VDEVICE(BROADCOM, 0x1803), .driver_data = BCM57502_NPAR }, { PCI_VDEVICE(BROADCOM, 0x1804), .driver_data = BCM57504_NPAR }, - { PCI_VDEVICE(BROADCOM, 0x1805), .driver_data = BCM57502_NPAR }, + { PCI_VDEVICE(BROADCOM, 0x1805), .driver_data = BCM57508_NPAR }, { PCI_VDEVICE(BROADCOM, 0xd802), .driver_data = BCM58802 }, { PCI_VDEVICE(BROADCOM, 0xd804), .driver_data = BCM58804 }, #ifdef CONFIG_BNXT_SRIOV From b055e322907e4b83d366633fa761d25f7319c509 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 28 Mar 2023 18:30:21 -0700 Subject: [PATCH 112/179] bnxt_en: Add missing 200G link speed reporting [ Upstream commit 581bce7bcb7e7f100908728e7b292e266c76895b ] bnxt_fw_to_ethtool_speed() is missing the case statement for 200G link speed reported by firmware. As a result, ethtool will report unknown speed when the firmware reports 200G link speed. Fixes: 532262ba3b84 ("bnxt_en: ethtool: support PAM4 link speeds up to 200G") Signed-off-by: Michael Chan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 02741d499bf4..1d2588c92977 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1225,6 +1225,7 @@ struct bnxt_link_info { #define BNXT_LINK_SPEED_40GB PORT_PHY_QCFG_RESP_LINK_SPEED_40GB #define BNXT_LINK_SPEED_50GB PORT_PHY_QCFG_RESP_LINK_SPEED_50GB #define BNXT_LINK_SPEED_100GB PORT_PHY_QCFG_RESP_LINK_SPEED_100GB +#define BNXT_LINK_SPEED_200GB PORT_PHY_QCFG_RESP_LINK_SPEED_200GB u16 support_speeds; u16 support_pam4_speeds; u16 auto_link_speeds; /* fw adv setting */ diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index cdbc62ad659c..01b973bc509f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -1712,6 +1712,8 @@ u32 bnxt_fw_to_ethtool_speed(u16 fw_link_speed) return SPEED_50000; case BNXT_LINK_SPEED_100GB: return SPEED_100000; + case BNXT_LINK_SPEED_200GB: + return SPEED_200000; default: return SPEED_UNKNOWN; } From 1b0e8aba451031fa031bfb96711af12c4b902fdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steffen=20B=C3=A4tz?= Date: Wed, 29 Mar 2023 12:01:40 -0300 Subject: [PATCH 113/179] net: dsa: mv88e6xxx: Enable IGMP snooping on user ports only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7bcad0f0e6fbc1d613e49e0ee35c8e5f2e685bb0 ] Do not set the MV88E6XXX_PORT_CTL0_IGMP_MLD_SNOOP bit on CPU or DSA ports. This allows the host CPU port to be a regular IGMP listener by sending out IGMP Membership Reports, which would otherwise not be forwarded by the mv88exxx chip, but directly looped back to the CPU port itself. Fixes: 54d792f257c6 ("net: dsa: Centralise global and port setup code into mv88e6xxx.") Signed-off-by: Steffen Bätz Signed-off-by: Fabio Estevam Reviewed-by: Andrew Lunn Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20230329150140.701559-1-festevam@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/mv88e6xxx/chip.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3a6db36574ad..8cf27e2654fc 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3354,9 +3354,14 @@ static int mv88e6xxx_setup_port(struct mv88e6xxx_chip *chip, int port) * If this is the upstream port for this switch, enable * forwarding of unknown unicasts and multicasts. */ - reg = MV88E6XXX_PORT_CTL0_IGMP_MLD_SNOOP | - MV88E6185_PORT_CTL0_USE_TAG | MV88E6185_PORT_CTL0_USE_IP | + reg = MV88E6185_PORT_CTL0_USE_TAG | MV88E6185_PORT_CTL0_USE_IP | MV88E6XXX_PORT_CTL0_STATE_FORWARDING; + /* Forward any IPv4 IGMP or IPv6 MLD frames received + * by a USER port to the CPU port to allow snooping. + */ + if (dsa_is_user_port(ds, port)) + reg |= MV88E6XXX_PORT_CTL0_IGMP_MLD_SNOOP; + err = mv88e6xxx_port_write(chip, port, MV88E6XXX_PORT_CTL0, reg); if (err) return err; From a7bd974edf12185341f85e1a2197b7065e2322d9 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 30 Mar 2023 14:08:38 +0200 Subject: [PATCH 114/179] net: ethernet: mtk_eth_soc: fix flow block refcounting logic [ Upstream commit 8c1cb87c2a5c29da416848451a687473f379611c ] Since we call flow_block_cb_decref on FLOW_BLOCK_UNBIND, we also need to call flow_block_cb_incref for a newly allocated cb. Also fix the accidentally inverted refcount check on unbind. Fixes: 502e84e2382d ("net: ethernet: mtk_eth_soc: add flow offloading support") Reviewed-by: Simon Horman Signed-off-by: Felix Fietkau Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230330120840.52079-1-nbd@nbd.name Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index 28bbd1df3e30..6a72687d5b83 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -570,6 +570,7 @@ mtk_eth_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) if (IS_ERR(block_cb)) return PTR_ERR(block_cb); + flow_block_cb_incref(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &block_cb_list); return 0; @@ -578,7 +579,7 @@ mtk_eth_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) if (!block_cb) return -ENOENT; - if (flow_block_cb_decref(block_cb)) { + if (!flow_block_cb_decref(block_cb)) { flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); } From 106fb49b14af1eeb064616090f8cc9afc27516be Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 30 Mar 2023 14:08:40 +0200 Subject: [PATCH 115/179] net: ethernet: mtk_eth_soc: add missing ppe cache flush when deleting a flow [ Upstream commit 924531326e2dd4ceabe7240f2b55a88e7d894ec2 ] The cache needs to be flushed to ensure that the hardware stops offloading the flow immediately. Fixes: 33fc42de3327 ("net: ethernet: mtk_eth_soc: support creating mac address based offload entries") Reviewed-by: Simon Horman Signed-off-by: Felix Fietkau Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/20230330120840.52079-3-nbd@nbd.name Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mediatek/mtk_ppe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c index 34ea8af48c3d..d6eed204574a 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe.c @@ -438,6 +438,7 @@ __mtk_foe_entry_clear(struct mtk_ppe *ppe, struct mtk_flow_entry *entry) hwe->ib1 &= ~MTK_FOE_IB1_STATE; hwe->ib1 |= FIELD_PREP(MTK_FOE_IB1_STATE, MTK_FOE_STATE_INVALID); dma_wmb(); + mtk_ppe_cache_clear(ppe); } entry->hash = 0xffff; From 2a858f3f804ce62a57ed5660bf9068afe7fd6377 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 6 Feb 2023 21:37:20 +0100 Subject: [PATCH 116/179] pinctrl: ocelot: Fix alt mode for ocelot [ Upstream commit 657fd9da2d4b4aa0a384105b236baa22fa0233bf ] In case the driver was trying to set an alternate mode for gpio 0 or 32 then the mode was not set correctly. The reason is that there is computation error inside the function ocelot_pinmux_set_mux because in this case it was trying to shift to left by -1. Fix this by actually shifting the function bits and not the position. Fixes: 4b36082e2e09 ("pinctrl: ocelot: fix pinmuxing for pins after 31") Signed-off-by: Horatiu Vultur Link: https://lore.kernel.org/r/20230206203720.1177718-1-horatiu.vultur@microchip.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/pinctrl-ocelot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index 3d5995cbcb78..c1d58939dd89 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -1202,7 +1202,7 @@ static int ocelot_pinmux_set_mux(struct pinctrl_dev *pctldev, regmap_update_bits(info->map, REG_ALT(0, info, pin->pin), BIT(p), f << p); regmap_update_bits(info->map, REG_ALT(1, info, pin->pin), - BIT(p), f << (p - 1)); + BIT(p), (f >> 1) << p); return 0; } From db0ac14908af0bb9bf66d1e79804f52a262679f4 Mon Sep 17 00:00:00 2001 From: Matthias Benkmann Date: Sun, 19 Mar 2023 21:30:15 -0700 Subject: [PATCH 117/179] Input: xpad - fix incorrectly applied patch for MAP_PROFILE_BUTTON [ Upstream commit ffa6206ebf8d39e83d87ac226df68dbbe155819a ] When commit commit fff1011a26d6 ("Input: xpad - add X-Box Adaptive Profile button") was applied, one hunk ended up in the wrong function; move it to where it belongs. Fixes: fff1011a26d6 ("Input: xpad - add X-Box Adaptive Profile button") Signed-off-by: Matthias Benkmann Link: https://lore.kernel.org/r/20230318162106.0aef4ba5@ninja Signed-off-by: Dmitry Torokhov Signed-off-by: Sasha Levin --- drivers/input/joystick/xpad.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 2959d80f7fdb..cd36cf716542 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -779,9 +779,6 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d input_report_key(dev, BTN_C, data[8]); input_report_key(dev, BTN_Z, data[9]); - /* Profile button has a value of 0-3, so it is reported as an axis */ - if (xpad->mapping & MAP_PROFILE_BUTTON) - input_report_abs(dev, ABS_PROFILE, data[34]); input_sync(dev); } @@ -1059,6 +1056,10 @@ static void xpadone_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char (__u16) le16_to_cpup((__le16 *)(data + 8))); } + /* Profile button has a value of 0-3, so it is reported as an axis */ + if (xpad->mapping & MAP_PROFILE_BUTTON) + input_report_abs(dev, ABS_PROFILE, data[34]); + /* paddle handling */ /* based on SDL's SDL_hidapi_xboxone.c */ if (xpad->mapping & MAP_PADDLES) { From 973043d6bdea219b74b3a51442d5c77c11242c71 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 29 Mar 2023 21:47:20 +0800 Subject: [PATCH 118/179] iommu/vt-d: Allow zero SAGAW if second-stage not supported [ Upstream commit bfd3c6b9fa4a1dc78139dd1621d5bea321ffa69d ] The VT-d spec states (in section 11.4.2) that hardware implementations reporting second-stage translation support (SSTS) field as Clear also report the SAGAW field as 0. Fix an inappropriate check in alloc_iommu(). Fixes: 792fb43ce2c9 ("iommu/vt-d: Enable Intel IOMMU scalable mode by default") Suggested-by: Raghunathan Srinivasan Reviewed-by: Kevin Tian Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20230318024824.124542-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20230329134721.469447-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin --- drivers/iommu/intel/dmar.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index bc94059a5b87..f800989ea046 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1057,7 +1057,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) } err = -EINVAL; - if (cap_sagaw(iommu->cap) == 0) { + if (!cap_sagaw(iommu->cap) && + (!ecap_smts(iommu->ecap) || ecap_slts(iommu->ecap))) { pr_info("%s: No supported address widths. Not attempting DMA translation.\n", iommu->name); drhd->ignored = 1; From 30a8863f21ec8451142241a77cade1b8b8ab17f7 Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Thu, 23 Mar 2023 18:13:11 -0700 Subject: [PATCH 119/179] Input: i8042 - add TUXEDO devices to i8042 quirk tables for partial fix commit cbedf1a33970c9b825ae75b81fbd3e88e224a418 upstream. A lot of modern Clevo barebones have touchpad and/or keyboard issues after suspend fixable with nomux + reset + noloop + nopnp. Luckily, none of them have an external PS/2 port so this can safely be set for all of them. I'm not entirely sure if every device listed really needs all four quirks, but after testing and production use, no negative effects could be observed when setting all four. Setting SERIO_QUIRK_NOMUX or SERIO_QUIRK_RESET_ALWAYS on the Clevo N150CU and the Clevo NHxxRZQ makes the keyboard very laggy for ~5 seconds after boot and sometimes also after resume. However both are required for the keyboard to not fail completely sometimes after boot or resume. Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230321191619.647911-1-wse@tuxedocomputers.com Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042-acpipnpio.h | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index efc61736099b..371406f85ab3 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -1116,6 +1116,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + { + /* + * Setting SERIO_QUIRK_NOMUX or SERIO_QUIRK_RESET_ALWAYS makes + * the keyboard very laggy for ~5 seconds after boot and + * sometimes also after resume. + * However both are required for the keyboard to not fail + * completely sometimes after boot or resume. + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "N150CU"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "NH5xAx"), @@ -1123,6 +1137,20 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) }, + { + /* + * Setting SERIO_QUIRK_NOMUX or SERIO_QUIRK_RESET_ALWAYS makes + * the keyboard very laggy for ~5 seconds after boot and + * sometimes also after resume. + * However both are required for the keyboard to not fail + * completely sometimes after boot or resume. + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "NHxxRZQ"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX | SERIO_QUIRK_RESET_ALWAYS | + SERIO_QUIRK_NOLOOP | SERIO_QUIRK_NOPNP) + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "NL5xRU"), From 5ee6df525715002649ba8138f59d77f5d7a1e3c2 Mon Sep 17 00:00:00 2001 From: msizanoen Date: Sun, 19 Mar 2023 23:02:56 -0700 Subject: [PATCH 120/179] Input: alps - fix compatibility with -funsigned-char commit 754ff5060daf5a1cf4474eff9b4edeb6c17ef7ab upstream. The AlpsPS/2 code previously relied on the assumption that `char` is a signed type, which was true on x86 platforms (the only place where this driver is used) before kernel 6.2. However, on 6.2 and later, this assumption is broken due to the introduction of -funsigned-char as a new global compiler flag. Fix this by explicitly specifying the signedness of `char` when sign extending the values received from the device. Fixes: f3f33c677699 ("Input: alps - Rushmore and v7 resolution support") Signed-off-by: msizanoen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230320045228.182259-1-msizanoen@qtmlabs.xyz Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/alps.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c index 4a6b33bbe7ea..dd08ce97e7c9 100644 --- a/drivers/input/mouse/alps.c +++ b/drivers/input/mouse/alps.c @@ -852,8 +852,8 @@ static void alps_process_packet_v6(struct psmouse *psmouse) x = y = z = 0; /* Divide 4 since trackpoint's speed is too fast */ - input_report_rel(dev2, REL_X, (char)x / 4); - input_report_rel(dev2, REL_Y, -((char)y / 4)); + input_report_rel(dev2, REL_X, (s8)x / 4); + input_report_rel(dev2, REL_Y, -((s8)y / 4)); psmouse_report_standard_buttons(dev2, packet[3]); @@ -1104,8 +1104,8 @@ static void alps_process_trackstick_packet_v7(struct psmouse *psmouse) ((packet[3] & 0x20) << 1); z = (packet[5] & 0x3f) | ((packet[3] & 0x80) >> 1); - input_report_rel(dev2, REL_X, (char)x); - input_report_rel(dev2, REL_Y, -((char)y)); + input_report_rel(dev2, REL_X, (s8)x); + input_report_rel(dev2, REL_Y, -((s8)y)); input_report_abs(dev2, ABS_PRESSURE, z); psmouse_report_standard_buttons(dev2, packet[1]); @@ -2294,20 +2294,20 @@ static int alps_get_v3_v7_resolution(struct psmouse *psmouse, int reg_pitch) if (reg < 0) return reg; - x_pitch = (char)(reg << 4) >> 4; /* sign extend lower 4 bits */ + x_pitch = (s8)(reg << 4) >> 4; /* sign extend lower 4 bits */ x_pitch = 50 + 2 * x_pitch; /* In 0.1 mm units */ - y_pitch = (char)reg >> 4; /* sign extend upper 4 bits */ + y_pitch = (s8)reg >> 4; /* sign extend upper 4 bits */ y_pitch = 36 + 2 * y_pitch; /* In 0.1 mm units */ reg = alps_command_mode_read_reg(psmouse, reg_pitch + 1); if (reg < 0) return reg; - x_electrode = (char)(reg << 4) >> 4; /* sign extend lower 4 bits */ + x_electrode = (s8)(reg << 4) >> 4; /* sign extend lower 4 bits */ x_electrode = 17 + x_electrode; - y_electrode = (char)reg >> 4; /* sign extend upper 4 bits */ + y_electrode = (s8)reg >> 4; /* sign extend upper 4 bits */ y_electrode = 13 + y_electrode; x_phys = x_pitch * (x_electrode - 1); /* In 0.1 mm units */ From f8bdc959cefee58a1a17712f031bfcf3ea3e6ba1 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 19 Mar 2023 21:36:36 -0700 Subject: [PATCH 121/179] Input: focaltech - use explicitly signed char type commit 8980f190947ba29f23110408e712444884b74251 upstream. The recent change of -funsigned-char causes additions of negative numbers to become additions of large positive numbers, leading to wrong calculations of mouse movement. Change these casts to be explicitly signed, to take into account negative offsets. Fixes: 3bc753c06dd0 ("kbuild: treat char as always unsigned") Signed-off-by: Jason A. Donenfeld Reviewed-by: Hans de Goede Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=217211 Link: https://lore.kernel.org/r/20230318133010.1285202-1-Jason@zx2c4.com Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/focaltech.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/input/mouse/focaltech.c b/drivers/input/mouse/focaltech.c index 6fd5fff0cbff..c74b99077d16 100644 --- a/drivers/input/mouse/focaltech.c +++ b/drivers/input/mouse/focaltech.c @@ -202,8 +202,8 @@ static void focaltech_process_rel_packet(struct psmouse *psmouse, state->pressed = packet[0] >> 7; finger1 = ((packet[0] >> 4) & 0x7) - 1; if (finger1 < FOC_MAX_FINGERS) { - state->fingers[finger1].x += (char)packet[1]; - state->fingers[finger1].y += (char)packet[2]; + state->fingers[finger1].x += (s8)packet[1]; + state->fingers[finger1].y += (s8)packet[2]; } else { psmouse_err(psmouse, "First finger in rel packet invalid: %d\n", finger1); @@ -218,8 +218,8 @@ static void focaltech_process_rel_packet(struct psmouse *psmouse, */ finger2 = ((packet[3] >> 4) & 0x7) - 1; if (finger2 < FOC_MAX_FINGERS) { - state->fingers[finger2].x += (char)packet[4]; - state->fingers[finger2].y += (char)packet[5]; + state->fingers[finger2].x += (s8)packet[4]; + state->fingers[finger2].y += (s8)packet[5]; } } From eb94ea52afff7f0d4dfcb200ba45855fcf7986a3 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 29 Mar 2023 17:14:22 -0300 Subject: [PATCH 122/179] cifs: prevent infinite recursion in CIFSGetDFSRefer() commit 09ba47b44d26b475bbdf9c80db9e0193d2b58956 upstream. We can't call smb_init() in CIFSGetDFSRefer() as cifs_reconnect_tcon() may end up calling CIFSGetDFSRefer() again to get new DFS referrals and thus causing an infinite recursion. Signed-off-by: Paulo Alcantara (SUSE) Reviewed-by: Ronnie Sahlberg Cc: stable@vger.kernel.org # 6.2 Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifssmb.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4bc6ba87baf4..c90d4ec9292c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4319,8 +4319,13 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, return -ENODEV; getDFSRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, (void **) &pSMB, - (void **) &pSMBr); + /* + * Use smb_init_no_reconnect() instead of smb_init() as + * CIFSGetDFSRefer() may be called from cifs_reconnect_tcon() and thus + * causing an infinite recursion. + */ + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, + (void **)&pSMB, (void **)&pSMBr); if (rc) return rc; From b7d854c33ab48e55fc233699bbefe39ec9bb5c05 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Wed, 29 Mar 2023 22:24:06 +0200 Subject: [PATCH 123/179] cifs: fix DFS traversal oops without CONFIG_CIFS_DFS_UPCALL commit 179a88a8558bbf42991d361595281f3e45d7edfc upstream. When compiled with CONFIG_CIFS_DFS_UPCALL disabled, cifs_dfs_d_automount is NULL. cifs.ko logic for mapping CIFS_FATTR_DFS_REFERRAL attributes to S_AUTOMOUNT and corresponding dentry flags is retained regardless of CONFIG_CIFS_DFS_UPCALL, leading to a NULL pointer dereference in VFS follow_automount() when traversing a DFS referral link: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... Call Trace: __traverse_mounts+0xb5/0x220 ? cifs_revalidate_mapping+0x65/0xc0 [cifs] step_into+0x195/0x610 ? lookup_fast+0xe2/0xf0 path_lookupat+0x64/0x140 filename_lookup+0xc2/0x140 ? __create_object+0x299/0x380 ? kmem_cache_alloc+0x119/0x220 ? user_path_at_empty+0x31/0x50 user_path_at_empty+0x31/0x50 __x64_sys_chdir+0x2a/0xd0 ? exit_to_user_mode_prepare+0xca/0x100 do_syscall_64+0x42/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc This fix adds an inline cifs_dfs_d_automount() {return -EREMOTE} handler when CONFIG_CIFS_DFS_UPCALL is disabled. An alternative would be to avoid flagging S_AUTOMOUNT, etc. without CONFIG_CIFS_DFS_UPCALL. This approach was chosen as it provides more control over the error path. Signed-off-by: David Disseldorp Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (SUSE) Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 388b745a978e..b6c38896fb2d 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -118,7 +118,10 @@ extern const struct dentry_operations cifs_ci_dentry_ops; #ifdef CONFIG_CIFS_DFS_UPCALL extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #else -#define cifs_dfs_d_automount NULL +static inline struct vfsmount *cifs_dfs_d_automount(struct path *path) +{ + return ERR_PTR(-EREMOTE); +} #endif /* Functions related to symlinks */ From 89c4b695431d33811d9b0103d5a2f059b75c10d3 Mon Sep 17 00:00:00 2001 From: Jonathan Denose Date: Fri, 17 Mar 2023 03:19:51 -0700 Subject: [PATCH 124/179] Input: i8042 - add quirk for Fujitsu Lifebook A574/H commit f5bad62f9107b701a6def7cac1f5f65862219b83 upstream. Fujitsu Lifebook A574/H requires the nomux option to properly probe the touchpad, especially when waking from sleep. Signed-off-by: Jonathan Denose Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20230303152623.45859-1-jdenose@google.com Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042-acpipnpio.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/input/serio/i8042-acpipnpio.h b/drivers/input/serio/i8042-acpipnpio.h index 371406f85ab3..028e45bd050b 100644 --- a/drivers/input/serio/i8042-acpipnpio.h +++ b/drivers/input/serio/i8042-acpipnpio.h @@ -610,6 +610,14 @@ static const struct dmi_system_id i8042_dmi_quirk_table[] __initconst = { }, .driver_data = (void *)(SERIO_QUIRK_NOMUX) }, + { + /* Fujitsu Lifebook A574/H */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "FMVA0501PZ"), + }, + .driver_data = (void *)(SERIO_QUIRK_NOMUX) + }, { /* Gigabyte M912 */ .matches = { From 85ec44199b5a4408f40109d960919cde45c22c89 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 17 Mar 2023 03:13:12 -0700 Subject: [PATCH 125/179] Input: goodix - add Lenovo Yoga Book X90F to nine_bytes_report DMI table commit 8a0432bab6ea3203d220785da7ab3c7677f70ecb upstream. The Android Lenovo Yoga Book X90F / X90L uses the same goodix touchscreen with 9 bytes touch reports for its touch keyboard as the already supported Windows Lenovo Yoga Book X91F/L, add a DMI match for this to the nine_bytes_report DMI table. When the quirk for the X91F/L was initially added it was written to also apply to the X90F/L but this does not work because the Android version of the Yoga Book uses completely different DMI strings. Also adjust the X91F/L quirk to reflect that it only applies to the X91F/L models. Signed-off-by: Hans de Goede Reviewed-by: Bastien Nocera Link: https://lore.kernel.org/r/20230315134442.71787-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/touchscreen/goodix.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c index c281e49826c2..25e575183dd1 100644 --- a/drivers/input/touchscreen/goodix.c +++ b/drivers/input/touchscreen/goodix.c @@ -124,10 +124,18 @@ static const unsigned long goodix_irq_flags[] = { static const struct dmi_system_id nine_bytes_report[] = { #if defined(CONFIG_DMI) && defined(CONFIG_X86) { - .ident = "Lenovo YogaBook", - /* YB1-X91L/F and YB1-X90L/F */ + /* Lenovo Yoga Book X90F / X90L */ .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "Lenovo YB1-X9") + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "CHERRYVIEW D1 PLATFORM"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "YETI-11"), + } + }, + { + /* Lenovo Yoga Book X91F / X91L */ + .matches = { + /* Non exact match to match F + L versions */ + DMI_MATCH(DMI_PRODUCT_NAME, "Lenovo YB1-X91"), } }, #endif From 6134a4bb6b1c411a244edee041ac89266c78d45c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Mar 2023 09:46:34 +0000 Subject: [PATCH 126/179] btrfs: fix deadlock when aborting transaction during relocation with scrub commit 2d82a40aa7d6fcae0250ec68b8566cdee7bfd44c upstream. Before relocating a block group we pause scrub, then do the relocation and then unpause scrub. The relocation process requires starting and committing a transaction, and if we have a failure in the critical section of the transaction commit path (transaction state >= TRANS_STATE_COMMIT_START), we will deadlock if there is a paused scrub. That results in stack traces like the following: [42.479] BTRFS info (device sdc): relocating block group 53876686848 flags metadata|raid6 [42.936] BTRFS warning (device sdc): Skipping commit of aborted transaction. [42.936] ------------[ cut here ]------------ [42.936] BTRFS: Transaction aborted (error -28) [42.936] WARNING: CPU: 11 PID: 346822 at fs/btrfs/transaction.c:1977 btrfs_commit_transaction+0xcc8/0xeb0 [btrfs] [42.936] Modules linked in: dm_flakey dm_mod loop btrfs (...) [42.936] CPU: 11 PID: 346822 Comm: btrfs Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [42.936] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [42.936] RIP: 0010:btrfs_commit_transaction+0xcc8/0xeb0 [btrfs] [42.936] Code: ff ff 45 8b (...) [42.936] RSP: 0018:ffffb58649633b48 EFLAGS: 00010282 [42.936] RAX: 0000000000000000 RBX: ffff8be6ef4d5bd8 RCX: 0000000000000000 [42.936] RDX: 0000000000000002 RSI: ffffffffb35e7782 RDI: 00000000ffffffff [42.936] RBP: ffff8be6ef4d5c98 R08: 0000000000000000 R09: ffffb586496339e8 [42.936] R10: 0000000000000001 R11: 0000000000000001 R12: ffff8be6d38c7c00 [42.936] R13: 00000000ffffffe4 R14: ffff8be6c268c000 R15: ffff8be6ef4d5cf0 [42.936] FS: 00007f381a82b340(0000) GS:ffff8beddfcc0000(0000) knlGS:0000000000000000 [42.936] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [42.936] CR2: 00007f1e35fb7638 CR3: 0000000117680006 CR4: 0000000000370ee0 [42.936] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [42.936] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [42.936] Call Trace: [42.936] [42.936] ? start_transaction+0xcb/0x610 [btrfs] [42.936] prepare_to_relocate+0x111/0x1a0 [btrfs] [42.936] relocate_block_group+0x57/0x5d0 [btrfs] [42.936] ? btrfs_wait_nocow_writers+0x25/0xb0 [btrfs] [42.936] btrfs_relocate_block_group+0x248/0x3c0 [btrfs] [42.936] ? __pfx_autoremove_wake_function+0x10/0x10 [42.936] btrfs_relocate_chunk+0x3b/0x150 [btrfs] [42.936] btrfs_balance+0x8ff/0x11d0 [btrfs] [42.936] ? __kmem_cache_alloc_node+0x14a/0x410 [42.936] btrfs_ioctl+0x2334/0x32c0 [btrfs] [42.937] ? mod_objcg_state+0xd2/0x360 [42.937] ? refill_obj_stock+0xb0/0x160 [42.937] ? seq_release+0x25/0x30 [42.937] ? __rseq_handle_notify_resume+0x3b5/0x4b0 [42.937] ? percpu_counter_add_batch+0x2e/0xa0 [42.937] ? __x64_sys_ioctl+0x88/0xc0 [42.937] __x64_sys_ioctl+0x88/0xc0 [42.937] do_syscall_64+0x38/0x90 [42.937] entry_SYSCALL_64_after_hwframe+0x72/0xdc [42.937] RIP: 0033:0x7f381a6ffe9b [42.937] Code: 00 48 89 44 24 (...) [42.937] RSP: 002b:00007ffd45ecf060 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [42.937] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f381a6ffe9b [42.937] RDX: 00007ffd45ecf150 RSI: 00000000c4009420 RDI: 0000000000000003 [42.937] RBP: 0000000000000003 R08: 0000000000000013 R09: 0000000000000000 [42.937] R10: 00007f381a60c878 R11: 0000000000000246 R12: 00007ffd45ed0423 [42.937] R13: 00007ffd45ecf150 R14: 0000000000000000 R15: 00007ffd45ecf148 [42.937] [42.937] ---[ end trace 0000000000000000 ]--- [42.937] BTRFS: error (device sdc: state A) in cleanup_transaction:1977: errno=-28 No space left [59.196] INFO: task btrfs:346772 blocked for more than 120 seconds. [59.196] Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [59.196] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [59.196] task:btrfs state:D stack:0 pid:346772 ppid:1 flags:0x00004002 [59.196] Call Trace: [59.196] [59.196] __schedule+0x392/0xa70 [59.196] ? __pv_queued_spin_lock_slowpath+0x165/0x370 [59.196] schedule+0x5d/0xd0 [59.196] __scrub_blocked_if_needed+0x74/0xc0 [btrfs] [59.197] ? __pfx_autoremove_wake_function+0x10/0x10 [59.197] scrub_pause_off+0x21/0x50 [btrfs] [59.197] scrub_simple_mirror+0x1c7/0x950 [btrfs] [59.197] ? scrub_parity_put+0x1a5/0x1d0 [btrfs] [59.198] ? __pfx_autoremove_wake_function+0x10/0x10 [59.198] scrub_stripe+0x20d/0x740 [btrfs] [59.198] scrub_chunk+0xc4/0x130 [btrfs] [59.198] scrub_enumerate_chunks+0x3e4/0x7a0 [btrfs] [59.198] ? __pfx_autoremove_wake_function+0x10/0x10 [59.198] btrfs_scrub_dev+0x236/0x6a0 [btrfs] [59.199] ? btrfs_ioctl+0xd97/0x32c0 [btrfs] [59.199] ? _copy_from_user+0x7b/0x80 [59.199] btrfs_ioctl+0xde1/0x32c0 [btrfs] [59.199] ? refill_stock+0x33/0x50 [59.199] ? should_failslab+0xa/0x20 [59.199] ? kmem_cache_alloc_node+0x151/0x460 [59.199] ? alloc_io_context+0x1b/0x80 [59.199] ? preempt_count_add+0x70/0xa0 [59.199] ? __x64_sys_ioctl+0x88/0xc0 [59.199] __x64_sys_ioctl+0x88/0xc0 [59.199] do_syscall_64+0x38/0x90 [59.199] entry_SYSCALL_64_after_hwframe+0x72/0xdc [59.199] RIP: 0033:0x7f82ffaffe9b [59.199] RSP: 002b:00007f82ff9fcc50 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [59.199] RAX: ffffffffffffffda RBX: 000055b191e36310 RCX: 00007f82ffaffe9b [59.199] RDX: 000055b191e36310 RSI: 00000000c400941b RDI: 0000000000000003 [59.199] RBP: 0000000000000000 R08: 00007fff1575016f R09: 0000000000000000 [59.199] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f82ff9fd640 [59.199] R13: 000000000000006b R14: 00007f82ffa87580 R15: 0000000000000000 [59.199] [59.199] INFO: task btrfs:346773 blocked for more than 120 seconds. [59.200] Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [59.200] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [59.201] task:btrfs state:D stack:0 pid:346773 ppid:1 flags:0x00004002 [59.201] Call Trace: [59.201] [59.201] __schedule+0x392/0xa70 [59.201] ? __pv_queued_spin_lock_slowpath+0x165/0x370 [59.201] schedule+0x5d/0xd0 [59.201] __scrub_blocked_if_needed+0x74/0xc0 [btrfs] [59.201] ? __pfx_autoremove_wake_function+0x10/0x10 [59.201] scrub_pause_off+0x21/0x50 [btrfs] [59.202] scrub_simple_mirror+0x1c7/0x950 [btrfs] [59.202] ? scrub_parity_put+0x1a5/0x1d0 [btrfs] [59.202] ? __pfx_autoremove_wake_function+0x10/0x10 [59.202] scrub_stripe+0x20d/0x740 [btrfs] [59.202] scrub_chunk+0xc4/0x130 [btrfs] [59.203] scrub_enumerate_chunks+0x3e4/0x7a0 [btrfs] [59.203] ? __pfx_autoremove_wake_function+0x10/0x10 [59.203] btrfs_scrub_dev+0x236/0x6a0 [btrfs] [59.203] ? btrfs_ioctl+0xd97/0x32c0 [btrfs] [59.203] ? _copy_from_user+0x7b/0x80 [59.203] btrfs_ioctl+0xde1/0x32c0 [btrfs] [59.204] ? should_failslab+0xa/0x20 [59.204] ? kmem_cache_alloc_node+0x151/0x460 [59.204] ? alloc_io_context+0x1b/0x80 [59.204] ? preempt_count_add+0x70/0xa0 [59.204] ? __x64_sys_ioctl+0x88/0xc0 [59.204] __x64_sys_ioctl+0x88/0xc0 [59.204] do_syscall_64+0x38/0x90 [59.204] entry_SYSCALL_64_after_hwframe+0x72/0xdc [59.204] RIP: 0033:0x7f82ffaffe9b [59.204] RSP: 002b:00007f82ff1fbc50 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [59.204] RAX: ffffffffffffffda RBX: 000055b191e36790 RCX: 00007f82ffaffe9b [59.204] RDX: 000055b191e36790 RSI: 00000000c400941b RDI: 0000000000000003 [59.204] RBP: 0000000000000000 R08: 00007fff1575016f R09: 0000000000000000 [59.204] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f82ff1fc640 [59.204] R13: 000000000000006b R14: 00007f82ffa87580 R15: 0000000000000000 [59.204] [59.204] INFO: task btrfs:346774 blocked for more than 120 seconds. [59.205] Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [59.205] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [59.206] task:btrfs state:D stack:0 pid:346774 ppid:1 flags:0x00004002 [59.206] Call Trace: [59.206] [59.206] __schedule+0x392/0xa70 [59.206] schedule+0x5d/0xd0 [59.206] __scrub_blocked_if_needed+0x74/0xc0 [btrfs] [59.206] ? __pfx_autoremove_wake_function+0x10/0x10 [59.206] scrub_pause_off+0x21/0x50 [btrfs] [59.207] scrub_simple_mirror+0x1c7/0x950 [btrfs] [59.207] ? scrub_parity_put+0x1a5/0x1d0 [btrfs] [59.207] ? __pfx_autoremove_wake_function+0x10/0x10 [59.207] scrub_stripe+0x20d/0x740 [btrfs] [59.208] scrub_chunk+0xc4/0x130 [btrfs] [59.208] scrub_enumerate_chunks+0x3e4/0x7a0 [btrfs] [59.208] ? __mutex_unlock_slowpath.isra.0+0x9a/0x120 [59.208] btrfs_scrub_dev+0x236/0x6a0 [btrfs] [59.208] ? btrfs_ioctl+0xd97/0x32c0 [btrfs] [59.209] ? _copy_from_user+0x7b/0x80 [59.209] btrfs_ioctl+0xde1/0x32c0 [btrfs] [59.209] ? should_failslab+0xa/0x20 [59.209] ? kmem_cache_alloc_node+0x151/0x460 [59.209] ? alloc_io_context+0x1b/0x80 [59.209] ? preempt_count_add+0x70/0xa0 [59.209] ? __x64_sys_ioctl+0x88/0xc0 [59.209] __x64_sys_ioctl+0x88/0xc0 [59.209] do_syscall_64+0x38/0x90 [59.209] entry_SYSCALL_64_after_hwframe+0x72/0xdc [59.209] RIP: 0033:0x7f82ffaffe9b [59.209] RSP: 002b:00007f82fe9fac50 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [59.209] RAX: ffffffffffffffda RBX: 000055b191e36c10 RCX: 00007f82ffaffe9b [59.209] RDX: 000055b191e36c10 RSI: 00000000c400941b RDI: 0000000000000003 [59.209] RBP: 0000000000000000 R08: 00007fff1575016f R09: 0000000000000000 [59.209] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f82fe9fb640 [59.209] R13: 000000000000006b R14: 00007f82ffa87580 R15: 0000000000000000 [59.209] [59.209] INFO: task btrfs:346775 blocked for more than 120 seconds. [59.210] Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [59.210] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [59.211] task:btrfs state:D stack:0 pid:346775 ppid:1 flags:0x00004002 [59.211] Call Trace: [59.211] [59.211] __schedule+0x392/0xa70 [59.211] schedule+0x5d/0xd0 [59.211] __scrub_blocked_if_needed+0x74/0xc0 [btrfs] [59.211] ? __pfx_autoremove_wake_function+0x10/0x10 [59.211] scrub_pause_off+0x21/0x50 [btrfs] [59.212] scrub_simple_mirror+0x1c7/0x950 [btrfs] [59.212] ? scrub_parity_put+0x1a5/0x1d0 [btrfs] [59.212] ? __pfx_autoremove_wake_function+0x10/0x10 [59.212] scrub_stripe+0x20d/0x740 [btrfs] [59.213] scrub_chunk+0xc4/0x130 [btrfs] [59.213] scrub_enumerate_chunks+0x3e4/0x7a0 [btrfs] [59.213] ? __mutex_unlock_slowpath.isra.0+0x9a/0x120 [59.213] btrfs_scrub_dev+0x236/0x6a0 [btrfs] [59.213] ? btrfs_ioctl+0xd97/0x32c0 [btrfs] [59.214] ? _copy_from_user+0x7b/0x80 [59.214] btrfs_ioctl+0xde1/0x32c0 [btrfs] [59.214] ? should_failslab+0xa/0x20 [59.214] ? kmem_cache_alloc_node+0x151/0x460 [59.214] ? alloc_io_context+0x1b/0x80 [59.214] ? preempt_count_add+0x70/0xa0 [59.214] ? __x64_sys_ioctl+0x88/0xc0 [59.214] __x64_sys_ioctl+0x88/0xc0 [59.214] do_syscall_64+0x38/0x90 [59.214] entry_SYSCALL_64_after_hwframe+0x72/0xdc [59.214] RIP: 0033:0x7f82ffaffe9b [59.214] RSP: 002b:00007f82fe1f9c50 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [59.214] RAX: ffffffffffffffda RBX: 000055b191e37090 RCX: 00007f82ffaffe9b [59.214] RDX: 000055b191e37090 RSI: 00000000c400941b RDI: 0000000000000003 [59.214] RBP: 0000000000000000 R08: 00007fff1575016f R09: 0000000000000000 [59.214] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f82fe1fa640 [59.214] R13: 000000000000006b R14: 00007f82ffa87580 R15: 0000000000000000 [59.214] [59.214] INFO: task btrfs:346776 blocked for more than 120 seconds. [59.215] Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [59.216] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [59.217] task:btrfs state:D stack:0 pid:346776 ppid:1 flags:0x00004002 [59.217] Call Trace: [59.217] [59.217] __schedule+0x392/0xa70 [59.217] ? __pv_queued_spin_lock_slowpath+0x165/0x370 [59.217] schedule+0x5d/0xd0 [59.217] __scrub_blocked_if_needed+0x74/0xc0 [btrfs] [59.217] ? __pfx_autoremove_wake_function+0x10/0x10 [59.217] scrub_pause_off+0x21/0x50 [btrfs] [59.217] scrub_simple_mirror+0x1c7/0x950 [btrfs] [59.217] ? scrub_parity_put+0x1a5/0x1d0 [btrfs] [59.218] ? __pfx_autoremove_wake_function+0x10/0x10 [59.218] scrub_stripe+0x20d/0x740 [btrfs] [59.218] scrub_chunk+0xc4/0x130 [btrfs] [59.218] scrub_enumerate_chunks+0x3e4/0x7a0 [btrfs] [59.219] ? __pfx_autoremove_wake_function+0x10/0x10 [59.219] btrfs_scrub_dev+0x236/0x6a0 [btrfs] [59.219] ? btrfs_ioctl+0xd97/0x32c0 [btrfs] [59.219] ? _copy_from_user+0x7b/0x80 [59.219] btrfs_ioctl+0xde1/0x32c0 [btrfs] [59.219] ? should_failslab+0xa/0x20 [59.219] ? kmem_cache_alloc_node+0x151/0x460 [59.219] ? alloc_io_context+0x1b/0x80 [59.219] ? preempt_count_add+0x70/0xa0 [59.219] ? __x64_sys_ioctl+0x88/0xc0 [59.219] __x64_sys_ioctl+0x88/0xc0 [59.219] do_syscall_64+0x38/0x90 [59.219] entry_SYSCALL_64_after_hwframe+0x72/0xdc [59.219] RIP: 0033:0x7f82ffaffe9b [59.219] RSP: 002b:00007f82fd9f8c50 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [59.219] RAX: ffffffffffffffda RBX: 000055b191e37510 RCX: 00007f82ffaffe9b [59.219] RDX: 000055b191e37510 RSI: 00000000c400941b RDI: 0000000000000003 [59.219] RBP: 0000000000000000 R08: 00007fff1575016f R09: 0000000000000000 [59.219] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f82fd9f9640 [59.219] R13: 000000000000006b R14: 00007f82ffa87580 R15: 0000000000000000 [59.219] [59.219] INFO: task btrfs:346822 blocked for more than 120 seconds. [59.220] Tainted: G W 6.3.0-rc2-btrfs-next-127+ #1 [59.221] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [59.222] task:btrfs state:D stack:0 pid:346822 ppid:1 flags:0x00004002 [59.222] Call Trace: [59.222] [59.222] __schedule+0x392/0xa70 [59.222] schedule+0x5d/0xd0 [59.222] btrfs_scrub_cancel+0x91/0x100 [btrfs] [59.222] ? __pfx_autoremove_wake_function+0x10/0x10 [59.222] btrfs_commit_transaction+0x572/0xeb0 [btrfs] [59.223] ? start_transaction+0xcb/0x610 [btrfs] [59.223] prepare_to_relocate+0x111/0x1a0 [btrfs] [59.223] relocate_block_group+0x57/0x5d0 [btrfs] [59.223] ? btrfs_wait_nocow_writers+0x25/0xb0 [btrfs] [59.223] btrfs_relocate_block_group+0x248/0x3c0 [btrfs] [59.224] ? __pfx_autoremove_wake_function+0x10/0x10 [59.224] btrfs_relocate_chunk+0x3b/0x150 [btrfs] [59.224] btrfs_balance+0x8ff/0x11d0 [btrfs] [59.224] ? __kmem_cache_alloc_node+0x14a/0x410 [59.224] btrfs_ioctl+0x2334/0x32c0 [btrfs] [59.225] ? mod_objcg_state+0xd2/0x360 [59.225] ? refill_obj_stock+0xb0/0x160 [59.225] ? seq_release+0x25/0x30 [59.225] ? __rseq_handle_notify_resume+0x3b5/0x4b0 [59.225] ? percpu_counter_add_batch+0x2e/0xa0 [59.225] ? __x64_sys_ioctl+0x88/0xc0 [59.225] __x64_sys_ioctl+0x88/0xc0 [59.225] do_syscall_64+0x38/0x90 [59.225] entry_SYSCALL_64_after_hwframe+0x72/0xdc [59.225] RIP: 0033:0x7f381a6ffe9b [59.225] RSP: 002b:00007ffd45ecf060 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [59.225] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f381a6ffe9b [59.225] RDX: 00007ffd45ecf150 RSI: 00000000c4009420 RDI: 0000000000000003 [59.225] RBP: 0000000000000003 R08: 0000000000000013 R09: 0000000000000000 [59.225] R10: 00007f381a60c878 R11: 0000000000000246 R12: 00007ffd45ed0423 [59.225] R13: 00007ffd45ecf150 R14: 0000000000000000 R15: 00007ffd45ecf148 [59.225] What happens is the following: 1) A scrub is running, so fs_info->scrubs_running is 1; 2) Task A starts block group relocation, and at btrfs_relocate_chunk() it pauses scrub by calling btrfs_scrub_pause(). That increments fs_info->scrub_pause_req from 0 to 1 and waits for the scrub task to pause (for fs_info->scrubs_paused to be == to fs_info->scrubs_running); 3) The scrub task pauses at scrub_pause_off(), waiting for fs_info->scrub_pause_req to decrease to 0; 4) Task A then enters btrfs_relocate_block_group(), and down that call chain we start a transaction and then attempt to commit it; 5) When task A calls btrfs_commit_transaction(), it either will do the commit itself or wait for some other task that already started the commit of the transaction - it doesn't matter which case; 6) The transaction commit enters state TRANS_STATE_COMMIT_START; 7) An error happens during the transaction commit, like -ENOSPC when running delayed refs or delayed items for example; 8) This results in calling transaction.c:cleanup_transaction(), where we call btrfs_scrub_cancel(), incrementing fs_info->scrub_cancel_req from 0 to 1, and blocking this task waiting for fs_info->scrubs_running to decrease to 0; 9) From this point on, both the transaction commit and the scrub task hang forever: 1) The transaction commit is waiting for fs_info->scrubs_running to be decreased to 0; 2) The scrub task is at scrub_pause_off() waiting for fs_info->scrub_pause_req to decrease to 0 - so it can not proceed to stop the scrub and decrement fs_info->scrubs_running from 0 to 1. Therefore resulting in a deadlock. Fix this by having cleanup_transaction(), called if a transaction commit fails, not call btrfs_scrub_cancel() if relocation is in progress, and having btrfs_relocate_block_group() call btrfs_scrub_cancel() instead if the relocation failed and a transaction abort happened. This was triggered with btrfs/061 from fstests. Fixes: 55e3a601c81c ("btrfs: Fix data checksum error cause by replace with io-load.") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/transaction.c | 15 ++++++++++++++- fs/btrfs/volumes.c | 9 ++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1f1da6820fb..682b463a7633 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2009,7 +2009,20 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) if (current->journal_info == trans) current->journal_info = NULL; - btrfs_scrub_cancel(fs_info); + + /* + * If relocation is running, we can't cancel scrub because that will + * result in a deadlock. Before relocating a block group, relocation + * pauses scrub, then starts and commits a transaction before unpausing + * scrub. If the transaction commit is being done by the relocation + * task or triggered by another task and the relocation task is waiting + * for the commit, and we end up here due to an error in the commit + * path, then calling btrfs_scrub_cancel() will deadlock, as we are + * asking for scrub to stop while having it asked to be paused higher + * above in relocation code. + */ + if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + btrfs_scrub_cancel(fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f02b8cbd6ec4..6a2b6042845c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3286,8 +3286,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) btrfs_scrub_pause(fs_info); ret = btrfs_relocate_block_group(fs_info, chunk_offset); btrfs_scrub_continue(fs_info); - if (ret) + if (ret) { + /* + * If we had a transaction abort, stop all running scrubs. + * See transaction.c:cleanup_transaction() why we do it here. + */ + if (BTRFS_FS_ERROR(fs_info)) + btrfs_scrub_cancel(fs_info); return ret; + } block_group = btrfs_lookup_block_group(fs_info, chunk_offset); if (!block_group) From a38ff2024805a30d9b96f52557c6ea0bbc31252a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Mar 2023 10:33:28 +0000 Subject: [PATCH 127/179] btrfs: fix race between quota disable and quota assign ioctls commit 2f1a6be12ab6c8470d5776e68644726c94257c54 upstream. The quota assign ioctl can currently run in parallel with a quota disable ioctl call. The assign ioctl uses the quota root, while the disable ioctl frees that root, and therefore we can have a use-after-free triggered in the assign ioctl, leading to a trace like the following when KASAN is enabled: [672.723][T736] BUG: KASAN: slab-use-after-free in btrfs_search_slot+0x2962/0x2db0 [672.723][T736] Read of size 8 at addr ffff888022ec0208 by task btrfs_search_sl/27736 [672.724][T736] [672.725][T736] CPU: 1 PID: 27736 Comm: btrfs_search_sl Not tainted 6.3.0-rc3 #37 [672.723][T736] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [672.727][T736] Call Trace: [672.728][T736] [672.728][T736] dump_stack_lvl+0xd9/0x150 [672.725][T736] print_report+0xc1/0x5e0 [672.720][T736] ? __virt_addr_valid+0x61/0x2e0 [672.727][T736] ? __phys_addr+0xc9/0x150 [672.725][T736] ? btrfs_search_slot+0x2962/0x2db0 [672.722][T736] kasan_report+0xc0/0xf0 [672.729][T736] ? btrfs_search_slot+0x2962/0x2db0 [672.724][T736] btrfs_search_slot+0x2962/0x2db0 [672.723][T736] ? fs_reclaim_acquire+0xba/0x160 [672.722][T736] ? split_leaf+0x13d0/0x13d0 [672.726][T736] ? rcu_is_watching+0x12/0xb0 [672.723][T736] ? kmem_cache_alloc+0x338/0x3c0 [672.722][T736] update_qgroup_status_item+0xf7/0x320 [672.724][T736] ? add_qgroup_rb+0x3d0/0x3d0 [672.739][T736] ? do_raw_spin_lock+0x12d/0x2b0 [672.730][T736] ? spin_bug+0x1d0/0x1d0 [672.737][T736] btrfs_run_qgroups+0x5de/0x840 [672.730][T736] ? btrfs_qgroup_rescan_worker+0xa70/0xa70 [672.738][T736] ? __del_qgroup_relation+0x4ba/0xe00 [672.738][T736] btrfs_ioctl+0x3d58/0x5d80 [672.735][T736] ? tomoyo_path_number_perm+0x16a/0x550 [672.737][T736] ? tomoyo_execute_permission+0x4a0/0x4a0 [672.731][T736] ? btrfs_ioctl_get_supported_features+0x50/0x50 [672.737][T736] ? __sanitizer_cov_trace_switch+0x54/0x90 [672.734][T736] ? do_vfs_ioctl+0x132/0x1660 [672.730][T736] ? vfs_fileattr_set+0xc40/0xc40 [672.730][T736] ? _raw_spin_unlock_irq+0x2e/0x50 [672.732][T736] ? sigprocmask+0xf2/0x340 [672.737][T736] ? __fget_files+0x26a/0x480 [672.732][T736] ? bpf_lsm_file_ioctl+0x9/0x10 [672.738][T736] ? btrfs_ioctl_get_supported_features+0x50/0x50 [672.736][T736] __x64_sys_ioctl+0x198/0x210 [672.736][T736] do_syscall_64+0x39/0xb0 [672.731][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.739][T736] RIP: 0033:0x4556ad [672.742][T736] [672.743][T736] [672.748][T736] Allocated by task 27677: [672.743][T736] kasan_save_stack+0x22/0x40 [672.741][T736] kasan_set_track+0x25/0x30 [672.741][T736] __kasan_kmalloc+0xa4/0xb0 [672.749][T736] btrfs_alloc_root+0x48/0x90 [672.746][T736] btrfs_create_tree+0x146/0xa20 [672.744][T736] btrfs_quota_enable+0x461/0x1d20 [672.743][T736] btrfs_ioctl+0x4a1c/0x5d80 [672.747][T736] __x64_sys_ioctl+0x198/0x210 [672.749][T736] do_syscall_64+0x39/0xb0 [672.744][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.756][T736] [672.757][T736] Freed by task 27677: [672.759][T736] kasan_save_stack+0x22/0x40 [672.759][T736] kasan_set_track+0x25/0x30 [672.756][T736] kasan_save_free_info+0x2e/0x50 [672.751][T736] ____kasan_slab_free+0x162/0x1c0 [672.758][T736] slab_free_freelist_hook+0x89/0x1c0 [672.752][T736] __kmem_cache_free+0xaf/0x2e0 [672.752][T736] btrfs_put_root+0x1ff/0x2b0 [672.759][T736] btrfs_quota_disable+0x80a/0xbc0 [672.752][T736] btrfs_ioctl+0x3e5f/0x5d80 [672.756][T736] __x64_sys_ioctl+0x198/0x210 [672.753][T736] do_syscall_64+0x39/0xb0 [672.765][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.769][T736] [672.768][T736] The buggy address belongs to the object at ffff888022ec0000 [672.768][T736] which belongs to the cache kmalloc-4k of size 4096 [672.769][T736] The buggy address is located 520 bytes inside of [672.769][T736] freed 4096-byte region [ffff888022ec0000, ffff888022ec1000) [672.760][T736] [672.764][T736] The buggy address belongs to the physical page: [672.761][T736] page:ffffea00008bb000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x22ec0 [672.766][T736] head:ffffea00008bb000 order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [672.779][T736] flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) [672.770][T736] raw: 00fff00000010200 ffff888012842140 ffffea000054ba00 dead000000000002 [672.770][T736] raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 [672.771][T736] page dumped because: kasan: bad access detected [672.778][T736] page_owner tracks the page as allocated [672.777][T736] page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd2040(__GFP_IO|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 88 [672.779][T736] get_page_from_freelist+0x119c/0x2d50 [672.779][T736] __alloc_pages+0x1cb/0x4a0 [672.776][T736] alloc_pages+0x1aa/0x270 [672.773][T736] allocate_slab+0x260/0x390 [672.771][T736] ___slab_alloc+0xa9a/0x13e0 [672.778][T736] __slab_alloc.constprop.0+0x56/0xb0 [672.771][T736] __kmem_cache_alloc_node+0x136/0x320 [672.789][T736] __kmalloc+0x4e/0x1a0 [672.783][T736] tomoyo_realpath_from_path+0xc3/0x600 [672.781][T736] tomoyo_path_perm+0x22f/0x420 [672.782][T736] tomoyo_path_unlink+0x92/0xd0 [672.780][T736] security_path_unlink+0xdb/0x150 [672.788][T736] do_unlinkat+0x377/0x680 [672.788][T736] __x64_sys_unlink+0xca/0x110 [672.789][T736] do_syscall_64+0x39/0xb0 [672.783][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.784][T736] page last free stack trace: [672.787][T736] free_pcp_prepare+0x4e5/0x920 [672.787][T736] free_unref_page+0x1d/0x4e0 [672.784][T736] __unfreeze_partials+0x17c/0x1a0 [672.797][T736] qlist_free_all+0x6a/0x180 [672.796][T736] kasan_quarantine_reduce+0x189/0x1d0 [672.797][T736] __kasan_slab_alloc+0x64/0x90 [672.793][T736] kmem_cache_alloc+0x17c/0x3c0 [672.799][T736] getname_flags.part.0+0x50/0x4e0 [672.799][T736] getname_flags+0x9e/0xe0 [672.792][T736] vfs_fstatat+0x77/0xb0 [672.791][T736] __do_sys_newlstat+0x84/0x100 [672.798][T736] do_syscall_64+0x39/0xb0 [672.796][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.790][T736] [672.791][T736] Memory state around the buggy address: [672.799][T736] ffff888022ec0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.805][T736] ffff888022ec0180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.802][T736] >ffff888022ec0200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.809][T736] ^ [672.809][T736] ffff888022ec0280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.809][T736] ffff888022ec0300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fix this by having the qgroup assign ioctl take the qgroup ioctl mutex before calling btrfs_run_qgroups(), which is what all qgroup ioctls should call. Reported-by: butt3rflyh4ck Link: https://lore.kernel.org/linux-btrfs/CAFcO6XN3VD8ogmHwqRk4kbiwtpUSNySu2VAxN8waEPciCHJvMA@mail.gmail.com/ CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 2 ++ fs/btrfs/qgroup.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c05f16a35bca..fe2fb81da46b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4621,7 +4621,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) } /* update qgroup status and info */ + mutex_lock(&fs_info->qgroup_ioctl_lock); err = btrfs_run_qgroups(trans); + mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) btrfs_handle_fs_error(fs_info, err, "failed to update qgroup status and info"); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5ac65384c947..f8b054898885 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2812,13 +2812,22 @@ cleanup: } /* - * called from commit_transaction. Writes all changed qgroups to disk. + * Writes all changed qgroups to disk. + * Called by the transaction commit path and the qgroup assign ioctl. */ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret = 0; + /* + * In case we are called from the qgroup assign ioctl, assert that we + * are holding the qgroup_ioctl_lock, otherwise we can race with a quota + * disable operation (ioctl) and access a freed quota root. + */ + if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) + lockdep_assert_held(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) return ret; From 8a6539ea362eb2cae3148012dbc9c6bc943c1b6a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 23 Mar 2023 15:56:48 +0800 Subject: [PATCH 128/179] btrfs: scan device in non-exclusive mode commit 50d281fc434cb8e2497f5e70a309ccca6b1a09f0 upstream. This fixes mkfs/mount/check failures due to race with systemd-udevd scan. During the device scan initiated by systemd-udevd, other user space EXCL operations such as mkfs, mount, or check may get blocked and result in a "Device or resource busy" error. This is because the device scan process opens the device with the EXCL flag in the kernel. Two reports were received: - btrfs/179 test case, where the fsck command failed with the -EBUSY error - LTP pwritev03 test case, where mkfs.vfs failed with the -EBUSY error, when mkfs.vfs tried to overwrite old btrfs filesystem on the device. In both cases, fsck and mkfs (respectively) were racing with a systemd-udevd device scan, and systemd-udevd won, resulting in the -EBUSY error for fsck and mkfs. Reproducing the problem has been difficult because there is a very small window during which these userspace threads can race to acquire the exclusive device open. Even on the system where the problem was observed, the problem occurrences were anywhere between 10 to 400 iterations and chances of reproducing decreases with debug printk()s. However, an exclusive device open is unnecessary for the scan process, as there are no write operations on the device during scan. Furthermore, during the mount process, the superblock is re-read in the below function call chain: btrfs_mount_root btrfs_open_devices open_fs_devices btrfs_open_one_device btrfs_get_bdev_and_sb So, to fix this issue, removes the FMODE_EXCL flag from the scan operation, and add a comment. The case where mkfs may still write to the device and a scan is running, the btrfs signature is not written at that time so scan will not recognize such device. Reported-by: Sherry Yang Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202303170839.fdf23068-oliver.sang@intel.com CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6a2b6042845c..67b2aa552d22 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1379,8 +1379,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ - flags |= FMODE_EXCL; + /* + * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may + * initiate the device scan which may race with the user's mount + * or mkfs command, resulting in failure. + * Since the device scan is solely for reading purposes, there is + * no need for FMODE_EXCL. Additionally, the devices are read again + * during the mount process. It is ok to get some inconsistent + * values temporarily, as the device paths of the fsid are the only + * required information for assembling the volume. + */ bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) return ERR_CAST(bdev); From 4a8f1f5122667b1e54327ddbab7cfd096ac7dc00 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 Mar 2023 09:47:58 +0900 Subject: [PATCH 129/179] zonefs: Do not propagate iomap_dio_rw() ENOTBLK error to user space commit 77af13ba3c7f91d91c377c7e2d122849bbc17128 upstream. The call to invalidate_inode_pages2_range() in __iomap_dio_rw() may fail, in which case -ENOTBLK is returned and this error code is propagated back to user space trhough iomap_dio_rw() -> zonefs_file_dio_write() return chain. This error code is fairly obscure and may confuse the user. Avoid this and be consistent with the behavior of zonefs_file_dio_append() for similar invalidate_inode_pages2_range() errors by returning -EBUSY to user space when iomap_dio_rw() returns -ENOTBLK. Suggested-by: Christoph Hellwig Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Tested-by: Hans Holmberg Signed-off-by: Greg Kroah-Hartman --- fs/zonefs/file.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index c71cc0fcb3ec..763f10a61999 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -567,11 +567,21 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) append = sync; } - if (append) + if (append) { ret = zonefs_file_dio_append(iocb, from); - else + } else { + /* + * iomap_dio_rw() may return ENOTBLK if there was an issue with + * page invalidation. Overwrite that error code with EBUSY to + * be consistent with zonefs_file_dio_append() return value for + * similar issues. + */ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, &zonefs_write_dio_ops, 0, NULL, 0); + if (ret == -ENOTBLK) + ret = -EBUSY; + } + if (zonefs_zone_is_seq(z) && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) From e5da11825ef7c3f4760b235e04e225e7c8746279 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 20 Mar 2023 20:01:25 -0600 Subject: [PATCH 130/179] block/io_uring: pass in issue_flags for uring_cmd task_work handling commit 9d2789ac9d60c049d26ef6d3005d9c94c5a559e9 upstream. io_uring_cmd_done() currently assumes that the uring_lock is held when invoked, and while it generally is, this is not guaranteed. Pass in the issue_flags associated with it, so that we have IO_URING_F_UNLOCKED available to be able to lock the CQ ring appropriately when completing events. Cc: stable@vger.kernel.org Fixes: ee692a21e9bf ("fs,io_uring: add infrastructure for uring-cmd") Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/ublk_drv.c | 31 ++++++++++++++++++------------- drivers/nvme/host/ioctl.c | 14 ++++++++------ include/linux/io_uring.h | 11 ++++++----- io_uring/uring_cmd.c | 10 ++++++---- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4aec9be0ab77..22a790d51284 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -656,7 +656,8 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, } } -static void ubq_complete_io_cmd(struct ublk_io *io, int res) +static void ubq_complete_io_cmd(struct ublk_io *io, int res, + unsigned issue_flags) { /* mark this cmd owned by ublksrv */ io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; @@ -668,7 +669,7 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res) io->flags &= ~UBLK_IO_FLAG_ACTIVE; /* tell ublksrv one io request is coming */ - io_uring_cmd_done(io->cmd, res, 0); + io_uring_cmd_done(io->cmd, res, 0, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -685,7 +686,8 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); } -static inline void __ublk_rq_task_work(struct request *req) +static inline void __ublk_rq_task_work(struct request *req, + unsigned issue_flags) { struct ublk_queue *ubq = req->mq_hctx->driver_data; int tag = req->tag; @@ -723,7 +725,7 @@ static inline void __ublk_rq_task_work(struct request *req) pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n", __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags); - ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA); + ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags); return; } /* @@ -761,17 +763,18 @@ static inline void __ublk_rq_task_work(struct request *req) mapped_bytes >> 9; } - ubq_complete_io_cmd(io, UBLK_IO_RES_OK); + ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } -static inline void ublk_forward_io_cmds(struct ublk_queue *ubq) +static inline void ublk_forward_io_cmds(struct ublk_queue *ubq, + unsigned issue_flags) { struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds); struct ublk_rq_data *data, *tmp; io_cmds = llist_reverse_order(io_cmds); llist_for_each_entry_safe(data, tmp, io_cmds, node) - __ublk_rq_task_work(blk_mq_rq_from_pdu(data)); + __ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags); } static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) @@ -783,12 +786,12 @@ static inline void ublk_abort_io_cmds(struct ublk_queue *ubq) __ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data)); } -static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) { struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; - ublk_forward_io_cmds(ubq); + ublk_forward_io_cmds(ubq, issue_flags); } static void ublk_rq_task_work_fn(struct callback_head *work) @@ -797,8 +800,9 @@ static void ublk_rq_task_work_fn(struct callback_head *work) struct ublk_rq_data, work); struct request *req = blk_mq_rq_from_pdu(data); struct ublk_queue *ubq = req->mq_hctx->driver_data; + unsigned issue_flags = IO_URING_F_UNLOCKED; - ublk_forward_io_cmds(ubq); + ublk_forward_io_cmds(ubq, issue_flags); } static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) @@ -1052,7 +1056,8 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) struct ublk_io *io = &ubq->ios[i]; if (io->flags & UBLK_IO_FLAG_ACTIVE) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, + IO_URING_F_UNLOCKED); } /* all io commands are canceled */ @@ -1295,7 +1300,7 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; out: - io_uring_cmd_done(cmd, ret, 0); + io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", __func__, cmd_op, tag, ret, io->flags); return -EIOCBQUEUED; @@ -2053,7 +2058,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, break; } out: - io_uring_cmd_done(cmd, ret, 0); + io_uring_cmd_done(cmd, ret, 0, issue_flags); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); return -EIOCBQUEUED; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 81f5550b670d..8224675f8de2 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -387,7 +387,8 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; } -static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd) +static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, + unsigned issue_flags) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); struct request *req = pdu->req; @@ -408,17 +409,18 @@ static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd) blk_rq_unmap_user(req->bio); blk_mq_free_request(req); - io_uring_cmd_done(ioucmd, status, result); + io_uring_cmd_done(ioucmd, status, result, issue_flags); } -static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd) +static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, + unsigned issue_flags) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result); + io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, @@ -440,7 +442,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, * Otherwise, move the completion to task work. */ if (cookie != NULL && blk_rq_is_poll(req)) - nvme_uring_task_cb(ioucmd); + nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); @@ -462,7 +464,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, * Otherwise, move the completion to task work. */ if (cookie != NULL && blk_rq_is_poll(req)) - nvme_uring_task_meta_cb(ioucmd); + nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); else io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb); diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 0ded9e271523..a1484cdb3158 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -26,7 +26,7 @@ struct io_uring_cmd { const void *cmd; union { /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd); + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); /* used for polled completion */ void *cookie; }; @@ -38,9 +38,10 @@ struct io_uring_cmd { #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, + unsigned issue_flags); void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)); + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); @@ -71,11 +72,11 @@ static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, return -EOPNOTSUPP; } static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2) + ssize_t ret2, unsigned issue_flags) { } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } static inline struct sock *io_uring_get_socket(struct file *file) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 18dfc5f6a8b7..92f310dfb9fd 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -15,12 +15,13 @@ static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; - ioucmd->task_work_cb(ioucmd); + ioucmd->task_work_cb(ioucmd, issue_flags); } void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -42,7 +43,8 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, + unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -56,7 +58,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); else - __io_req_complete(req, 0); + __io_req_complete(req, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_done); From 3eb2138d4693d81aa6e5514f439be255117cae63 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Mar 2023 19:56:18 -0600 Subject: [PATCH 131/179] io_uring/poll: clear single/double poll flags on poll arming commit 005308f7bdacf5685ed1a431244a183dbbb9e0e8 upstream. Unless we have at least one entry queued, then don't call into io_poll_remove_entries(). Normally this isn't possible, but if we retry poll then we can have ->nr_entries cleared again as we're setting it up. If this happens for a poll retry, then we'll still have at least REQ_F_SINGLE_POLL set. io_poll_remove_entries() then thinks it has entries to remove. Clear REQ_F_SINGLE_POLL and REQ_F_DOUBLE_POLL unconditionally when arming a poll request. Fixes: c16bda37594f ("io_uring/poll: allow some retries for poll triggering spuriously") Cc: stable@vger.kernel.org Reported-by: Pengfei Xu Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/poll.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/poll.c b/io_uring/poll.c index 56dbd1863c78..4788073ec45d 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -742,6 +742,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; + req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL); req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; From ef329fa764c3c27f43db0f7c57adbe84e9a07f00 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 29 Mar 2023 15:03:43 +0100 Subject: [PATCH 132/179] io_uring/rsrc: fix rogue rsrc node grabbing commit 4ff0b50de8cabba055efe50bbcb7506c41a69835 upstream. We should not be looking at ctx->rsrc_node and anyhow modifying the node without holding uring_lock, grabbing references in such a way is not safe either. Cc: stable@vger.kernel.org Fixes: 5106dd6e74ab6 ("io_uring: propagate issue_flags state down to file assignment") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1202ede2d7bb90136e3482b2b84aad9ed483e5d6.1680098433.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/rsrc.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 81445a477622..d60c758326b4 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -143,15 +143,13 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, unsigned int issue_flags) { if (!req->rsrc_node) { + io_ring_submit_lock(ctx, issue_flags); + + lockdep_assert_held(&ctx->uring_lock); + req->rsrc_node = ctx->rsrc_node; - - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - lockdep_assert_held(&ctx->uring_lock); - - io_charge_rsrc_node(ctx); - } else { - percpu_ref_get(&req->rsrc_node->refs); - } + io_charge_rsrc_node(ctx); + io_ring_submit_unlock(ctx, issue_flags); } } From d67b3cdacac911958c6353f043fe98d42c84f5ad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Mar 2023 06:52:38 -0600 Subject: [PATCH 133/179] io_uring: fix poll/netmsg alloc caches commit fd30d1cdcc4ff405fc54765edf2e11b03f2ed4f3 upstream. We increase cache->nr_cached when we free into the cache but don't decrease when we take from it, so in some time we'll get an empty cache with cache->nr_cached larger than IO_ALLOC_CACHE_MAX, that fails io_alloc_cache_put() and effectively disables caching. Fixes: 9b797a37c4bd8 ("io_uring: add abstraction around apoll cache") Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/alloc_cache.h | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 729793ae9712..c2cde88aeed5 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -27,6 +27,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c struct hlist_node *node = cache->list.first; hlist_del(node); + cache->nr_cached--; return container_of(node, struct io_cache_entry, node); } From 4d35d375efedce73bdc3c66472bad740c5ced9de Mon Sep 17 00:00:00 2001 From: Ronak Doshi Date: Thu, 23 Mar 2023 13:07:21 -0700 Subject: [PATCH 134/179] vmxnet3: use gro callback when UPT is enabled commit 3bced313b9a5a237c347e0f079c8c2fe4b3935aa upstream. Currently, vmxnet3 uses GRO callback only if LRO is disabled. However, on smartNic based setups where UPT is supported, LRO can be enabled from guest VM but UPT devicve does not support LRO as of now. In such cases, there can be performance degradation as GRO is not being done. This patch fixes this issue by calling GRO API when UPT is enabled. We use updateRxProd to determine if UPT mode is active or not. To clarify few things discussed over the thread: The patch is not neglecting any feature bits nor disabling GRO. It uses GRO callback when UPT is active as LRO is not available in UPT. GRO callback cannot be used as default for all cases as it degrades performance for non-UPT cases or for cases when LRO is already done in ESXi. Cc: stable@vger.kernel.org Fixes: 6f91f4ba046e ("vmxnet3: add support for capability registers") Signed-off-by: Ronak Doshi Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230323200721.27622-1-doshir@vmware.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/vmxnet3/vmxnet3_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 682987040ea8..da488cbb0542 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -1688,7 +1688,9 @@ not_lro: if (unlikely(rcd->ts)) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rcd->tci); - if (adapter->netdev->features & NETIF_F_LRO) + /* Use GRO callback if UPT is enabled */ + if ((adapter->netdev->features & NETIF_F_LRO) && + !rq->shared->updateRxProd) netif_receive_skb(skb); else napi_gro_receive(&rq->napi, skb); From d7c67be755ccc488c7c2ae9b98bbd8b6409be4f6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 29 Mar 2023 13:16:01 +0900 Subject: [PATCH 135/179] zonefs: Always invalidate last cached page on append write commit c1976bd8f23016d8706973908f2bb0ac0d852a8f upstream. When a direct append write is executed, the append offset may correspond to the last page of a sequential file inode which might have been cached already by buffered reads, page faults with mmap-read or non-direct readahead. To ensure that the on-disk and cached data is consistant for such last cached page, make sure to always invalidate it in zonefs_file_dio_append(). If the invalidation fails, return -EBUSY to userspace to differentiate from IO errors. This invalidation will always be a no-op when the FS block size (device zone write granularity) is equal to the page size (e.g. 4K). Reported-by: Hans Holmberg Fixes: 02ef12a663c7 ("zonefs: use REQ_OP_ZONE_APPEND for sync DIO") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Tested-by: Hans Holmberg Signed-off-by: Greg Kroah-Hartman --- fs/zonefs/file.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 763f10a61999..63cd50840419 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -382,6 +382,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) struct zonefs_zone *z = zonefs_inode_zone(inode); struct block_device *bdev = inode->i_sb->s_bdev; unsigned int max = bdev_max_zone_append_sectors(bdev); + pgoff_t start, end; struct bio *bio; ssize_t size; int nr_pages; @@ -390,6 +391,19 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); + /* + * If the inode block size (zone write granularity) is smaller than the + * page size, we may be appending data belonging to the last page of the + * inode straddling inode->i_size, with that page already cached due to + * a buffered read or readahead. So make sure to invalidate that page. + * This will always be a no-op for the case where the block size is + * equal to the page size. + */ + start = iocb->ki_pos >> PAGE_SHIFT; + end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT; + if (invalidate_inode_pages2_range(inode->i_mapping, start, end)) + return -EBUSY; + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); if (!nr_pages) return 0; From 8b7c731e5444efbab518c3acf9a4d1b62d18920e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 30 Mar 2023 15:09:29 -0400 Subject: [PATCH 136/179] dm: fix __send_duplicate_bios() to always allow for splitting IO commit 666eed46769d929c3e13636134ecfc67d75ef548 upstream. Commit 7dd76d1feec70 ("dm: improve bio splitting and associated IO accounting") only called setup_split_accounting() from __send_duplicate_bios() if a single bio were being issued. But the case where duplicate bios are issued must call it too. Otherwise the bio won't be split and resubmitted (via recursion through block core back to DM) to submit the later portions of a bio (which may map to an entirely different target). For example, when discarding an entire DM striped device with the following DM table: vg-lvol0: 0 159744 striped 2 128 7:0 2048 7:1 2048 vg-lvol0: 159744 45056 striped 2 128 7:2 2048 7:3 2048 Before (broken, discards the first striped target's devices twice): device-mapper: striped: target_stripe=0, bdev=7:0, start=2048 len=79872 device-mapper: striped: target_stripe=1, bdev=7:1, start=2048 len=79872 device-mapper: striped: target_stripe=0, bdev=7:0, start=2049 len=22528 device-mapper: striped: target_stripe=1, bdev=7:1, start=2048 len=22528 After (works as expected): device-mapper: striped: target_stripe=0, bdev=7:0, start=2048 len=79872 device-mapper: striped: target_stripe=1, bdev=7:1, start=2048 len=79872 device-mapper: striped: target_stripe=0, bdev=7:2, start=2048 len=22528 device-mapper: striped: target_stripe=1, bdev=7:3, start=2048 len=22528 Fixes: 7dd76d1feec70 ("dm: improve bio splitting and associated IO accounting") Cc: stable@vger.kernel.org Reported-by: Orange Kao Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 89bf28ed874c..38d8aa21ef7a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1522,6 +1522,8 @@ static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, ret = 1; break; default: + if (len) + setup_split_accounting(ci, *len); /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ alloc_multiple_bios(&blist, ci, ti, num_bios); while ((clone = bio_list_pop(&blist))) { From ace6aa2ab5ba5869563ca689bbd912100514ae7b Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Mar 2023 14:01:41 +0100 Subject: [PATCH 137/179] can: j1939: prevent deadlock by moving j1939_sk_errqueue() commit d1366b283d94ac4537a4b3a1e8668da4df7ce7e9 upstream. This commit addresses a deadlock situation that can occur in certain scenarios, such as when running data TP/ETP transfer and subscribing to the error queue while receiving a net down event. The deadlock involves locks in the following order: 3 j1939_session_list_lock -> active_session_list_lock j1939_session_activate ... j1939_sk_queue_activate_next -> sk_session_queue_lock ... j1939_xtp_rx_eoma_one 2 j1939_sk_queue_drop_all -> sk_session_queue_lock ... j1939_sk_netdev_event_netdown -> j1939_socks_lock j1939_netdev_notify 1 j1939_sk_errqueue -> j1939_socks_lock __j1939_session_cancel -> active_session_list_lock j1939_tp_rxtimer CPU0 CPU1 ---- ---- lock(&priv->active_session_list_lock); lock(&jsk->sk_session_queue_lock); lock(&priv->active_session_list_lock); lock(&priv->j1939_socks_lock); The solution implemented in this commit is to move the j1939_sk_errqueue() call out of the active_session_list_lock context, thus preventing the deadlock situation. Reported-by: syzbot+ee1cd780f69483a8616b@syzkaller.appspotmail.com Fixes: 5b9272e93f2e ("can: j1939: extend UAPI to notify about RX status") Co-developed-by: Hillf Danton Signed-off-by: Hillf Danton Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/all/20230324130141.2132787-1-o.rempel@pengutronix.de Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/j1939/transport.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 4177e9617070..848b60c9ef79 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1124,8 +1124,6 @@ static void __j1939_session_cancel(struct j1939_session *session, if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); - else - j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } static void j1939_session_cancel(struct j1939_session *session, @@ -1140,6 +1138,9 @@ static void j1939_session_cancel(struct j1939_session *session, } j1939_session_list_unlock(session->priv); + + if (!session->sk) + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) @@ -1253,6 +1254,9 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); } j1939_session_list_unlock(session->priv); + + if (!session->sk) + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } j1939_session_put(session); From cdfac0a5064187219c88ddcc9283fbeb80abf522 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 27 Mar 2023 10:36:45 +0200 Subject: [PATCH 138/179] xen/netback: don't do grant copy across page boundary commit 05310f31ca74673a96567fb14637b7d5d6c82ea5 upstream. Fix xenvif_get_requests() not to do grant copy operations across local page boundaries. This requires to double the maximum number of copy operations per queue, as each copy could now be split into 2. Make sure that struct xenvif_tx_cb doesn't grow too large. Cc: stable@vger.kernel.org Fixes: ad7f402ae4f4 ("xen/netback: Ensure protocol headers don't fall in the non-linear area") Signed-off-by: Juergen Gross Reviewed-by: Paul Durrant Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netback/common.h | 2 +- drivers/net/xen-netback/netback.c | 25 +++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 3dbfc8a6924e..1fcbd83f7ff2 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -166,7 +166,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */ struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; - struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS]; + struct gnttab_copy tx_copy_ops[2 * MAX_PENDING_REQS]; struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; /* passed to gnttab_[un]map_refs with pages under (un)mapping */ diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index bf627af723bf..5c266062c08f 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -334,6 +334,7 @@ static int xenvif_count_requests(struct xenvif_queue *queue, struct xenvif_tx_cb { u16 copy_pending_idx[XEN_NETBK_LEGACY_SLOTS_MAX + 1]; u8 copy_count; + u32 split_mask; }; #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb) @@ -361,6 +362,8 @@ static inline struct sk_buff *xenvif_alloc_skb(unsigned int size) struct sk_buff *skb = alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN, GFP_ATOMIC | __GFP_NOWARN); + + BUILD_BUG_ON(sizeof(*XENVIF_TX_CB(skb)) > sizeof(skb->cb)); if (unlikely(skb == NULL)) return NULL; @@ -396,11 +399,13 @@ static void xenvif_get_requests(struct xenvif_queue *queue, nr_slots = shinfo->nr_frags + 1; copy_count(skb) = 0; + XENVIF_TX_CB(skb)->split_mask = 0; /* Create copy ops for exactly data_len bytes into the skb head. */ __skb_put(skb, data_len); while (data_len > 0) { int amount = data_len > txp->size ? txp->size : data_len; + bool split = false; cop->source.u.ref = txp->gref; cop->source.domid = queue->vif->domid; @@ -413,6 +418,13 @@ static void xenvif_get_requests(struct xenvif_queue *queue, cop->dest.u.gmfn = virt_to_gfn(skb->data + skb_headlen(skb) - data_len); + /* Don't cross local page boundary! */ + if (cop->dest.offset + amount > XEN_PAGE_SIZE) { + amount = XEN_PAGE_SIZE - cop->dest.offset; + XENVIF_TX_CB(skb)->split_mask |= 1U << copy_count(skb); + split = true; + } + cop->len = amount; cop->flags = GNTCOPY_source_gref; @@ -420,7 +432,8 @@ static void xenvif_get_requests(struct xenvif_queue *queue, pending_idx = queue->pending_ring[index]; callback_param(queue, pending_idx).ctx = NULL; copy_pending_idx(skb, copy_count(skb)) = pending_idx; - copy_count(skb)++; + if (!split) + copy_count(skb)++; cop++; data_len -= amount; @@ -441,7 +454,8 @@ static void xenvif_get_requests(struct xenvif_queue *queue, nr_slots--; } else { /* The copy op partially covered the tx_request. - * The remainder will be mapped. + * The remainder will be mapped or copied in the next + * iteration. */ txp->offset += amount; txp->size -= amount; @@ -539,6 +553,13 @@ static int xenvif_tx_check_gop(struct xenvif_queue *queue, pending_idx = copy_pending_idx(skb, i); newerr = (*gopp_copy)->status; + + /* Split copies need to be handled together. */ + if (XENVIF_TX_CB(skb)->split_mask & (1U << i)) { + (*gopp_copy)++; + if (!newerr) + newerr = (*gopp_copy)->status; + } if (likely(!newerr)) { /* The first frag might still have this slot mapped */ if (i < copy_count(skb) - 1 || !sharedslot) From 2269be4951420af24896dd43987333ab35a72fc8 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 23 Mar 2023 12:25:36 +0200 Subject: [PATCH 139/179] net: phy: dp83869: fix default value for tx-/rx-internal-delay commit 82e2c39f9ef78896e9b634dfd82dc042e6956bb7 upstream. dp83869 internally uses a look-up table for mapping supported delays in nanoseconds to register values. When specific delays are defined in device-tree, phy_get_internal_delay does the lookup automatically returning an index. The default case wrongly assigns the nanoseconds value from the lookup table, resulting in numeric value 2000 applied to delay configuration register, rather than the expected index values 0-7 (7 for 2000). Ultimately this issue broke RX for 1Gbps links. Fix default delay configuration by assigning the intended index value directly. Cc: stable@vger.kernel.org Fixes: 736b25afe284 ("net: dp83869: Add RGMII internal delay configuration") Co-developed-by: Yazan Shhady Signed-off-by: Yazan Shhady Signed-off-by: Josua Mayer Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230323102536.31988-1-josua@solid-run.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/dp83869.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index b4ff9c5073a3..9ab5eff502b7 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -588,15 +588,13 @@ static int dp83869_of_init(struct phy_device *phydev) &dp83869_internal_delay[0], delay_size, true); if (dp83869->rx_int_delay < 0) - dp83869->rx_int_delay = - dp83869_internal_delay[DP83869_CLK_DELAY_DEF]; + dp83869->rx_int_delay = DP83869_CLK_DELAY_DEF; dp83869->tx_int_delay = phy_get_internal_delay(phydev, dev, &dp83869_internal_delay[0], delay_size, false); if (dp83869->tx_int_delay < 0) - dp83869->tx_int_delay = - dp83869_internal_delay[DP83869_CLK_DELAY_DEF]; + dp83869->tx_int_delay = DP83869_CLK_DELAY_DEF; return ret; } From 0e7ac17634d2254e0fc65307a5f430e4f1fdb7e9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 22 Mar 2023 19:11:45 +0100 Subject: [PATCH 140/179] modpost: Fix processing of CRCs on 32-bit build machines commit fb27e70f6e408dee5d22b083e7a38a59e6118253 upstream. modpost now reads CRCs from .*.cmd files, parsing them using strtol(). This is inconsistent with its parsing of Module.symvers and with their definition as *unsigned* 32-bit values. strtol() clamps values to [LONG_MIN, LONG_MAX], and when building on a 32-bit system this changes all CRCs >= 0x80000000 to be 0x7fffffff. Change extract_crcs_for_object() to use strtoul() instead. Cc: stable@vger.kernel.org Fixes: f292d875d0dc ("modpost: extract symbol versions from *.cmd files") Signed-off-by: Ben Hutchings Signed-off-by: Masahiro Yamada Signed-off-by: Greg Kroah-Hartman --- scripts/mod/modpost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 2c80da0220c3..1dfa80c6b471 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1722,7 +1722,7 @@ static void extract_crcs_for_object(const char *object, struct module *mod) if (!isdigit(*p)) continue; /* skip this line */ - crc = strtol(p, &p, 0); + crc = strtoul(p, &p, 0); if (*p != '\n') continue; /* skip this line */ From d9c63daa576b24486c66024414f63d224897c796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kornel=20Dul=C4=99ba?= Date: Mon, 20 Mar 2023 09:32:59 +0000 Subject: [PATCH 141/179] pinctrl: amd: Disable and mask interrupts on resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b26cd9325be4c1fcd331b77f10acb627c560d4d7 upstream. This fixes a similar problem to the one observed in: commit 4e5a04be88fe ("pinctrl: amd: disable and mask interrupts on probe"). On some systems, during suspend/resume cycle firmware leaves an interrupt enabled on a pin that is not used by the kernel. This confuses the AMD pinctrl driver and causes spurious interrupts. The driver already has logic to detect if a pin is used by the kernel. Leverage it to re-initialize interrupt fields of a pin only if it's not used by us. Cc: stable@vger.kernel.org Fixes: dbad75dd1f25 ("pinctrl: add AMD GPIO driver support.") Signed-off-by: Kornel Dulęba Link: https://lore.kernel.org/r/20230320093259.845178-1-korneld@chromium.org Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-amd.c | 36 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 32c3edaf9038..5e7b82a2b13d 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -865,32 +865,34 @@ static const struct pinconf_ops amd_pinconf_ops = { .pin_config_group_set = amd_pinconf_group_set, }; -static void amd_gpio_irq_init(struct amd_gpio *gpio_dev) +static void amd_gpio_irq_init_pin(struct amd_gpio *gpio_dev, int pin) { - struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + const struct pin_desc *pd; unsigned long flags; u32 pin_reg, mask; - int i; mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3) | BIT(INTERRUPT_MASK_OFF) | BIT(INTERRUPT_ENABLE_OFF) | BIT(WAKE_CNTRL_OFF_S4); - for (i = 0; i < desc->npins; i++) { - int pin = desc->pins[i].number; - const struct pin_desc *pd = pin_desc_get(gpio_dev->pctrl, pin); + pd = pin_desc_get(gpio_dev->pctrl, pin); + if (!pd) + return; - if (!pd) - continue; + raw_spin_lock_irqsave(&gpio_dev->lock, flags); + pin_reg = readl(gpio_dev->base + pin * 4); + pin_reg &= ~mask; + writel(pin_reg, gpio_dev->base + pin * 4); + raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); +} - raw_spin_lock_irqsave(&gpio_dev->lock, flags); +static void amd_gpio_irq_init(struct amd_gpio *gpio_dev) +{ + struct pinctrl_desc *desc = gpio_dev->pctrl->desc; + int i; - pin_reg = readl(gpio_dev->base + i * 4); - pin_reg &= ~mask; - writel(pin_reg, gpio_dev->base + i * 4); - - raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); - } + for (i = 0; i < desc->npins; i++) + amd_gpio_irq_init_pin(gpio_dev, i); } #ifdef CONFIG_PM_SLEEP @@ -943,8 +945,10 @@ static int amd_gpio_resume(struct device *dev) for (i = 0; i < desc->npins; i++) { int pin = desc->pins[i].number; - if (!amd_gpio_should_save(gpio_dev, pin)) + if (!amd_gpio_should_save(gpio_dev, pin)) { + amd_gpio_irq_init_pin(gpio_dev, pin); continue; + } raw_spin_lock_irqsave(&gpio_dev->lock, flags); gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING; From 61c1f420bb01f33707a15894b26c0ae9906e913e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 24 Feb 2023 14:08:28 +0100 Subject: [PATCH 142/179] pinctrl: at91-pio4: fix domain name assignment commit 7bb97e360acdd38b68ad0a1defb89c6e89c85596 upstream. Since commit d59f6617eef0 ("genirq: Allow fwnode to carry name information only") an IRQ domain is always given a name during allocation (e.g. used for the debugfs entry). Drop the no longer valid name assignment, which would lead to an attempt to free a string constant when removing the domain on late probe failures (e.g. probe deferral). Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") Cc: stable@vger.kernel.org # 4.13 Signed-off-by: Johan Hovold Reviewed-by: Claudiu Beznea Tested-by: Claudiu Beznea # on SAMA7G5 Link: https://lore.kernel.org/r/20230224130828.27985-1-johan+linaro@kernel.org Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/pinctrl-at91-pio4.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 7f193f2b1566..0b7cc6f063e0 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -1176,7 +1176,6 @@ static int atmel_pinctrl_probe(struct platform_device *pdev) dev_err(dev, "can't add the irq domain\n"); return -ENODEV; } - atmel_pioctrl->irq_domain->name = "atmel gpio"; for (i = 0; i < atmel_pioctrl->npins; i++) { int irq = irq_create_mapping(atmel_pioctrl->irq_domain, i); From 44917e8c38d89abd4a7e9a1694e868b83713c743 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 30 Mar 2023 21:46:44 +0200 Subject: [PATCH 143/179] platform/x86: ideapad-laptop: Stop sending KEY_TOUCHPAD_TOGGLE commit e3271a5917d1501089b1a224d702aa053e2877f4 upstream. Commit 5829f8a897e4 ("platform/x86: ideapad-laptop: Send KEY_TOUCHPAD_TOGGLE on some models") made ideapad-laptop send KEY_TOUCHPAD_TOGGLE when we receive an ACPI notify with VPC event bit 5 set and the touchpad-state has not been changed by the EC itself already. This was done under the assumption that this would be good to do to make the touchpad-toggle hotkey work on newer models where the EC does not toggle the touchpad on/off itself (because it is not routed through the PS/2 controller, but uses I2C). But it turns out that at least some models, e.g. the Yoga 7-15ITL5 the EC triggers an ACPI notify with VPC event bit 5 set on resume, which would now cause a spurious KEY_TOUCHPAD_TOGGLE on resume to which the desktop environment responds by disabling the touchpad in software, breaking the touchpad (until manually re-enabled) on resume. It was never confirmed that sending KEY_TOUCHPAD_TOGGLE actually improves things on new models and at least some new models like the Yoga 7-15ITL5 don't have a touchpad on/off toggle hotkey at all, while still sending ACPI notify events with VPC event bit 5 set. So it seems best to revert the change to send KEY_TOUCHPAD_TOGGLE when receiving an ACPI notify events with VPC event bit 5 and the touchpad state as reported by the EC has not changed. Note this is not a full revert the code to cache the last EC touchpad state is kept to avoid sending spurious KEY_TOUCHPAD_ON / _OFF events on resume. Fixes: 5829f8a897e4 ("platform/x86: ideapad-laptop: Send KEY_TOUCHPAD_TOGGLE on some models") Link: https://bugzilla.kernel.org/show_bug.cgi?id=217234 Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20230330194644.64628-1-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/ideapad-laptop.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 4e28c55f0ea5..bd38c7dcae34 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1164,7 +1164,6 @@ static const struct key_entry ideapad_keymap[] = { { KE_KEY, 65, { KEY_PROG4 } }, { KE_KEY, 66, { KEY_TOUCHPAD_OFF } }, { KE_KEY, 67, { KEY_TOUCHPAD_ON } }, - { KE_KEY, 68, { KEY_TOUCHPAD_TOGGLE } }, { KE_KEY, 128, { KEY_ESC } }, /* @@ -1520,18 +1519,16 @@ static void ideapad_sync_touchpad_state(struct ideapad_private *priv, bool send_ if (priv->features.ctrl_ps2_aux_port) i8042_command(¶m, value ? I8042_CMD_AUX_ENABLE : I8042_CMD_AUX_DISABLE); - if (send_events) { - /* - * On older models the EC controls the touchpad and toggles it - * on/off itself, in this case we report KEY_TOUCHPAD_ON/_OFF. - * If the EC did not toggle, report KEY_TOUCHPAD_TOGGLE. - */ - if (value != priv->r_touchpad_val) { - ideapad_input_report(priv, value ? 67 : 66); - sysfs_notify(&priv->platform_device->dev.kobj, NULL, "touchpad"); - } else { - ideapad_input_report(priv, 68); - } + /* + * On older models the EC controls the touchpad and toggles it on/off + * itself, in this case we report KEY_TOUCHPAD_ON/_OFF. Some models do + * an acpi-notify with VPC bit 5 set on resume, so this function get + * called with send_events=true on every resume. Therefor if the EC did + * not toggle, do nothing to avoid sending spurious KEY_TOUCHPAD_TOGGLE. + */ + if (send_events && value != priv->r_touchpad_val) { + ideapad_input_report(priv, value ? 67 : 66); + sysfs_notify(&priv->platform_device->dev.kobj, NULL, "touchpad"); } priv->r_touchpad_val = value; From 064a1c7b0f8403260d77627e62424a72ca26cee2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 26 Mar 2023 16:15:57 -0600 Subject: [PATCH 144/179] powerpc: Don't try to copy PPR for task with NULL pt_regs commit fd7276189450110ed835eb0a334e62d2f1c4e3be upstream. powerpc sets up PF_KTHREAD and PF_IO_WORKER with a NULL pt_regs, which from my (arguably very short) checking is not commonly done for other archs. This is fine, except when PF_IO_WORKER's have been created and the task does something that causes a coredump to be generated. Then we get this crash: Kernel attempted to read user page (160) - exploit attempt? (uid: 1000) BUG: Kernel NULL pointer dereference on read at 0x00000160 Faulting instruction address: 0xc0000000000c3a60 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=32 NUMA pSeries Modules linked in: bochs drm_vram_helper drm_kms_helper xts binfmt_misc ecb ctr syscopyarea sysfillrect cbc sysimgblt drm_ttm_helper aes_generic ttm sg libaes evdev joydev virtio_balloon vmx_crypto gf128mul drm dm_mod fuse loop configfs drm_panel_orientation_quirks ip_tables x_tables autofs4 hid_generic usbhid hid xhci_pci xhci_hcd usbcore usb_common sd_mod CPU: 1 PID: 1982 Comm: ppc-crash Not tainted 6.3.0-rc2+ #88 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,HEAD hv:linux,kvm pSeries NIP: c0000000000c3a60 LR: c000000000039944 CTR: c0000000000398e0 REGS: c0000000041833b0 TRAP: 0300 Not tainted (6.3.0-rc2+) MSR: 800000000280b033 CR: 88082828 XER: 200400f8 ... NIP memcpy_power7+0x200/0x7d0 LR ppr_get+0x64/0xb0 Call Trace: ppr_get+0x40/0xb0 (unreliable) __regset_get+0x180/0x1f0 regset_get_alloc+0x64/0x90 elf_core_dump+0xb98/0x1b60 do_coredump+0x1c34/0x24a0 get_signal+0x71c/0x1410 do_notify_resume+0x140/0x6f0 interrupt_exit_user_prepare_main+0x29c/0x320 interrupt_exit_user_prepare+0x6c/0xa0 interrupt_return_srr_user+0x8/0x138 Because ppr_get() is trying to copy from a PF_IO_WORKER with a NULL pt_regs. Check for a valid pt_regs in both ppc_get/ppr_set, and return an error if not set. The actual error value doesn't seem to be important here, so just pick -EINVAL. Fixes: fa439810cc1b ("powerpc/ptrace: Enable support for NT_PPPC_TAR, NT_PPC_PPR, NT_PPC_DSCR") Cc: stable@vger.kernel.org # v4.8+ Signed-off-by: Jens Axboe [mpe: Trim oops in change log, add Fixes & Cc stable] Signed-off-by: Michael Ellerman Link: https://msgid.link/d9f63344-fe7c-56ae-b420-4a1a04a2ae4c@kernel.dk Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/ptrace/ptrace-view.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 076d867412c7..31876db8e996 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -290,6 +290,9 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset, static int ppr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { + if (!target->thread.regs) + return -EINVAL; + return membuf_write(&to, &target->thread.regs->ppr, sizeof(u64)); } @@ -297,6 +300,9 @@ static int ppr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + if (!target->thread.regs) + return -EINVAL; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.regs->ppr, 0, sizeof(u64)); } From 0bb88976bdd2936caf8a2b35746f43c2d7cb6c27 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Mon, 20 Mar 2023 19:50:08 -0700 Subject: [PATCH 145/179] powerpc/pseries/vas: Ignore VAS update for DLPAR if copy/paste is not enabled commit eca9f6e6f83b6725b84e1c76fdde19b003cff0eb upstream. The hypervisor supports user-mode NX from Power10. pseries_vas_dlpar_cpu() is called from lparcfg_write() to update VAS windows for DLPAR event in shared processor mode and the kernel gets -ENOTSUPP for HCALLs if the user-mode NX is not supported. The current VAS implementation also supports only with Radix page tables. Whereas in dedicated processor mode, pseries_vas_notifier() is registered only if the copy/paste feature is enabled. So instead of displaying HCALL error messages, update VAS capabilities if the copy/paste feature is available. This patch ignores updating VAS capabilities in pseries_vas_dlpar_cpu() and returns success if the copy/paste feature is not enabled. Then lparcfg_write() completes the processor DLPAR operations without any failures. Fixes: 2147783d6bf0 ("powerpc/pseries: Use lparcfg to reconfig VAS windows for DLPAR CPU") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Haren Myneni Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/1d0e727e7dbd9a28627ef08ca9df9c86a50175e2.camel@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/vas.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 4ad6e510d405..94c023bb13e0 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -857,6 +857,13 @@ int pseries_vas_dlpar_cpu(void) { int new_nr_creds, rc; + /* + * NX-GZIP is not enabled. Nothing to do for DLPAR event + */ + if (!copypaste_feat) + return 0; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, (u64)virt_to_phys(&hv_cop_caps)); @@ -1013,6 +1020,7 @@ static int __init pseries_vas_init(void) * Linux supports user space COPY/PASTE only with Radix */ if (!radix_enabled()) { + copypaste_feat = false; pr_err("API is supported only with radix page tables\n"); return -ENOTSUPP; } From 3f5ded246953dd41608444a61b5ae1cfda3ef61e Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Fri, 3 Mar 2023 09:59:47 +1100 Subject: [PATCH 146/179] powerpc/64s: Fix __pte_needs_flush() false positive warning commit 1abce0580b89464546ae06abd5891ebec43c9470 upstream. Userspace PROT_NONE ptes set _PAGE_PRIVILEGED, triggering a false positive debug assertion that __pte_flags_need_flush() is not called on a kernel mapping. Detect when it is a userspace PROT_NONE page by checking the required bits of PAGE_NONE are set, and none of the RWX bits are set. pte_protnone() is insufficient here because it always returns 0 when CONFIG_NUMA_BALANCING=n. Fixes: b11931e9adc1 ("powerpc/64s: add pte_needs_flush and huge_pmd_needs_flush") Cc: stable@vger.kernel.org # v6.1+ Reported-by: Russell Currey Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20230302225947.81083-1-bgray@linux.ibm.com Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/include/asm/book3s/64/tlbflush.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 67655cd60545..985bd8e69bdc 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -163,6 +163,11 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, */ } +static inline bool __pte_protnone(unsigned long pte) +{ + return (pte & (pgprot_val(PAGE_NONE) | _PAGE_RWX)) == pgprot_val(PAGE_NONE); +} + static inline bool __pte_flags_need_flush(unsigned long oldval, unsigned long newval) { @@ -179,8 +184,8 @@ static inline bool __pte_flags_need_flush(unsigned long oldval, /* * We do not expect kernel mappings or non-PTEs or not-present PTEs. */ - VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED); - VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(!__pte_protnone(oldval) && oldval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(!__pte_protnone(newval) && newval & _PAGE_PRIVILEGED); VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE)); VM_WARN_ON_ONCE(!(newval & _PAGE_PTE)); VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT)); From f6bcbd556978edb98ad96bb2038995221f58fa58 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 21 Mar 2023 00:17:36 -0400 Subject: [PATCH 147/179] NFSv4: Fix hangs when recovering open state after a server reboot commit 6165a16a5ad9b237bb3131cff4d3c601ccb8f9a3 upstream. When we're using a cached open stateid or a delegation in order to avoid sending a CLAIM_PREVIOUS open RPC call to the server, we don't have a new open stateid to present to update_open_stateid(). Instead rely on nfs4_try_open_cached(), just as if we were doing a normal open. Fixes: d2bfda2e7aa0 ("NFSv4: don't reprocess cached open CLAIM_PREVIOUS") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d70da78e698d..70e76359909c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1980,8 +1980,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (!data->rpc_done) { if (data->rpc_status) return ERR_PTR(data->rpc_status); - /* cached opens have already been processed */ - goto update; + return nfs4_try_open_cached(data); } ret = nfs_refresh_inode(inode, &data->f_attr); @@ -1990,7 +1989,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); -update: + if (!update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode)) return ERR_PTR(-EAGAIN); From 0c60b9c0b77479eda2e3ab05f0e804722ec2f8b1 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 20 Mar 2023 15:09:54 +0100 Subject: [PATCH 148/179] ALSA: hda/conexant: Partial revert of a quirk for Lenovo commit b871cb971c683f7f212e7ca3c9a6709a75785116 upstream. The recent commit f83bb2592482 ("ALSA: hda/conexant: Add quirk for LENOVO 20149 Notebook model") introduced a quirk for the device with 17aa:3977, but this caused a regression on another model (Lenovo Ideadpad U31) with the very same PCI SSID. And, through skimming over the net, it seems that this PCI SSID is used for multiple different models, so it's no good idea to apply the quirk with the SSID. Although we may take a different ID check (e.g. the codec SSID instead of the PCI SSID), unfortunately, the original patch author couldn't identify the hardware details any longer as the machine was returned, and we can't develop the further proper fix. In this patch, instead, we partially revert the change so that the quirk won't be applied as default for addressing the regression. Meanwhile, the quirk function itself is kept, and it's now made to be applicable via the explicit model=lenovo-20149 option. Fixes: f83bb2592482 ("ALSA: hda/conexant: Add quirk for LENOVO 20149 Notebook model") Reported-by: Jetro Jormalainen Link: https://lore.kernel.org/r/20230308215009.4d3e58a6@mopti Cc: Link: https://lore.kernel.org/r/20230320140954.31154-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_conexant.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 75e1d00074b9..a889cccdd607 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -980,7 +980,10 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x17aa, 0x3905, "Lenovo G50-30", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x390b, "Lenovo G50-80", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x3975, "Lenovo U300s", CXT_FIXUP_STEREO_DMIC), - SND_PCI_QUIRK(0x17aa, 0x3977, "Lenovo IdeaPad U310", CXT_PINCFG_LENOVO_NOTEBOOK), + /* NOTE: we'd need to extend the quirk for 17aa:3977 as the same + * PCI SSID is used on multiple Lenovo models + */ + SND_PCI_QUIRK(0x17aa, 0x3977, "Lenovo IdeaPad U310", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo G50-70", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x397b, "Lenovo S205", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK_VENDOR(0x17aa, "Thinkpad", CXT_FIXUP_THINKPAD_ACPI), @@ -1003,6 +1006,7 @@ static const struct hda_model_fixup cxt5066_fixup_models[] = { { .id = CXT_FIXUP_MUTE_LED_GPIO, .name = "mute-led-gpio" }, { .id = CXT_FIXUP_HP_ZBOOK_MUTE_LED, .name = "hp-zbook-mute-led" }, { .id = CXT_FIXUP_HP_MIC_NO_PRESENCE, .name = "hp-mic-fix" }, + { .id = CXT_PINCFG_LENOVO_NOTEBOOK, .name = "lenovo-20149" }, {} }; From 3e120e9200160e177298db4d1ffc363942b8fcfb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 24 Mar 2023 08:50:05 +0100 Subject: [PATCH 149/179] ALSA: usb-audio: Fix regression on detection of Roland VS-100 commit fa4e7a6fa12b1132340785e14bd439cbe95b7a5a upstream. It's been reported that the recent kernel can't probe the PCM devices on Roland VS-100 properly, and it turned out to be a regression by the recent addition of the bit shift range check for the format bits. In the old code, we just did bit-shift and it resulted in zero, which is then corrected to the standard PCM format, while the new code explicitly returns an error in such a case. For addressing the regression, relax the check and fallback to the standard PCM type (with the info output). Fixes: 43d5ca88dfcd ("ALSA: usb-audio: Fix potential out-of-bounds shift") Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=217084 Link: https://lore.kernel.org/r/20230324075005.19403-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/format.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/usb/format.c b/sound/usb/format.c index 405dc0bf6678..4b1c5ba121f3 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -39,8 +39,12 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip, case UAC_VERSION_1: default: { struct uac_format_type_i_discrete_descriptor *fmt = _fmt; - if (format >= 64) - return 0; /* invalid format */ + if (format >= 64) { + usb_audio_info(chip, + "%u:%d: invalid format type 0x%llx is detected, processed as PCM\n", + fp->iface, fp->altsetting, format); + format = UAC_FORMAT_TYPE_I_PCM; + } sample_width = fmt->bBitResolution; sample_bytes = fmt->bSubframeSize; format = 1ULL << format; From 036d5ae0a7cf0e80617bb467e7d1fd8c8abf882f Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Fri, 17 Mar 2023 08:18:25 -0600 Subject: [PATCH 150/179] ALSA: hda/realtek: Add quirks for some Clevo laptops commit b7a5822810c4398515300d614d988cf638adecad upstream. Add the audio quirk for some of Clevo's latest RPL laptops: - NP50RNJS (ALC256) - NP70SNE (ALC256) - PD50SNE (ALC1220) - PE60RNE (ALC1220) Co-authored-by: Jeremy Soller Signed-off-by: Tim Crawford Cc: Link: https://lore.kernel.org/r/20230317141825.11807-1-tcrawford@system76.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 28ac6c159b2a..62c5486d3af6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2631,6 +2631,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x65e5, "Clevo PC50D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65f1, "Clevo PC50HS", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65f5, "Clevo PD50PN[NRT]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x66a2, "Clevo PE60RNE", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), @@ -2651,6 +2652,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x96e1, "Clevo P960[ER][CDFN]-K", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x97e1, "Clevo P970[ER][CDFN]", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x97e2, "Clevo P970RC-M", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1558, 0xd502, "Clevo PD50SNE", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK_VENDOR(0x1558, "Clevo laptop", ALC882_FIXUP_EAPD), SND_PCI_QUIRK(0x161f, 0x2054, "Medion laptop", ALC883_FIXUP_EAPD), SND_PCI_QUIRK(0x17aa, 0x3a0d, "Lenovo Y530", ALC882_FIXUP_LENOVO_Y530), @@ -9574,6 +9576,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x5101, "Clevo S510WU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x5157, "Clevo W517GU1", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x51a1, "Clevo NS50MU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x5630, "Clevo NP50RNJS", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70a1, "Clevo NB70T[HJK]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70b3, "Clevo NK70SB", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x70f2, "Clevo NH79EPY", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), @@ -9608,6 +9611,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x971d, "Clevo N970T[CDF]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xa500, "Clevo NL5[03]RU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xa600, "Clevo NL50NU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0xa671, "Clevo NP70SN[CDE]", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb018, "Clevo NP50D[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0xb022, "Clevo NH77D[DC][QW]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), From 3a0e34af6bffee89c5344e00f15844806b9d44b1 Mon Sep 17 00:00:00 2001 From: huangwenhui Date: Tue, 28 Mar 2023 15:46:44 +0800 Subject: [PATCH 151/179] ALSA: hda/realtek: Add quirk for Lenovo ZhaoYang CF4620Z commit 52aad39385e1bfdb34a1b405f699a8ef302c58b0 upstream. Fix headset microphone detection on Lenovo ZhaoYang CF4620Z. [ adjusted to be applicable to the latest tree -- tiwai ] Signed-off-by: huangwenhui Cc: Link: https://lore.kernel.org/r/20230328074644.30142-1-huangwenhuia@uniontech.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 62c5486d3af6..070150bbd355 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9712,6 +9712,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x511e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x511f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), + SND_PCI_QUIRK(0x17aa, 0x9e56, "Lenovo ZhaoYang CF4620Z", ALC286_FIXUP_SONY_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1849, 0x1233, "ASRock NUC Box 1100", ALC233_FIXUP_NO_AUDIO_JACK), SND_PCI_QUIRK(0x1849, 0xa233, "Positivo Master C6300", ALC269_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS), From 75289cdbe125e62036a72369206ce661e3e4cff2 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Thu, 16 Mar 2023 23:00:21 -0700 Subject: [PATCH 152/179] xtensa: fix KASAN report for show_stack commit 1d3b7a788ca7435156809a6bd5b20c95b2370d45 upstream. show_stack dumps raw stack contents which may trigger an unnecessary KASAN report. Fix it by copying stack contents to a temporary buffer with __memcpy and then printing that buffer instead of passing stack pointer directly to the print_hex_dump. Cc: stable@vger.kernel.org Signed-off-by: Max Filippov Signed-off-by: Greg Kroah-Hartman --- arch/xtensa/kernel/traps.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 0c25e035ff10..9e9ade20a7ce 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -541,7 +541,7 @@ static size_t kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { - size_t len; + size_t len, off = 0; if (!sp) sp = stack_pointer(task); @@ -550,9 +550,17 @@ void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) kstack_depth_to_print * STACK_DUMP_ENTRY_SIZE); printk("%sStack:\n", loglvl); - print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, - STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, - sp, len, false); + while (off < len) { + u8 line[STACK_DUMP_LINE_SIZE]; + size_t line_len = len - off > STACK_DUMP_LINE_SIZE ? + STACK_DUMP_LINE_SIZE : len - off; + + __memcpy(line, (u8 *)sp + off, line_len); + print_hex_dump(loglvl, " ", DUMP_PREFIX_NONE, + STACK_DUMP_LINE_SIZE, STACK_DUMP_ENTRY_SIZE, + line, line_len, false); + off += STACK_DUMP_LINE_SIZE; + } show_trace(task, sp, loglvl); } From 69bec5ac6ea078c6df9018ac3c26c37cb3b0b6c0 Mon Sep 17 00:00:00 2001 From: Douglas Raillard Date: Mon, 6 Mar 2023 12:27:43 +0000 Subject: [PATCH 153/179] rcu: Fix rcu_torture_read ftrace event commit d18a04157fc171fd48075e3dc96471bd3b87f0dd upstream. Fix the rcutorturename field so that its size is correctly reported in the text format embedded in trace.dat files. As it stands, it is reported as being of size 1: field:char rcutorturename[8]; offset:8; size:1; signed:0; Signed-off-by: Douglas Raillard Reviewed-by: Mukesh Ojha Cc: stable@vger.kernel.org Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()") Reviewed-by: Steven Rostedt (Google) [ boqun: Add "Cc" and "Fixes" tags per Steven ] Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney Signed-off-by: Greg Kroah-Hartman --- include/trace/events/rcu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 90b2fb0292cb..012fa0d171b2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -768,7 +768,7 @@ TRACE_EVENT_RCU(rcu_torture_read, TP_ARGS(rcutorturename, rhp, secs, c_old, c), TP_STRUCT__entry( - __field(char, rcutorturename[RCUTORTURENAME_LEN]) + __array(char, rcutorturename, RCUTORTURENAME_LEN) __field(struct rcu_head *, rhp) __field(unsigned long, secs) __field(unsigned long, c_old) From bc2f8b56217b50ef95498dc56736b955c002aade Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 14 Feb 2023 15:26:43 +0100 Subject: [PATCH 154/179] dt-bindings: mtd: jedec,spi-nor: Document CPOL/CPHA support commit a56cde41340ac4049fa6edac9e6cfbcd2804074e upstream. SPI EEPROMs typically support both SPI Mode 0 (CPOL=CPHA=0) and Mode 3 (CPOL=CPHA=1). However, using the latter is currently flagged as an error by "make dtbs_check", e.g.: arch/arm/boot/dts/r8a7791-koelsch.dtb: flash@0: Unevaluated properties are not allowed ('spi-cpha', 'spi-cpol' were unexpected) From schema: Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml Fix this by documenting support for CPOL=CPHA=1. Fixes: 233363aba72ac638 ("spi/panel: dt-bindings: drop CPHA and CPOL from common properties") Cc: stable@vger.kernel.org Signed-off-by: Geert Uytterhoeven Reviewed-by: Miquel Raynal Reviewed-by: Krzysztof Kozlowski Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/afe470603028db9374930b0c57464b1f6d52bdd3.1676384304.git.geert+renesas@glider.be Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml index 7149784a36ac..3ee77af5a74c 100644 --- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml +++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.yaml @@ -84,6 +84,13 @@ patternProperties: "^otp(-[0-9]+)?$": type: object + spi-cpol: true + spi-cpha: true + +dependencies: + spi-cpol: [ spi-cpha ] + spi-cpha: [ spi-cpol ] + unevaluatedProperties: false examples: From a028d92967bba61b4909ea26044d98260db1eaed Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 23 Mar 2023 13:09:16 +0100 Subject: [PATCH 155/179] s390/uaccess: add missing earlyclobber annotations to __clear_user() commit 89aba4c26fae4e459f755a18912845c348ee48f3 upstream. Add missing earlyclobber annotation to size, to, and tmp2 operands of the __clear_user() inline assembly since they are modified or written to before the last usage of all input operands. This can lead to incorrect register allocation for the inline assembly. Fixes: 6c2a9e6df604 ("[S390] Use alternative user-copy operations for new hardware.") Reported-by: Mark Rutland Link: https://lore.kernel.org/all/20230321122514.1743889-3-mark.rutland@arm.com/ Cc: stable@vger.kernel.org Reviewed-by: Gerald Schaefer Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- arch/s390/lib/uaccess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 720036fb1924..d44214072779 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -172,7 +172,7 @@ unsigned long __clear_user(void __user *to, unsigned long size) "4: slgr %0,%0\n" "5:\n" EX_TABLE(0b,2b) EX_TABLE(6b,2b) EX_TABLE(3b,5b) EX_TABLE(7b,5b) - : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) + : "+&a" (size), "+&a" (to), "+a" (tmp1), "=&a" (tmp2) : "a" (empty_zero_page), [spec] "d" (spec.val) : "cc", "memory", "0"); return size; From 3f878da42862ca0a75051846bf992ff2c370bea6 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 16 Mar 2023 12:28:09 +0100 Subject: [PATCH 156/179] s390: reintroduce expoline dependence to scripts commit 7bb2107e63d8a4a13bbb6fe0e1cbd68784a2e9ac upstream. Expolines depend on scripts/basic/fixdep. And build of expolines can now race with the fixdep build: make[1]: *** Deleting file 'arch/s390/lib/expoline/expoline.o' /bin/sh: line 1: scripts/basic/fixdep: Permission denied make[1]: *** [../scripts/Makefile.build:385: arch/s390/lib/expoline/expoline.o] Error 126 make: *** [../arch/s390/Makefile:166: expoline_prepare] Error 2 The dependence was removed in the below Fixes: commit. So reintroduce the dependence on scripts. Fixes: a0b0987a7811 ("s390/nospec: remove unneeded header includes") Cc: Joe Lawrence Cc: stable@vger.kernel.org Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: linux-s390@vger.kernel.org Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20230316112809.7903-1-jirislaby@kernel.org Signed-off-by: Vasily Gorbik Signed-off-by: Greg Kroah-Hartman --- arch/s390/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index b3235ab0ace8..ed646c583e4f 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -162,7 +162,7 @@ vdso_prepare: prepare0 ifdef CONFIG_EXPOLINE_EXTERN modules_prepare: expoline_prepare -expoline_prepare: +expoline_prepare: scripts $(Q)$(MAKE) $(build)=arch/s390/lib/expoline arch/s390/lib/expoline/expoline.o endif endif From f931ca46773a4f02040f63b4cdd1b14abcc130c1 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2023 18:21:54 +0100 Subject: [PATCH 157/179] drm/etnaviv: fix reference leak when mmaping imported buffer commit 963b2e8c428f79489ceeb058e8314554ec9cbe6f upstream. drm_gem_prime_mmap() takes a reference on the GEM object, but before that drm_gem_mmap_obj() already takes a reference, which will be leaked as only one reference is dropped when the mapping is closed. Drop the extra reference when dma_buf_mmap() succeeds. Cc: stable@vger.kernel.org Signed-off-by: Lucas Stach Reviewed-by: Christian Gmeiner Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c index 3fa2da149639..fc594ea5bbc1 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c @@ -91,7 +91,15 @@ static void *etnaviv_gem_prime_vmap_impl(struct etnaviv_gem_object *etnaviv_obj) static int etnaviv_gem_prime_mmap_obj(struct etnaviv_gem_object *etnaviv_obj, struct vm_area_struct *vma) { - return dma_buf_mmap(etnaviv_obj->base.dma_buf, vma, 0); + int ret; + + ret = dma_buf_mmap(etnaviv_obj->base.dma_buf, vma, 0); + if (!ret) { + /* Drop the reference acquired by drm_gem_mmap_obj(). */ + drm_gem_object_put(&etnaviv_obj->base); + } + + return ret; } static const struct etnaviv_gem_ops etnaviv_gem_prime_ops = { From febacc33298f8d72d5bfd4d23a556cc21b5cd6c2 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Thu, 30 Mar 2023 10:33:02 +0800 Subject: [PATCH 158/179] drm/amdgpu: allow more APUs to do mode2 reset when go to S4 commit 2fec9dc8e0acc3dfb56d1389151bcf405f087b10 upstream. Skip mode2 reset only for IMU enabled APUs when do S4. This patch is to fix the regression issue https://gitlab.freedesktop.org/drm/amd/-/issues/2483 It is generated by commit b589626674de ("drm/amdgpu: skip ASIC reset for APUs when go to S4"). Fixes: b589626674de ("drm/amdgpu: skip ASIC reset for APUs when go to S4") Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2483 Tested-by: Yuan Perry Signed-off-by: Tim Huang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 435d81d6ffd9..a78e80f9f65c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -981,7 +981,12 @@ static bool amdgpu_atcs_pci_probe_handle(struct pci_dev *pdev) */ bool amdgpu_acpi_should_gpu_reset(struct amdgpu_device *adev) { - if (adev->flags & AMD_IS_APU) + if ((adev->flags & AMD_IS_APU) && + adev->gfx.imu.funcs) /* Not need to do mode2 reset for IMU enabled APUs */ + return false; + + if ((adev->flags & AMD_IS_APU) && + amdgpu_acpi_is_s3_active(adev)) return false; if (amdgpu_sriov_vf(adev)) From 41abe8828c83e8f73940790a3861b498a8b5ee3f Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Fri, 24 Feb 2023 13:45:21 -0500 Subject: [PATCH 159/179] drm/amd/display: Add DSC Support for Synaptics Cascaded MST Hub commit f4f3b7dedbe849e780c779ba67365bb1db0d8637 upstream. Traditional synaptics hub has one MST branch device without virtual dpcd. Synaptics cascaded hub has two chained MST branch devices. DSC decoding is performed via root MST branch device, instead of the second MST branch device. Reviewed-by: Hersen Wu Acked-by: Qingqing Zhuo Signed-off-by: Fangzhi Zuo Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 19 +++++++++++++++++++ .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 12 ++++++++++++ 2 files changed, 31 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 8561e9b017a2..36c19579f2d1 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -208,6 +208,21 @@ bool needs_dsc_aux_workaround(struct dc_link *link) return false; } +bool is_synaptics_cascaded_panamera(struct dc_link *link, struct drm_dp_mst_port *port) +{ + u8 branch_vendor_data[4] = { 0 }; // Vendor data 0x50C ~ 0x50F + + if (drm_dp_dpcd_read(port->mgr->aux, DP_BRANCH_VENDOR_SPECIFIC_START, &branch_vendor_data, 4) == 4) { + if (link->dpcd_caps.branch_dev_id == DP_BRANCH_DEVICE_ID_90CC24 && + IS_SYNAPTICS_CASCADED_PANAMERA(link->dpcd_caps.branch_dev_name, branch_vendor_data)) { + DRM_INFO("Synaptics Cascaded MST hub\n"); + return true; + } + } + + return false; +} + static bool validate_dsc_caps_on_connector(struct amdgpu_dm_connector *aconnector) { struct dc_sink *dc_sink = aconnector->dc_sink; @@ -231,6 +246,10 @@ static bool validate_dsc_caps_on_connector(struct amdgpu_dm_connector *aconnecto needs_dsc_aux_workaround(aconnector->dc_link)) aconnector->dsc_aux = &aconnector->mst_port->dm_dp_aux.aux; + /* synaptics cascaded MST hub case */ + if (!aconnector->dsc_aux && is_synaptics_cascaded_panamera(aconnector->dc_link, port)) + aconnector->dsc_aux = port->mgr->aux; + if (!aconnector->dsc_aux) return false; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h index 97fd70df531b..0b5750202e73 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h @@ -34,6 +34,18 @@ #define SYNAPTICS_RC_OFFSET 0x4BC #define SYNAPTICS_RC_DATA 0x4C0 +#define DP_BRANCH_VENDOR_SPECIFIC_START 0x50C + +/** + * Panamera MST Hub detection + * Offset DPCD 050Eh == 0x5A indicates cascaded MST hub case + * Check from beginning of branch device vendor specific field (050Ch) + */ +#define IS_SYNAPTICS_PANAMERA(branchDevName) (((int)branchDevName[4] & 0xF0) == 0x50 ? 1 : 0) +#define BRANCH_HW_REVISION_PANAMERA_A2 0x10 +#define SYNAPTICS_CASCADED_HUB_ID 0x5A +#define IS_SYNAPTICS_CASCADED_PANAMERA(devName, data) ((IS_SYNAPTICS_PANAMERA(devName) && ((int)data[2] == SYNAPTICS_CASCADED_HUB_ID)) ? 1 : 0) + struct amdgpu_display_manager; struct amdgpu_dm_connector; From fd71f4c9e3fa7454f9797c539abd5fcfc8b92b41 Mon Sep 17 00:00:00 2001 From: Fangzhi Zuo Date: Tue, 28 Feb 2023 21:34:58 -0500 Subject: [PATCH 160/179] drm/amd/display: Take FEC Overhead into Timeslot Calculation commit 68dc1846c3a44d5e633be145c169ce2fd5420695 upstream. 8b/10b encoding needs to add 3% fec overhead into the pbn. In the Synapcis Cascaded MST hub, the first stage MST branch device needs the information to determine the timeslot count for the second stage MST branch device. Missing this overhead will leads to insufficient timeslot allocation. Cc: stable@vger.kernel.org Cc: Mario Limonciello Reviewed-by: Hersen Wu Acked-by: Qingqing Zhuo Signed-off-by: Fangzhi Zuo Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 32 ++++++++++++++----- .../display/amdgpu_dm/amdgpu_dm_mst_types.h | 3 ++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 36c19579f2d1..df74bc88e460 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -646,12 +646,25 @@ struct dsc_mst_fairness_params { struct amdgpu_dm_connector *aconnector; }; -static int kbps_to_peak_pbn(int kbps) +static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link) +{ + u8 link_coding_cap; + uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B; + + link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link); + if (link_coding_cap == DP_128b_132b_ENCODING) + fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B; + + return fec_overhead_multiplier_x1000; +} + +static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000) { u64 peak_kbps = kbps; peak_kbps *= 1006; - peak_kbps = div_u64(peak_kbps, 1000); + peak_kbps *= fec_overhead_multiplier_x1000; + peak_kbps = div_u64(peak_kbps, 1000 * 1000); return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000)); } @@ -738,11 +751,12 @@ static int increase_dsc_bpp(struct drm_atomic_state *state, int link_timeslots_used; int fair_pbn_alloc; int ret = 0; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled) { initial_slack[i] = - kbps_to_peak_pbn(params[i].bw_range.max_kbps) - vars[i + k].pbn; + kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn; bpp_increased[i] = false; remaining_to_increase += 1; } else { @@ -838,6 +852,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, int next_index; int remaining_to_try = 0; int ret; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); for (i = 0; i < count; i++) { if (vars[i + k].dsc_enabled @@ -867,7 +882,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, if (next_index == -1) break; - vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps); + vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -880,7 +895,7 @@ static int try_disable_dsc(struct drm_atomic_state *state, vars[next_index].dsc_enabled = false; vars[next_index].bpp_x16 = 0; } else { - vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.max_kbps); + vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.max_kbps, fec_overhead_multiplier_x1000); ret = drm_dp_atomic_find_time_slots(state, params[next_index].port->mgr, params[next_index].port, @@ -909,6 +924,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, int count = 0; int i, k, ret; bool debugfs_overwrite = false; + uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link); memset(params, 0, sizeof(params)); @@ -970,7 +986,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, /* Try no compression */ for (i = 0; i < count; i++) { vars[i + k].aconnector = params[i].aconnector; - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port, @@ -989,7 +1005,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, /* Try max compression */ for (i = 0; i < count; i++) { if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = true; vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, @@ -997,7 +1013,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state, if (ret < 0) return ret; } else { - vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps); + vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000); vars[i + k].dsc_enabled = false; vars[i + k].bpp_x16 = 0; ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h index 0b5750202e73..1e4ede1e57ab 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h @@ -46,6 +46,9 @@ #define SYNAPTICS_CASCADED_HUB_ID 0x5A #define IS_SYNAPTICS_CASCADED_PANAMERA(devName, data) ((IS_SYNAPTICS_PANAMERA(devName) && ((int)data[2] == SYNAPTICS_CASCADED_HUB_ID)) ? 1 : 0) +#define PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B 1031 +#define PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B 1000 + struct amdgpu_display_manager; struct amdgpu_dm_connector; From 21ee19974b19edcda7a8e4f9bd5b02c3a750a69e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 16 Mar 2023 17:59:18 +0100 Subject: [PATCH 161/179] drm/i915/gem: Flush lmem contents after construction commit d032ca43f2c80049ce5aabd3f208dc3849359497 upstream. i915_gem_object_create_lmem_from_data() lacks the flush of the data written to lmem to ensure the object is marked as dirty and the writes flushed to the backing store. Once created, we can immediately release the obj->mm.mapping caching of the vmap. Fixes: 7acbbc7cf485 ("drm/i915/guc: put all guc objects in lmem when available") Cc: Matthew Auld Cc: Daniele Ceraolo Spurio Cc: Andi Shyti Cc: Matthew Brost Cc: John Harrison Signed-off-by: Chris Wilson Cc: # v5.16+ Signed-off-by: Nirmoy Das Reviewed-by: Andi Shyti Reviewed-by: Nirmoy Das Link: https://patchwork.freedesktop.org/patch/msgid/20230316165918.13074-1-nirmoy.das@intel.com (cherry picked from commit e2ee10474ce766686e7a7496585cdfaf79e3a1bf) Signed-off-by: Jani Nikula Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/gem/i915_gem_lmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_lmem.c b/drivers/gpu/drm/i915/gem/i915_gem_lmem.c index 8949fb0a944f..3198b64ad7db 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_lmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_lmem.c @@ -127,7 +127,8 @@ i915_gem_object_create_lmem_from_data(struct drm_i915_private *i915, memcpy(map, data, size); - i915_gem_object_unpin_map(obj); + i915_gem_object_flush_map(obj); + __i915_gem_object_release_map(obj); return obj; } From c781c107731fc09ce4330c8c636b8446d0f72aa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 20 Mar 2023 11:05:17 +0200 Subject: [PATCH 162/179] drm/i915/dpt: Treat the DPT BO as a framebuffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3413881e1ecc3cba722a2e87ec099692eed5be28 upstream. Currently i915_gem_object_is_framebuffer() doesn't treat the BO containing the framebuffer's DPT as a framebuffer itself. This means eg. that the shrinker can evict the DPT BO while leaving the actual FB BO bound, when the DPT is allocated from regular shmem. That causes an immediate oops during hibernate as we try to rewrite the PTEs inside the already evicted DPT obj. TODO: presumably this might also be the reason for the DPT related display faults under heavy memory pressure, but I'm still not sure how that would happen as the object should be pinned by intel_dpt_pin() while in active use by the display engine... Cc: stable@vger.kernel.org Cc: Juha-Pekka Heikkila Cc: Matthew Auld Cc: Imre Deak Fixes: 0dc987b699ce ("drm/i915/display: Add smem fallback allocation for dpt") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20230320090522.9909-2-ville.syrjala@linux.intel.com Reviewed-by: Juha-Pekka Heikkila (cherry picked from commit 779cb5ba64ec7df80675a956c9022929514f517a) Signed-off-by: Jani Nikula Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/display/intel_dpt.c | 2 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h | 2 +- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c b/drivers/gpu/drm/i915/display/intel_dpt.c index ac587647e1f5..a3893aff3861 100644 --- a/drivers/gpu/drm/i915/display/intel_dpt.c +++ b/drivers/gpu/drm/i915/display/intel_dpt.c @@ -300,6 +300,7 @@ intel_dpt_create(struct intel_framebuffer *fb) vm->pte_encode = gen8_ggtt_pte_encode; dpt->obj = dpt_obj; + dpt->obj->is_dpt = true; return &dpt->vm; } @@ -308,5 +309,6 @@ void intel_dpt_destroy(struct i915_address_space *vm) { struct i915_dpt *dpt = i915_vm_to_dpt(vm); + dpt->obj->is_dpt = false; i915_vm_put(&dpt->vm); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 1723af9b0f6a..ea951e2f55b1 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -319,7 +319,7 @@ i915_gem_object_never_mmap(const struct drm_i915_gem_object *obj) static inline bool i915_gem_object_is_framebuffer(const struct drm_i915_gem_object *obj) { - return READ_ONCE(obj->frontbuffer); + return READ_ONCE(obj->frontbuffer) || obj->is_dpt; } static inline unsigned int diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index ab4c2f90a564..1d0d8ee9d707 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -491,6 +491,9 @@ struct drm_i915_gem_object { */ unsigned int cache_dirty:1; + /* @is_dpt: Object houses a display page table (DPT) */ + unsigned int is_dpt:1; + /** * @read_domains: Read memory domains. * From 0fc6fea41c7122aa5f2088117f50144b507e13d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 20 Mar 2023 20:35:32 +0200 Subject: [PATCH 163/179] drm/i915: Disable DC states for all commits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a2b6e99d8a623544f3bdccd28ee35b9c1b00daa5 upstream. Keeping DC states enabled is incompatible with the _noarm()/_arm() split we use for writing pipe/plane registers. When DC5 and PSR are enabled, all pipe/plane registers effectively become self-arming on account of DC5 exit arming the update, and PSR exit latching it. What probably saves us most of the time is that (with PIPE_MISC[21]=0) all pipe register writes themselves trigger PSR exit, and then we don't re-enter PSR until the idle frame count has elapsed. So it may be that the PSR exit happens already before we've updated the state too much. Also the PSR1 panel (at least on this KBL) seems to discard the first frame we trasmit, presumably still scanning out from its internal framebuffer at that point. So only the second frame we transmit is actually visible. But I suppose that could also be panel specific behaviour. I haven't checked out how other PSR panels behave, nor did I bother to check what the eDP spec has to say about this. And since this really is all about DC states, let's switch from the MODESET domain to the DC_OFF domain. Functionally they are 100% identical. We should probably remove the MODESET domain... And for good measure let's toss in an assert to the place where we do the _noarm() register writes to make sure DC states are in fact off. v2: Just use intel_display_power_is_enabled() (Imre) Cc: #v5.17+ Cc: Manasi Navare Cc: Drew Davenport Cc: Jouni Högander Reviewed-by: Imre Deak Fixes: d13dde449580 ("drm/i915: Split pipe+output CSC programming to noarm+arm pair") Fixes: f8a005eb8972 ("drm/i915: Optimize icl+ universal plane programming") Fixes: 890b6ec4a522 ("drm/i915: Split skl+ plane update into noarm+arm pair") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20230320183532.17727-1-ville.syrjala@linux.intel.com (cherry picked from commit 41b4c7fe72b6105a4b49395eea9aa40cef94288d) Signed-off-by: Jani Nikula Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/display/intel_display.c | 28 +++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index e0d36edd1e3f..3f3982ae9974 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -7124,6 +7124,8 @@ static void intel_update_crtc(struct intel_atomic_state *state, intel_fbc_update(state, crtc); + drm_WARN_ON(&i915->drm, !intel_display_power_is_enabled(i915, POWER_DOMAIN_DC_OFF)); + if (!modeset && (new_crtc_state->uapi.color_mgmt_changed || new_crtc_state->update_pipe)) @@ -7500,8 +7502,28 @@ static void intel_atomic_commit_tail(struct intel_atomic_state *state) drm_atomic_helper_wait_for_dependencies(&state->base); drm_dp_mst_atomic_wait_for_dependencies(&state->base); - if (state->modeset) - wakeref = intel_display_power_get(dev_priv, POWER_DOMAIN_MODESET); + /* + * During full modesets we write a lot of registers, wait + * for PLLs, etc. Doing that while DC states are enabled + * is not a good idea. + * + * During fastsets and other updates we also need to + * disable DC states due to the following scenario: + * 1. DC5 exit and PSR exit happen + * 2. Some or all _noarm() registers are written + * 3. Due to some long delay PSR is re-entered + * 4. DC5 entry -> DMC saves the already written new + * _noarm() registers and the old not yet written + * _arm() registers + * 5. DC5 exit -> DMC restores a mixture of old and + * new register values and arms the update + * 6. PSR exit -> hardware latches a mixture of old and + * new register values -> corrupted frame, or worse + * 7. New _arm() registers are finally written + * 8. Hardware finally latches a complete set of new + * register values, and subsequent frames will be OK again + */ + wakeref = intel_display_power_get(dev_priv, POWER_DOMAIN_DC_OFF); intel_atomic_prepare_plane_clear_colors(state); @@ -7640,8 +7662,8 @@ static void intel_atomic_commit_tail(struct intel_atomic_state *state) * the culprit. */ intel_uncore_arm_unclaimed_mmio_detection(&dev_priv->uncore); - intel_display_power_put(dev_priv, POWER_DOMAIN_MODESET, wakeref); } + intel_display_power_put(dev_priv, POWER_DOMAIN_DC_OFF, wakeref); intel_runtime_pm_put(&dev_priv->runtime_pm, state->wakeref); /* From fcf712b4e5d0aacbc193e71962bdaa4d4afe3335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 20 Mar 2023 11:54:34 +0200 Subject: [PATCH 164/179] drm/i915: Move CSC load back into .color_commit_arm() when PSR is enabled on skl/glk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a8e03e00b62073b494886dbff32f8b5338066c8b upstream. SKL/GLK CSC unit suffers from a nasty issue where a CSC coeff/offset register read or write between DC5 exit and PSR exit will undo the CSC arming performed by DMC, and then during PSR exit the hardware will latch zeroes into the active CSC registers. This causes any plane going through the CSC to output all black. We can sidestep the issue by making sure the PSR exit has already actually happened before we touch the CSC coeff/offset registers. Easiest way to guarantee that is to just move the CSC programming back into the .color_commir_arm() as we force a PSR exit (and crucially wait for it to actually happen) prior to touching the arming registers. When PSR (and thus also DC states) are disabled we don't have anything to worry about, so we can keep using the more optional _noarm() hook for writing the CSC registers. Cc: #v5.19+ Cc: Manasi Navare Cc: Drew Davenport Cc: Imre Deak Cc: Jouni Högander Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8283 Fixes: d13dde449580 ("drm/i915: Split pipe+output CSC programming to noarm+arm pair") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20230320095438.17328-3-ville.syrjala@linux.intel.com Reviewed-by: Imre Deak (cherry picked from commit 80a892a4c2428b65366721599fc5fe50eaed35fd) Signed-off-by: Jani Nikula Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/display/intel_color.c | 23 ++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c index 6bda4274eae9..c85757f55112 100644 --- a/drivers/gpu/drm/i915/display/intel_color.c +++ b/drivers/gpu/drm/i915/display/intel_color.c @@ -499,6 +499,22 @@ static void icl_color_commit_noarm(const struct intel_crtc_state *crtc_state) icl_load_csc_matrix(crtc_state); } +static void skl_color_commit_noarm(const struct intel_crtc_state *crtc_state) +{ + /* + * Possibly related to display WA #1184, SKL CSC loses the latched + * CSC coeff/offset register values if the CSC registers are disarmed + * between DC5 exit and PSR exit. This will cause the plane(s) to + * output all black (until CSC_MODE is rearmed and properly latched). + * Once PSR exit (and proper register latching) has occurred the + * danger is over. Thus when PSR is enabled the CSC coeff/offset + * register programming will be peformed from skl_color_commit_arm() + * which is called after PSR exit. + */ + if (!crtc_state->has_psr) + ilk_load_csc_matrix(crtc_state); +} + static void ilk_color_commit_noarm(const struct intel_crtc_state *crtc_state) { ilk_load_csc_matrix(crtc_state); @@ -541,6 +557,9 @@ static void skl_color_commit_arm(const struct intel_crtc_state *crtc_state) enum pipe pipe = crtc->pipe; u32 val = 0; + if (crtc_state->has_psr) + ilk_load_csc_matrix(crtc_state); + /* * We don't (yet) allow userspace to control the pipe background color, * so force it to black, but apply pipe gamma and CSC appropriately @@ -2171,7 +2190,7 @@ static const struct intel_color_funcs icl_color_funcs = { static const struct intel_color_funcs glk_color_funcs = { .color_check = glk_color_check, - .color_commit_noarm = ilk_color_commit_noarm, + .color_commit_noarm = skl_color_commit_noarm, .color_commit_arm = skl_color_commit_arm, .load_luts = glk_load_luts, .read_luts = glk_read_luts, @@ -2179,7 +2198,7 @@ static const struct intel_color_funcs glk_color_funcs = { static const struct intel_color_funcs skl_color_funcs = { .color_check = ivb_color_check, - .color_commit_noarm = ilk_color_commit_noarm, + .color_commit_noarm = skl_color_commit_noarm, .color_commit_arm = skl_color_commit_arm, .load_luts = bdw_load_luts, .read_luts = NULL, From 051e660c8185e4c4bee6fb30d796cfe4ae2c345f Mon Sep 17 00:00:00 2001 From: Reiji Watanabe Date: Sun, 12 Mar 2023 20:32:08 -0700 Subject: [PATCH 165/179] KVM: arm64: PMU: Fix GET_ONE_REG for vPMC regs to return the current value commit 9228b26194d1cc00449f12f306f53ef2e234a55b upstream. Have KVM_GET_ONE_REG for vPMU counter (vPMC) registers (PMCCNTR_EL0 and PMEVCNTR_EL0) return the sum of the register value in the sysreg file and the current perf event counter value. Values of vPMC registers are saved in sysreg files on certain occasions. These saved values don't represent the current values of the vPMC registers if the perf events for the vPMCs count events after the save. The current values of those registers are the sum of the sysreg file value and the current perf event counter value. But, when userspace reads those registers (using KVM_GET_ONE_REG), KVM returns the sysreg file value to userspace (not the sum value). Fix this to return the sum value for KVM_GET_ONE_REG. Fixes: 051ff581ce70 ("arm64: KVM: Add access handler for event counter register") Cc: stable@vger.kernel.org Reviewed-by: Marc Zyngier Signed-off-by: Reiji Watanabe Link: https://lore.kernel.org/r/20230313033208.1475499-1-reijiw@google.com Signed-off-by: Oliver Upton Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/sys_regs.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f4a7c5abcbca..bfe4f17232b3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -767,6 +767,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) return true; } +static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u64 idx; + + if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0) + /* PMCCNTR_EL0 */ + idx = ARMV8_PMU_CYCLE_IDX; + else + /* PMEVCNTRn_EL0 */ + idx = ((r->CRm & 3) << 3) | (r->Op2 & 7); + + *val = kvm_pmu_get_counter_value(vcpu, idx); + return 0; +} + static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -983,7 +999,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ - .reset = reset_pmevcntr, \ + .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ @@ -1632,7 +1648,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, + .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr}, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), From e4ca4572de06013c6aeedb44c7fadbeec63fa4e7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 16 Mar 2023 17:45:45 +0000 Subject: [PATCH 166/179] KVM: arm64: Disable interrupts while walking userspace PTs commit e86fc1a3a3e9b4850fe74d738e3cfcf4297d8bba upstream. We walk the userspace PTs to discover what mapping size was used there. However, this can race against the userspace tables being freed, and we end-up in the weeds. Thankfully, the mm code is being generous and will IPI us when doing so. So let's implement our part of the bargain and disable interrupts around the walk. This ensures that nothing terrible happens during that time. We still need to handle the removal of the page tables before the walk. For that, allow get_user_mapping_size() to return an error, and make sure this error can be propagated all the way to the the exit handler. Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230316174546.3777507-2-maz@kernel.org Signed-off-by: Oliver Upton Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/mmu.c | 45 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 2c3759f1f2c5..019472dd98ff 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -646,14 +646,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) CONFIG_PGTABLE_LEVELS), .mm_ops = &kvm_user_mm_ops, }; + unsigned long flags; kvm_pte_t pte = 0; /* Keep GCC quiet... */ u32 level = ~0; int ret; + /* + * Disable IRQs so that we hazard against a concurrent + * teardown of the userspace page tables (which relies on + * IPI-ing threads). + */ + local_irq_save(flags); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); - VM_BUG_ON(ret); - VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); - VM_BUG_ON(!(pte & PTE_VALID)); + local_irq_restore(flags); + + if (ret) + return ret; + + /* + * Not seeing an error, but not updating level? Something went + * deeply wrong... + */ + if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS)) + return -EFAULT; + + /* Oops, the userspace PTs are gone... Replay the fault */ + if (!kvm_pte_valid(pte)) + return -EAGAIN; return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); } @@ -1006,7 +1025,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * * Returns the size of the mapping. */ -static unsigned long +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) @@ -1018,8 +1037,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && - get_user_mapping_size(kvm, hva) >= PMD_SIZE) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + int sz = get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -1138,7 +1164,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool logging_active = memslot_is_logging(memslot); bool use_read_lock = false; unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); - unsigned long vma_pagesize, fault_granule; + long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; @@ -1295,6 +1321,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &pfn, &fault_ipa); + + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { From 8f872c781f6476665c0b09260a429ab03139339e Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Fri, 9 Dec 2022 19:28:15 +0200 Subject: [PATCH 167/179] net: dsa: mv88e6xxx: read FID when handling ATU violations commit 4bf24ad09bc0b05e97fb48b962b2c9246fc76727 upstream. When an ATU violation occurs, the switch uses the ATU FID register to report the FID of the MAC address that incurred the violation. It would be good for the driver to know the FID value for purposes such as logging and CPU-based authentication. Up until now, the driver has been calling the mv88e6xxx_g1_atu_op() function to read ATU violations, but that doesn't do exactly what we want, namely it calls mv88e6xxx_g1_atu_fid_write() with FID 0. (side note, the documentation for the ATU Get/Clear Violation command says that writes to the ATU FID register have no effect before the operation starts, it's only that we disregard the value that this register provides once the operation completes) So mv88e6xxx_g1_atu_fid_write() is not what we want, but rather mv88e6xxx_g1_atu_fid_read(). However, the latter doesn't exist, we need to write it. The remainder of mv88e6xxx_g1_atu_op() except for mv88e6xxx_g1_atu_fid_write() is still needed, namely to send a GET_CLR_VIOLATION command to the ATU. In principle we could have still kept calling mv88e6xxx_g1_atu_op(), but the MDIO writes to the ATU FID register are pointless, but in the interest of doing less CPU work per interrupt, write a new function called mv88e6xxx_g1_read_atu_violation() and call it. The FID will be the port default FID as set by mv88e6xxx_port_set_fid() if the VID from the packet cannot be found in the VTU. Otherwise it is the FID derived from the VTU entry associated with that VID. Signed-off-by: Hans J. Schultz Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski Cc: Fabio Estevam Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mv88e6xxx/global1_atu.c | 72 +++++++++++++++++++++---- 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c index 40bd67a5c8e9..4f689396fc40 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c @@ -114,6 +114,19 @@ static int mv88e6xxx_g1_atu_op_wait(struct mv88e6xxx_chip *chip) return mv88e6xxx_g1_wait_bit(chip, MV88E6XXX_G1_ATU_OP, bit, 0); } +static int mv88e6xxx_g1_read_atu_violation(struct mv88e6xxx_chip *chip) +{ + int err; + + err = mv88e6xxx_g1_write(chip, MV88E6XXX_G1_ATU_OP, + MV88E6XXX_G1_ATU_OP_BUSY | + MV88E6XXX_G1_ATU_OP_GET_CLR_VIOLATION); + if (err) + return err; + + return mv88e6xxx_g1_atu_op_wait(chip); +} + static int mv88e6xxx_g1_atu_op(struct mv88e6xxx_chip *chip, u16 fid, u16 op) { u16 val; @@ -159,6 +172,41 @@ int mv88e6xxx_g1_atu_get_next(struct mv88e6xxx_chip *chip, u16 fid) return mv88e6xxx_g1_atu_op(chip, fid, MV88E6XXX_G1_ATU_OP_GET_NEXT_DB); } +static int mv88e6xxx_g1_atu_fid_read(struct mv88e6xxx_chip *chip, u16 *fid) +{ + u16 val = 0, upper = 0, op = 0; + int err = -EOPNOTSUPP; + + if (mv88e6xxx_num_databases(chip) > 256) { + err = mv88e6xxx_g1_read(chip, MV88E6352_G1_ATU_FID, &val); + val &= 0xfff; + if (err) + return err; + } else { + err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_ATU_OP, &op); + if (err) + return err; + if (mv88e6xxx_num_databases(chip) > 64) { + /* ATU DBNum[7:4] are located in ATU Control 15:12 */ + err = mv88e6xxx_g1_read(chip, MV88E6XXX_G1_ATU_CTL, + &upper); + if (err) + return err; + + upper = (upper >> 8) & 0x00f0; + } else if (mv88e6xxx_num_databases(chip) > 16) { + /* ATU DBNum[5:4] are located in ATU Operation 9:8 */ + upper = (op >> 4) & 0x30; + } + + /* ATU DBNum[3:0] are located in ATU Operation 3:0 */ + val = (op & 0xf) | upper; + } + *fid = val; + + return err; +} + /* Offset 0x0C: ATU Data Register */ static int mv88e6xxx_g1_atu_data_read(struct mv88e6xxx_chip *chip, @@ -353,14 +401,12 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id) { struct mv88e6xxx_chip *chip = dev_id; struct mv88e6xxx_atu_entry entry; - int spid; - int err; - u16 val; + int err, spid; + u16 val, fid; mv88e6xxx_reg_lock(chip); - err = mv88e6xxx_g1_atu_op(chip, 0, - MV88E6XXX_G1_ATU_OP_GET_CLR_VIOLATION); + err = mv88e6xxx_g1_read_atu_violation(chip); if (err) goto out; @@ -368,6 +414,10 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id) if (err) goto out; + err = mv88e6xxx_g1_atu_fid_read(chip, &fid); + if (err) + goto out; + err = mv88e6xxx_g1_atu_data_read(chip, &entry); if (err) goto out; @@ -386,22 +436,22 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id) if (val & MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION) { dev_err_ratelimited(chip->dev, - "ATU member violation for %pM portvec %x spid %d\n", - entry.mac, entry.portvec, spid); + "ATU member violation for %pM fid %u portvec %x spid %d\n", + entry.mac, fid, entry.portvec, spid); chip->ports[spid].atu_member_violation++; } if (val & MV88E6XXX_G1_ATU_OP_MISS_VIOLATION) { dev_err_ratelimited(chip->dev, - "ATU miss violation for %pM portvec %x spid %d\n", - entry.mac, entry.portvec, spid); + "ATU miss violation for %pM fid %u portvec %x spid %d\n", + entry.mac, fid, entry.portvec, spid); chip->ports[spid].atu_miss_violation++; } if (val & MV88E6XXX_G1_ATU_OP_FULL_VIOLATION) { dev_err_ratelimited(chip->dev, - "ATU full violation for %pM portvec %x spid %d\n", - entry.mac, entry.portvec, spid); + "ATU full violation for %pM fid %u portvec %x spid %d\n", + entry.mac, fid, entry.portvec, spid); chip->ports[spid].atu_full_violation++; } mv88e6xxx_reg_unlock(chip); From be831b5c696334e8ee6cfce2cf9b92923f07aae9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 9 Dec 2022 19:28:16 +0200 Subject: [PATCH 168/179] net: dsa: mv88e6xxx: replace ATU violation prints with trace points commit 8646384d80f3d3b4a66b3284dbbd8232d1b8799e upstream. In applications where the switch ports must perform 802.1X based authentication and are therefore locked, ATU violation interrupts are quite to be expected as part of normal operation. The problem is that they currently spam the kernel log, even if rate limited. Create a series of trace points, all derived from the same event class, which log these violations to the kernel's trace buffer, which is both much faster and much easier to ignore than printing to a serial console. New usage model: $ trace-cmd list | grep mv88e6xxx mv88e6xxx mv88e6xxx:mv88e6xxx_atu_full_violation mv88e6xxx:mv88e6xxx_atu_miss_violation mv88e6xxx:mv88e6xxx_atu_member_violation $ trace-cmd record -e mv88e6xxx sleep 10 Signed-off-by: Vladimir Oltean Reviewed-by: Saeed Mahameed Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski Cc: Fabio Estevam Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mv88e6xxx/Makefile | 4 ++ drivers/net/dsa/mv88e6xxx/global1_atu.c | 19 +++---- drivers/net/dsa/mv88e6xxx/trace.c | 6 +++ drivers/net/dsa/mv88e6xxx/trace.h | 66 +++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 9 deletions(-) create mode 100644 drivers/net/dsa/mv88e6xxx/trace.c create mode 100644 drivers/net/dsa/mv88e6xxx/trace.h diff --git a/drivers/net/dsa/mv88e6xxx/Makefile b/drivers/net/dsa/mv88e6xxx/Makefile index c8eca2b6f959..49bf358b9c4f 100644 --- a/drivers/net/dsa/mv88e6xxx/Makefile +++ b/drivers/net/dsa/mv88e6xxx/Makefile @@ -15,3 +15,7 @@ mv88e6xxx-objs += port_hidden.o mv88e6xxx-$(CONFIG_NET_DSA_MV88E6XXX_PTP) += ptp.o mv88e6xxx-objs += serdes.o mv88e6xxx-objs += smi.o +mv88e6xxx-objs += trace.o + +# for tracing framework to find trace.h +CFLAGS_trace.o := -I$(src) diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c index 4f689396fc40..7c513a03789c 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_atu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c @@ -12,6 +12,7 @@ #include "chip.h" #include "global1.h" +#include "trace.h" /* Offset 0x01: ATU FID Register */ @@ -435,23 +436,23 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id) } if (val & MV88E6XXX_G1_ATU_OP_MEMBER_VIOLATION) { - dev_err_ratelimited(chip->dev, - "ATU member violation for %pM fid %u portvec %x spid %d\n", - entry.mac, fid, entry.portvec, spid); + trace_mv88e6xxx_atu_member_violation(chip->dev, spid, + entry.portvec, entry.mac, + fid); chip->ports[spid].atu_member_violation++; } if (val & MV88E6XXX_G1_ATU_OP_MISS_VIOLATION) { - dev_err_ratelimited(chip->dev, - "ATU miss violation for %pM fid %u portvec %x spid %d\n", - entry.mac, fid, entry.portvec, spid); + trace_mv88e6xxx_atu_miss_violation(chip->dev, spid, + entry.portvec, entry.mac, + fid); chip->ports[spid].atu_miss_violation++; } if (val & MV88E6XXX_G1_ATU_OP_FULL_VIOLATION) { - dev_err_ratelimited(chip->dev, - "ATU full violation for %pM fid %u portvec %x spid %d\n", - entry.mac, fid, entry.portvec, spid); + trace_mv88e6xxx_atu_full_violation(chip->dev, spid, + entry.portvec, entry.mac, + fid); chip->ports[spid].atu_full_violation++; } mv88e6xxx_reg_unlock(chip); diff --git a/drivers/net/dsa/mv88e6xxx/trace.c b/drivers/net/dsa/mv88e6xxx/trace.c new file mode 100644 index 000000000000..7833cb50ca5d --- /dev/null +++ b/drivers/net/dsa/mv88e6xxx/trace.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright 2022 NXP + */ + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/drivers/net/dsa/mv88e6xxx/trace.h b/drivers/net/dsa/mv88e6xxx/trace.h new file mode 100644 index 000000000000..d9ab5c8dee55 --- /dev/null +++ b/drivers/net/dsa/mv88e6xxx/trace.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright 2022 NXP + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mv88e6xxx + +#if !defined(_MV88E6XXX_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _MV88E6XXX_TRACE_H + +#include +#include +#include + +DECLARE_EVENT_CLASS(mv88e6xxx_atu_violation, + + TP_PROTO(const struct device *dev, int spid, u16 portvec, + const unsigned char *addr, u16 fid), + + TP_ARGS(dev, spid, portvec, addr, fid), + + TP_STRUCT__entry( + __string(name, dev_name(dev)) + __field(int, spid) + __field(u16, portvec) + __array(unsigned char, addr, ETH_ALEN) + __field(u16, fid) + ), + + TP_fast_assign( + __assign_str(name, dev_name(dev)); + __entry->spid = spid; + __entry->portvec = portvec; + memcpy(__entry->addr, addr, ETH_ALEN); + __entry->fid = fid; + ), + + TP_printk("dev %s spid %d portvec 0x%x addr %pM fid %u", + __get_str(name), __entry->spid, __entry->portvec, + __entry->addr, __entry->fid) +); + +DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_member_violation, + TP_PROTO(const struct device *dev, int spid, u16 portvec, + const unsigned char *addr, u16 fid), + TP_ARGS(dev, spid, portvec, addr, fid)); + +DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_miss_violation, + TP_PROTO(const struct device *dev, int spid, u16 portvec, + const unsigned char *addr, u16 fid), + TP_ARGS(dev, spid, portvec, addr, fid)); + +DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_full_violation, + TP_PROTO(const struct device *dev, int spid, u16 portvec, + const unsigned char *addr, u16 fid), + TP_ARGS(dev, spid, portvec, addr, fid)); + +#endif /* _MV88E6XXX_TRACE_H */ + +/* We don't want to use include/trace/events */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace +/* This part must be outside protection */ +#include From 0f9e728e1a6c83dfebb367f9dec6fc6ef1dc6cbb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 9 Dec 2022 19:28:17 +0200 Subject: [PATCH 169/179] net: dsa: mv88e6xxx: replace VTU violation prints with trace points commit 9e3d9ae52b5657399a7b61258cc7482434a911bb upstream. It is possible to trigger these VTU violation messages very easily, it's only necessary to send packets with an unknown VLAN ID to a port that belongs to a VLAN-aware bridge. Do a similar thing as for ATU violation messages, and hide them in the kernel's trace buffer. New usage model: $ trace-cmd list | grep mv88e6xxx mv88e6xxx mv88e6xxx:mv88e6xxx_vtu_miss_violation mv88e6xxx:mv88e6xxx_vtu_member_violation $ trace-cmd report Signed-off-by: Vladimir Oltean Reviewed-by: Saeed Mahameed Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski Cc: Fabio Estevam Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mv88e6xxx/global1_vtu.c | 7 +++--- drivers/net/dsa/mv88e6xxx/trace.h | 30 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/global1_vtu.c b/drivers/net/dsa/mv88e6xxx/global1_vtu.c index 38e18f5811bf..bcfb4a812055 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_vtu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_vtu.c @@ -13,6 +13,7 @@ #include "chip.h" #include "global1.h" +#include "trace.h" /* Offset 0x02: VTU FID Register */ @@ -628,14 +629,12 @@ static irqreturn_t mv88e6xxx_g1_vtu_prob_irq_thread_fn(int irq, void *dev_id) spid = val & MV88E6XXX_G1_VTU_OP_SPID_MASK; if (val & MV88E6XXX_G1_VTU_OP_MEMBER_VIOLATION) { - dev_err_ratelimited(chip->dev, "VTU member violation for vid %d, source port %d\n", - vid, spid); + trace_mv88e6xxx_vtu_member_violation(chip->dev, spid, vid); chip->ports[spid].vtu_member_violation++; } if (val & MV88E6XXX_G1_VTU_OP_MISS_VIOLATION) { - dev_dbg_ratelimited(chip->dev, "VTU miss violation for vid %d, source port %d\n", - vid, spid); + trace_mv88e6xxx_vtu_miss_violation(chip->dev, spid, vid); chip->ports[spid].vtu_miss_violation++; } diff --git a/drivers/net/dsa/mv88e6xxx/trace.h b/drivers/net/dsa/mv88e6xxx/trace.h index d9ab5c8dee55..f59ca04768e7 100644 --- a/drivers/net/dsa/mv88e6xxx/trace.h +++ b/drivers/net/dsa/mv88e6xxx/trace.h @@ -55,6 +55,36 @@ DEFINE_EVENT(mv88e6xxx_atu_violation, mv88e6xxx_atu_full_violation, const unsigned char *addr, u16 fid), TP_ARGS(dev, spid, portvec, addr, fid)); +DECLARE_EVENT_CLASS(mv88e6xxx_vtu_violation, + + TP_PROTO(const struct device *dev, int spid, u16 vid), + + TP_ARGS(dev, spid, vid), + + TP_STRUCT__entry( + __string(name, dev_name(dev)) + __field(int, spid) + __field(u16, vid) + ), + + TP_fast_assign( + __assign_str(name, dev_name(dev)); + __entry->spid = spid; + __entry->vid = vid; + ), + + TP_printk("dev %s spid %d vid %u", + __get_str(name), __entry->spid, __entry->vid) +); + +DEFINE_EVENT(mv88e6xxx_vtu_violation, mv88e6xxx_vtu_member_violation, + TP_PROTO(const struct device *dev, int spid, u16 vid), + TP_ARGS(dev, spid, vid)); + +DEFINE_EVENT(mv88e6xxx_vtu_violation, mv88e6xxx_vtu_miss_violation, + TP_PROTO(const struct device *dev, int spid, u16 vid), + TP_ARGS(dev, spid, vid)); + #endif /* _MV88E6XXX_TRACE_H */ /* We don't want to use include/trace/events */ From 2e35b08b66b0b2558216b3764df729d9fe2012ed Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Sat, 1 Oct 2022 13:44:25 +0300 Subject: [PATCH 170/179] selftests/bpf: Test btf dump for struct with padding only fields [ Upstream commit d503f1176b14f722a40ea5110312614982f9a80b ] Structures with zero regular fields but some padding constitute a special case in btf_dump.c:btf_dump_emit_struct_def with regards to newline before closing '}'. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221001104425.415768-2-eddyz87@gmail.com Stable-dep-of: ea2ce1ba99aa ("libbpf: Fix BTF-to-C converter's padding logic") Signed-off-by: Sasha Levin --- .../selftests/bpf/progs/btf_dump_test_case_padding.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c index f2661c8d2d90..7cb522d22a66 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c @@ -102,12 +102,21 @@ struct zone { struct zone_padding __pad__; }; +/* ----- START-EXPECTED-OUTPUT ----- */ +struct padding_wo_named_members { + long: 64; + long: 64; +}; + +/* ------ END-EXPECTED-OUTPUT ------ */ + int f(struct { struct padded_implicitly _1; struct padded_explicitly _2; struct padded_a_lot _3; struct padded_cache_line _4; struct zone _5; + struct padding_wo_named_members _6; } *_) { return 0; From 524617e553bc33f881ec3d4d5233fa1bb1e119de Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:04 -0800 Subject: [PATCH 171/179] libbpf: Fix BTF-to-C converter's padding logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ea2ce1ba99aa6a60c8d8a706e3abadf3de372163 ] Turns out that btf_dump API doesn't handle a bunch of tricky corner cases, as reported by Per, and further discovered using his testing Python script ([0]). This patch revamps btf_dump's padding logic significantly, making it more correct and also avoiding unnecessary explicit padding, where compiler would pad naturally. This overall topic turned out to be very tricky and subtle, there are lots of subtle corner cases. The comments in the code tries to give some clues, but comments themselves are supposed to be paired with good understanding of C alignment and padding rules. Plus some experimentation to figure out subtle things like whether `long :0;` means that struct is now forced to be long-aligned (no, it's not, turns out). Anyways, Per's script, while not completely correct in some known situations, doesn't show any obvious cases where this logic breaks, so this is a nice improvement over the previous state of this logic. Some selftests had to be adjusted to accommodate better use of natural alignment rules, eliminating some unnecessary padding, or changing it to `type: 0;` alignment markers. Note also that for when we are in between bitfields, we emit explicit bit size, while otherwise we use `: 0`, this feels much more natural in practice. Next patch will add few more test cases, found through randomized Per's script. [0] https://lore.kernel.org/bpf/85f83c333f5355c8ac026f835b18d15060725fcb.camel@ericsson.com/ Reported-by: Per Sundström XP Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-6-andrii@kernel.org Signed-off-by: Sasha Levin --- tools/lib/bpf/btf_dump.c | 167 +++++++++++++----- .../bpf/progs/btf_dump_test_case_bitfields.c | 2 +- .../bpf/progs/btf_dump_test_case_padding.c | 58 ++++-- 3 files changed, 163 insertions(+), 64 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 0b470169729e..09c78a797518 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -829,6 +829,25 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) } } +static int btf_natural_align_of(const struct btf *btf, __u32 id) +{ + const struct btf_type *t = btf__type_by_id(btf, id); + int i, align, vlen; + const struct btf_member *m; + + if (!btf_is_composite(t)) + return btf__align_of(btf, id); + + align = 1; + m = btf_members(t); + vlen = btf_vlen(t); + for (i = 0; i < vlen; i++, m++) { + align = max(align, btf__align_of(btf, m->type)); + } + + return align; +} + static bool btf_is_struct_packed(const struct btf *btf, __u32 id, const struct btf_type *t) { @@ -836,16 +855,16 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, int align, i, bit_sz; __u16 vlen; - align = btf__align_of(btf, id); - /* size of a non-packed struct has to be a multiple of its alignment*/ - if (align && t->size % align) + align = btf_natural_align_of(btf, id); + /* size of a non-packed struct has to be a multiple of its alignment */ + if (align && (t->size % align) != 0) return true; m = btf_members(t); vlen = btf_vlen(t); /* all non-bitfield fields have to be naturally aligned */ for (i = 0; i < vlen; i++, m++) { - align = btf__align_of(btf, m->type); + align = btf_natural_align_of(btf, m->type); bit_sz = btf_member_bitfield_size(t, i); if (align && bit_sz == 0 && m->offset % (8 * align) != 0) return true; @@ -858,44 +877,97 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, return false; } -static int chip_away_bits(int total, int at_most) -{ - return total % at_most ? : at_most; -} - static void btf_dump_emit_bit_padding(const struct btf_dump *d, - int cur_off, int m_off, int m_bit_sz, - int align, int lvl) + int cur_off, int next_off, int next_align, + bool in_bitfield, int lvl) { - int off_diff = m_off - cur_off; - int ptr_bits = d->ptr_sz * 8; + const struct { + const char *name; + int bits; + } pads[] = { + {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} + }; + int new_off, pad_bits, bits, i; + const char *pad_type; - if (off_diff <= 0) - /* no gap */ - return; - if (m_bit_sz == 0 && off_diff < align * 8) - /* natural padding will take care of a gap */ - return; + if (cur_off >= next_off) + return; /* no gap */ - while (off_diff > 0) { - const char *pad_type; - int pad_bits; + /* For filling out padding we want to take advantage of + * natural alignment rules to minimize unnecessary explicit + * padding. First, we find the largest type (among long, int, + * short, or char) that can be used to force naturally aligned + * boundary. Once determined, we'll use such type to fill in + * the remaining padding gap. In some cases we can rely on + * compiler filling some gaps, but sometimes we need to force + * alignment to close natural alignment with markers like + * `long: 0` (this is always the case for bitfields). Note + * that even if struct itself has, let's say 4-byte alignment + * (i.e., it only uses up to int-aligned types), using `long: + * X;` explicit padding doesn't actually change struct's + * overall alignment requirements, but compiler does take into + * account that type's (long, in this example) natural + * alignment requirements when adding implicit padding. We use + * this fact heavily and don't worry about ruining correct + * struct alignment requirement. + */ + for (i = 0; i < ARRAY_SIZE(pads); i++) { + pad_bits = pads[i].bits; + pad_type = pads[i].name; - if (ptr_bits > 32 && off_diff > 32) { - pad_type = "long"; - pad_bits = chip_away_bits(off_diff, ptr_bits); - } else if (off_diff > 16) { - pad_type = "int"; - pad_bits = chip_away_bits(off_diff, 32); - } else if (off_diff > 8) { - pad_type = "short"; - pad_bits = chip_away_bits(off_diff, 16); - } else { - pad_type = "char"; - pad_bits = chip_away_bits(off_diff, 8); + new_off = roundup(cur_off, pad_bits); + if (new_off <= next_off) + break; + } + + if (new_off > cur_off && new_off <= next_off) { + /* We need explicit `: 0` aligning mark if next + * field is right on alignment offset and its + * alignment requirement is less strict than 's + * alignment (so compiler won't naturally align to the + * offset we expect), or if subsequent `: X`, + * will actually completely fit in the remaining hole, + * making compiler basically ignore `: X` + * completely. + */ + if (in_bitfield || + (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) || + (new_off != next_off && next_off - new_off <= new_off - cur_off)) + /* but for bitfields we'll emit explicit bit count */ + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, + in_bitfield ? new_off - cur_off : 0); + cur_off = new_off; + } + + /* Now we know we start at naturally aligned offset for a chosen + * padding type (long, int, short, or char), and so the rest is just + * a straightforward filling of remaining padding gap with full + * `: sizeof();` markers, except for the last one, which + * might need smaller than sizeof() padding. + */ + while (cur_off != next_off) { + bits = min(next_off - cur_off, pad_bits); + if (bits == pad_bits) { + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); + cur_off += bits; + continue; + } + /* For the remainder padding that doesn't cover entire + * pad_type bit length, we pick the smallest necessary type. + * This is pure aesthetics, we could have just used `long`, + * but having smallest necessary one communicates better the + * scale of the padding gap. + */ + for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) { + pad_type = pads[i].name; + pad_bits = pads[i].bits; + if (pad_bits < bits) + continue; + + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits); + cur_off += bits; + break; } - btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); - off_diff -= pad_bits; } } @@ -915,9 +987,11 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, { const struct btf_member *m = btf_members(t); bool is_struct = btf_is_struct(t); - int align, i, packed, off = 0; + bool packed, prev_bitfield = false; + int align, i, off = 0; __u16 vlen = btf_vlen(t); + align = btf__align_of(d->btf, id); packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; btf_dump_printf(d, "%s%s%s {", @@ -927,33 +1001,36 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, for (i = 0; i < vlen; i++, m++) { const char *fname; - int m_off, m_sz; + int m_off, m_sz, m_align; + bool in_bitfield; fname = btf_name_of(d, m->name_off); m_sz = btf_member_bitfield_size(t, i); m_off = btf_member_bit_offset(t, i); - align = packed ? 1 : btf__align_of(d->btf, m->type); + m_align = packed ? 1 : btf__align_of(d->btf, m->type); - btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1); + in_bitfield = prev_bitfield && m_sz != 0; + + btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1); btf_dump_printf(d, "\n%s", pfx(lvl + 1)); btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); if (m_sz) { btf_dump_printf(d, ": %d", m_sz); off = m_off + m_sz; + prev_bitfield = true; } else { m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type)); off = m_off + m_sz * 8; + prev_bitfield = false; } + btf_dump_printf(d, ";"); } /* pad at the end, if necessary */ - if (is_struct) { - align = packed ? 1 : btf__align_of(d->btf, id); - btf_dump_emit_bit_padding(d, off, t->size * 8, 0, align, - lvl + 1); - } + if (is_struct) + btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1); if (vlen) btf_dump_printf(d, "\n"); diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c index e5560a656030..e01690618e1e 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c @@ -53,7 +53,7 @@ struct bitfields_only_mixed_types { */ /* ------ END-EXPECTED-OUTPUT ------ */ struct bitfield_mixed_with_others { - long: 4; /* char is enough as a backing field */ + char: 4; /* char is enough as a backing field */ int a: 4; /* 8-bit implicit padding */ short b; /* combined with previous bitfield */ diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c index 7cb522d22a66..6f963d34c45b 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c @@ -19,7 +19,7 @@ struct padded_implicitly { /* *struct padded_explicitly { * int a; - * int: 32; + * long: 0; * int b; *}; * @@ -28,41 +28,28 @@ struct padded_implicitly { struct padded_explicitly { int a; - int: 1; /* algo will explicitly pad with full 32 bits here */ + int: 1; /* algo will emit aligning `long: 0;` here */ int b; }; /* ----- START-EXPECTED-OUTPUT ----- */ -/* - *struct padded_a_lot { - * int a; - * long: 32; - * long: 64; - * long: 64; - * int b; - *}; - * - */ -/* ------ END-EXPECTED-OUTPUT ------ */ - struct padded_a_lot { int a; - /* 32 bit of implicit padding here, which algo will make explicit */ long: 64; long: 64; int b; }; +/* ------ END-EXPECTED-OUTPUT ------ */ + /* ----- START-EXPECTED-OUTPUT ----- */ /* *struct padded_cache_line { * int a; - * long: 32; * long: 64; * long: 64; * long: 64; * int b; - * long: 32; * long: 64; * long: 64; * long: 64; @@ -85,7 +72,7 @@ struct padded_cache_line { *struct zone { * int a; * short b; - * short: 16; + * long: 0; * struct zone_padding __pad__; *}; * @@ -108,6 +95,39 @@ struct padding_wo_named_members { long: 64; }; +struct padding_weird_1 { + int a; + long: 64; + short: 16; + short b; +}; + +/* ------ END-EXPECTED-OUTPUT ------ */ + +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *struct padding_weird_2 { + * long: 56; + * char a; + * long: 56; + * char b; + * char: 8; + *}; + * + */ +/* ------ END-EXPECTED-OUTPUT ------ */ +struct padding_weird_2 { + int: 32; /* these paddings will be collapsed into `long: 56;` */ + short: 16; + char: 8; + char a; + int: 32; /* these paddings will be collapsed into `long: 56;` */ + short: 16; + char: 8; + char b; + char: 8; +}; + /* ------ END-EXPECTED-OUTPUT ------ */ int f(struct { @@ -117,6 +137,8 @@ int f(struct { struct padded_cache_line _4; struct zone _5; struct padding_wo_named_members _6; + struct padding_weird_1 _7; + struct padding_weird_2 _8; } *_) { return 0; From 6c8afd54f8e99508148099e17cfec21b9fbadb6a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 12 Dec 2022 13:15:05 -0800 Subject: [PATCH 172/179] selftests/bpf: Add few corner cases to test padding handling of btf_dump [ Upstream commit b148c8b9b926e257a59c8eb2cd6fa3adfd443254 ] Add few hand-crafted cases and few randomized cases found using script from [0] that tests btf_dump's padding logic. [0] https://lore.kernel.org/bpf/85f83c333f5355c8ac026f835b18d15060725fcb.camel@ericsson.com/ Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20221212211505.558851-7-andrii@kernel.org Stable-dep-of: 4fb877aaa179 ("libbpf: Fix btf_dump's packed struct determination") Signed-off-by: Sasha Levin --- .../bpf/progs/btf_dump_test_case_packing.c | 61 +++++++++- .../bpf/progs/btf_dump_test_case_padding.c | 104 ++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c index e304b6204bd9..5c6c62f7ed32 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c @@ -58,7 +58,64 @@ union jump_code_union { } __attribute__((packed)); }; -/*------ END-EXPECTED-OUTPUT ------ */ +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *struct nested_packed_but_aligned_struct { + * int x1; + * int x2; + *}; + * + *struct outer_implicitly_packed_struct { + * char y1; + * struct nested_packed_but_aligned_struct y2; + *} __attribute__((packed)); + * + */ +/* ------ END-EXPECTED-OUTPUT ------ */ + +struct nested_packed_but_aligned_struct { + int x1; + int x2; +} __attribute__((packed)); + +struct outer_implicitly_packed_struct { + char y1; + struct nested_packed_but_aligned_struct y2; +}; +/* ----- START-EXPECTED-OUTPUT ----- */ +/* + *struct usb_ss_ep_comp_descriptor { + * char: 8; + * char bDescriptorType; + * char bMaxBurst; + * short wBytesPerInterval; + *}; + * + *struct usb_host_endpoint { + * long: 64; + * char: 8; + * struct usb_ss_ep_comp_descriptor ss_ep_comp; + * long: 0; + *} __attribute__((packed)); + * + */ +/* ------ END-EXPECTED-OUTPUT ------ */ + +struct usb_ss_ep_comp_descriptor { + char: 8; + char bDescriptorType; + char bMaxBurst; + int: 0; + short wBytesPerInterval; +} __attribute__((packed)); + +struct usb_host_endpoint { + long: 64; + char: 8; + struct usb_ss_ep_comp_descriptor ss_ep_comp; + long: 0; +}; + int f(struct { struct packed_trailing_space _1; @@ -69,6 +126,8 @@ int f(struct { union union_is_never_packed _6; union union_does_not_need_packing _7; union jump_code_union _8; + struct outer_implicitly_packed_struct _9; + struct usb_host_endpoint _10; } *_) { return 0; diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c index 6f963d34c45b..79276fbe454a 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c @@ -128,6 +128,98 @@ struct padding_weird_2 { char: 8; }; +/* ----- START-EXPECTED-OUTPUT ----- */ +struct exact_1byte { + char x; +}; + +struct padded_1byte { + char: 8; +}; + +struct exact_2bytes { + short x; +}; + +struct padded_2bytes { + short: 16; +}; + +struct exact_4bytes { + int x; +}; + +struct padded_4bytes { + int: 32; +}; + +struct exact_8bytes { + long x; +}; + +struct padded_8bytes { + long: 64; +}; + +struct ff_periodic_effect { + int: 32; + short magnitude; + long: 0; + short phase; + long: 0; + int: 32; + int custom_len; + short *custom_data; +}; + +struct ib_wc { + long: 64; + long: 64; + int: 32; + int byte_len; + void *qp; + union {} ex; + long: 64; + int slid; + int wc_flags; + long: 64; + char smac[6]; + long: 0; + char network_hdr_type; +}; + +struct acpi_object_method { + long: 64; + char: 8; + char type; + short reference_count; + char flags; + short: 0; + char: 8; + char sync_level; + long: 64; + void *node; + void *aml_start; + union {} dispatch; + long: 64; + int aml_length; +}; + +struct nested_unpacked { + int x; +}; + +struct nested_packed { + struct nested_unpacked a; + char c; +} __attribute__((packed)); + +struct outer_mixed_but_unpacked { + struct nested_packed b1; + short a1; + struct nested_packed b2; +}; + /* ------ END-EXPECTED-OUTPUT ------ */ int f(struct { @@ -139,6 +231,18 @@ int f(struct { struct padding_wo_named_members _6; struct padding_weird_1 _7; struct padding_weird_2 _8; + struct exact_1byte _100; + struct padded_1byte _101; + struct exact_2bytes _102; + struct padded_2bytes _103; + struct exact_4bytes _104; + struct padded_4bytes _105; + struct exact_8bytes _106; + struct padded_8bytes _107; + struct ff_periodic_effect _200; + struct ib_wc _201; + struct acpi_object_method _202; + struct outer_mixed_but_unpacked _203; } *_) { return 0; From e5c5cb47a9ebbcfda9ba10df94c5241072ab773b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 15 Dec 2022 10:36:05 -0800 Subject: [PATCH 173/179] libbpf: Fix btf_dump's packed struct determination [ Upstream commit 4fb877aaa179dcdb1676d55216482febaada457e ] Fix bug in btf_dump's logic of determining if a given struct type is packed or not. The notion of "natural alignment" is not needed and is even harmful in this case, so drop it altogether. The biggest difference in btf_is_struct_packed() compared to its original implementation is that we don't really use btf__align_of() to determine overall alignment of a struct type (because it could be 1 for both packed and non-packed struct, depending on specifci field definitions), and just use field's actual alignment to calculate whether any field is requiring packing or struct's size overall necessitates packing. Add two simple test cases that demonstrate the difference this change would make. Fixes: ea2ce1ba99aa ("libbpf: Fix BTF-to-C converter's padding logic") Reported-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20221215183605.4149488-1-andrii@kernel.org Signed-off-by: Sasha Levin --- tools/lib/bpf/btf_dump.c | 33 ++++--------------- .../bpf/progs/btf_dump_test_case_packing.c | 19 +++++++++++ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 09c78a797518..56102711f395 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -829,47 +829,26 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) } } -static int btf_natural_align_of(const struct btf *btf, __u32 id) -{ - const struct btf_type *t = btf__type_by_id(btf, id); - int i, align, vlen; - const struct btf_member *m; - - if (!btf_is_composite(t)) - return btf__align_of(btf, id); - - align = 1; - m = btf_members(t); - vlen = btf_vlen(t); - for (i = 0; i < vlen; i++, m++) { - align = max(align, btf__align_of(btf, m->type)); - } - - return align; -} - static bool btf_is_struct_packed(const struct btf *btf, __u32 id, const struct btf_type *t) { const struct btf_member *m; - int align, i, bit_sz; + int max_align = 1, align, i, bit_sz; __u16 vlen; - align = btf_natural_align_of(btf, id); - /* size of a non-packed struct has to be a multiple of its alignment */ - if (align && (t->size % align) != 0) - return true; - m = btf_members(t); vlen = btf_vlen(t); /* all non-bitfield fields have to be naturally aligned */ for (i = 0; i < vlen; i++, m++) { - align = btf_natural_align_of(btf, m->type); + align = btf__align_of(btf, m->type); bit_sz = btf_member_bitfield_size(t, i); if (align && bit_sz == 0 && m->offset % (8 * align) != 0) return true; + max_align = max(align, max_align); } - + /* size of a non-packed struct has to be a multiple of its alignment */ + if (t->size % max_align != 0) + return true; /* * if original struct was marked as packed, but its layout is * naturally aligned, we'll detect that it's not packed diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c index 5c6c62f7ed32..7998f27df7dd 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c @@ -116,6 +116,23 @@ struct usb_host_endpoint { long: 0; }; +/* ----- START-EXPECTED-OUTPUT ----- */ +struct nested_packed_struct { + int a; + char b; +} __attribute__((packed)); + +struct outer_nonpacked_struct { + short a; + struct nested_packed_struct b; +}; + +struct outer_packed_struct { + short a; + struct nested_packed_struct b; +} __attribute__((packed)); + +/* ------ END-EXPECTED-OUTPUT ------ */ int f(struct { struct packed_trailing_space _1; @@ -128,6 +145,8 @@ int f(struct { union jump_code_union _8; struct outer_implicitly_packed_struct _9; struct usb_host_endpoint _10; + struct outer_nonpacked_struct _11; + struct outer_packed_struct _12; } *_) { return 0; From fbfe493874e98970071b15c6753116fba054487f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 8 Mar 2023 16:42:43 +0100 Subject: [PATCH 174/179] usb: ucsi: Fix ucsi->connector race commit 0482c34ec6f8557e06cd0f8e2d0e20e8ede6a22c upstream. ucsi_init() which runs from a workqueue sets ucsi->connector and on an error will clear it again. ucsi->connector gets dereferenced by ucsi_resume(), this checks for ucsi->connector being NULL in case ucsi_init() has not finished yet; or in case ucsi_init() has failed. ucsi_init() setting ucsi->connector and then clearing it again on an error creates a race where the check in ucsi_resume() may pass, only to have ucsi->connector free-ed underneath it when ucsi_init() hits an error. Fix this race by making ucsi_init() store the connector array in a local variable and only assign it to ucsi->connector on success. Fixes: bdc62f2bae8f ("usb: typec: ucsi: Simplified registration and I/O API") Cc: stable@vger.kernel.org Reviewed-by: Heikki Krogerus Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20230308154244.722337-3-hdegoede@redhat.com Signed-off-by: Joakim Tjernlund Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 8cbbb002fefe..086b50968983 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -1039,9 +1039,8 @@ static struct fwnode_handle *ucsi_find_fwnode(struct ucsi_connector *con) return NULL; } -static int ucsi_register_port(struct ucsi *ucsi, int index) +static int ucsi_register_port(struct ucsi *ucsi, struct ucsi_connector *con) { - struct ucsi_connector *con = &ucsi->connector[index]; struct typec_capability *cap = &con->typec_cap; enum typec_accessory *accessory = cap->accessory; enum usb_role u_role = USB_ROLE_NONE; @@ -1062,7 +1061,6 @@ static int ucsi_register_port(struct ucsi *ucsi, int index) init_completion(&con->complete); mutex_init(&con->lock); INIT_LIST_HEAD(&con->partner_tasks); - con->num = index + 1; con->ucsi = ucsi; cap->fwnode = ucsi_find_fwnode(con); @@ -1204,7 +1202,7 @@ out_unlock: */ static int ucsi_init(struct ucsi *ucsi) { - struct ucsi_connector *con; + struct ucsi_connector *con, *connector; u64 command, ntfy; int ret; int i; @@ -1235,16 +1233,16 @@ static int ucsi_init(struct ucsi *ucsi) } /* Allocate the connectors. Released in ucsi_unregister() */ - ucsi->connector = kcalloc(ucsi->cap.num_connectors + 1, - sizeof(*ucsi->connector), GFP_KERNEL); - if (!ucsi->connector) { + connector = kcalloc(ucsi->cap.num_connectors + 1, sizeof(*connector), GFP_KERNEL); + if (!connector) { ret = -ENOMEM; goto err_reset; } /* Register all connectors */ for (i = 0; i < ucsi->cap.num_connectors; i++) { - ret = ucsi_register_port(ucsi, i); + connector[i].num = i + 1; + ret = ucsi_register_port(ucsi, &connector[i]); if (ret) goto err_unregister; } @@ -1256,11 +1254,12 @@ static int ucsi_init(struct ucsi *ucsi) if (ret < 0) goto err_unregister; + ucsi->connector = connector; ucsi->ntfy = ntfy; return 0; err_unregister: - for (con = ucsi->connector; con->port; con++) { + for (con = connector; con->port; con++) { ucsi_unregister_partner(con); ucsi_unregister_altmodes(con, UCSI_RECIPIENT_CON); ucsi_unregister_port_psy(con); @@ -1269,10 +1268,7 @@ err_unregister: typec_unregister_port(con->port); con->port = NULL; } - - kfree(ucsi->connector); - ucsi->connector = NULL; - + kfree(connector); err_reset: memset(&ucsi->cap, 0, sizeof(ucsi->cap)); ucsi_reset_ppm(ucsi); From d0386bd84e6d81990ef22e10657d5a5d9c209763 Mon Sep 17 00:00:00 2001 From: Xiaogang Chen Date: Thu, 9 Mar 2023 17:44:55 -0600 Subject: [PATCH 175/179] drm/amdkfd: Get prange->offset after svm_range_vram_node_new commit 8eeddc0d4200762063e1c66b9cc63afa7b24ebf0 upstream. During miration to vram prange->offset is valid after vram buffer is located, either use old one or allocate a new one. Move svm_range_vram_node_new before migrate for each vma to get valid prange->offset. v2: squash in warning fix Fixes: b4ee9606378b ("drm/amdkfd: Fix BO offset for multi-VMA page migration") Signed-off-by: Xiaogang Chen Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index fad500dd224d..88bf6221d4be 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -311,12 +311,6 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, src = scratch; dst = (uint64_t *)(scratch + npages); - r = svm_range_vram_node_new(adev, prange, true); - if (r) { - dev_dbg(adev->dev, "fail %d to alloc vram\n", r); - goto out; - } - amdgpu_res_first(prange->ttm_res, ttm_res_offset, npages << PAGE_SHIFT, &cursor); for (i = j = 0; i < npages; i++) { @@ -397,7 +391,7 @@ out_free_vram_pages: migrate->dst[i + 3] = 0; } #endif -out: + return r; } @@ -526,6 +520,12 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, start = prange->start << PAGE_SHIFT; end = (prange->last + 1) << PAGE_SHIFT; + + r = svm_range_vram_node_new(adev, prange, true); + if (r) { + dev_dbg(adev->dev, "fail %ld to alloc vram\n", r); + return r; + } ttm_res_offset = prange->offset << PAGE_SHIFT; for (addr = start; addr < end;) { @@ -549,6 +549,8 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, if (cpages) prange->actual_loc = best_loc; + else + svm_range_vram_node_free(prange); return r < 0 ? r : 0; } From 9f291f2348ed65eef879da461acc732e505b389f Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 15 Mar 2023 21:25:17 +0100 Subject: [PATCH 176/179] hsr: ratelimit only when errors are printed commit 1b0120e4db0bf2838d1ce741195ce4b7cc100b91 upstream. Recently, when automatically merging -net and net-next in MPTCP devel tree, our CI reported [1] a conflict in hsr, the same as the one reported by Stephen in netdev [2]. When looking at the conflict, I noticed it is in fact the v1 [3] that has been applied in -net and the v2 [4] in net-next. Maybe the v1 was applied by accident. As mentioned by Jakub Kicinski [5], the new condition makes more sense before the net_ratelimit(), not to update net_ratelimit's state which is unnecessary if we're not going to print either way. Here, this modification applies the v2 but in -net. Link: https://github.com/multipath-tcp/mptcp_net-next/actions/runs/4423171069 [1] Link: https://lore.kernel.org/netdev/20230315100914.53fc1760@canb.auug.org.au/ [2] Link: https://lore.kernel.org/netdev/20230307133229.127442-1-koverskeid@gmail.com/ [3] Link: https://lore.kernel.org/netdev/20230309092302.179586-1-koverskeid@gmail.com/ [4] Link: https://lore.kernel.org/netdev/20230308232001.2fb62013@kernel.org/ [5] Fixes: 28e8cabe80f3 ("net: hsr: Don't log netdev_err message on unknown prp dst node") Signed-off-by: Matthieu Baerts Reviewed-by: Steen Hegelund Link: https://lore.kernel.org/r/20230315-net-20230315-hsr_framereg-ratelimit-v1-1-61d2ef176d11@tessares.net Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/hsr/hsr_framereg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index bd0afb899117..a16f0445023a 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -422,7 +422,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { - if (net_ratelimit() && port->hsr->prot_version != PRP_V1) + if (port->hsr->prot_version != PRP_V1 && net_ratelimit()) netdev_err(skb->dev, "%s: Unknown node\n", __func__); return; } From cafb47f5f59bf372f93ebfaab129a0e712137345 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 21 Mar 2023 09:03:26 +0100 Subject: [PATCH 177/179] x86/PVH: avoid 32-bit build warning when obtaining VGA console info commit aadbd07ff8a75ed342388846da78dfaddb8b106a upstream. In the commit referenced below I failed to pay attention to this code also being buildable as 32-bit. Adjust the type of "ret" - there's no real need for it to be wider than 32 bits. Fixes: 934ef33ee75c ("x86/PVH: obtain VGA console info in Dom0") Reported-by: kernel test robot Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/2d2193ff-670b-0a27-e12d-2c5c4c121c79@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pvh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 1da44aca896c..ada3868c02c2 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -48,7 +48,7 @@ void __init xen_pvh_init(struct boot_params *boot_params) struct xen_platform_op op = { .cmd = XENPF_get_dom0_console, }; - long ret = HYPERVISOR_platform_op(&op); + int ret = HYPERVISOR_platform_op(&op); if (ret > 0) xen_init_vga(&op.u.dom0_console, From 9c5aa3c8619fe9c1668bcb8eccc63ff9302666e4 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 5 Apr 2023 07:31:15 -0400 Subject: [PATCH 178/179] Revert "cpuidle, intel_idle: Fix CPUIDLE_FLAG_IRQ_ENABLE *again*" This reverts commit 07fc78d8f0c960f7ca241de98bc8c6bfe7d200f3 which was upstream commit 6d9c7f51b1d9179bf7c3542267c656a934e8af23. Lockdep warnings on boot that are not seen with Linus's tree. Signed-off-by: Sasha Levin --- drivers/idle/intel_idle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index f060ac7376e6..cfeb24d40d37 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -168,7 +168,13 @@ static __cpuidle int intel_idle_irq(struct cpuidle_device *dev, raw_local_irq_enable(); ret = __intel_idle(dev, drv, index); - raw_local_irq_disable(); + + /* + * The lockdep hardirqs state may be changed to 'on' with timer + * tick interrupt followed by __do_softirq(). Use local_irq_disable() + * to keep the hardirqs state correct. + */ + local_irq_disable(); return ret; } From 543aff194ab6286af7791c5a138978ee7da4c93f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 6 Apr 2023 12:10:58 +0200 Subject: [PATCH 179/179] Linux 6.1.23 Link: https://lore.kernel.org/r/20230403140415.090615502@linuxfoundation.org Tested-by: Markus Reichelt Tested-by: Shuah Khan Tested-by: Takeshi Ogasawara Tested-by: Florian Fainelli Tested-by: Bagas Sanjaya Tested-by: Jon Hunter Tested-by: Chris Paterson (CIP) Tested-by: Conor Dooley Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20230404183150.381314754@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Ron Economos Tested-by: Bagas Sanjaya Tested-by: Linux Kernel Functional Testing Tested-by: Salvatore Bonaccorso Link: https://lore.kernel.org/r/20230405100302.540890806@linuxfoundation.org Tested-by: Takeshi Ogasawara Tested-by: Florian Fainelli Tested-by: Rudi Heitbaum Tested-by: Jon Hunter Tested-by: Markus Reichelt Tested-by: Linux Kernel Functional Testing Tested-by: Guenter Roeck Tested-by: Kelsey Steele Tested-by: Bagas Sanjaya Tested-by: Allen Pais Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c3d44de8850c..a162f6cdf77c 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 22 +SUBLEVEL = 23 EXTRAVERSION = NAME = Hurr durr I'ma ninja sloth