diff --git a/.mailmap b/.mailmap index 6d484937f901..9ba38a82aba4 100644 --- a/.mailmap +++ b/.mailmap @@ -236,6 +236,7 @@ Linus Lüssing Li Yang Li Yang +Lorenzo Pieralisi Lukasz Luba Maciej W. Rozycki Maciej W. Rozycki diff --git a/Documentation/admin-guide/nfs/nfs-client.rst b/Documentation/admin-guide/nfs/nfs-client.rst index 6adb6457bc69..36760685dd34 100644 --- a/Documentation/admin-guide/nfs/nfs-client.rst +++ b/Documentation/admin-guide/nfs/nfs-client.rst @@ -36,10 +36,9 @@ administrative requirements that require particular behavior that does not work well as part of an nfs_client_id4 string. The nfs.nfs4_unique_id boot parameter specifies a unique string that can be -used instead of a system's node name when an NFS client identifies itself to -a server. Thus, if the system's node name is not unique, or it changes, its -nfs.nfs4_unique_id stays the same, preventing collision with other clients -or loss of state during NFS reboot recovery or transparent state migration. +used together with a system's node name when an NFS client identifies itself to +a server. Thus, if the system's node name is not unique, its +nfs.nfs4_unique_id can help prevent collisions with other clients. The nfs.nfs4_unique_id string is typically a UUID, though it can contain anything that is believed to be unique across all NFS clients. An @@ -53,8 +52,12 @@ outstanding NFSv4 state has expired, to prevent loss of NFSv4 state. This string can be stored in an NFS client's grub.conf, or it can be provided via a net boot facility such as PXE. It may also be specified as an nfs.ko -module parameter. Specifying a uniquifier string is not support for NFS -clients running in containers. +module parameter. + +This uniquifier string will be the same for all NFS clients running in +containers unless it is overridden by a value written to +/sys/fs/nfs/net/nfs_client/identifier which will be local to the network +namespace of the process which writes. The DNS resolver diff --git a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt deleted file mode 100644 index 3716589d6999..000000000000 --- a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt +++ /dev/null @@ -1,43 +0,0 @@ -Bindings for cadence I3C master block -===================================== - -Required properties: --------------------- -- compatible: shall be "cdns,i3c-master" -- clocks: shall reference the pclk and sysclk -- clock-names: shall contain "pclk" and "sysclk" -- interrupts: the interrupt line connected to this I3C master -- reg: I3C master registers - -Mandatory properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- #address-cells: shall be set to 1 -- #size-cells: shall be set to 0 - -Optional properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- i2c-scl-hz -- i3c-scl-hz - -I3C device connected on the bus follow the generic description (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details). - -Example: - - i3c-master@0d040000 { - compatible = "cdns,i3c-master"; - clocks = <&coreclock>, <&i3csysclock>; - clock-names = "pclk", "sysclk"; - interrupts = <3 0>; - reg = <0x0d040000 0x1000>; - #address-cells = <1>; - #size-cells = <0>; - i2c-scl-hz = <100000>; - - nunchuk: nunchuk@52 { - compatible = "nintendo,nunchuk"; - reg = <0x52 0x0 0x10>; - }; - }; diff --git a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml new file mode 100644 index 000000000000..cc40d25358ec --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/cdns,i3c-master.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Cadence I3C master block + +maintainers: + - Boris Brezillon + +allOf: + - $ref: i3c.yaml# + +properties: + compatible: + const: cdns,i3c-master + + reg: + maxItems: 1 + + clocks: + maxItems: 2 + + clock-names: + items: + - const: pclk + - const: sysclk + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + +unevaluatedProperties: false + +examples: + - | + i3c-master@d040000 { + compatible = "cdns,i3c-master"; + clocks = <&coreclock>, <&i3csysclock>; + clock-names = "pclk", "sysclk"; + interrupts = <3 0>; + reg = <0x0d040000 0x1000>; + #address-cells = <3>; + #size-cells = <0>; + i2c-scl-hz = <100000>; + + eeprom@57{ + compatible = "atmel,24c01"; + reg = <0x57 0x0 0x10>; + pagesize = <0x8>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt deleted file mode 100644 index 07f35f36085d..000000000000 --- a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt +++ /dev/null @@ -1,41 +0,0 @@ -Bindings for Synopsys DesignWare I3C master block -================================================= - -Required properties: --------------------- -- compatible: shall be "snps,dw-i3c-master-1.00a" -- clocks: shall reference the core_clk -- interrupts: the interrupt line connected to this I3C master -- reg: Offset and length of I3C master registers - -Mandatory properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- #address-cells: shall be set to 3 -- #size-cells: shall be set to 0 - -Optional properties defined by the generic binding (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details): - -- i2c-scl-hz -- i3c-scl-hz - -I3C device connected on the bus follow the generic description (see -Documentation/devicetree/bindings/i3c/i3c.yaml for more details). - -Example: - - i3c-master@2000 { - compatible = "snps,dw-i3c-master-1.00a"; - #address-cells = <3>; - #size-cells = <0>; - reg = <0x02000 0x1000>; - interrupts = <0>; - clocks = <&i3cclk>; - - eeprom@57{ - compatible = "atmel,24c01"; - reg = <0x57 0x0 0x10>; - pagesize = <0x8>; - }; - }; diff --git a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml new file mode 100644 index 000000000000..7a76fd32962a --- /dev/null +++ b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/i3c/snps,dw-i3c-master.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Synopsys DesignWare I3C master block + +maintainers: + - Alexandre Belloni + +allOf: + - $ref: i3c.yaml# + +properties: + compatible: + const: snps,dw-i3c-master-1.00a + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - interrupts + +unevaluatedProperties: false + +examples: + - | + i3c-master@2000 { + compatible = "snps,dw-i3c-master-1.00a"; + #address-cells = <3>; + #size-cells = <0>; + reg = <0x02000 0x1000>; + interrupts = <0>; + clocks = <&i3cclk>; + + eeprom@57{ + compatible = "atmel,24c01"; + reg = <0x57 0x0 0x10>; + pagesize = <0x8>; + }; + }; +... diff --git a/Documentation/devicetree/bindings/mfd/da9063.txt b/Documentation/devicetree/bindings/mfd/da9063.txt index 91b79a21d403..aa8b800cc4ad 100644 --- a/Documentation/devicetree/bindings/mfd/da9063.txt +++ b/Documentation/devicetree/bindings/mfd/da9063.txt @@ -64,10 +64,13 @@ Sub-nodes: and KEY_SLEEP. - watchdog : This node defines settings for the Watchdog timer associated - with the DA9063 and DA9063L. There are currently no entries in this - binding, however compatible = "dlg,da9063-watchdog" should be added - if a node is created. + with the DA9063 and DA9063L. The node should contain the compatible property + with the value "dlg,da9063-watchdog". + Optional watchdog properties: + - dlg,use-sw-pm: Add this property to disable the watchdog during suspend. + Only use this option if you can't use the watchdog automatic suspend + function during a suspend (see register CONTROL_B). Example: diff --git a/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml b/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml new file mode 100644 index 000000000000..ab45df80345d --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2022 Microchip Technology, Inc. and its subsidiaries +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/atmel,at91sam-pwm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Atmel/Microchip PWM controller + +maintainers: + - Claudiu Beznea + +allOf: + - $ref: "pwm.yaml#" + +properties: + compatible: + oneOf: + - items: + - enum: + - atmel,at91sam9rl-pwm + - atmel,sama5d3-pwm + - atmel,sama5d2-pwm + - microchip,sam9x60-pwm + - items: + - const: microchip,sama7g5-pwm + - const: atmel,sama5d2-pwm + + reg: + maxItems: 1 + + "#pwm-cells": + const: 3 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + pwm0: pwm@f8034000 { + compatible = "atmel,at91sam9rl-pwm"; + reg = <0xf8034000 0x400>; + #pwm-cells = <3>; + }; diff --git a/Documentation/devicetree/bindings/pwm/atmel-pwm.txt b/Documentation/devicetree/bindings/pwm/atmel-pwm.txt deleted file mode 100644 index fbb5325be1f0..000000000000 --- a/Documentation/devicetree/bindings/pwm/atmel-pwm.txt +++ /dev/null @@ -1,35 +0,0 @@ -Atmel PWM controller - -Required properties: - - compatible: should be one of: - - "atmel,at91sam9rl-pwm" - - "atmel,sama5d3-pwm" - - "atmel,sama5d2-pwm" - - "microchip,sam9x60-pwm" - - reg: physical base address and length of the controller's registers - - #pwm-cells: Should be 3. See pwm.yaml in this directory for a - description of the cells format. - -Example: - - pwm0: pwm@f8034000 { - compatible = "atmel,at91sam9rl-pwm"; - reg = <0xf8034000 0x400>; - #pwm-cells = <3>; - }; - - pwmleds { - compatible = "pwm-leds"; - - d1 { - label = "d1"; - pwms = <&pwm0 3 5000 0> - max-brightness = <255>; - }; - - d2 { - label = "d2"; - pwms = <&pwm0 1 5000 1> - max-brightness = <255>; - }; - }; diff --git a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml index 7ab6912a845f..c8577bdf6c94 100644 --- a/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/google,cros-ec-pwm.yaml @@ -21,7 +21,14 @@ allOf: properties: compatible: - const: google,cros-ec-pwm + oneOf: + - description: PWM controlled using EC_PWM_TYPE_GENERIC channels. + items: + - const: google,cros-ec-pwm + - description: PWM controlled using CROS_EC_PWM_DT_<...> types. + items: + - const: google,cros-ec-pwm-type + "#pwm-cells": description: The cell specifies the PWM index. const: 1 diff --git a/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml new file mode 100644 index 000000000000..e4fe2d1bfef5 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/mediatek,pwm-disp.yaml @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/mediatek,pwm-disp.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek DISP_PWM Controller Device Tree Bindings + +maintainers: + - Jitao Shi + - Xinlei Lee + +allOf: + - $ref: pwm.yaml# + +properties: + compatible: + oneOf: + - enum: + - mediatek,mt2701-disp-pwm + - mediatek,mt6595-disp-pwm + - mediatek,mt8173-disp-pwm + - mediatek,mt8183-disp-pwm + - items: + - const: mediatek,mt8167-disp-pwm + - const: mediatek,mt8173-disp-pwm + - items: + - enum: + - mediatek,mt8186-disp-pwm + - mediatek,mt8192-disp-pwm + - mediatek,mt8195-disp-pwm + - const: mediatek,mt8183-disp-pwm + + reg: + maxItems: 1 + + "#pwm-cells": + const: 2 + + interrupts: + maxItems: 1 + + clocks: + items: + - description: Main Clock + - description: Mm Clock + + clock-names: + items: + - const: main + - const: mm + +required: + - compatible + - reg + - "#pwm-cells" + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + #include + + pwm0: pwm@1401e000 { + compatible = "mediatek,mt8173-disp-pwm"; + reg = <0x1401e000 0x1000>; + #pwm-cells = <2>; + clocks = <&mmsys CLK_MM_DISP_PWM026M>, + <&mmsys CLK_MM_DISP_PWM0MM>; + clock-names = "main", "mm"; + }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt index 25ed214473d7..033d1fc0f405 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt @@ -3,6 +3,7 @@ MediaTek PWM controller Required properties: - compatible: should be "mediatek,-pwm": - "mediatek,mt2712-pwm": found on mt2712 SoC. + - "mediatek,mt6795-pwm": found on mt6795 SoC. - "mediatek,mt7622-pwm": found on mt7622 SoC. - "mediatek,mt7623-pwm": found on mt7623 SoC. - "mediatek,mt7628-pwm": found on mt7628 SoC. diff --git a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt b/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt deleted file mode 100644 index 691e58b6c223..000000000000 --- a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt +++ /dev/null @@ -1,45 +0,0 @@ -MediaTek display PWM controller - -Required properties: - - compatible: should be "mediatek,-disp-pwm": - - "mediatek,mt2701-disp-pwm": found on mt2701 SoC. - - "mediatek,mt6595-disp-pwm": found on mt6595 SoC. - - "mediatek,mt8167-disp-pwm", "mediatek,mt8173-disp-pwm": found on mt8167 SoC. - - "mediatek,mt8173-disp-pwm": found on mt8173 SoC. - - "mediatek,mt8183-disp-pwm": found on mt8183 SoC.$ - - reg: physical base address and length of the controller's registers. - - #pwm-cells: must be 2. See pwm.yaml in this directory for a description of - the cell format. - - clocks: phandle and clock specifier of the PWM reference clock. - - clock-names: must contain the following: - - "main": clock used to generate PWM signals. - - "mm": sync signals from the modules of mmsys. - - pinctrl-names: Must contain a "default" entry. - - pinctrl-0: One property must exist for each entry in pinctrl-names. - See pinctrl/pinctrl-bindings.txt for details of the property values. - -Example: - pwm0: pwm@1401e000 { - compatible = "mediatek,mt8173-disp-pwm", - "mediatek,mt6595-disp-pwm"; - reg = <0 0x1401e000 0 0x1000>; - #pwm-cells = <2>; - clocks = <&mmsys CLK_MM_DISP_PWM026M>, - <&mmsys CLK_MM_DISP_PWM0MM>; - clock-names = "main", "mm"; - pinctrl-names = "default"; - pinctrl-0 = <&disp_pwm0_pins>; - }; - - backlight_lcd: backlight_lcd { - compatible = "pwm-backlight"; - pwms = <&pwm0 0 1000000>; - brightness-levels = < - 0 16 32 48 64 80 96 112 - 128 144 160 176 192 208 224 240 - 255 - >; - default-brightness-level = <9>; - power-supply = <&mt6397_vio18_reg>; - enable-gpios = <&pio 95 GPIO_ACTIVE_HIGH>; - }; diff --git a/Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml b/Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml new file mode 100644 index 000000000000..d4fc9e8db1d1 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) Sunplus Co., Ltd. 2021 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/sunplus,sp7021-pwm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sunplus SoC SP7021 PWM Controller + +maintainers: + - Hammer Hsieh + +allOf: + - $ref: pwm.yaml# + +properties: + compatible: + const: sunplus,sp7021-pwm + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + '#pwm-cells': + const: 2 + +unevaluatedProperties: false + +required: + - reg + - clocks + +examples: + - | + pwm: pwm@9c007a00 { + compatible = "sunplus,sp7021-pwm"; + reg = <0x9c007a00 0x80>; + clocks = <&clkc 0xa2>; + #pwm-cells = <2>; + }; diff --git a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml index fc16d903353e..3a1f59ad79e2 100644 --- a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml @@ -15,14 +15,15 @@ maintainers: properties: compatible: enum: - - fsl,imx8mq-cm4 + - fsl,imx6sx-cm4 + - fsl,imx7d-cm4 + - fsl,imx7ulp-cm4 - fsl,imx8mm-cm4 - fsl,imx8mn-cm7 - fsl,imx8mp-cm7 + - fsl,imx8mq-cm4 - fsl,imx8ulp-cm33 - - fsl,imx7d-cm4 - - fsl,imx7ulp-cm4 - - fsl,imx6sx-cm4 + - fsl,imx93-cm33 clocks: maxItems: 1 diff --git a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml index 5b693a2d049c..eec3b9c4c713 100644 --- a/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/mtk,scp.yaml @@ -23,11 +23,13 @@ properties: reg: description: - Should contain the address ranges for memory regions SRAM, CFG, and - L1TCM. + Should contain the address ranges for memory regions SRAM, CFG, and, + on some platforms, L1TCM. + minItems: 2 maxItems: 3 reg-names: + minItems: 2 items: - const: sram - const: cfg @@ -42,21 +44,48 @@ properties: clock-names: const: main + interrupts: + maxItems: 1 + + firmware-name: + $ref: /schemas/types.yaml#/definitions/string + description: + If present, name (or relative path) of the file within the + firmware search path containing the firmware image used when + initializing SCP. + + memory-region: + maxItems: 1 + required: - compatible - reg - reg-names -if: - properties: - compatible: - enum: - - mediatek,mt8183-scp - - mediatek,mt8192-scp -then: - required: - - clocks - - clock-names +allOf: + - if: + properties: + compatible: + enum: + - mediatek,mt8183-scp + - mediatek,mt8192-scp + then: + required: + - clocks + - clock-names + + - if: + properties: + compatible: + enum: + - mediatek,mt8183-scp + - mediatek,mt8186-scp + then: + properties: + reg: + maxItems: 2 + reg-names: + maxItems: 2 additionalProperties: type: object @@ -76,10 +105,10 @@ additionalProperties: examples: - | - #include + #include scp@10500000 { - compatible = "mediatek,mt8183-scp"; + compatible = "mediatek,mt8192-scp"; reg = <0x10500000 0x80000>, <0x10700000 0x8000>, <0x10720000 0xe0000>; diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml index a4409c398193..947f94548d0e 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml @@ -16,6 +16,7 @@ description: properties: compatible: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil @@ -29,6 +30,9 @@ properties: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas - qcom,sc8180x-mpss-pas + - qcom,sc8280xp-adsp-pas + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas - qcom,sdm660-adsp-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas @@ -159,6 +163,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil @@ -169,6 +174,9 @@ allOf: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas - qcom,sc8180x-mpss-pas + - qcom,sc8280xp-adsp-pas + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas - qcom,sm6350-adsp-pas @@ -274,6 +282,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil @@ -284,6 +293,9 @@ allOf: - qcom,qcs404-wcss-pas - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas + - qcom,sc8280xp-adsp-pas + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas - qcom,sm6350-adsp-pas @@ -364,6 +376,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8998-adsp-pas then: @@ -471,6 +484,7 @@ allOf: enum: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas + - qcom,sc8280xp-adsp-pas - qcom,sm6350-adsp-pas - qcom,sm8150-slpi-pas - qcom,sm8250-adsp-pas @@ -508,6 +522,22 @@ allOf: - const: cx - const: mxc + - if: + properties: + compatible: + contains: + enum: + - qcom,sc8280xp-nsp0-pas + - qcom,sc8280xp-nsp1-pas + then: + properties: + power-domains: + items: + - description: NSP power domain + power-domain-names: + items: + - const: nsp + - if: properties: compatible: @@ -546,6 +576,7 @@ allOf: compatible: contains: enum: + - qcom,msm8226-adsp-pil - qcom,msm8974-adsp-pil - qcom,msm8996-adsp-pil - qcom,msm8996-slpi-pil diff --git a/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml index be3d9b0e876b..da50f0e99fe2 100644 --- a/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml @@ -43,8 +43,8 @@ properties: items: - items: - description: Phandle of syscon block - - description: FIXME - - description: FIXME + - description: The offset of the trust zone setting register + - description: The field mask of the trust zone state interrupts: description: Should contain the WWDG1 watchdog reset interrupt @@ -101,8 +101,8 @@ properties: items: - items: - description: Phandle of syscon block - - description: FIXME - - description: FIXME + - description: The offset of the power setting register + - description: The field mask of the PDDS selection st,syscfg-m4-state: $ref: "/schemas/types.yaml#/definitions/phandle-array" @@ -111,8 +111,8 @@ properties: items: - items: - description: Phandle of syscon block with the tamp register - - description: FIXME - - description: FIXME + - description: The offset of the tamp register + - description: The field mask of the Cortex-M4 state st,syscfg-rsc-tbl: $ref: "/schemas/types.yaml#/definitions/phandle-array" @@ -122,8 +122,8 @@ properties: items: - items: - description: Phandle of syscon block with the tamp register - - description: FIXME - - description: FIXME + - description: The offset of the tamp register + - description: The field mask of the Cortex-M4 resource table address st,auto-boot: $ref: /schemas/types.yaml#/definitions/flag diff --git a/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt b/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt index 6439682c9319..217b7cd06c11 100644 --- a/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt +++ b/Documentation/devicetree/bindings/rtc/nxp,pcf85063.txt @@ -2,6 +2,7 @@ Required properties: - compatible: Should one of contain: + "nxp,pca85073a", "nxp,pcf85063", "nxp,pcf85063a", "nxp,pcf85063tp", diff --git a/Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml b/Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml new file mode 100644 index 000000000000..2d4741f51663 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/renesas,rzn1-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/N1 SoCs Real-Time Clock DT bindings + +maintainers: + - Miquel Raynal + +allOf: + - $ref: rtc.yaml# + +properties: + compatible: + items: + - enum: + - renesas,r9a06g032-rtc + - const: renesas,rzn1-rtc + + reg: + maxItems: 1 + + interrupts: + minItems: 3 + maxItems: 3 + + interrupt-names: + items: + - const: alarm + - const: timer + - const: pps + + clocks: + maxItems: 1 + + clock-names: + const: hclk + + power-domains: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - interrupt-names + - clocks + - clock-names + - power-domains + +unevaluatedProperties: false + +examples: + - | + #include + #include + rtc@40006000 { + compatible = "renesas,r9a06g032-rtc", "renesas,rzn1-rtc"; + reg = <0x40006000 0x1000>; + interrupts = , + , + ; + interrupt-names = "alarm", "timer", "pps"; + clocks = <&sysctrl R9A06G032_HCLK_RTC>; + clock-names = "hclk"; + power-domains = <&sysctrl>; + start-year = <2000>; + }; diff --git a/Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml b/Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml new file mode 100644 index 000000000000..dd168d41d2e0 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/xlnx,xps-timer.yaml @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/xlnx,xps-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Xilinx LogiCORE IP AXI Timer Device Tree Binding + +maintainers: + - Sean Anderson + +properties: + compatible: + contains: + const: xlnx,xps-timer-1.00.a + + clocks: + maxItems: 1 + + clock-names: + const: s_axi_aclk + + interrupts: + maxItems: 1 + + reg: + maxItems: 1 + + '#pwm-cells': true + + xlnx,count-width: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [8, 16, 32] + default: 32 + description: + The width of the counter(s), in bits. + + xlnx,one-timer-only: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [ 0, 1 ] + description: + Whether only one timer is present in this block. + +required: + - compatible + - reg + - xlnx,one-timer-only + +allOf: + - if: + required: + - '#pwm-cells' + then: + allOf: + - required: + - clocks + - properties: + xlnx,one-timer-only: + const: 0 + else: + required: + - interrupts + - if: + required: + - clocks + then: + required: + - clock-names + +additionalProperties: false + +examples: + - | + timer@800e0000 { + clock-names = "s_axi_aclk"; + clocks = <&zynqmp_clk 71>; + compatible = "xlnx,xps-timer-1.00.a"; + reg = <0x800e0000 0x10000>; + interrupts = <0 39 2>; + xlnx,count-width = <16>; + xlnx,one-timer-only = <0x0>; + }; + + timer@800f0000 { + #pwm-cells = <0>; + clock-names = "s_axi_aclk"; + clocks = <&zynqmp_clk 71>; + compatible = "xlnx,xps-timer-1.00.a"; + reg = <0x800e0000 0x10000>; + xlnx,count-width = <32>; + xlnx,one-timer-only = <0x0>; + }; diff --git a/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt b/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt index 950e4fba8dbc..354314d854ef 100644 --- a/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/da9062-wdt.txt @@ -10,6 +10,12 @@ Optional properties: - dlg,use-sw-pm: Add this property to disable the watchdog during suspend. Only use this option if you can't use the watchdog automatic suspend function during a suspend (see register CONTROL_B). +- dlg,wdt-sd: Set what happens on watchdog timeout. If this bit is set the + watchdog timeout triggers SHUTDOWN, if cleared the watchdog triggers + POWERDOWN. Can be 0 or 1. Only use this option if you want to change the + default chip's OTP setting for WATCHDOG_SD bit. If this property is NOT + set the WATCHDOG_SD bit and on timeout watchdog behavior will match the + chip's OTP settings. Example: DA9062 diff --git a/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt b/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt deleted file mode 100644 index 9ecdb502e605..000000000000 --- a/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.txt +++ /dev/null @@ -1,22 +0,0 @@ -Faraday Technology FTWDT010 watchdog - -This is an IP part from Faraday Technology found in the Gemini -SoCs and others. - -Required properties: -- compatible : must be one of - "faraday,ftwdt010" - "cortina,gemini-watchdog", "faraday,ftwdt010" -- reg : shall contain base register location and length -- interrupts : shall contain the interrupt for the watchdog - -Optional properties: -- timeout-sec : the default watchdog timeout in seconds. - -Example: - -watchdog@41000000 { - compatible = "faraday,ftwdt010"; - reg = <0x41000000 0x1000>; - interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; -}; diff --git a/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml b/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml new file mode 100644 index 000000000000..ca9e1beff76b --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/faraday,ftwdt010.yaml @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/faraday,ftwdt010.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Faraday Technology FTWDT010 watchdog + +maintainers: + - Linus Walleij + - Corentin Labbe + +description: | + This is an IP part from Faraday Technology found in the Gemini + SoCs and others. + +allOf: + - $ref: "watchdog.yaml#" + +properties: + compatible: + oneOf: + - const: faraday,ftwdt010 + - items: + - enum: + - cortina,gemini-watchdog + - moxa,moxart-watchdog + - const: faraday,ftwdt010 + + reg: + maxItems: 1 + + resets: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + const: PCLK + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + watchdog@41000000 { + compatible = "faraday,ftwdt010"; + reg = <0x41000000 0x1000>; + interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; + timeout-secs = <5>; + }; + - | + watchdog: watchdog@98500000 { + compatible = "moxa,moxart-watchdog", "faraday,ftwdt010"; + reg = <0x98500000 0x10>; + clocks = <&clk_apb>; + clock-names = "PCLK"; + }; +... diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml index 4ca8a31359a5..8562978aa0c8 100644 --- a/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/fsl-imx7ulp-wdt.yaml @@ -19,6 +19,7 @@ properties: - items: - const: fsl,imx8ulp-wdt - const: fsl,imx7ulp-wdt + - const: fsl,imx93-wdt reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt index a97418c74f6b..762c62e428ef 100644 --- a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt @@ -16,6 +16,7 @@ Required properties: "mediatek,mt7629-wdt", "mediatek,mt6589-wdt": for MT7629 "mediatek,mt7986-wdt", "mediatek,mt6589-wdt": for MT7986 "mediatek,mt8183-wdt": for MT8183 + "mediatek,mt8186-wdt", "mediatek,mt6589-wdt": for MT8186 "mediatek,mt8516-wdt", "mediatek,mt6589-wdt": for MT8516 "mediatek,mt8192-wdt": for MT8192 "mediatek,mt8195-wdt", "mediatek,mt6589-wdt": for MT8195 diff --git a/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml b/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml index 16c6f82a13ca..2bd6b4a52637 100644 --- a/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/qcom-wdt.yaml @@ -14,22 +14,29 @@ allOf: properties: compatible: - enum: - - qcom,apss-wdt-qcs404 - - qcom,apss-wdt-sc7180 - - qcom,apss-wdt-sc7280 - - qcom,apss-wdt-sdm845 - - qcom,apss-wdt-sdx55 - - qcom,apss-wdt-sm6350 - - qcom,apss-wdt-sm8150 - - qcom,apss-wdt-sm8250 - - qcom,kpss-timer - - qcom,kpss-wdt - - qcom,kpss-wdt-apq8064 - - qcom,kpss-wdt-ipq4019 - - qcom,kpss-wdt-ipq8064 - - qcom,kpss-wdt-msm8960 - - qcom,scss-timer + oneOf: + - items: + - enum: + - qcom,apss-wdt-qcs404 + - qcom,apss-wdt-sc7180 + - qcom,apss-wdt-sc7280 + - qcom,apss-wdt-sc8180x + - qcom,apss-wdt-sc8280xp + - qcom,apss-wdt-sdm845 + - qcom,apss-wdt-sdx55 + - qcom,apss-wdt-sm6350 + - qcom,apss-wdt-sm8150 + - qcom,apss-wdt-sm8250 + - const: qcom,kpss-wdt + - items: + - enum: + - qcom,kpss-wdt + - qcom,kpss-timer + - qcom,kpss-wdt-apq8064 + - qcom,kpss-wdt-ipq4019 + - qcom,kpss-wdt-ipq8064 + - qcom,kpss-wdt-msm8960 + - qcom,scss-timer reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml index d060438e1402..a8d7dde5271b 100644 --- a/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.yaml @@ -21,8 +21,15 @@ properties: - items: - enum: + - renesas,r9a06g032-wdt # RZ/N1D + - const: renesas,rzn1-wdt # RZ/N1 + + - items: + - enum: + - renesas,r9a07g043-wdt # RZ/G2UL - renesas,r9a07g044-wdt # RZ/G2{L,LC} - - const: renesas,rzg2l-wdt # RZ/G2L + - renesas,r9a07g054-wdt # RZ/V2L + - const: renesas,rzg2l-wdt - items: - enum: @@ -52,11 +59,11 @@ properties: - renesas,r8a77980-wdt # R-Car V3H - renesas,r8a77990-wdt # R-Car E3 - renesas,r8a77995-wdt # R-Car D3 - - renesas,r8a779a0-wdt # R-Car V3U - const: renesas,rcar-gen3-wdt # R-Car Gen3 and RZ/G2 - items: - enum: + - renesas,r8a779a0-wdt # R-Car V3U - renesas,r8a779f0-wdt # R-Car S4-8 - const: renesas,rcar-gen4-wdt # R-Car Gen4 @@ -94,6 +101,7 @@ allOf: contains: enum: - renesas,rza-wdt + - renesas,rzn1-wdt then: required: - power-domains diff --git a/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml b/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml index a059d16cb4f2..90698cfa8f94 100644 --- a/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/socionext,uniphier-wdt.yaml @@ -19,7 +19,7 @@ properties: required: - compatible -additionalProperties: false +unevaluatedProperties: false examples: - | diff --git a/Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml b/Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml new file mode 100644 index 000000000000..d90271013191 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) Sunplus Co., Ltd. 2021 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/sunplus,sp7021-wdt.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sunplus SoCs Watchdog + +maintainers: + - XianTao Hu + +allOf: + - $ref: watchdog.yaml# + +properties: + compatible: + const: sunplus,sp7021-wdt + + reg: + items: + - description: watchdog registers regions + - description: miscellaneous control registers regions + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + +required: + - compatible + - reg + - clocks + - resets + +additionalProperties: false + +examples: + - | + watchdog: watchdog@9c000630 { + compatible = "sunplus,sp7021-wdt"; + reg = <0x9c000630 0x08>, <0x9c000274 0x04>; + clocks = <&clkc 0x24>; + resets = <&rstc 0x14>; + }; +... diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst index ccb06e485756..fd26c3d895b6 100644 --- a/Documentation/driver-api/pwm.rst +++ b/Documentation/driver-api/pwm.rst @@ -49,6 +49,12 @@ After being requested, a PWM has to be configured using:: This API controls both the PWM period/duty_cycle config and the enable/disable state. + +As a consumer, don't rely on the output's state for a disabled PWM. If it's +easily possible, drivers are supposed to emit the inactive state, but some +drivers cannot. If you rely on getting the inactive state, use .duty_cycle=0, +.enabled=true. + There is also a usage_power setting: If set, the PWM driver is only required to maintain the power output but has more freedom regarding signal form. If supported by the driver, the signal can be optimized, for example to improve diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index 784bbeb22adc..eded8719180f 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -262,10 +262,10 @@ Translation APIs for Mediated Devices The following APIs are provided for translating user pfn to host pfn in a VFIO driver:: - extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, + int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); - extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, + int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); These functions call back into the back-end IOMMU module by using the pin_pages diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index bef6d3040ce4..05e03d54af1a 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -1,63 +1,82 @@ .. SPDX-License-Identifier: GPL-2.0 ====================================== -Enhanced Read-Only File System - EROFS +EROFS - Enhanced Read-Only File System ====================================== Overview ======== -EROFS file-system stands for Enhanced Read-Only File System. Different -from other read-only file systems, it aims to be designed for flexibility, -scalability, but be kept simple and high performance. +EROFS filesystem stands for Enhanced Read-Only File System. It aims to form a +generic read-only filesystem solution for various read-only use cases instead +of just focusing on storage space saving without considering any side effects +of runtime performance. -It is designed as a better filesystem solution for the following scenarios: +It is designed to meet the needs of flexibility, feature extendability and user +payload friendly, etc. Apart from those, it is still kept as a simple +random-access friendly high-performance filesystem to get rid of unneeded I/O +amplification and memory-resident overhead compared to similar approaches. + +It is implemented to be a better choice for the following scenarios: - read-only storage media or - part of a fully trusted read-only solution, which means it needs to be immutable and bit-for-bit identical to the official golden image for - their releases due to security and other considerations and + their releases due to security or other considerations and - hope to minimize extra storage space with guaranteed end-to-end performance by using compact layout, transparent file compression and direct access, especially for those embedded devices with limited memory and high-density - hosts with numerous containers; + hosts with numerous containers. Here is the main features of EROFS: - Little endian on-disk design; - - Currently 4KB block size (nobh) and therefore maximum 16TB address space; + - 4KiB block size and 32-bit block addresses, therefore 16TiB address space + at most for now; - - Metadata & data could be mixed by design; + - Two inode layouts for different requirements: - - 2 inode versions for different requirements: - - ===================== ============ ===================================== + ===================== ============ ====================================== compact (v1) extended (v2) - ===================== ============ ===================================== + ===================== ============ ====================================== Inode metadata size 32 bytes 64 bytes - Max file size 4 GB 16 EB (also limited by max. vol size) + Max file size 4 GiB 16 EiB (also limited by max. vol size) Max uids/gids 65536 4294967296 Per-inode timestamp no yes (64 + 32-bit timestamp) Max hardlinks 65536 4294967296 - Metadata reserved 4 bytes 14 bytes - ===================== ============ ===================================== + Metadata reserved 8 bytes 18 bytes + ===================== ============ ====================================== + + - Metadata and data could be mixed as an option; - Support extended attributes (xattrs) as an option; - - Support xattr inline and tail-end data inline for all files; + - Support tailpacking data and xattr inline compared to byte-addressed + unaligned metadata or smaller block size alternatives; - Support POSIX.1e ACLs by using xattrs; - Support transparent data compression as an option: - LZ4 algorithm with the fixed-sized output compression for high performance; + LZ4 and MicroLZMA algorithms can be used on a per-file basis; In addition, + inplace decompression is also supported to avoid bounce compressed buffers + and page cache thrashing. - - Multiple device support for multi-layer container images. + - Support direct I/O on uncompressed files to avoid double caching for loop + devices; + + - Support FSDAX on uncompressed images for secure containers and ramdisks in + order to get rid of unnecessary page cache. + + - Support multiple devices for multi blob container images; + + - Support file-based on-demand loading with the Fscache infrastructure. The following git tree provides the file system user-space tools under -development (ex, formatting tool mkfs.erofs): +development, such as a formatting tool (mkfs.erofs), an on-disk consistency & +compatibility checking tool (fsck.erofs), and a debugging tool (dump.erofs): - git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs-utils.git @@ -91,6 +110,7 @@ dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. device=%s Specify a path to an extra device to be used together. +fsid=%s Specify a filesystem image ID for Fscache back-end. =================== ========================================================= Sysfs Entries @@ -226,8 +246,8 @@ Note that apart from the offset of the first filename, nameoff0 also indicates the total number of directory entries in this block since it is no need to introduce another on-disk field at all. -Chunk-based file ----------------- +Chunk-based files +----------------- In order to support chunk-based data deduplication, a new inode data layout has been supported since Linux v5.15: Files are split in equal-sized data chunks with ``extents`` area of the inode metadata indicating how to get the chunk diff --git a/Documentation/filesystems/nfs/client-identifier.rst b/Documentation/filesystems/nfs/client-identifier.rst new file mode 100644 index 000000000000..5147e15815a1 --- /dev/null +++ b/Documentation/filesystems/nfs/client-identifier.rst @@ -0,0 +1,216 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +NFSv4 client identifier +======================= + +This document explains how the NFSv4 protocol identifies client +instances in order to maintain file open and lock state during +system restarts. A special identifier and principal are maintained +on each client. These can be set by administrators, scripts +provided by site administrators, or tools provided by Linux +distributors. + +There are risks if a client's NFSv4 identifier and its principal +are not chosen carefully. + + +Introduction +------------ + +The NFSv4 protocol uses "lease-based file locking". Leases help +NFSv4 servers provide file lock guarantees and manage their +resources. + +Simply put, an NFSv4 server creates a lease for each NFSv4 client. +The server collects each client's file open and lock state under +the lease for that client. + +The client is responsible for periodically renewing its leases. +While a lease remains valid, the server holding that lease +guarantees the file locks the client has created remain in place. + +If a client stops renewing its lease (for example, if it crashes), +the NFSv4 protocol allows the server to remove the client's open +and lock state after a certain period of time. When a client +restarts, it indicates to servers that open and lock state +associated with its previous leases is no longer valid and can be +destroyed immediately. + +In addition, each NFSv4 server manages a persistent list of client +leases. When the server restarts and clients attempt to recover +their state, the server uses this list to distinguish amongst +clients that held state before the server restarted and clients +sending fresh OPEN and LOCK requests. This enables file locks to +persist safely across server restarts. + +NFSv4 client identifiers +------------------------ + +Each NFSv4 client presents an identifier to NFSv4 servers so that +they can associate the client with its lease. Each client's +identifier consists of two elements: + + - co_ownerid: An arbitrary but fixed string. + + - boot verifier: A 64-bit incarnation verifier that enables a + server to distinguish successive boot epochs of the same client. + +The NFSv4.0 specification refers to these two items as an +"nfs_client_id4". The NFSv4.1 specification refers to these two +items as a "client_owner4". + +NFSv4 servers tie this identifier to the principal and security +flavor that the client used when presenting it. Servers use this +principal to authorize subsequent lease modification operations +sent by the client. Effectively this principal is a third element of +the identifier. + +As part of the identity presented to servers, a good +"co_ownerid" string has several important properties: + + - The "co_ownerid" string identifies the client during reboot + recovery, therefore the string is persistent across client + reboots. + - The "co_ownerid" string helps servers distinguish the client + from others, therefore the string is globally unique. Note + that there is no central authority that assigns "co_ownerid" + strings. + - Because it often appears on the network in the clear, the + "co_ownerid" string does not reveal private information about + the client itself. + - The content of the "co_ownerid" string is set and unchanging + before the client attempts NFSv4 mounts after a restart. + - The NFSv4 protocol places a 1024-byte limit on the size of the + "co_ownerid" string. + +Protecting NFSv4 lease state +---------------------------- + +NFSv4 servers utilize the "client_owner4" as described above to +assign a unique lease to each client. Under this scheme, there are +circumstances where clients can interfere with each other. This is +referred to as "lease stealing". + +If distinct clients present the same "co_ownerid" string and use +the same principal (for example, AUTH_SYS and UID 0), a server is +unable to tell that the clients are not the same. Each distinct +client presents a different boot verifier, so it appears to the +server as if there is one client that is rebooting frequently. +Neither client can maintain open or lock state in this scenario. + +If distinct clients present the same "co_ownerid" string and use +distinct principals, the server is likely to allow the first client +to operate normally but reject subsequent clients with the same +"co_ownerid" string. + +If a client's "co_ownerid" string or principal are not stable, +state recovery after a server or client reboot is not guaranteed. +If a client unexpectedly restarts but presents a different +"co_ownerid" string or principal to the server, the server orphans +the client's previous open and lock state. This blocks access to +locked files until the server removes the orphaned state. + +If the server restarts and a client presents a changed "co_ownerid" +string or principal to the server, the server will not allow the +client to reclaim its open and lock state, and may give those locks +to other clients in the meantime. This is referred to as "lock +stealing". + +Lease stealing and lock stealing increase the potential for denial +of service and in rare cases even data corruption. + +Selecting an appropriate client identifier +------------------------------------------ + +By default, the Linux NFSv4 client implementation constructs its +"co_ownerid" string starting with the words "Linux NFS" followed by +the client's UTS node name (the same node name, incidentally, that +is used as the "machine name" in an AUTH_SYS credential). In small +deployments, this construction is usually adequate. Often, however, +the node name by itself is not adequately unique, and can change +unexpectedly. Problematic situations include: + + - NFS-root (diskless) clients, where the local DCHP server (or + equivalent) does not provide a unique host name. + + - "Containers" within a single Linux host. If each container has + a separate network namespace, but does not use the UTS namespace + to provide a unique host name, then there can be multiple NFS + client instances with the same host name. + + - Clients across multiple administrative domains that access a + common NFS server. If hostnames are not assigned centrally + then uniqueness cannot be guaranteed unless a domain name is + included in the hostname. + +Linux provides two mechanisms to add uniqueness to its "co_ownerid" +string: + + nfs.nfs4_unique_id + This module parameter can set an arbitrary uniquifier string + via the kernel command line, or when the "nfs" module is + loaded. + + /sys/fs/nfs/client/net/identifier + This virtual file, available since Linux 5.3, is local to the + network namespace in which it is accessed and so can provide + distinction between network namespaces (containers) when the + hostname remains uniform. + +Note that this file is empty on name-space creation. If the +container system has access to some sort of per-container identity +then that uniquifier can be used. For example, a uniquifier might +be formed at boot using the container's internal identifier: + + sha256sum /etc/machine-id | awk '{print $1}' \\ + > /sys/fs/nfs/client/net/identifier + +Security considerations +----------------------- + +The use of cryptographic security for lease management operations +is strongly encouraged. + +If NFS with Kerberos is not configured, a Linux NFSv4 client uses +AUTH_SYS and UID 0 as the principal part of its client identity. +This configuration is not only insecure, it increases the risk of +lease and lock stealing. However, it might be the only choice for +client configurations that have no local persistent storage. +"co_ownerid" string uniqueness and persistence is critical in this +case. + +When a Kerberos keytab is present on a Linux NFS client, the client +attempts to use one of the principals in that keytab when +identifying itself to servers. The "sec=" mount option does not +control this behavior. Alternately, a single-user client with a +Kerberos principal can use that principal in place of the client's +host principal. + +Using Kerberos for this purpose enables the client and server to +use the same lease for operations covered by all "sec=" settings. +Additionally, the Linux NFS client uses the RPCSEC_GSS security +flavor with Kerberos and the integrity QOS to prevent in-transit +modification of lease modification requests. + +Additional notes +---------------- +The Linux NFSv4 client establishes a single lease on each NFSv4 +server it accesses. NFSv4 mounts from a Linux NFSv4 client of a +particular server then share that lease. + +Once a client establishes open and lock state, the NFSv4 protocol +enables lease state to transition to other servers, following data +that has been migrated. This hides data migration completely from +running applications. The Linux NFSv4 client facilitates state +migration by presenting the same "client_owner4" to all servers it +encounters. + +======== +See Also +======== + + - nfs(5) + - kerberos(7) + - RFC 7530 for the NFSv4.0 specification + - RFC 8881 for the NFSv4.1 specification. diff --git a/Documentation/filesystems/nfs/index.rst b/Documentation/filesystems/nfs/index.rst index 288d8ddb2bc6..8536134f31fd 100644 --- a/Documentation/filesystems/nfs/index.rst +++ b/Documentation/filesystems/nfs/index.rst @@ -6,6 +6,8 @@ NFS .. toctree:: :maxdepth: 1 + client-identifier + exporting pnfs rpc-cache rpc-server-gss diff --git a/MAINTAINERS b/MAINTAINERS index 790b80f8ac91..53caa85fe779 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -382,7 +382,7 @@ F: include/acpi/ F: tools/power/acpi/ ACPI FOR ARM64 (ACPI/arm64) -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Hanjun Guo M: Sudeep Holla L: linux-acpi@vger.kernel.org @@ -2940,7 +2940,7 @@ N: uniphier ARM/VERSATILE EXPRESS PLATFORM M: Liviu Dudau M: Sudeep Holla -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: */*/*/vexpress* @@ -4566,8 +4566,8 @@ F: drivers/power/supply/cw2015_battery.c CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov -M: Jeff Layton M: Xiubo Li +R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -4577,9 +4577,9 @@ F: include/linux/crush/ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) -M: Jeff Layton M: Xiubo Li M: Ilya Dryomov +R: Jeff Layton L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -5156,7 +5156,7 @@ F: arch/x86/kernel/cpuid.c F: arch/x86/kernel/msr.c CPUIDLE DRIVER - ARM BIG LITTLE -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Daniel Lezcano L: linux-pm@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -5176,7 +5176,7 @@ F: drivers/cpuidle/cpuidle-exynos.c F: include/linux/platform_data/cpuidle-exynos.h CPUIDLE DRIVER - ARM PSCI -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Sudeep Holla L: linux-pm@vger.kernel.org L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) @@ -9319,13 +9319,13 @@ F: drivers/i2c/i2c-stub.c I3C DRIVER FOR CADENCE I3C MASTER IP M: Przemysław Gaj S: Maintained -F: Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt +F: Documentation/devicetree/bindings/i3c/cdns,i3c-master.yaml F: drivers/i3c/master/i3c-master-cdns.c I3C DRIVER FOR SYNOPSYS DESIGNWARE M: Vitor Soares S: Maintained -F: Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.txt +F: Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml F: drivers/i3c/master/dw* I3C SUBSYSTEM @@ -11441,8 +11441,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: arch/s390/include/asm/livepatch.h -F: arch/x86/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: kernel/module/livepatch.c @@ -13069,7 +13067,7 @@ M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-pwm@vger.kernel.org S: Supported -F: Documentation/devicetree/bindings/pwm/atmel-pwm.txt +F: Documentation/devicetree/bindings/pwm/atmel,at91sam-pwm.yaml F: drivers/pwm/pwm-atmel.c MICROCHIP SAMA5D2-COMPATIBLE ADC DRIVER @@ -15291,7 +15289,7 @@ F: drivers/pci/controller/pci-v3-semi.c PCI ENDPOINT SUBSYSTEM M: Kishon Vijay Abraham I -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org S: Supported @@ -15354,7 +15352,7 @@ F: Documentation/devicetree/bindings/pci/xgene-pci-msi.txt F: drivers/pci/controller/pci-xgene-msi.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi R: Rob Herring R: Krzysztof Wilczyński L: linux-pci@vger.kernel.org @@ -15907,7 +15905,7 @@ F: include/linux/dtpm.h POWER STATE COORDINATION INTERFACE (PSCI) M: Mark Rutland -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: drivers/firmware/psci/ @@ -17002,6 +17000,14 @@ S: Supported F: Documentation/devicetree/bindings/iio/adc/renesas,rzg2l-adc.yaml F: drivers/iio/adc/rzg2l_adc.c +RENESAS RZ/N1 RTC CONTROLLER DRIVER +M: Miquel Raynal +L: linux-rtc@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/rtc/renesas,rzn1-rtc.yaml +F: drivers/rtc/rtc-rzn1.c + RENESAS R-CAR GEN3 & RZ/N1 NAND CONTROLLER DRIVER M: Miquel Raynal L: linux-mtd@lists.infradead.org @@ -18286,7 +18292,7 @@ F: drivers/net/ethernet/smsc/smc91x.* SECURE MONITOR CALL(SMC) CALLING CONVENTION (SMCCC) M: Mark Rutland -M: Lorenzo Pieralisi +M: Lorenzo Pieralisi M: Sudeep Holla L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained @@ -19038,6 +19044,12 @@ S: Maintained F: Documentation/devicetree/bindings/nvmem/sunplus,sp7021-ocotp.yaml F: drivers/nvmem/sunplus-ocotp.c +SUNPLUS PWM DRIVER +M: Hammer Hsieh +S: Maintained +F: Documentation/devicetree/bindings/pwm/sunplus,sp7021-pwm.yaml +F: drivers/pwm/pwm-sunplus.c + SUNPLUS RTC DRIVER M: Vincent Shih L: linux-rtc@vger.kernel.org @@ -19058,6 +19070,13 @@ S: Maintained F: Documentation/devicetree/bindings/serial/sunplus,sp7021-uart.yaml F: drivers/tty/serial/sunplus-uart.c +SUNPLUS WATCHDOG DRIVER +M: Xiantao Hu +L: linux-watchdog@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/watchdog/sunplus,sp7021-wdt.yaml +F: drivers/watchdog/sunplus_wdt.c + SUPERH M: Yoshinori Sato M: Rich Felker @@ -21827,6 +21846,12 @@ F: drivers/misc/Makefile F: drivers/misc/xilinx_sdfec.c F: include/uapi/misc/xilinx_sdfec.h +XILINX PWM DRIVER +M: Sean Anderson +S: Maintained +F: drivers/pwm/pwm-xilinx.c +F: include/clocksource/timer-xilinx.h + XILINX UARTLITE SERIAL DRIVER M: Peter Korsgaard L: linux-serial@vger.kernel.org diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index f8832cf49384..26c385582c3b 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -251,6 +251,10 @@ static int __init xilinx_timer_init(struct device_node *timer) u32 timer_num = 1; int ret; + /* If this property is present, the device is a PWM and not a timer */ + if (of_property_read_bool(timer, "#pwm-cells")) + return 0; + if (initialized) return -EINVAL; diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 1c60094ea0cd..d044a1fd4f44 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -7,17 +7,9 @@ #ifndef _ASM_POWERPC_LIVEPATCH_H #define _ASM_POWERPC_LIVEPATCH_H -#include -#include +#include #include -#ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} -#endif /* CONFIG_LIVEPATCH */ - #ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) { diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ea38c13936c7..dd09919c3c66 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 0e8fc1cd1c55..5761f08dae95 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h deleted file mode 100644 index 5209f223331a..000000000000 --- a/arch/s390/include/asm/livepatch.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * livepatch.h - s390-specific Kernel Live Patching Core - * - * Copyright (c) 2013-2015 SUSE - * Authors: Jiri Kosina - * Vojtech Pavlik - * Jiri Slaby - */ - -#ifndef ASM_LIVEPATCH_H -#define ASM_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h deleted file mode 100644 index 7c5cc6660e4b..000000000000 --- a/arch/x86/include/asm/livepatch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * livepatch.h - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings - * Copyright (C) 2014 SUSE - */ - -#ifndef _ASM_X86_LIVEPATCH_H -#define _ASM_X86_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2b21f717cce1..ef9bc62e9afd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -756,24 +756,23 @@ static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) */ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) { - struct rbd_client *client_node; - bool found = false; + struct rbd_client *rbdc = NULL, *iter; if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; spin_lock(&rbd_client_list_lock); - list_for_each_entry(client_node, &rbd_client_list, node) { - if (!ceph_compare_options(ceph_opts, client_node->client)) { - __rbd_get_client(client_node); + list_for_each_entry(iter, &rbd_client_list, node) { + if (!ceph_compare_options(ceph_opts, iter->client)) { + __rbd_get_client(iter); - found = true; + rbdc = iter; break; } } spin_unlock(&rbd_client_list_lock); - return found ? client_node : NULL; + return rbdc; } /* diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 9c5cc2800975..b4f69364f9a1 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -51,7 +51,7 @@ static int preallocated_oos_pages = 8192; static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn) { - struct kvm *kvm = vgpu->kvm; + struct kvm *kvm = vgpu->vfio_device.kvm; int idx; bool ret; @@ -1185,7 +1185,7 @@ static int is_2MB_gtt_possible(struct intel_vgpu *vgpu, if (!vgpu->attached) return -EINVAL; - pfn = gfn_to_pfn(vgpu->kvm, ops->get_pfn(entry)); + pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry)); if (is_error_noslot_pfn(pfn)) return -EINVAL; return PageTransHuge(pfn_to_page(pfn)); diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 03ecffc2ba56..aee1a45da74b 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -227,11 +227,7 @@ struct intel_vgpu { struct mutex cache_lock; struct notifier_block iommu_notifier; - struct notifier_block group_notifier; - struct kvm *kvm; - struct work_struct release_work; atomic_t released; - struct vfio_group *vfio_group; struct kvm_page_track_notifier_node track_node; #define NR_BKT (1 << 18) @@ -732,7 +728,7 @@ static inline int intel_gvt_read_gpa(struct intel_vgpu *vgpu, unsigned long gpa, { if (!vgpu->attached) return -ESRCH; - return vfio_dma_rw(vgpu->vfio_group, gpa, buf, len, false); + return vfio_dma_rw(&vgpu->vfio_device, gpa, buf, len, false); } /** @@ -750,7 +746,7 @@ static inline int intel_gvt_write_gpa(struct intel_vgpu *vgpu, { if (!vgpu->attached) return -ESRCH; - return vfio_dma_rw(vgpu->vfio_group, gpa, buf, len, true); + return vfio_dma_rw(&vgpu->vfio_device, gpa, buf, len, true); } void intel_gvt_debugfs_remove_vgpu(struct intel_vgpu *vgpu); diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 0787ba5c301f..e2f6c56ab342 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -228,8 +228,6 @@ static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt) } } -static void intel_vgpu_release_work(struct work_struct *work); - static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long size) { @@ -243,7 +241,7 @@ static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, for (npage = 0; npage < total_pages; npage++) { unsigned long cur_gfn = gfn + npage; - ret = vfio_group_unpin_pages(vgpu->vfio_group, &cur_gfn, 1); + ret = vfio_unpin_pages(&vgpu->vfio_device, &cur_gfn, 1); drm_WARN_ON(&i915->drm, ret != 1); } } @@ -266,8 +264,8 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long cur_gfn = gfn + npage; unsigned long pfn; - ret = vfio_group_pin_pages(vgpu->vfio_group, &cur_gfn, 1, - IOMMU_READ | IOMMU_WRITE, &pfn); + ret = vfio_pin_pages(&vgpu->vfio_device, &cur_gfn, 1, + IOMMU_READ | IOMMU_WRITE, &pfn); if (ret != 1) { gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n", cur_gfn, ret); @@ -761,23 +759,6 @@ static int intel_vgpu_iommu_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static int intel_vgpu_group_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct intel_vgpu *vgpu = - container_of(nb, struct intel_vgpu, group_notifier); - - /* the only action we care about */ - if (action == VFIO_GROUP_NOTIFY_SET_KVM) { - vgpu->kvm = data; - - if (!data) - schedule_work(&vgpu->release_work); - } - - return NOTIFY_OK; -} - static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu) { struct intel_vgpu *itr; @@ -789,7 +770,7 @@ static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu) if (!itr->attached) continue; - if (vgpu->kvm == itr->kvm) { + if (vgpu->vfio_device.kvm == itr->vfio_device.kvm) { ret = true; goto out; } @@ -804,61 +785,44 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); unsigned long events; int ret; - struct vfio_group *vfio_group; vgpu->iommu_notifier.notifier_call = intel_vgpu_iommu_notifier; - vgpu->group_notifier.notifier_call = intel_vgpu_group_notifier; events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; - ret = vfio_register_notifier(vfio_dev->dev, VFIO_IOMMU_NOTIFY, &events, - &vgpu->iommu_notifier); + ret = vfio_register_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, &events, + &vgpu->iommu_notifier); if (ret != 0) { gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n", ret); goto out; } - events = VFIO_GROUP_NOTIFY_SET_KVM; - ret = vfio_register_notifier(vfio_dev->dev, VFIO_GROUP_NOTIFY, &events, - &vgpu->group_notifier); - if (ret != 0) { - gvt_vgpu_err("vfio_register_notifier for group failed: %d\n", - ret); + ret = -EEXIST; + if (vgpu->attached) + goto undo_iommu; + + ret = -ESRCH; + if (!vgpu->vfio_device.kvm || + vgpu->vfio_device.kvm->mm != current->mm) { + gvt_vgpu_err("KVM is required to use Intel vGPU\n"); goto undo_iommu; } - vfio_group = - vfio_group_get_external_user_from_dev(vgpu->vfio_device.dev); - if (IS_ERR_OR_NULL(vfio_group)) { - ret = !vfio_group ? -EFAULT : PTR_ERR(vfio_group); - gvt_vgpu_err("vfio_group_get_external_user_from_dev failed\n"); - goto undo_register; - } - vgpu->vfio_group = vfio_group; - - ret = -EEXIST; - if (vgpu->attached) - goto undo_group; - - ret = -ESRCH; - if (!vgpu->kvm || vgpu->kvm->mm != current->mm) { - gvt_vgpu_err("KVM is required to use Intel vGPU\n"); - goto undo_group; - } + kvm_get_kvm(vgpu->vfio_device.kvm); ret = -EEXIST; if (__kvmgt_vgpu_exist(vgpu)) - goto undo_group; + goto undo_iommu; vgpu->attached = true; - kvm_get_kvm(vgpu->kvm); kvmgt_protect_table_init(vgpu); gvt_cache_init(vgpu); vgpu->track_node.track_write = kvmgt_page_track_write; vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; - kvm_page_track_register_notifier(vgpu->kvm, &vgpu->track_node); + kvm_page_track_register_notifier(vgpu->vfio_device.kvm, + &vgpu->track_node); debugfs_create_ulong(KVMGT_DEBUGFS_FILENAME, 0444, vgpu->debugfs, &vgpu->nr_cache_entries); @@ -868,17 +832,9 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) atomic_set(&vgpu->released, 0); return 0; -undo_group: - vfio_group_put_external_user(vgpu->vfio_group); - vgpu->vfio_group = NULL; - -undo_register: - vfio_unregister_notifier(vfio_dev->dev, VFIO_GROUP_NOTIFY, - &vgpu->group_notifier); - undo_iommu: - vfio_unregister_notifier(vfio_dev->dev, VFIO_IOMMU_NOTIFY, - &vgpu->iommu_notifier); + vfio_unregister_notifier(vfio_dev, VFIO_IOMMU_NOTIFY, + &vgpu->iommu_notifier); out: return ret; } @@ -894,8 +850,9 @@ static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu *vgpu) } } -static void __intel_vgpu_release(struct intel_vgpu *vgpu) +static void intel_vgpu_close_device(struct vfio_device *vfio_dev) { + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); struct drm_i915_private *i915 = vgpu->gvt->gt->i915; int ret; @@ -907,41 +864,24 @@ static void __intel_vgpu_release(struct intel_vgpu *vgpu) intel_gvt_release_vgpu(vgpu); - ret = vfio_unregister_notifier(vgpu->vfio_device.dev, VFIO_IOMMU_NOTIFY, - &vgpu->iommu_notifier); + ret = vfio_unregister_notifier(&vgpu->vfio_device, VFIO_IOMMU_NOTIFY, + &vgpu->iommu_notifier); drm_WARN(&i915->drm, ret, "vfio_unregister_notifier for iommu failed: %d\n", ret); - ret = vfio_unregister_notifier(vgpu->vfio_device.dev, VFIO_GROUP_NOTIFY, - &vgpu->group_notifier); - drm_WARN(&i915->drm, ret, - "vfio_unregister_notifier for group failed: %d\n", ret); - debugfs_remove(debugfs_lookup(KVMGT_DEBUGFS_FILENAME, vgpu->debugfs)); - kvm_page_track_unregister_notifier(vgpu->kvm, &vgpu->track_node); - kvm_put_kvm(vgpu->kvm); + kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm, + &vgpu->track_node); kvmgt_protect_table_destroy(vgpu); gvt_cache_destroy(vgpu); intel_vgpu_release_msi_eventfd_ctx(vgpu); - vfio_group_put_external_user(vgpu->vfio_group); - vgpu->kvm = NULL; vgpu->attached = false; -} -static void intel_vgpu_close_device(struct vfio_device *vfio_dev) -{ - __intel_vgpu_release(vfio_dev_to_vgpu(vfio_dev)); -} - -static void intel_vgpu_release_work(struct work_struct *work) -{ - struct intel_vgpu *vgpu = - container_of(work, struct intel_vgpu, release_work); - - __intel_vgpu_release(vgpu); + if (vgpu->vfio_device.kvm) + kvm_put_kvm(vgpu->vfio_device.kvm); } static u64 intel_vgpu_get_bar_addr(struct intel_vgpu *vgpu, int bar) @@ -1690,7 +1630,6 @@ static int intel_vgpu_probe(struct mdev_device *mdev) return PTR_ERR(vgpu); } - INIT_WORK(&vgpu->release_work, intel_vgpu_release_work); vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev, &intel_vgpu_dev_ops); @@ -1728,7 +1667,7 @@ static struct mdev_driver intel_vgpu_mdev_driver = { int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) { - struct kvm *kvm = info->kvm; + struct kvm *kvm = info->vfio_device.kvm; struct kvm_memory_slot *slot; int idx; @@ -1758,7 +1697,7 @@ out: int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn) { - struct kvm *kvm = info->kvm; + struct kvm *kvm = info->vfio_device.kvm; struct kvm_memory_slot *slot; int idx; diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index c16157ee8c52..6078fa0c0d48 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -528,6 +528,9 @@ static int ismt_access(struct i2c_adapter *adap, u16 addr, case I2C_SMBUS_BLOCK_PROC_CALL: dev_dbg(dev, "I2C_SMBUS_BLOCK_PROC_CALL\n"); + if (data->block[0] > I2C_SMBUS_BLOCK_MAX) + return -EINVAL; + dma_size = I2C_SMBUS_BLOCK_MAX; desc->tgtaddr_rw = ISMT_DESC_ADDR_RW(addr, 1); desc->wr_len_cmd = data->block[0] + 1; diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c index 8c01123dc4ed..6aef5ce43cc1 100644 --- a/drivers/i3c/master/mipi-i3c-hci/core.c +++ b/drivers/i3c/master/mipi-i3c-hci/core.c @@ -768,13 +768,8 @@ static int i3c_hci_probe(struct platform_device *pdev) static int i3c_hci_remove(struct platform_device *pdev) { struct i3c_hci *hci = platform_get_drvdata(pdev); - int ret; - ret = i3c_master_unregister(&hci->master); - if (ret) - return ret; - - return 0; + return i3c_master_unregister(&hci->master); } static const __maybe_unused struct of_device_id i3c_hci_of_match[] = { diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 7550dad64ecf..d6e9ed74cdcf 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -1597,12 +1597,11 @@ static int __maybe_unused svc_i3c_runtime_suspend(struct device *dev) static int __maybe_unused svc_i3c_runtime_resume(struct device *dev) { struct svc_i3c_master *master = dev_get_drvdata(dev); - int ret = 0; pinctrl_pm_select_default_state(dev); svc_i3c_master_prepare_clks(master); - return ret; + return 0; } static const struct dev_pm_ops svc_i3c_pm_ops = { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9e1d4a98f6ed..9b43c5695ac4 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1005,7 +1005,7 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -static int dm_table_supports_poll(struct dm_table *t); +static bool dm_table_supports_poll(struct dm_table *t); static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { @@ -1027,7 +1027,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); } - poll_supported = !!dm_table_supports_poll(t); + poll_supported = dm_table_supports_poll(t); } t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, @@ -1613,9 +1613,20 @@ static int count_device(struct dm_target *ti, struct dm_dev *dev, return 0; } -static int dm_table_supports_poll(struct dm_table *t) +static bool dm_table_supports_poll(struct dm_table *t) { - return !dm_table_any_dev_attr(t, device_not_poll_capable, NULL); + struct dm_target *ti; + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_poll_capable, NULL)) + return false; + } + + return true; } /* diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 80133aae0db3..d6dbd47492a8 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1312,6 +1312,7 @@ bad: static struct target_type verity_target = { .name = "verity", + .features = DM_TARGET_IMMUTABLE, .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = verity_ctr, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index 887ee0f729d1..2935614f6fa9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -87,6 +87,11 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) enable_vfs_hca: num_msix_count = mlx5_get_default_msix_vec_count(dev, num_vfs); for (vf = 0; vf < num_vfs; vf++) { + /* Notify the VF before its enablement to let it set + * some stuff. + */ + blocking_notifier_call_chain(&sriov->vfs_ctx[vf].notifier, + MLX5_PF_NOTIFY_ENABLE_VF, dev); err = mlx5_core_enable_hca(dev, vf + 1); if (err) { mlx5_core_warn(dev, "failed to enable VF %d (%d)\n", vf, err); @@ -127,6 +132,11 @@ mlx5_device_disable_sriov(struct mlx5_core_dev *dev, int num_vfs, bool clear_vf) for (vf = num_vfs - 1; vf >= 0; vf--) { if (!sriov->vfs_ctx[vf].enabled) continue; + /* Notify the VF before its disablement to let it clean + * some resources. + */ + blocking_notifier_call_chain(&sriov->vfs_ctx[vf].notifier, + MLX5_PF_NOTIFY_DISABLE_VF, dev); err = mlx5_core_disable_hca(dev, vf + 1); if (err) { mlx5_core_warn(dev, "failed to disable VF %d\n", vf); @@ -257,7 +267,7 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; struct pci_dev *pdev = dev->pdev; - int total_vfs; + int total_vfs, i; if (!mlx5_core_is_pf(dev)) return 0; @@ -269,6 +279,9 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev) if (!sriov->vfs_ctx) return -ENOMEM; + for (i = 0; i < total_vfs; i++) + BLOCKING_INIT_NOTIFIER_HEAD(&sriov->vfs_ctx[i].notifier); + return 0; } @@ -281,3 +294,53 @@ void mlx5_sriov_cleanup(struct mlx5_core_dev *dev) kfree(sriov->vfs_ctx); } + +/** + * mlx5_sriov_blocking_notifier_unregister - Unregister a VF from + * a notification block chain. + * + * @mdev: The mlx5 core device. + * @vf_id: The VF id. + * @nb: The notifier block to be unregistered. + */ +void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb) +{ + struct mlx5_vf_context *vfs_ctx; + struct mlx5_core_sriov *sriov; + + sriov = &mdev->priv.sriov; + if (WARN_ON(vf_id < 0 || vf_id >= sriov->num_vfs)) + return; + + vfs_ctx = &sriov->vfs_ctx[vf_id]; + blocking_notifier_chain_unregister(&vfs_ctx->notifier, nb); +} +EXPORT_SYMBOL(mlx5_sriov_blocking_notifier_unregister); + +/** + * mlx5_sriov_blocking_notifier_register - Register a VF notification + * block chain. + * + * @mdev: The mlx5 core device. + * @vf_id: The VF id. + * @nb: The notifier block to be called upon the VF events. + * + * Returns 0 on success or an error code. + */ +int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb) +{ + struct mlx5_vf_context *vfs_ctx; + struct mlx5_core_sriov *sriov; + + sriov = &mdev->priv.sriov; + if (vf_id < 0 || vf_id >= sriov->num_vfs) + return -EINVAL; + + vfs_ctx = &sriov->vfs_ctx[vf_id]; + return blocking_notifier_chain_register(&vfs_ctx->notifier, nb); +} +EXPORT_SYMBOL(mlx5_sriov_blocking_notifier_register); diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 375c0c40bbf8..e61058e13818 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -196,8 +195,6 @@ static inline void brcm_pcie_bridge_sw_init_set_generic(struct brcm_pcie *pcie, static inline void brcm_pcie_perst_set_4908(struct brcm_pcie *pcie, u32 val); static inline void brcm_pcie_perst_set_7278(struct brcm_pcie *pcie, u32 val); static inline void brcm_pcie_perst_set_generic(struct brcm_pcie *pcie, u32 val); -static int brcm_pcie_linkup(struct brcm_pcie *pcie); -static int brcm_pcie_add_bus(struct pci_bus *bus); enum { RGR1_SW_INIT_1, @@ -286,14 +283,6 @@ static const struct pcie_cfg_data bcm2711_cfg = { .bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_generic, }; -struct subdev_regulators { - unsigned int num_supplies; - struct regulator_bulk_data supplies[]; -}; - -static int pci_subdev_regulators_add_bus(struct pci_bus *bus); -static void pci_subdev_regulators_remove_bus(struct pci_bus *bus); - struct brcm_msi { struct device *dev; void __iomem *base; @@ -331,9 +320,6 @@ struct brcm_pcie { u32 hw_rev; void (*perst_set)(struct brcm_pcie *pcie, u32 val); void (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val); - bool refusal_mode; - struct subdev_regulators *sr; - bool ep_wakeup_capable; }; static inline bool is_bmips(const struct brcm_pcie *pcie) @@ -450,99 +436,6 @@ static int brcm_pcie_set_ssc(struct brcm_pcie *pcie) return ssc && pll ? 0 : -EIO; } -static void *alloc_subdev_regulators(struct device *dev) -{ - static const char * const supplies[] = { - "vpcie3v3", - "vpcie3v3aux", - "vpcie12v", - }; - const size_t size = sizeof(struct subdev_regulators) - + sizeof(struct regulator_bulk_data) * ARRAY_SIZE(supplies); - struct subdev_regulators *sr; - int i; - - sr = devm_kzalloc(dev, size, GFP_KERNEL); - if (sr) { - sr->num_supplies = ARRAY_SIZE(supplies); - for (i = 0; i < ARRAY_SIZE(supplies); i++) - sr->supplies[i].supply = supplies[i]; - } - - return sr; -} - -static int pci_subdev_regulators_add_bus(struct pci_bus *bus) -{ - struct device *dev = &bus->dev; - struct subdev_regulators *sr; - int ret; - - if (!dev->of_node || !bus->parent || !pci_is_root_bus(bus->parent)) - return 0; - - if (dev->driver_data) - dev_err(dev, "dev.driver_data unexpectedly non-NULL\n"); - - sr = alloc_subdev_regulators(dev); - if (!sr) - return -ENOMEM; - - dev->driver_data = sr; - ret = regulator_bulk_get(dev, sr->num_supplies, sr->supplies); - if (ret) - return ret; - - ret = regulator_bulk_enable(sr->num_supplies, sr->supplies); - if (ret) { - dev_err(dev, "failed to enable regulators for downstream device\n"); - return ret; - } - - return 0; -} - -static int brcm_pcie_add_bus(struct pci_bus *bus) -{ - struct device *dev = &bus->dev; - struct brcm_pcie *pcie = (struct brcm_pcie *) bus->sysdata; - int ret; - - if (!dev->of_node || !bus->parent || !pci_is_root_bus(bus->parent)) - return 0; - - ret = pci_subdev_regulators_add_bus(bus); - if (ret) - return ret; - - /* Grab the regulators for suspend/resume */ - pcie->sr = bus->dev.driver_data; - - /* - * If we have failed linkup there is no point to return an error as - * currently it will cause a WARNING() from pci_alloc_child_bus(). - * We return 0 and turn on the "refusal_mode" so that any further - * accesses to the pci_dev just get 0xffffffff - */ - if (brcm_pcie_linkup(pcie) != 0) - pcie->refusal_mode = true; - - return 0; -} - -static void pci_subdev_regulators_remove_bus(struct pci_bus *bus) -{ - struct device *dev = &bus->dev; - struct subdev_regulators *sr = dev->driver_data; - - if (!sr || !bus->parent || !pci_is_root_bus(bus->parent)) - return; - - if (regulator_bulk_disable(sr->num_supplies, sr->supplies)) - dev_err(dev, "failed to disable regulators for downstream device\n"); - dev->driver_data = NULL; -} - /* Limits operation to a specific generation (1, 2, or 3) */ static void brcm_pcie_set_gen(struct brcm_pcie *pcie, int gen) { @@ -858,18 +751,6 @@ static void __iomem *brcm_pcie_map_conf(struct pci_bus *bus, unsigned int devfn, /* Accesses to the RC go right to the RC registers if slot==0 */ if (pci_is_root_bus(bus)) return PCI_SLOT(devfn) ? NULL : base + where; - if (pcie->refusal_mode) { - /* - * At this point we do not have link. There will be a CPU - * abort -- a quirk with this controller --if Linux tries - * to read any config-space registers besides those - * targeting the host bridge. To prevent this we hijack - * the address to point to a safe access that will return - * 0xffffffff. - */ - writel(0xffffffff, base + PCIE_MISC_RC_BAR2_CONFIG_HI); - return base + PCIE_MISC_RC_BAR2_CONFIG_HI + (where & 0x3); - } /* For devices, write to the config space index register */ idx = PCIE_ECAM_OFFSET(bus->number, devfn, 0); @@ -898,8 +779,6 @@ static struct pci_ops brcm_pcie_ops = { .map_bus = brcm_pcie_map_conf, .read = pci_generic_config_read, .write = pci_generic_config_write, - .add_bus = brcm_pcie_add_bus, - .remove_bus = pci_subdev_regulators_remove_bus, }; static struct pci_ops brcm_pcie_ops32 = { @@ -1047,9 +926,16 @@ static inline int brcm_pcie_get_rc_bar2_size_and_offset(struct brcm_pcie *pcie, static int brcm_pcie_setup(struct brcm_pcie *pcie) { + struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); u64 rc_bar2_offset, rc_bar2_size; void __iomem *base = pcie->base; - int ret, memc; + struct device *dev = pcie->dev; + struct resource_entry *entry; + bool ssc_good = false; + struct resource *res; + int num_out_wins = 0; + u16 nlw, cls, lnksta; + int i, ret, memc; u32 tmp, burst, aspm_support; /* Reset the bridge */ @@ -1139,40 +1025,6 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie) if (pcie->gen) brcm_pcie_set_gen(pcie, pcie->gen); - /* Don't advertise L0s capability if 'aspm-no-l0s' */ - aspm_support = PCIE_LINK_STATE_L1; - if (!of_property_read_bool(pcie->np, "aspm-no-l0s")) - aspm_support |= PCIE_LINK_STATE_L0S; - tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); - u32p_replace_bits(&tmp, aspm_support, - PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK); - writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); - - /* - * For config space accesses on the RC, show the right class for - * a PCIe-PCIe bridge (the default setting is to be EP mode). - */ - tmp = readl(base + PCIE_RC_CFG_PRIV1_ID_VAL3); - u32p_replace_bits(&tmp, 0x060400, - PCIE_RC_CFG_PRIV1_ID_VAL3_CLASS_CODE_MASK); - writel(tmp, base + PCIE_RC_CFG_PRIV1_ID_VAL3); - - return 0; -} - -static int brcm_pcie_linkup(struct brcm_pcie *pcie) -{ - struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); - struct device *dev = pcie->dev; - void __iomem *base = pcie->base; - struct resource_entry *entry; - struct resource *res; - int num_out_wins = 0; - u16 nlw, cls, lnksta; - bool ssc_good = false; - u32 tmp; - int ret, i; - /* Unassert the fundamental reset */ pcie->perst_set(pcie, 0); @@ -1223,6 +1075,24 @@ static int brcm_pcie_linkup(struct brcm_pcie *pcie) num_out_wins++; } + /* Don't advertise L0s capability if 'aspm-no-l0s' */ + aspm_support = PCIE_LINK_STATE_L1; + if (!of_property_read_bool(pcie->np, "aspm-no-l0s")) + aspm_support |= PCIE_LINK_STATE_L0S; + tmp = readl(base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); + u32p_replace_bits(&tmp, aspm_support, + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY_ASPM_SUPPORT_MASK); + writel(tmp, base + PCIE_RC_CFG_PRIV1_LINK_CAPABILITY); + + /* + * For config space accesses on the RC, show the right class for + * a PCIe-PCIe bridge (the default setting is to be EP mode). + */ + tmp = readl(base + PCIE_RC_CFG_PRIV1_ID_VAL3); + u32p_replace_bits(&tmp, 0x060400, + PCIE_RC_CFG_PRIV1_ID_VAL3_CLASS_CODE_MASK); + writel(tmp, base + PCIE_RC_CFG_PRIV1_ID_VAL3); + if (pcie->ssc) { ret = brcm_pcie_set_ssc(pcie); if (ret == 0) @@ -1351,21 +1221,9 @@ static void brcm_pcie_turn_off(struct brcm_pcie *pcie) pcie->bridge_sw_init_set(pcie, 1); } -static int pci_dev_may_wakeup(struct pci_dev *dev, void *data) -{ - bool *ret = data; - - if (device_may_wakeup(&dev->dev)) { - *ret = true; - dev_info(&dev->dev, "disable cancelled for wake-up device\n"); - } - return (int) *ret; -} - static int brcm_pcie_suspend(struct device *dev) { struct brcm_pcie *pcie = dev_get_drvdata(dev); - struct pci_host_bridge *bridge = pci_host_bridge_from_priv(pcie); int ret; brcm_pcie_turn_off(pcie); @@ -1383,25 +1241,6 @@ static int brcm_pcie_suspend(struct device *dev) return ret; } - if (pcie->sr) { - /* - * Now turn off the regulators, but if at least one - * downstream device is enabled as a wake-up source, do not - * turn off regulators. - */ - pcie->ep_wakeup_capable = false; - pci_walk_bus(bridge->bus, pci_dev_may_wakeup, - &pcie->ep_wakeup_capable); - if (!pcie->ep_wakeup_capable) { - ret = regulator_bulk_disable(pcie->sr->num_supplies, - pcie->sr->supplies); - if (ret) { - dev_err(dev, "Could not turn off regulators\n"); - reset_control_reset(pcie->rescal); - return ret; - } - } - } clk_disable_unprepare(pcie->clk); return 0; @@ -1419,28 +1258,9 @@ static int brcm_pcie_resume(struct device *dev) if (ret) return ret; - if (pcie->sr) { - if (pcie->ep_wakeup_capable) { - /* - * We are resuming from a suspend. In the suspend we - * did not disable the power supplies, so there is - * no need to enable them (and falsely increase their - * usage count). - */ - pcie->ep_wakeup_capable = false; - } else { - ret = regulator_bulk_enable(pcie->sr->num_supplies, - pcie->sr->supplies); - if (ret) { - dev_err(dev, "Could not turn on regulators\n"); - goto err_disable_clk; - } - } - } - ret = reset_control_reset(pcie->rescal); if (ret) - goto err_regulator; + goto err_disable_clk; ret = brcm_phy_start(pcie); if (ret) @@ -1461,10 +1281,6 @@ static int brcm_pcie_resume(struct device *dev) if (ret) goto err_reset; - ret = brcm_pcie_linkup(pcie); - if (ret) - goto err_reset; - if (pcie->msi) brcm_msi_set_regs(pcie->msi); @@ -1472,9 +1288,6 @@ static int brcm_pcie_resume(struct device *dev) err_reset: reset_control_rearm(pcie->rescal); -err_regulator: - if (pcie->sr) - regulator_bulk_disable(pcie->sr->num_supplies, pcie->sr->supplies); err_disable_clk: clk_disable_unprepare(pcie->clk); return ret; @@ -1606,17 +1419,7 @@ static int brcm_pcie_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pcie); - ret = pci_host_probe(bridge); - if (!ret && !brcm_pcie_link_up(pcie)) - ret = -ENODEV; - - if (ret) { - brcm_pcie_remove(pdev); - return ret; - } - - return 0; - + return pci_host_probe(bridge); fail: __brcm_pcie_remove(pcie); return ret; @@ -1625,8 +1428,8 @@ fail: MODULE_DEVICE_TABLE(of, brcm_pcie_match); static const struct dev_pm_ops brcm_pcie_pm_ops = { - .suspend_noirq = brcm_pcie_suspend, - .resume_noirq = brcm_pcie_resume, + .suspend = brcm_pcie_suspend, + .resume = brcm_pcie_resume, }; static struct platform_driver brcm_pcie_driver = { diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 876182873468..cfaf40a540a8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2967,6 +2967,8 @@ static const struct dmi_system_id bridge_d3_blacklist[] = { DMI_MATCH(DMI_BOARD_VENDOR, "Gigabyte Technology Co., Ltd."), DMI_MATCH(DMI_BOARD_NAME, "X299 DESIGNARE EX-CF"), }, + }, + { /* * Downstream device is not accessible after putting a root port * into D3cold and back into D0 on Elo i2. diff --git a/drivers/pcmcia/Kconfig b/drivers/pcmcia/Kconfig index ec977f031bc2..bf495bf0f48a 100644 --- a/drivers/pcmcia/Kconfig +++ b/drivers/pcmcia/Kconfig @@ -151,7 +151,7 @@ config TCIC config PCMCIA_ALCHEMY_DEVBOARD tristate "Alchemy Db/Pb1xxx PCMCIA socket services" - depends on MIPS_ALCHEMY && PCMCIA + depends on MIPS_DB1XXX && PCMCIA help Enable this driver of you want PCMCIA support on your Alchemy Db1000, Db/Pb1100, Db/Pb1500, Db/Pb1550, Db/Pb1200, DB1300 diff --git a/drivers/pcmcia/bcm63xx_pcmcia.c b/drivers/pcmcia/bcm63xx_pcmcia.c index 16f573173471..bb06311d0b5f 100644 --- a/drivers/pcmcia/bcm63xx_pcmcia.c +++ b/drivers/pcmcia/bcm63xx_pcmcia.c @@ -327,10 +327,11 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) { struct bcm63xx_pcmcia_socket *skt; struct pcmcia_socket *sock; - struct resource *res, *irq_res; + struct resource *res; unsigned int regmem_size = 0, iomem_size = 0; u32 val; int ret; + int irq; skt = kzalloc(sizeof(*skt), GFP_KERNEL); if (!skt) @@ -342,9 +343,9 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) /* make sure we have all resources we need */ skt->common_res = platform_get_resource(pdev, IORESOURCE_MEM, 1); skt->attr_res = platform_get_resource(pdev, IORESOURCE_MEM, 2); - irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + irq = platform_get_irq(pdev, 0); skt->pd = pdev->dev.platform_data; - if (!skt->common_res || !skt->attr_res || !irq_res || !skt->pd) { + if (!skt->common_res || !skt->attr_res || (irq < 0) || !skt->pd) { ret = -EINVAL; goto err; } @@ -380,7 +381,7 @@ static int bcm63xx_drv_pcmcia_probe(struct platform_device *pdev) sock->dev.parent = &pdev->dev; sock->features = SS_CAP_STATIC_MAP | SS_CAP_PCCARD; sock->io_offset = (unsigned long)skt->io_base; - sock->pci_irq = irq_res->start; + sock->pci_irq = irq; #ifdef CONFIG_CARDBUS sock->cb_dev = bcm63xx_cb_dev; diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index 6b6c578b5f92..ad1141fddb4c 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -394,7 +394,7 @@ static int do_validate_mem(struct pcmcia_socket *s, * do_mem_probe() checks a memory region for use by the PCMCIA subsystem. * To do so, the area is split up into sensible parts, and then passed * into the @validate() function. Only if @validate() and @fallback() fail, - * the area is marked as unavaibale for use by the PCMCIA subsystem. The + * the area is marked as unavailable for use by the PCMCIA subsystem. The * function returns the size of the usable memory area. */ static int do_mem_probe(struct pcmcia_socket *s, u_long base, u_long num, diff --git a/drivers/power/supply/ab8500_fg.c b/drivers/power/supply/ab8500_fg.c index 97ac588a9e9c..ec8a404d71b4 100644 --- a/drivers/power/supply/ab8500_fg.c +++ b/drivers/power/supply/ab8500_fg.c @@ -3037,13 +3037,6 @@ static int ab8500_fg_bind(struct device *dev, struct device *master, { struct ab8500_fg *di = dev_get_drvdata(dev); - /* Create a work queue for running the FG algorithm */ - di->fg_wq = alloc_ordered_workqueue("ab8500_fg_wq", WQ_MEM_RECLAIM); - if (di->fg_wq == NULL) { - dev_err(dev, "failed to create work queue\n"); - return -ENOMEM; - } - di->bat_cap.max_mah_design = di->bm->bi->charge_full_design_uah; di->bat_cap.max_mah = di->bat_cap.max_mah_design; di->vbat_nom_uv = di->bm->bi->voltage_max_design_uv; @@ -3067,8 +3060,7 @@ static void ab8500_fg_unbind(struct device *dev, struct device *master, if (ret) dev_err(dev, "failed to disable coulomb counter\n"); - destroy_workqueue(di->fg_wq); - flush_scheduled_work(); + flush_workqueue(di->fg_wq); } static const struct component_ops ab8500_fg_component_ops = { @@ -3117,6 +3109,13 @@ static int ab8500_fg_probe(struct platform_device *pdev) ab8500_fg_charge_state_to(di, AB8500_FG_CHARGE_INIT); ab8500_fg_discharge_state_to(di, AB8500_FG_DISCHARGE_INIT); + /* Create a work queue for running the FG algorithm */ + di->fg_wq = alloc_ordered_workqueue("ab8500_fg_wq", WQ_MEM_RECLAIM); + if (di->fg_wq == NULL) { + dev_err(dev, "failed to create work queue\n"); + return -ENOMEM; + } + /* Init work for running the fg algorithm instantly */ INIT_WORK(&di->fg_work, ab8500_fg_instant_work); @@ -3227,6 +3226,8 @@ static int ab8500_fg_remove(struct platform_device *pdev) { struct ab8500_fg *di = platform_get_drvdata(pdev); + destroy_workqueue(di->fg_wq); + flush_scheduled_work(); component_del(&pdev->dev, &ab8500_fg_component_ops); list_del(&di->node); ab8500_fg_sysfs_exit(di); diff --git a/drivers/power/supply/axp288_fuel_gauge.c b/drivers/power/supply/axp288_fuel_gauge.c index e9f285dae489..8e6f8a655079 100644 --- a/drivers/power/supply/axp288_fuel_gauge.c +++ b/drivers/power/supply/axp288_fuel_gauge.c @@ -90,6 +90,8 @@ #define AXP288_REG_UPDATE_INTERVAL (60 * HZ) #define AXP288_FG_INTR_NUM 6 +#define AXP288_QUIRK_NO_BATTERY BIT(0) + static bool no_current_sense_res; module_param(no_current_sense_res, bool, 0444); MODULE_PARM_DESC(no_current_sense_res, "No (or broken) current sense resistor"); @@ -524,7 +526,7 @@ static struct power_supply_desc fuel_gauge_desc = { * detection reports one despite it not being there. * Please keep this listed sorted alphabetically. */ -static const struct dmi_system_id axp288_no_battery_list[] = { +static const struct dmi_system_id axp288_quirks[] = { { /* ACEPC T8 Cherry Trail Z8350 mini PC */ .matches = { @@ -534,6 +536,7 @@ static const struct dmi_system_id axp288_no_battery_list[] = { /* also match on somewhat unique bios-version */ DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1.000"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* ACEPC T11 Cherry Trail Z8350 mini PC */ @@ -544,6 +547,7 @@ static const struct dmi_system_id axp288_no_battery_list[] = { /* also match on somewhat unique bios-version */ DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1.000"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Intel Cherry Trail Compute Stick, Windows version */ @@ -551,6 +555,7 @@ static const struct dmi_system_id axp288_no_battery_list[] = { DMI_MATCH(DMI_SYS_VENDOR, "Intel"), DMI_MATCH(DMI_PRODUCT_NAME, "STK1AW32SC"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Intel Cherry Trail Compute Stick, version without an OS */ @@ -558,34 +563,54 @@ static const struct dmi_system_id axp288_no_battery_list[] = { DMI_MATCH(DMI_SYS_VENDOR, "Intel"), DMI_MATCH(DMI_PRODUCT_NAME, "STK1A32SC"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Meegopad T02 */ .matches = { DMI_MATCH(DMI_PRODUCT_NAME, "MEEGOPAD T02"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Mele PCG03 Mini PC */ .matches = { DMI_EXACT_MATCH(DMI_BOARD_VENDOR, "Mini PC"), DMI_EXACT_MATCH(DMI_BOARD_NAME, "Mini PC"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { /* Minix Neo Z83-4 mini PC */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "MINIX"), DMI_MATCH(DMI_PRODUCT_NAME, "Z83-4"), - } + }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, { - /* Various Ace PC/Meegopad/MinisForum/Wintel Mini-PCs/HDMI-sticks */ + /* + * One Mix 1, this uses the "T3 MRD" boardname used by + * generic mini PCs, but it is a mini laptop so it does + * actually have a battery! + */ + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "T3 MRD"), + DMI_MATCH(DMI_BIOS_DATE, "06/14/2018"), + }, + .driver_data = NULL, + }, + { + /* + * Various Ace PC/Meegopad/MinisForum/Wintel Mini-PCs/HDMI-sticks + * This entry must be last because it is generic, this allows + * adding more specifuc quirks overriding this generic entry. + */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "T3 MRD"), DMI_MATCH(DMI_CHASSIS_TYPE, "3"), DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - DMI_MATCH(DMI_BIOS_VERSION, "5.11"), }, + .driver_data = (void *)AXP288_QUIRK_NO_BATTERY, }, {} }; @@ -665,7 +690,9 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev) [BAT_D_CURR] = "axp288-chrg-d-curr", [BAT_VOLT] = "axp288-batt-volt", }; + const struct dmi_system_id *dmi_id; struct device *dev = &pdev->dev; + unsigned long quirks = 0; int i, pirq, ret; /* @@ -675,7 +702,11 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev) if (!acpi_quirk_skip_acpi_ac_and_battery()) return -ENODEV; - if (dmi_check_system(axp288_no_battery_list)) + dmi_id = dmi_first_match(axp288_quirks); + if (dmi_id) + quirks = (unsigned long)dmi_id->driver_data; + + if (quirks & AXP288_QUIRK_NO_BATTERY) return -ENODEV; info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c index aa1a589eb9f2..27f5c7648617 100644 --- a/drivers/power/supply/bq24190_charger.c +++ b/drivers/power/supply/bq24190_charger.c @@ -455,11 +455,9 @@ static ssize_t bq24190_sysfs_show(struct device *dev, if (!info) return -EINVAL; - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } ret = bq24190_read_mask(bdi, info->reg, info->mask, info->shift, &v); if (ret) @@ -490,11 +488,9 @@ static ssize_t bq24190_sysfs_store(struct device *dev, if (ret < 0) return ret; - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } ret = bq24190_write_mask(bdi, info->reg, info->mask, info->shift, v); if (ret) @@ -512,10 +508,9 @@ static int bq24190_set_otg_vbus(struct bq24190_dev_info *bdi, bool enable) union power_supply_propval val = { .intval = bdi->charge_type }; int ret; - ret = pm_runtime_get_sync(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); if (ret < 0) { dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret); - pm_runtime_put_noidle(bdi->dev); return ret; } @@ -551,10 +546,9 @@ static int bq24190_vbus_is_enabled(struct regulator_dev *dev) int ret; u8 val; - ret = pm_runtime_get_sync(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); if (ret < 0) { dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret); - pm_runtime_put_noidle(bdi->dev); return ret; } @@ -1128,11 +1122,9 @@ static int bq24190_charger_get_property(struct power_supply *psy, dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_CHARGE_TYPE: @@ -1204,11 +1196,9 @@ static int bq24190_charger_set_property(struct power_supply *psy, dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_ONLINE: @@ -1477,11 +1467,9 @@ static int bq24190_battery_get_property(struct power_supply *psy, dev_warn(bdi->dev, "warning: /sys/class/power_supply/bq24190-battery is deprecated\n"); dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_STATUS: @@ -1525,11 +1513,9 @@ static int bq24190_battery_set_property(struct power_supply *psy, dev_warn(bdi->dev, "warning: /sys/class/power_supply/bq24190-battery is deprecated\n"); dev_dbg(bdi->dev, "prop: %d\n", psp); - ret = pm_runtime_get_sync(bdi->dev); - if (ret < 0) { - pm_runtime_put_noidle(bdi->dev); + ret = pm_runtime_resume_and_get(bdi->dev); + if (ret < 0) return ret; - } switch (psp) { case POWER_SUPPLY_PROP_ONLINE: @@ -1683,10 +1669,9 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data) int error; bdi->irq_event = true; - error = pm_runtime_get_sync(bdi->dev); + error = pm_runtime_resume_and_get(bdi->dev); if (error < 0) { dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); return IRQ_NONE; } bq24190_check_status(bdi); @@ -1921,11 +1906,9 @@ static int bq24190_remove(struct i2c_client *client) struct bq24190_dev_info *bdi = i2c_get_clientdata(client); int error; - error = pm_runtime_get_sync(bdi->dev); - if (error < 0) { + error = pm_runtime_resume_and_get(bdi->dev); + if (error < 0) dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); - } bq24190_register_reset(bdi); if (bdi->battery) @@ -1982,11 +1965,9 @@ static __maybe_unused int bq24190_pm_suspend(struct device *dev) struct bq24190_dev_info *bdi = i2c_get_clientdata(client); int error; - error = pm_runtime_get_sync(bdi->dev); - if (error < 0) { + error = pm_runtime_resume_and_get(bdi->dev); + if (error < 0) dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); - } bq24190_register_reset(bdi); @@ -2007,11 +1988,9 @@ static __maybe_unused int bq24190_pm_resume(struct device *dev) bdi->f_reg = 0; bdi->ss_reg = BQ24190_REG_SS_VBUS_STAT_MASK; /* impossible state */ - error = pm_runtime_get_sync(bdi->dev); - if (error < 0) { + error = pm_runtime_resume_and_get(bdi->dev); + if (error < 0) dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", error); - pm_runtime_put_noidle(bdi->dev); - } bq24190_register_reset(bdi); bq24190_set_config(bdi); diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 72e727cd31e8..35e6a394c0df 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1572,14 +1572,6 @@ static int bq27xxx_battery_read_charge(struct bq27xxx_device_info *di, u8 reg) */ static inline int bq27xxx_battery_read_nac(struct bq27xxx_device_info *di) { - int flags; - - if (di->opts & BQ27XXX_O_ZERO) { - flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, true); - if (flags >= 0 && (flags & BQ27000_FLAG_CI)) - return -ENODATA; - } - return bq27xxx_battery_read_charge(di, BQ27XXX_REG_NAC); } @@ -1742,6 +1734,18 @@ static bool bq27xxx_battery_dead(struct bq27xxx_device_info *di, u16 flags) return flags & (BQ27XXX_FLAG_SOC1 | BQ27XXX_FLAG_SOCF); } +/* + * Returns true if reported battery capacity is inaccurate + */ +static bool bq27xxx_battery_capacity_inaccurate(struct bq27xxx_device_info *di, + u16 flags) +{ + if (di->opts & BQ27XXX_O_HAS_CI) + return (flags & BQ27000_FLAG_CI); + else + return false; +} + static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di) { /* Unlikely but important to return first */ @@ -1751,6 +1755,8 @@ static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di) return POWER_SUPPLY_HEALTH_COLD; if (unlikely(bq27xxx_battery_dead(di, di->cache.flags))) return POWER_SUPPLY_HEALTH_DEAD; + if (unlikely(bq27xxx_battery_capacity_inaccurate(di, di->cache.flags))) + return POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED; return POWER_SUPPLY_HEALTH_GOOD; } @@ -1758,7 +1764,6 @@ static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di) void bq27xxx_battery_update(struct bq27xxx_device_info *di) { struct bq27xxx_reg_cache cache = {0, }; - bool has_ci_flag = di->opts & BQ27XXX_O_HAS_CI; bool has_singe_flag = di->opts & BQ27XXX_O_ZERO; cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag); @@ -1766,30 +1771,19 @@ void bq27xxx_battery_update(struct bq27xxx_device_info *di) cache.flags = -1; /* read error */ if (cache.flags >= 0) { cache.temperature = bq27xxx_battery_read_temperature(di); - if (has_ci_flag && (cache.flags & BQ27000_FLAG_CI)) { - dev_info_once(di->dev, "battery is not calibrated! ignoring capacity values\n"); - cache.capacity = -ENODATA; - cache.energy = -ENODATA; - cache.time_to_empty = -ENODATA; - cache.time_to_empty_avg = -ENODATA; - cache.time_to_full = -ENODATA; - cache.charge_full = -ENODATA; - cache.health = -ENODATA; - } else { - if (di->regs[BQ27XXX_REG_TTE] != INVALID_REG_ADDR) - cache.time_to_empty = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE); - if (di->regs[BQ27XXX_REG_TTECP] != INVALID_REG_ADDR) - cache.time_to_empty_avg = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTECP); - if (di->regs[BQ27XXX_REG_TTF] != INVALID_REG_ADDR) - cache.time_to_full = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTF); + if (di->regs[BQ27XXX_REG_TTE] != INVALID_REG_ADDR) + cache.time_to_empty = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE); + if (di->regs[BQ27XXX_REG_TTECP] != INVALID_REG_ADDR) + cache.time_to_empty_avg = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTECP); + if (di->regs[BQ27XXX_REG_TTF] != INVALID_REG_ADDR) + cache.time_to_full = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTF); - cache.charge_full = bq27xxx_battery_read_fcc(di); - cache.capacity = bq27xxx_battery_read_soc(di); - if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR) - cache.energy = bq27xxx_battery_read_energy(di); - di->cache.flags = cache.flags; - cache.health = bq27xxx_battery_read_health(di); - } + cache.charge_full = bq27xxx_battery_read_fcc(di); + cache.capacity = bq27xxx_battery_read_soc(di); + if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR) + cache.energy = bq27xxx_battery_read_energy(di); + di->cache.flags = cache.flags; + cache.health = bq27xxx_battery_read_health(di); if (di->regs[BQ27XXX_REG_CYCT] != INVALID_REG_ADDR) cache.cycle_count = bq27xxx_battery_read_cyct(di); diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index c887678b3d33..f9ee0807ef68 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -694,7 +694,7 @@ int power_supply_get_battery_info(struct power_supply *psy, goto out_put_node; } - info = devm_kmalloc(&psy->dev, sizeof(*info), GFP_KERNEL); + info = devm_kzalloc(&psy->dev, sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; goto out_put_node; diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 21e3b05a5153..904de8d61828 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -572,6 +572,17 @@ config PWM_SUN4I To compile this driver as a module, choose M here: the module will be called pwm-sun4i. +config PWM_SUNPLUS + tristate "Sunplus PWM support" + depends on ARCH_SUNPLUS || COMPILE_TEST + depends on HAS_IOMEM && OF + help + Generic PWM framework driver for the PWM controller on + Sunplus SoCs. + + To compile this driver as a module, choose M here: the module + will be called pwm-sunplus. + config PWM_TEGRA tristate "NVIDIA Tegra PWM support" depends on ARCH_TEGRA || COMPILE_TEST @@ -640,4 +651,18 @@ config PWM_VT8500 To compile this driver as a module, choose M here: the module will be called pwm-vt8500. +config PWM_XILINX + tristate "Xilinx AXI Timer PWM support" + depends on OF_ADDRESS + depends on COMMON_CLK + select REGMAP_MMIO + help + PWM driver for Xilinx LogiCORE IP AXI timers. This timer is + typically a soft core which may be present in Xilinx FPGAs. + This device may also be present in Microblaze soft processors. + If you don't have this IP in your design, choose N. + + To compile this driver as a module, choose M here: the module + will be called pwm-xilinx. + endif diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index 708840b7fba8..5c08bdb817b4 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_PWM_STM32) += pwm-stm32.o obj-$(CONFIG_PWM_STM32_LP) += pwm-stm32-lp.o obj-$(CONFIG_PWM_STMPE) += pwm-stmpe.o obj-$(CONFIG_PWM_SUN4I) += pwm-sun4i.o +obj-$(CONFIG_PWM_SUNPLUS) += pwm-sunplus.o obj-$(CONFIG_PWM_TEGRA) += pwm-tegra.o obj-$(CONFIG_PWM_TIECAP) += pwm-tiecap.o obj-$(CONFIG_PWM_TIEHRPWM) += pwm-tiehrpwm.o @@ -60,3 +61,4 @@ obj-$(CONFIG_PWM_TWL) += pwm-twl.o obj-$(CONFIG_PWM_TWL_LED) += pwm-twl-led.o obj-$(CONFIG_PWM_VISCONTI) += pwm-visconti.o obj-$(CONFIG_PWM_VT8500) += pwm-vt8500.o +obj-$(CONFIG_PWM_XILINX) += pwm-xilinx.o diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index 36f7ea381838..3977a0f9d132 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -61,7 +61,7 @@ struct atmel_tcb_pwm_chip { struct atmel_tcb_channel bkup; }; -const u8 atmel_tcb_divisors[] = { 2, 8, 32, 128, 0, }; +static const u8 atmel_tcb_divisors[] = { 2, 8, 32, 128, 0, }; static inline struct atmel_tcb_pwm_chip *to_tcb_chip(struct pwm_chip *chip) { @@ -72,7 +72,8 @@ static int atmel_tcb_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm, enum pwm_polarity polarity) { - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; tcbpwm->polarity = polarity; @@ -97,7 +98,6 @@ static int atmel_tcb_pwm_request(struct pwm_chip *chip, return ret; } - pwm_set_chip_data(pwm, tcbpwm); tcbpwm->polarity = PWM_POLARITY_NORMAL; tcbpwm->duty = 0; tcbpwm->period = 0; @@ -139,7 +139,7 @@ static int atmel_tcb_pwm_request(struct pwm_chip *chip, static void atmel_tcb_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; clk_disable_unprepare(tcbpwmc->clk); tcbpwmc->pwms[pwm->hwpwm] = NULL; @@ -149,7 +149,7 @@ static void atmel_tcb_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) static void atmel_tcb_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; unsigned cmr; enum pwm_polarity polarity = tcbpwm->polarity; @@ -206,7 +206,7 @@ static void atmel_tcb_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) static int atmel_tcb_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; u32 cmr; enum pwm_polarity polarity = tcbpwm->polarity; @@ -291,7 +291,7 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct atmel_tcb_pwm_chip *tcbpwmc = to_tcb_chip(chip); - struct atmel_tcb_pwm_device *tcbpwm = pwm_get_chip_data(pwm); + struct atmel_tcb_pwm_device *tcbpwm = tcbpwmc->pwms[pwm->hwpwm]; struct atmel_tcb_pwm_device *atcbpwm = NULL; int i = 0; int slowclk = 0; diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c index d7ad88685830..b0d91142da8d 100644 --- a/drivers/pwm/pwm-clps711x.c +++ b/drivers/pwm/pwm-clps711x.c @@ -23,29 +23,6 @@ static inline struct clps711x_chip *to_clps711x_chip(struct pwm_chip *chip) return container_of(chip, struct clps711x_chip, chip); } -static void clps711x_pwm_update_val(struct clps711x_chip *priv, u32 n, u32 v) -{ - /* PWM0 - bits 4..7, PWM1 - bits 8..11 */ - u32 shift = (n + 1) * 4; - unsigned long flags; - u32 tmp; - - spin_lock_irqsave(&priv->lock, flags); - - tmp = readl(priv->pmpcon); - tmp &= ~(0xf << shift); - tmp |= v << shift; - writel(tmp, priv->pmpcon); - - spin_unlock_irqrestore(&priv->lock, flags); -} - -static unsigned int clps711x_get_duty(struct pwm_device *pwm, unsigned int v) -{ - /* Duty cycle 0..15 max */ - return DIV64_U64_ROUND_CLOSEST(v * 0xf, pwm->args.period); -} - static int clps711x_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct clps711x_chip *priv = to_clps711x_chip(chip); @@ -60,44 +37,41 @@ static int clps711x_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) return 0; } -static int clps711x_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static int clps711x_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) { struct clps711x_chip *priv = to_clps711x_chip(chip); - unsigned int duty; + /* PWM0 - bits 4..7, PWM1 - bits 8..11 */ + u32 shift = (pwm->hwpwm + 1) * 4; + unsigned long flags; + u32 pmpcon, val; - if (period_ns != pwm->args.period) + if (state->polarity != PWM_POLARITY_NORMAL) return -EINVAL; - duty = clps711x_get_duty(pwm, duty_ns); - clps711x_pwm_update_val(priv, pwm->hwpwm, duty); + if (state->period != pwm->args.period) + return -EINVAL; + + if (state->enabled) + val = mul_u64_u64_div_u64(state->duty_cycle, 0xf, state->period); + else + val = 0; + + spin_lock_irqsave(&priv->lock, flags); + + pmpcon = readl(priv->pmpcon); + pmpcon &= ~(0xf << shift); + pmpcon |= val << shift; + writel(pmpcon, priv->pmpcon); + + spin_unlock_irqrestore(&priv->lock, flags); return 0; } -static int clps711x_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct clps711x_chip *priv = to_clps711x_chip(chip); - unsigned int duty; - - duty = clps711x_get_duty(pwm, pwm_get_duty_cycle(pwm)); - clps711x_pwm_update_val(priv, pwm->hwpwm, duty); - - return 0; -} - -static void clps711x_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct clps711x_chip *priv = to_clps711x_chip(chip); - - clps711x_pwm_update_val(priv, pwm->hwpwm, 0); -} - static const struct pwm_ops clps711x_pwm_ops = { .request = clps711x_pwm_request, - .config = clps711x_pwm_config, - .enable = clps711x_pwm_enable, - .disable = clps711x_pwm_disable, + .apply = clps711x_pwm_apply, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index 5e29d9c682c3..7f10f56c3eb6 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -12,17 +12,21 @@ #include #include +#include + /** * struct cros_ec_pwm_device - Driver data for EC PWM * * @dev: Device node * @ec: Pointer to EC device * @chip: PWM controller chip + * @use_pwm_type: Use PWM types instead of generic channels */ struct cros_ec_pwm_device { struct device *dev; struct cros_ec_device *ec; struct pwm_chip chip; + bool use_pwm_type; }; /** @@ -58,14 +62,31 @@ static void cros_ec_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) kfree(channel); } -static int cros_ec_pwm_set_duty(struct cros_ec_device *ec, u8 index, u16 duty) +static int cros_ec_dt_type_to_pwm_type(u8 dt_index, u8 *pwm_type) { + switch (dt_index) { + case CROS_EC_PWM_DT_KB_LIGHT: + *pwm_type = EC_PWM_TYPE_KB_LIGHT; + return 0; + case CROS_EC_PWM_DT_DISPLAY_LIGHT: + *pwm_type = EC_PWM_TYPE_DISPLAY_LIGHT; + return 0; + default: + return -EINVAL; + } +} + +static int cros_ec_pwm_set_duty(struct cros_ec_pwm_device *ec_pwm, u8 index, + u16 duty) +{ + struct cros_ec_device *ec = ec_pwm->ec; struct { struct cros_ec_command msg; struct ec_params_pwm_set_duty params; } __packed buf; struct ec_params_pwm_set_duty *params = &buf.params; struct cros_ec_command *msg = &buf.msg; + int ret; memset(&buf, 0, sizeof(buf)); @@ -75,14 +96,25 @@ static int cros_ec_pwm_set_duty(struct cros_ec_device *ec, u8 index, u16 duty) msg->outsize = sizeof(*params); params->duty = duty; - params->pwm_type = EC_PWM_TYPE_GENERIC; - params->index = index; + + if (ec_pwm->use_pwm_type) { + ret = cros_ec_dt_type_to_pwm_type(index, ¶ms->pwm_type); + if (ret) { + dev_err(ec->dev, "Invalid PWM type index: %d\n", index); + return ret; + } + params->index = 0; + } else { + params->pwm_type = EC_PWM_TYPE_GENERIC; + params->index = index; + } return cros_ec_cmd_xfer_status(ec, msg); } -static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) +static int cros_ec_pwm_get_duty(struct cros_ec_pwm_device *ec_pwm, u8 index) { + struct cros_ec_device *ec = ec_pwm->ec; struct { struct cros_ec_command msg; union { @@ -102,8 +134,17 @@ static int cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index) msg->insize = sizeof(*resp); msg->outsize = sizeof(*params); - params->pwm_type = EC_PWM_TYPE_GENERIC; - params->index = index; + if (ec_pwm->use_pwm_type) { + ret = cros_ec_dt_type_to_pwm_type(index, ¶ms->pwm_type); + if (ret) { + dev_err(ec->dev, "Invalid PWM type index: %d\n", index); + return ret; + } + params->index = 0; + } else { + params->pwm_type = EC_PWM_TYPE_GENERIC; + params->index = index; + } ret = cros_ec_cmd_xfer_status(ec, msg); if (ret < 0) @@ -133,7 +174,7 @@ static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, */ duty_cycle = state->enabled ? state->duty_cycle : 0; - ret = cros_ec_pwm_set_duty(ec_pwm->ec, pwm->hwpwm, duty_cycle); + ret = cros_ec_pwm_set_duty(ec_pwm, pwm->hwpwm, duty_cycle); if (ret < 0) return ret; @@ -149,7 +190,7 @@ static void cros_ec_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, struct cros_ec_pwm *channel = pwm_get_chip_data(pwm); int ret; - ret = cros_ec_pwm_get_duty(ec_pwm->ec, pwm->hwpwm); + ret = cros_ec_pwm_get_duty(ec_pwm, pwm->hwpwm); if (ret < 0) { dev_err(chip->dev, "error getting initial duty: %d\n", ret); return; @@ -204,13 +245,13 @@ static const struct pwm_ops cros_ec_pwm_ops = { * of PWMs it supports directly, so we have to read the pwm duty cycle for * subsequent channels until we get an error. */ -static int cros_ec_num_pwms(struct cros_ec_device *ec) +static int cros_ec_num_pwms(struct cros_ec_pwm_device *ec_pwm) { int i, ret; /* The index field is only 8 bits */ for (i = 0; i <= U8_MAX; i++) { - ret = cros_ec_pwm_get_duty(ec, i); + ret = cros_ec_pwm_get_duty(ec_pwm, i); /* * We look for SUCCESS, INVALID_COMMAND, or INVALID_PARAM * responses; everything else is treated as an error. @@ -236,6 +277,7 @@ static int cros_ec_pwm_probe(struct platform_device *pdev) { struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent); struct device *dev = &pdev->dev; + struct device_node *np = pdev->dev.of_node; struct cros_ec_pwm_device *ec_pwm; struct pwm_chip *chip; int ret; @@ -251,17 +293,26 @@ static int cros_ec_pwm_probe(struct platform_device *pdev) chip = &ec_pwm->chip; ec_pwm->ec = ec; + if (of_device_is_compatible(np, "google,cros-ec-pwm-type")) + ec_pwm->use_pwm_type = true; + /* PWM chip */ chip->dev = dev; chip->ops = &cros_ec_pwm_ops; chip->of_xlate = cros_ec_pwm_xlate; chip->of_pwm_n_cells = 1; - ret = cros_ec_num_pwms(ec); - if (ret < 0) { - dev_err(dev, "Couldn't find PWMs: %d\n", ret); - return ret; + + if (ec_pwm->use_pwm_type) { + chip->npwm = CROS_EC_PWM_DT_COUNT; + } else { + ret = cros_ec_num_pwms(ec_pwm); + if (ret < 0) { + dev_err(dev, "Couldn't find PWMs: %d\n", ret); + return ret; + } + chip->npwm = ret; } - chip->npwm = ret; + dev_dbg(dev, "Probed %u PWMs\n", chip->npwm); ret = pwmchip_add(chip); @@ -288,6 +339,7 @@ static int cros_ec_pwm_remove(struct platform_device *dev) #ifdef CONFIG_OF static const struct of_device_id cros_ec_pwm_of_match[] = { { .compatible = "google,cros-ec-pwm" }, + { .compatible = "google,cros-ec-pwm-type" }, {}, }; MODULE_DEVICE_TABLE(of, cros_ec_pwm_of_match); diff --git a/drivers/pwm/pwm-lp3943.c b/drivers/pwm/pwm-lp3943.c index ea17d446a627..215ef9069114 100644 --- a/drivers/pwm/pwm-lp3943.c +++ b/drivers/pwm/pwm-lp3943.c @@ -93,7 +93,7 @@ static void lp3943_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static int lp3943_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { struct lp3943_pwm *lp3943_pwm = to_lp3943_pwm(chip); struct lp3943 *lp3943 = lp3943_pwm->lp3943; @@ -118,14 +118,20 @@ static int lp3943_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, reg_duty = LP3943_REG_PWM1; } - period_ns = clamp(period_ns, LP3943_MIN_PERIOD, LP3943_MAX_PERIOD); - val = (u8)(period_ns / LP3943_MIN_PERIOD - 1); + /* + * Note that after this clamping, period_ns fits into an int. This is + * helpful because we can resort to integer division below instead of + * the (more expensive) 64 bit division. + */ + period_ns = clamp(period_ns, (u64)LP3943_MIN_PERIOD, (u64)LP3943_MAX_PERIOD); + val = (u8)((int)period_ns / LP3943_MIN_PERIOD - 1); err = lp3943_write_byte(lp3943, reg_prescale, val); if (err) return err; - val = (u8)(duty_ns * LP3943_MAX_DUTY / period_ns); + duty_ns = min(duty_ns, period_ns); + val = (u8)((int)duty_ns * LP3943_MAX_DUTY / (int)period_ns); return lp3943_write_byte(lp3943, reg_duty, val); } @@ -182,12 +188,34 @@ static void lp3943_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) lp3943_pwm_set_mode(lp3943_pwm, pwm_map, LP3943_GPIO_OUT_HIGH); } +static int lp3943_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + lp3943_pwm_disable(chip, pwm); + return 0; + } + + err = lp3943_pwm_config(chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = lp3943_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops lp3943_pwm_ops = { .request = lp3943_pwm_request, .free = lp3943_pwm_free, - .config = lp3943_pwm_config, - .enable = lp3943_pwm_enable, - .disable = lp3943_pwm_disable, + .apply = lp3943_pwm_apply, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c index b909096dba2f..272e0b5d01b8 100644 --- a/drivers/pwm/pwm-lpc18xx-sct.c +++ b/drivers/pwm/pwm-lpc18xx-sct.c @@ -226,14 +226,7 @@ static int lpc18xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } -static int lpc18xx_pwm_set_polarity(struct pwm_chip *chip, - struct pwm_device *pwm, - enum pwm_polarity polarity) -{ - return 0; -} - -static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) +static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm, enum pwm_polarity polarity) { struct lpc18xx_pwm_chip *lpc18xx_pwm = to_lpc18xx_pwm_chip(chip); struct lpc18xx_pwm_data *lpc18xx_data = &lpc18xx_pwm->channeldata[pwm->hwpwm]; @@ -249,7 +242,7 @@ static int lpc18xx_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) LPC18XX_PWM_EVSTATEMSK(lpc18xx_data->duty_event), LPC18XX_PWM_EVSTATEMSK_ALL); - if (pwm_get_polarity(pwm) == PWM_POLARITY_NORMAL) { + if (polarity == PWM_POLARITY_NORMAL) { set_event = lpc18xx_pwm->period_event; clear_event = lpc18xx_data->duty_event; res_action = LPC18XX_PWM_RES_SET; @@ -308,11 +301,35 @@ static void lpc18xx_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) clear_bit(lpc18xx_data->duty_event, &lpc18xx_pwm->event_map); } +static int lpc18xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity && pwm->state.enabled) { + lpc18xx_pwm_disable(chip, pwm); + enabled = false; + } + + if (!state->enabled) { + if (enabled) + lpc18xx_pwm_disable(chip, pwm); + + return 0; + } + + err = lpc18xx_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!enabled) + err = lpc18xx_pwm_enable(chip, pwm, state->polarity); + + return err; +} static const struct pwm_ops lpc18xx_pwm_ops = { - .config = lpc18xx_pwm_config, - .set_polarity = lpc18xx_pwm_set_polarity, - .enable = lpc18xx_pwm_enable, - .disable = lpc18xx_pwm_disable, + .apply = lpc18xx_pwm_apply, .request = lpc18xx_pwm_request, .free = lpc18xx_pwm_free, .owner = THIS_MODULE, diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index ddeab5687cb8..86a0ea0f6955 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -88,10 +88,33 @@ static void lpc32xx_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) clk_disable_unprepare(lpc32xx->clk); } +static int lpc32xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + lpc32xx_pwm_disable(chip, pwm); + + return 0; + } + + err = lpc32xx_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = lpc32xx_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops lpc32xx_pwm_ops = { - .config = lpc32xx_pwm_config, - .enable = lpc32xx_pwm_enable, - .disable = lpc32xx_pwm_disable, + .apply = lpc32xx_pwm_apply, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 568b13a48717..d28c0874c7f2 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -198,10 +198,33 @@ static void pwm_mediatek_disable(struct pwm_chip *chip, struct pwm_device *pwm) pwm_mediatek_clk_disable(chip, pwm); } +static int pwm_mediatek_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + pwm_mediatek_disable(chip, pwm); + + return 0; + } + + err = pwm_mediatek_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = pwm_mediatek_enable(chip, pwm); + + return err; +} + static const struct pwm_ops pwm_mediatek_ops = { - .config = pwm_mediatek_config, - .enable = pwm_mediatek_enable, - .disable = pwm_mediatek_disable, + .apply = pwm_mediatek_apply, .owner = THIS_MODULE, }; @@ -264,6 +287,12 @@ static const struct pwm_mediatek_of_data mt2712_pwm_data = { .has_ck_26m_sel = false, }; +static const struct pwm_mediatek_of_data mt6795_pwm_data = { + .num_pwms = 7, + .pwm45_fixup = false, + .has_ck_26m_sel = false, +}; + static const struct pwm_mediatek_of_data mt7622_pwm_data = { .num_pwms = 6, .pwm45_fixup = false, @@ -302,6 +331,7 @@ static const struct pwm_mediatek_of_data mt8516_pwm_data = { static const struct of_device_id pwm_mediatek_of_match[] = { { .compatible = "mediatek,mt2712-pwm", .data = &mt2712_pwm_data }, + { .compatible = "mediatek,mt6795-pwm", .data = &mt6795_pwm_data }, { .compatible = "mediatek,mt7622-pwm", .data = &mt7622_pwm_data }, { .compatible = "mediatek,mt7623-pwm", .data = &mt7623_pwm_data }, { .compatible = "mediatek,mt7628-pwm", .data = &mt7628_pwm_data }, diff --git a/drivers/pwm/pwm-raspberrypi-poe.c b/drivers/pwm/pwm-raspberrypi-poe.c index e52e29fc8231..6ff73029f367 100644 --- a/drivers/pwm/pwm-raspberrypi-poe.c +++ b/drivers/pwm/pwm-raspberrypi-poe.c @@ -66,7 +66,7 @@ static int raspberrypi_pwm_get_property(struct rpi_firmware *firmware, u32 reg, u32 *val) { struct raspberrypi_pwm_prop msg = { - .reg = reg + .reg = cpu_to_le32(reg), }; int ret; diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 4381df90a527..d7311614c846 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -89,71 +89,71 @@ struct tpu_device { #define to_tpu_device(c) container_of(c, struct tpu_device, chip) -static void tpu_pwm_write(struct tpu_pwm_device *pwm, int reg_nr, u16 value) +static void tpu_pwm_write(struct tpu_pwm_device *tpd, int reg_nr, u16 value) { - void __iomem *base = pwm->tpu->base + TPU_CHANNEL_OFFSET - + pwm->channel * TPU_CHANNEL_SIZE; + void __iomem *base = tpd->tpu->base + TPU_CHANNEL_OFFSET + + tpd->channel * TPU_CHANNEL_SIZE; iowrite16(value, base + reg_nr); } -static void tpu_pwm_set_pin(struct tpu_pwm_device *pwm, +static void tpu_pwm_set_pin(struct tpu_pwm_device *tpd, enum tpu_pin_state state) { static const char * const states[] = { "inactive", "PWM", "active" }; - dev_dbg(&pwm->tpu->pdev->dev, "%u: configuring pin as %s\n", - pwm->channel, states[state]); + dev_dbg(&tpd->tpu->pdev->dev, "%u: configuring pin as %s\n", + tpd->channel, states[state]); switch (state) { case TPU_PIN_INACTIVE: - tpu_pwm_write(pwm, TPU_TIORn, - pwm->polarity == PWM_POLARITY_INVERSED ? + tpu_pwm_write(tpd, TPU_TIORn, + tpd->polarity == PWM_POLARITY_INVERSED ? TPU_TIOR_IOA_1 : TPU_TIOR_IOA_0); break; case TPU_PIN_PWM: - tpu_pwm_write(pwm, TPU_TIORn, - pwm->polarity == PWM_POLARITY_INVERSED ? + tpu_pwm_write(tpd, TPU_TIORn, + tpd->polarity == PWM_POLARITY_INVERSED ? TPU_TIOR_IOA_0_SET : TPU_TIOR_IOA_1_CLR); break; case TPU_PIN_ACTIVE: - tpu_pwm_write(pwm, TPU_TIORn, - pwm->polarity == PWM_POLARITY_INVERSED ? + tpu_pwm_write(tpd, TPU_TIORn, + tpd->polarity == PWM_POLARITY_INVERSED ? TPU_TIOR_IOA_0 : TPU_TIOR_IOA_1); break; } } -static void tpu_pwm_start_stop(struct tpu_pwm_device *pwm, int start) +static void tpu_pwm_start_stop(struct tpu_pwm_device *tpd, int start) { unsigned long flags; u16 value; - spin_lock_irqsave(&pwm->tpu->lock, flags); - value = ioread16(pwm->tpu->base + TPU_TSTR); + spin_lock_irqsave(&tpd->tpu->lock, flags); + value = ioread16(tpd->tpu->base + TPU_TSTR); if (start) - value |= 1 << pwm->channel; + value |= 1 << tpd->channel; else - value &= ~(1 << pwm->channel); + value &= ~(1 << tpd->channel); - iowrite16(value, pwm->tpu->base + TPU_TSTR); - spin_unlock_irqrestore(&pwm->tpu->lock, flags); + iowrite16(value, tpd->tpu->base + TPU_TSTR); + spin_unlock_irqrestore(&tpd->tpu->lock, flags); } -static int tpu_pwm_timer_start(struct tpu_pwm_device *pwm) +static int tpu_pwm_timer_start(struct tpu_pwm_device *tpd) { int ret; - if (!pwm->timer_on) { + if (!tpd->timer_on) { /* Wake up device and enable clock. */ - pm_runtime_get_sync(&pwm->tpu->pdev->dev); - ret = clk_prepare_enable(pwm->tpu->clk); + pm_runtime_get_sync(&tpd->tpu->pdev->dev); + ret = clk_prepare_enable(tpd->tpu->clk); if (ret) { - dev_err(&pwm->tpu->pdev->dev, "cannot enable clock\n"); + dev_err(&tpd->tpu->pdev->dev, "cannot enable clock\n"); return ret; } - pwm->timer_on = true; + tpd->timer_on = true; } /* @@ -161,8 +161,8 @@ static int tpu_pwm_timer_start(struct tpu_pwm_device *pwm) * completely. First drive the pin to the inactive state to avoid * glitches. */ - tpu_pwm_set_pin(pwm, TPU_PIN_INACTIVE); - tpu_pwm_start_stop(pwm, false); + tpu_pwm_set_pin(tpd, TPU_PIN_INACTIVE); + tpu_pwm_start_stop(tpd, false); /* * - Clear TCNT on TGRB match @@ -172,142 +172,168 @@ static int tpu_pwm_timer_start(struct tpu_pwm_device *pwm) * - Output 1 until TGRA, output 0 until TGRB (active high polarity * - PWM mode */ - tpu_pwm_write(pwm, TPU_TCRn, TPU_TCR_CCLR_TGRB | TPU_TCR_CKEG_RISING | - pwm->prescaler); - tpu_pwm_write(pwm, TPU_TMDRn, TPU_TMDR_MD_PWM); - tpu_pwm_set_pin(pwm, TPU_PIN_PWM); - tpu_pwm_write(pwm, TPU_TGRAn, pwm->duty); - tpu_pwm_write(pwm, TPU_TGRBn, pwm->period); + tpu_pwm_write(tpd, TPU_TCRn, TPU_TCR_CCLR_TGRB | TPU_TCR_CKEG_RISING | + tpd->prescaler); + tpu_pwm_write(tpd, TPU_TMDRn, TPU_TMDR_MD_PWM); + tpu_pwm_set_pin(tpd, TPU_PIN_PWM); + tpu_pwm_write(tpd, TPU_TGRAn, tpd->duty); + tpu_pwm_write(tpd, TPU_TGRBn, tpd->period); - dev_dbg(&pwm->tpu->pdev->dev, "%u: TGRA 0x%04x TGRB 0x%04x\n", - pwm->channel, pwm->duty, pwm->period); + dev_dbg(&tpd->tpu->pdev->dev, "%u: TGRA 0x%04x TGRB 0x%04x\n", + tpd->channel, tpd->duty, tpd->period); /* Start the channel. */ - tpu_pwm_start_stop(pwm, true); + tpu_pwm_start_stop(tpd, true); return 0; } -static void tpu_pwm_timer_stop(struct tpu_pwm_device *pwm) +static void tpu_pwm_timer_stop(struct tpu_pwm_device *tpd) { - if (!pwm->timer_on) + if (!tpd->timer_on) return; /* Disable channel. */ - tpu_pwm_start_stop(pwm, false); + tpu_pwm_start_stop(tpd, false); /* Stop clock and mark device as idle. */ - clk_disable_unprepare(pwm->tpu->clk); - pm_runtime_put(&pwm->tpu->pdev->dev); + clk_disable_unprepare(tpd->tpu->clk); + pm_runtime_put(&tpd->tpu->pdev->dev); - pwm->timer_on = false; + tpd->timer_on = false; } /* ----------------------------------------------------------------------------- * PWM API */ -static int tpu_pwm_request(struct pwm_chip *chip, struct pwm_device *_pwm) +static int tpu_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct tpu_device *tpu = to_tpu_device(chip); - struct tpu_pwm_device *pwm; + struct tpu_pwm_device *tpd; - if (_pwm->hwpwm >= TPU_CHANNEL_MAX) + if (pwm->hwpwm >= TPU_CHANNEL_MAX) return -EINVAL; - pwm = kzalloc(sizeof(*pwm), GFP_KERNEL); - if (pwm == NULL) + tpd = kzalloc(sizeof(*tpd), GFP_KERNEL); + if (tpd == NULL) return -ENOMEM; - pwm->tpu = tpu; - pwm->channel = _pwm->hwpwm; - pwm->polarity = PWM_POLARITY_NORMAL; - pwm->prescaler = 0; - pwm->period = 0; - pwm->duty = 0; + tpd->tpu = tpu; + tpd->channel = pwm->hwpwm; + tpd->polarity = PWM_POLARITY_NORMAL; + tpd->prescaler = 0; + tpd->period = 0; + tpd->duty = 0; - pwm->timer_on = false; + tpd->timer_on = false; - pwm_set_chip_data(_pwm, pwm); + pwm_set_chip_data(pwm, tpd); return 0; } -static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *_pwm) +static void tpu_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); - tpu_pwm_timer_stop(pwm); - kfree(pwm); + tpu_pwm_timer_stop(tpd); + kfree(tpd); } -static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, - int duty_ns, int period_ns) +static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, + u64 duty_ns, u64 period_ns, bool enabled) { - static const unsigned int prescalers[] = { 1, 4, 16, 64 }; - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); struct tpu_device *tpu = to_tpu_device(chip); unsigned int prescaler; bool duty_only = false; u32 clk_rate; - u32 period; + u64 period; u32 duty; int ret; - /* - * Pick a prescaler to avoid overflowing the counter. - * TODO: Pick the highest acceptable prescaler. - */ clk_rate = clk_get_rate(tpu->clk); - - for (prescaler = 0; prescaler < ARRAY_SIZE(prescalers); ++prescaler) { - period = clk_rate / prescalers[prescaler] - / (NSEC_PER_SEC / period_ns); - if (period <= 0xffff) - break; + if (unlikely(clk_rate > NSEC_PER_SEC)) { + /* + * This won't happen in the nearer future, so this is only a + * safeguard to prevent the following calculation from + * overflowing. With this clk_rate * period_ns / NSEC_PER_SEC is + * not greater than period_ns and so fits into an u64. + */ + return -EINVAL; } - if (prescaler == ARRAY_SIZE(prescalers) || period == 0) { - dev_err(&tpu->pdev->dev, "clock rate mismatch\n"); - return -ENOTSUPP; + period = mul_u64_u64_div_u64(clk_rate, period_ns, NSEC_PER_SEC); + + /* + * Find the minimal prescaler in [0..3] such that + * + * period >> (2 * prescaler) < 0x10000 + * + * This could be calculated using something like: + * + * prescaler = max(ilog2(period) / 2, 7) - 7; + * + * but given there are only four allowed results and that ilog2 isn't + * cheap on all platforms using a switch statement is more effective. + */ + switch (period) { + case 1 ... 0xffff: + prescaler = 0; + break; + + case 0x10000 ... 0x3ffff: + prescaler = 1; + break; + + case 0x40000 ... 0xfffff: + prescaler = 2; + break; + + case 0x100000 ... 0x3fffff: + prescaler = 3; + break; + + default: + return -EINVAL; } - if (duty_ns) { - duty = clk_rate / prescalers[prescaler] - / (NSEC_PER_SEC / duty_ns); - if (duty > period) - return -EINVAL; - } else { + period >>= 2 * prescaler; + + if (duty_ns) + duty = mul_u64_u64_div_u64(clk_rate, duty_ns, + (u64)NSEC_PER_SEC << (2 * prescaler)); + else duty = 0; - } dev_dbg(&tpu->pdev->dev, "rate %u, prescaler %u, period %u, duty %u\n", - clk_rate, prescalers[prescaler], period, duty); + clk_rate, 1 << (2 * prescaler), (u32)period, duty); - if (pwm->prescaler == prescaler && pwm->period == period) + if (tpd->prescaler == prescaler && tpd->period == period) duty_only = true; - pwm->prescaler = prescaler; - pwm->period = period; - pwm->duty = duty; + tpd->prescaler = prescaler; + tpd->period = period; + tpd->duty = duty; /* If the channel is disabled we're done. */ - if (!pwm_is_enabled(_pwm)) + if (!enabled) return 0; - if (duty_only && pwm->timer_on) { + if (duty_only && tpd->timer_on) { /* * If only the duty cycle changed and the timer is already * running, there's no need to reconfigure it completely, Just * modify the duty cycle. */ - tpu_pwm_write(pwm, TPU_TGRAn, pwm->duty); - dev_dbg(&tpu->pdev->dev, "%u: TGRA 0x%04x\n", pwm->channel, - pwm->duty); + tpu_pwm_write(tpd, TPU_TGRAn, tpd->duty); + dev_dbg(&tpu->pdev->dev, "%u: TGRA 0x%04x\n", tpd->channel, + tpd->duty); } else { /* Otherwise perform a full reconfiguration. */ - ret = tpu_pwm_timer_start(pwm); + ret = tpu_pwm_timer_start(tpd); if (ret < 0) return ret; } @@ -317,29 +343,29 @@ static int tpu_pwm_config(struct pwm_chip *chip, struct pwm_device *_pwm, * To avoid running the timer when not strictly required, handle * 0% and 100% duty cycles as fixed levels and stop the timer. */ - tpu_pwm_set_pin(pwm, duty ? TPU_PIN_ACTIVE : TPU_PIN_INACTIVE); - tpu_pwm_timer_stop(pwm); + tpu_pwm_set_pin(tpd, duty ? TPU_PIN_ACTIVE : TPU_PIN_INACTIVE); + tpu_pwm_timer_stop(tpd); } return 0; } -static int tpu_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *_pwm, +static int tpu_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm, enum pwm_polarity polarity) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); - pwm->polarity = polarity; + tpd->polarity = polarity; return 0; } -static int tpu_pwm_enable(struct pwm_chip *chip, struct pwm_device *_pwm) +static int tpu_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); int ret; - ret = tpu_pwm_timer_start(pwm); + ret = tpu_pwm_timer_start(tpd); if (ret < 0) return ret; @@ -347,32 +373,64 @@ static int tpu_pwm_enable(struct pwm_chip *chip, struct pwm_device *_pwm) * To avoid running the timer when not strictly required, handle 0% and * 100% duty cycles as fixed levels and stop the timer. */ - if (pwm->duty == 0 || pwm->duty == pwm->period) { - tpu_pwm_set_pin(pwm, pwm->duty ? + if (tpd->duty == 0 || tpd->duty == tpd->period) { + tpu_pwm_set_pin(tpd, tpd->duty ? TPU_PIN_ACTIVE : TPU_PIN_INACTIVE); - tpu_pwm_timer_stop(pwm); + tpu_pwm_timer_stop(tpd); } return 0; } -static void tpu_pwm_disable(struct pwm_chip *chip, struct pwm_device *_pwm) +static void tpu_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { - struct tpu_pwm_device *pwm = pwm_get_chip_data(_pwm); + struct tpu_pwm_device *tpd = pwm_get_chip_data(pwm); /* The timer must be running to modify the pin output configuration. */ - tpu_pwm_timer_start(pwm); - tpu_pwm_set_pin(pwm, TPU_PIN_INACTIVE); - tpu_pwm_timer_stop(pwm); + tpu_pwm_timer_start(tpd); + tpu_pwm_set_pin(tpd, TPU_PIN_INACTIVE); + tpu_pwm_timer_stop(tpd); +} + +static int tpu_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity) { + if (enabled) { + tpu_pwm_disable(chip, pwm); + enabled = false; + } + + err = tpu_pwm_set_polarity(chip, pwm, state->polarity); + if (err) + return err; + } + + if (!state->enabled) { + if (enabled) + tpu_pwm_disable(chip, pwm); + + return 0; + } + + err = tpu_pwm_config(pwm->chip, pwm, + state->duty_cycle, state->period, enabled); + if (err) + return err; + + if (!enabled) + err = tpu_pwm_enable(chip, pwm); + + return err; } static const struct pwm_ops tpu_pwm_ops = { .request = tpu_pwm_request, .free = tpu_pwm_free, - .config = tpu_pwm_config, - .set_polarity = tpu_pwm_set_polarity, - .enable = tpu_pwm_enable, - .disable = tpu_pwm_disable, + .apply = tpu_pwm_apply, .owner = THIS_MODULE, }; @@ -398,10 +456,8 @@ static int tpu_probe(struct platform_device *pdev) return PTR_ERR(tpu->base); tpu->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(tpu->clk)) { - dev_err(&pdev->dev, "cannot get clock\n"); - return PTR_ERR(tpu->clk); - } + if (IS_ERR(tpu->clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(tpu->clk), "Failed to get clock\n"); /* Initialize and register the device. */ platform_set_drvdata(pdev, tpu); @@ -410,25 +466,13 @@ static int tpu_probe(struct platform_device *pdev) tpu->chip.ops = &tpu_pwm_ops; tpu->chip.npwm = TPU_CHANNEL_MAX; - pm_runtime_enable(&pdev->dev); + ret = devm_pm_runtime_enable(&pdev->dev); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Failed to enable runtime PM\n"); - ret = pwmchip_add(&tpu->chip); - if (ret < 0) { - dev_err(&pdev->dev, "failed to register PWM chip\n"); - pm_runtime_disable(&pdev->dev); - return ret; - } - - return 0; -} - -static int tpu_remove(struct platform_device *pdev) -{ - struct tpu_device *tpu = platform_get_drvdata(pdev); - - pwmchip_remove(&tpu->chip); - - pm_runtime_disable(&pdev->dev); + ret = devm_pwmchip_add(&pdev->dev, &tpu->chip); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Failed to register PWM chip\n"); return 0; } @@ -447,7 +491,6 @@ MODULE_DEVICE_TABLE(of, tpu_of_table); static struct platform_driver tpu_driver = { .probe = tpu_probe, - .remove = tpu_remove, .driver = { .name = "renesas-tpu-pwm", .of_match_table = of_match_ptr(tpu_of_table), diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c index 0a4ff55fad04..9c5b4f515641 100644 --- a/drivers/pwm/pwm-samsung.c +++ b/drivers/pwm/pwm-samsung.c @@ -321,14 +321,6 @@ static int __pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm); u32 tin_ns = chan->tin_ns, tcnt, tcmp, oldtcmp; - /* - * We currently avoid using 64bit arithmetic by using the - * fact that anything faster than 1Hz is easily representable - * by 32bits. - */ - if (period_ns > NSEC_PER_SEC) - return -ERANGE; - tcnt = readl(our_chip->base + REG_TCNTB(pwm->hwpwm)); oldtcmp = readl(our_chip->base + REG_TCMPB(pwm->hwpwm)); @@ -438,13 +430,51 @@ static int pwm_samsung_set_polarity(struct pwm_chip *chip, return 0; } +static int pwm_samsung_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err, enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity) { + if (enabled) { + pwm_samsung_disable(chip, pwm); + enabled = false; + } + + err = pwm_samsung_set_polarity(chip, pwm, state->polarity); + if (err) + return err; + } + + if (!state->enabled) { + if (enabled) + pwm_samsung_disable(chip, pwm); + + return 0; + } + + /* + * We currently avoid using 64bit arithmetic by using the + * fact that anything faster than 1Hz is easily representable + * by 32bits. + */ + if (state->period > NSEC_PER_SEC) + return -ERANGE; + + err = pwm_samsung_config(chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = pwm_samsung_enable(chip, pwm); + + return err; +} + static const struct pwm_ops pwm_samsung_ops = { .request = pwm_samsung_request, .free = pwm_samsung_free, - .enable = pwm_samsung_enable, - .disable = pwm_samsung_disable, - .config = pwm_samsung_config, - .set_polarity = pwm_samsung_set_polarity, + .apply = pwm_samsung_apply, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c index 253c4a17d255..e6d05a329002 100644 --- a/drivers/pwm/pwm-sifive.c +++ b/drivers/pwm/pwm-sifive.c @@ -138,10 +138,9 @@ static int pwm_sifive_enable(struct pwm_chip *chip, bool enable) dev_err(ddata->chip.dev, "Enable clk failed\n"); return ret; } - } - - if (!enable) + } else { clk_disable(ddata->clk); + } return 0; } diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c index f491d56254d7..44b1f93256b3 100644 --- a/drivers/pwm/pwm-sti.c +++ b/drivers/pwm/pwm-sti.c @@ -391,11 +391,34 @@ out: return ret; } +static int sti_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + sti_pwm_disable(chip, pwm); + + return 0; + } + + err = sti_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = sti_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops sti_pwm_ops = { .capture = sti_pwm_capture, - .config = sti_pwm_config, - .enable = sti_pwm_enable, - .disable = sti_pwm_disable, + .apply = sti_pwm_apply, .free = sti_pwm_free, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-stmpe.c b/drivers/pwm/pwm-stmpe.c index c4336d3bace3..5d4a4762ce0c 100644 --- a/drivers/pwm/pwm-stmpe.c +++ b/drivers/pwm/pwm-stmpe.c @@ -259,10 +259,33 @@ static int stmpe_24xx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +static int stmpe_24xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + stmpe_24xx_pwm_disable(chip, pwm); + + return 0; + } + + err = stmpe_24xx_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = stmpe_24xx_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops stmpe_24xx_pwm_ops = { - .config = stmpe_24xx_pwm_config, - .enable = stmpe_24xx_pwm_enable, - .disable = stmpe_24xx_pwm_disable, + .apply = stmpe_24xx_pwm_apply, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index 16d75f9aa36a..c8445b0a3339 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -89,7 +89,6 @@ struct sun4i_pwm_chip { void __iomem *base; spinlock_t ctrl_lock; const struct sun4i_pwm_data *data; - unsigned long next_period[2]; }; static inline struct sun4i_pwm_chip *to_sun4i_pwm_chip(struct pwm_chip *chip) @@ -236,7 +235,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, u32 ctrl, duty = 0, period = 0, val; int ret; unsigned int delay_us, prescaler = 0; - unsigned long now; bool bypass; pwm_get_state(pwm, &cstate); @@ -284,8 +282,6 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, val = (duty & PWM_DTY_MASK) | PWM_PRD(period); sun4i_pwm_writel(sun4i_pwm, val, PWM_CH_PRD(pwm->hwpwm)); - sun4i_pwm->next_period[pwm->hwpwm] = jiffies + - nsecs_to_jiffies(cstate.period + 1000); if (state->polarity != PWM_POLARITY_NORMAL) ctrl &= ~BIT_CH(PWM_ACT_STATE, pwm->hwpwm); @@ -305,15 +301,11 @@ static int sun4i_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; /* We need a full period to elapse before disabling the channel. */ - now = jiffies; - if (time_before(now, sun4i_pwm->next_period[pwm->hwpwm])) { - delay_us = jiffies_to_usecs(sun4i_pwm->next_period[pwm->hwpwm] - - now); - if ((delay_us / 500) > MAX_UDELAY_MS) - msleep(delay_us / 1000 + 1); - else - usleep_range(delay_us, delay_us * 2); - } + delay_us = DIV_ROUND_UP_ULL(cstate.period, NSEC_PER_USEC); + if ((delay_us / 500) > MAX_UDELAY_MS) + msleep(delay_us / 1000 + 1); + else + usleep_range(delay_us, delay_us * 2); spin_lock(&sun4i_pwm->ctrl_lock); ctrl = sun4i_pwm_readl(sun4i_pwm, PWM_CTRL_REG); diff --git a/drivers/pwm/pwm-sunplus.c b/drivers/pwm/pwm-sunplus.c new file mode 100644 index 000000000000..e776fd16512d --- /dev/null +++ b/drivers/pwm/pwm-sunplus.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PWM device driver for SUNPLUS SP7021 SoC + * + * Links: + * Reference Manual: + * https://sunplus-tibbo.atlassian.net/wiki/spaces/doc/overview + * + * Reference Manual(PWM module): + * https://sunplus.atlassian.net/wiki/spaces/doc/pages/461144198/12.+Pulse+Width+Modulation+PWM + * + * Limitations: + * - Only supports normal polarity. + * - It output low when PWM channel disabled. + * - When the parameters change, current running period will not be completed + * and run new settings immediately. + * - In .apply() PWM output need to write register FREQ and DUTY. When first write FREQ + * done and not yet write DUTY, it has short timing gap use new FREQ and old DUTY. + * + * Author: Hammer Hsieh + */ +#include +#include +#include +#include +#include +#include +#include + +#define SP7021_PWM_MODE0 0x000 +#define SP7021_PWM_MODE0_PWMEN(ch) BIT(ch) +#define SP7021_PWM_MODE0_BYPASS(ch) BIT(8 + (ch)) +#define SP7021_PWM_MODE1 0x004 +#define SP7021_PWM_MODE1_CNT_EN(ch) BIT(ch) +#define SP7021_PWM_FREQ(ch) (0x008 + 4 * (ch)) +#define SP7021_PWM_FREQ_MAX GENMASK(15, 0) +#define SP7021_PWM_DUTY(ch) (0x018 + 4 * (ch)) +#define SP7021_PWM_DUTY_DD_SEL(ch) FIELD_PREP(GENMASK(9, 8), ch) +#define SP7021_PWM_DUTY_MAX GENMASK(7, 0) +#define SP7021_PWM_DUTY_MASK SP7021_PWM_DUTY_MAX +#define SP7021_PWM_FREQ_SCALER 256 +#define SP7021_PWM_NUM 4 + +struct sunplus_pwm { + struct pwm_chip chip; + void __iomem *base; + struct clk *clk; +}; + +static inline struct sunplus_pwm *to_sunplus_pwm(struct pwm_chip *chip) +{ + return container_of(chip, struct sunplus_pwm, chip); +} + +static int sunplus_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + struct sunplus_pwm *priv = to_sunplus_pwm(chip); + u32 dd_freq, duty, mode0, mode1; + u64 clk_rate; + + if (state->polarity != pwm->state.polarity) + return -EINVAL; + + if (!state->enabled) { + /* disable pwm channel output */ + mode0 = readl(priv->base + SP7021_PWM_MODE0); + mode0 &= ~SP7021_PWM_MODE0_PWMEN(pwm->hwpwm); + writel(mode0, priv->base + SP7021_PWM_MODE0); + /* disable pwm channel clk source */ + mode1 = readl(priv->base + SP7021_PWM_MODE1); + mode1 &= ~SP7021_PWM_MODE1_CNT_EN(pwm->hwpwm); + writel(mode1, priv->base + SP7021_PWM_MODE1); + return 0; + } + + clk_rate = clk_get_rate(priv->clk); + + /* + * The following calculations might overflow if clk is bigger + * than 256 GHz. In practise it's 202.5MHz, so this limitation + * is only theoretic. + */ + if (clk_rate > (u64)SP7021_PWM_FREQ_SCALER * NSEC_PER_SEC) + return -EINVAL; + + /* + * With clk_rate limited above we have dd_freq <= state->period, + * so this cannot overflow. + */ + dd_freq = mul_u64_u64_div_u64(clk_rate, state->period, (u64)SP7021_PWM_FREQ_SCALER + * NSEC_PER_SEC); + + if (dd_freq == 0) + return -EINVAL; + + if (dd_freq > SP7021_PWM_FREQ_MAX) + dd_freq = SP7021_PWM_FREQ_MAX; + + writel(dd_freq, priv->base + SP7021_PWM_FREQ(pwm->hwpwm)); + + /* cal and set pwm duty */ + mode0 = readl(priv->base + SP7021_PWM_MODE0); + mode0 |= SP7021_PWM_MODE0_PWMEN(pwm->hwpwm); + mode1 = readl(priv->base + SP7021_PWM_MODE1); + mode1 |= SP7021_PWM_MODE1_CNT_EN(pwm->hwpwm); + if (state->duty_cycle == state->period) { + /* PWM channel output = high */ + mode0 |= SP7021_PWM_MODE0_BYPASS(pwm->hwpwm); + duty = SP7021_PWM_DUTY_DD_SEL(pwm->hwpwm) | SP7021_PWM_DUTY_MAX; + } else { + mode0 &= ~SP7021_PWM_MODE0_BYPASS(pwm->hwpwm); + /* + * duty_ns <= period_ns 27 bits, clk_rate 28 bits, won't overflow. + */ + duty = mul_u64_u64_div_u64(state->duty_cycle, clk_rate, + (u64)dd_freq * NSEC_PER_SEC); + duty = SP7021_PWM_DUTY_DD_SEL(pwm->hwpwm) | duty; + } + writel(duty, priv->base + SP7021_PWM_DUTY(pwm->hwpwm)); + writel(mode1, priv->base + SP7021_PWM_MODE1); + writel(mode0, priv->base + SP7021_PWM_MODE0); + + return 0; +} + +static void sunplus_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct sunplus_pwm *priv = to_sunplus_pwm(chip); + u32 mode0, dd_freq, duty; + u64 clk_rate; + + mode0 = readl(priv->base + SP7021_PWM_MODE0); + + if (mode0 & BIT(pwm->hwpwm)) { + clk_rate = clk_get_rate(priv->clk); + dd_freq = readl(priv->base + SP7021_PWM_FREQ(pwm->hwpwm)); + duty = readl(priv->base + SP7021_PWM_DUTY(pwm->hwpwm)); + duty = FIELD_GET(SP7021_PWM_DUTY_MASK, duty); + /* + * dd_freq 16 bits, SP7021_PWM_FREQ_SCALER 8 bits + * NSEC_PER_SEC 30 bits, won't overflow. + */ + state->period = DIV64_U64_ROUND_UP((u64)dd_freq * (u64)SP7021_PWM_FREQ_SCALER + * NSEC_PER_SEC, clk_rate); + /* + * dd_freq 16 bits, duty 8 bits, NSEC_PER_SEC 30 bits, won't overflow. + */ + state->duty_cycle = DIV64_U64_ROUND_UP((u64)dd_freq * (u64)duty * NSEC_PER_SEC, + clk_rate); + state->enabled = true; + } else { + state->enabled = false; + } + + state->polarity = PWM_POLARITY_NORMAL; +} + +static const struct pwm_ops sunplus_pwm_ops = { + .apply = sunplus_pwm_apply, + .get_state = sunplus_pwm_get_state, + .owner = THIS_MODULE, +}; + +static void sunplus_pwm_clk_release(void *data) +{ + struct clk *clk = data; + + clk_disable_unprepare(clk); +} + +static int sunplus_pwm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sunplus_pwm *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->clk = devm_clk_get(dev, NULL); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), + "get pwm clock failed\n"); + + ret = clk_prepare_enable(priv->clk); + if (ret < 0) { + dev_err(dev, "failed to enable clock: %d\n", ret); + return ret; + } + + ret = devm_add_action_or_reset(dev, sunplus_pwm_clk_release, priv->clk); + if (ret < 0) { + dev_err(dev, "failed to release clock: %d\n", ret); + return ret; + } + + priv->chip.dev = dev; + priv->chip.ops = &sunplus_pwm_ops; + priv->chip.npwm = SP7021_PWM_NUM; + + ret = devm_pwmchip_add(dev, &priv->chip); + if (ret < 0) + return dev_err_probe(dev, ret, "Cannot register sunplus PWM\n"); + + return 0; +} + +static const struct of_device_id sunplus_pwm_of_match[] = { + { .compatible = "sunplus,sp7021-pwm", }, + {} +}; +MODULE_DEVICE_TABLE(of, sunplus_pwm_of_match); + +static struct platform_driver sunplus_pwm_driver = { + .probe = sunplus_pwm_probe, + .driver = { + .name = "sunplus-pwm", + .of_match_table = sunplus_pwm_of_match, + }, +}; +module_platform_driver(sunplus_pwm_driver); + +MODULE_DESCRIPTION("Sunplus SoC PWM Driver"); +MODULE_AUTHOR("Hammer Hsieh "); +MODULE_LICENSE("GPL"); diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index e5a9ffef4a71..dad9978c9186 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -99,7 +99,7 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct tegra_pwm_chip *pc = to_tegra_pwm_chip(chip); - unsigned long long c = duty_ns, hz; + unsigned long long c = duty_ns; unsigned long rate, required_clk_rate; u32 val = 0; int err; @@ -156,11 +156,9 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, pc->clk_rate = clk_get_rate(pc->clk); } - rate = pc->clk_rate >> PWM_DUTY_WIDTH; - /* Consider precision in PWM_SCALE_WIDTH rate calculation */ - hz = DIV_ROUND_CLOSEST_ULL(100ULL * NSEC_PER_SEC, period_ns); - rate = DIV_ROUND_CLOSEST_ULL(100ULL * rate, hz); + rate = mul_u64_u64_div_u64(pc->clk_rate, period_ns, + (u64)NSEC_PER_SEC << PWM_DUTY_WIDTH); /* * Since the actual PWM divider is the register's frequency divider @@ -169,6 +167,8 @@ static int tegra_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, */ if (rate > 0) rate--; + else + return -EINVAL; /* * Make sure that the rate will fit in the register's frequency @@ -230,10 +230,34 @@ static void tegra_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) pm_runtime_put_sync(pc->dev); } +static int tegra_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (enabled) + tegra_pwm_disable(chip, pwm); + + return 0; + } + + err = tegra_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!enabled) + err = tegra_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops tegra_pwm_ops = { - .config = tegra_pwm_config, - .enable = tegra_pwm_enable, - .disable = tegra_pwm_disable, + .apply = tegra_pwm_apply, .owner = THIS_MODULE, }; diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c index 49d9f7a78012..ed0b63dd38f1 100644 --- a/drivers/pwm/pwm-twl-led.c +++ b/drivers/pwm/pwm-twl-led.c @@ -137,6 +137,45 @@ out: mutex_unlock(&twl->mutex); } +static int twl4030_pwmled_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int ret; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl4030_pwmled_disable(chip, pwm); + + return 0; + } + + /* + * We cannot skip calling ->config even if state->period == + * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle + * because we might have exited early in the last call to + * pwm_apply_state because of !state->enabled and so the two values in + * pwm->state might not be configured in hardware. + */ + ret = twl4030_pwmled_config(pwm->chip, pwm, + state->duty_cycle, state->period); + if (ret) + return ret; + + if (!pwm->state.enabled) + ret = twl4030_pwmled_enable(chip, pwm); + + return ret; +} + + +static const struct pwm_ops twl4030_pwmled_ops = { + .apply = twl4030_pwmled_apply, + .owner = THIS_MODULE, +}; + static int twl6030_pwmled_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { @@ -206,6 +245,32 @@ out: mutex_unlock(&twl->mutex); } +static int twl6030_pwmled_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != pwm->state.polarity) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl6030_pwmled_disable(chip, pwm); + + return 0; + } + + err = twl6030_pwmled_config(pwm->chip, pwm, + state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = twl6030_pwmled_enable(chip, pwm); + + return err; +} + static int twl6030_pwmled_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct twl_pwmled_chip *twl = to_twl(chip); @@ -257,17 +322,8 @@ out: mutex_unlock(&twl->mutex); } -static const struct pwm_ops twl4030_pwmled_ops = { - .enable = twl4030_pwmled_enable, - .disable = twl4030_pwmled_disable, - .config = twl4030_pwmled_config, - .owner = THIS_MODULE, -}; - static const struct pwm_ops twl6030_pwmled_ops = { - .enable = twl6030_pwmled_enable, - .disable = twl6030_pwmled_disable, - .config = twl6030_pwmled_config, + .apply = twl6030_pwmled_apply, .request = twl6030_pwmled_request, .free = twl6030_pwmled_free, .owner = THIS_MODULE, diff --git a/drivers/pwm/pwm-xilinx.c b/drivers/pwm/pwm-xilinx.c new file mode 100644 index 000000000000..4dab2b86c427 --- /dev/null +++ b/drivers/pwm/pwm-xilinx.c @@ -0,0 +1,321 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2021 Sean Anderson + * + * Limitations: + * - When changing both duty cycle and period, we may end up with one cycle + * with the old duty cycle and the new period. This is because the counters + * may only be reloaded by first stopping them, or by letting them be + * automatically reloaded at the end of a cycle. If this automatic reload + * happens after we set TLR0 but before we set TLR1 then we will have a + * bad cycle. This could probably be fixed by reading TCR0 just before + * reprogramming, but I think it would add complexity for little gain. + * - Cannot produce 100% duty cycle by configuring the TLRs. This might be + * possible by stopping the counters at an appropriate point in the cycle, + * but this is not (yet) implemented. + * - Only produces "normal" output. + * - Always produces low output if disabled. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The following functions are "common" to drivers for this device, and may be + * exported at a future date. + */ +u32 xilinx_timer_tlr_cycles(struct xilinx_timer_priv *priv, u32 tcsr, + u64 cycles) +{ + WARN_ON(cycles < 2 || cycles - 2 > priv->max); + + if (tcsr & TCSR_UDT) + return cycles - 2; + return priv->max - cycles + 2; +} + +unsigned int xilinx_timer_get_period(struct xilinx_timer_priv *priv, + u32 tlr, u32 tcsr) +{ + u64 cycles; + + if (tcsr & TCSR_UDT) + cycles = tlr + 2; + else + cycles = (u64)priv->max - tlr + 2; + + /* cycles has a max of 2^32 + 2, so we can't overflow */ + return DIV64_U64_ROUND_UP(cycles * NSEC_PER_SEC, + clk_get_rate(priv->clk)); +} + +/* + * The idea here is to capture whether the PWM is actually running (e.g. + * because we or the bootloader set it up) and we need to be careful to ensure + * we don't cause a glitch. According to the data sheet, to enable the PWM we + * need to + * + * - Set both timers to generate mode (MDT=1) + * - Set both timers to PWM mode (PWMA=1) + * - Enable the generate out signals (GENT=1) + * + * In addition, + * + * - The timer must be running (ENT=1) + * - The timer must auto-reload TLR into TCR (ARHT=1) + * - We must not be in the process of loading TLR into TCR (LOAD=0) + * - Cascade mode must be disabled (CASC=0) + * + * If any of these differ from usual, then the PWM is either disabled, or is + * running in a mode that this driver does not support. + */ +#define TCSR_PWM_SET (TCSR_GENT | TCSR_ARHT | TCSR_ENT | TCSR_PWMA) +#define TCSR_PWM_CLEAR (TCSR_MDT | TCSR_LOAD) +#define TCSR_PWM_MASK (TCSR_PWM_SET | TCSR_PWM_CLEAR) + +struct xilinx_pwm_device { + struct pwm_chip chip; + struct xilinx_timer_priv priv; +}; + +static inline struct xilinx_timer_priv +*xilinx_pwm_chip_to_priv(struct pwm_chip *chip) +{ + return &container_of(chip, struct xilinx_pwm_device, chip)->priv; +} + +static bool xilinx_timer_pwm_enabled(u32 tcsr0, u32 tcsr1) +{ + return ((TCSR_PWM_MASK | TCSR_CASC) & tcsr0) == TCSR_PWM_SET && + (TCSR_PWM_MASK & tcsr1) == TCSR_PWM_SET; +} + +static int xilinx_pwm_apply(struct pwm_chip *chip, struct pwm_device *unused, + const struct pwm_state *state) +{ + struct xilinx_timer_priv *priv = xilinx_pwm_chip_to_priv(chip); + u32 tlr0, tlr1, tcsr0, tcsr1; + u64 period_cycles, duty_cycles; + unsigned long rate; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + /* + * To be representable by TLR, cycles must be between 2 and + * priv->max + 2. To enforce this we can reduce the cycles, but we may + * not increase them. Caveat emptor: while this does result in more + * predictable rounding, it may also result in a completely different + * duty cycle (% high time) than what was requested. + */ + rate = clk_get_rate(priv->clk); + /* Avoid overflow */ + period_cycles = min_t(u64, state->period, U32_MAX * NSEC_PER_SEC); + period_cycles = mul_u64_u32_div(period_cycles, rate, NSEC_PER_SEC); + period_cycles = min_t(u64, period_cycles, priv->max + 2); + if (period_cycles < 2) + return -ERANGE; + + /* Same thing for duty cycles */ + duty_cycles = min_t(u64, state->duty_cycle, U32_MAX * NSEC_PER_SEC); + duty_cycles = mul_u64_u32_div(duty_cycles, rate, NSEC_PER_SEC); + duty_cycles = min_t(u64, duty_cycles, priv->max + 2); + + /* + * If we specify 100% duty cycle, we will get 0% instead, so decrease + * the duty cycle count by one. + */ + if (duty_cycles >= period_cycles) + duty_cycles = period_cycles - 1; + + /* Round down to 0% duty cycle for unrepresentable duty cycles */ + if (duty_cycles < 2) + duty_cycles = period_cycles; + + regmap_read(priv->map, TCSR0, &tcsr0); + regmap_read(priv->map, TCSR1, &tcsr1); + tlr0 = xilinx_timer_tlr_cycles(priv, tcsr0, period_cycles); + tlr1 = xilinx_timer_tlr_cycles(priv, tcsr1, duty_cycles); + regmap_write(priv->map, TLR0, tlr0); + regmap_write(priv->map, TLR1, tlr1); + + if (state->enabled) { + /* + * If the PWM is already running, then the counters will be + * reloaded at the end of the current cycle. + */ + if (!xilinx_timer_pwm_enabled(tcsr0, tcsr1)) { + /* Load TLR into TCR */ + regmap_write(priv->map, TCSR0, tcsr0 | TCSR_LOAD); + regmap_write(priv->map, TCSR1, tcsr1 | TCSR_LOAD); + /* Enable timers all at once with ENALL */ + tcsr0 = (TCSR_PWM_SET & ~TCSR_ENT) | (tcsr0 & TCSR_UDT); + tcsr1 = TCSR_PWM_SET | TCSR_ENALL | (tcsr1 & TCSR_UDT); + regmap_write(priv->map, TCSR0, tcsr0); + regmap_write(priv->map, TCSR1, tcsr1); + } + } else { + regmap_write(priv->map, TCSR0, 0); + regmap_write(priv->map, TCSR1, 0); + } + + return 0; +} + +static void xilinx_pwm_get_state(struct pwm_chip *chip, + struct pwm_device *unused, + struct pwm_state *state) +{ + struct xilinx_timer_priv *priv = xilinx_pwm_chip_to_priv(chip); + u32 tlr0, tlr1, tcsr0, tcsr1; + + regmap_read(priv->map, TLR0, &tlr0); + regmap_read(priv->map, TLR1, &tlr1); + regmap_read(priv->map, TCSR0, &tcsr0); + regmap_read(priv->map, TCSR1, &tcsr1); + state->period = xilinx_timer_get_period(priv, tlr0, tcsr0); + state->duty_cycle = xilinx_timer_get_period(priv, tlr1, tcsr1); + state->enabled = xilinx_timer_pwm_enabled(tcsr0, tcsr1); + state->polarity = PWM_POLARITY_NORMAL; + + /* + * 100% duty cycle results in constant low output. This may be (very) + * wrong if rate > 1 GHz, so fix this if you have such hardware :) + */ + if (state->period == state->duty_cycle) + state->duty_cycle = 0; +} + +static const struct pwm_ops xilinx_pwm_ops = { + .apply = xilinx_pwm_apply, + .get_state = xilinx_pwm_get_state, + .owner = THIS_MODULE, +}; + +static const struct regmap_config xilinx_pwm_regmap_config = { + .reg_bits = 32, + .reg_stride = 4, + .val_bits = 32, + .val_format_endian = REGMAP_ENDIAN_LITTLE, + .max_register = TCR1, +}; + +static int xilinx_pwm_probe(struct platform_device *pdev) +{ + int ret; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; + struct xilinx_timer_priv *priv; + struct xilinx_pwm_device *xilinx_pwm; + u32 pwm_cells, one_timer, width; + void __iomem *regs; + + /* If there are no PWM cells, this binding is for a timer */ + ret = of_property_read_u32(np, "#pwm-cells", &pwm_cells); + if (ret == -EINVAL) + return -ENODEV; + if (ret) + return dev_err_probe(dev, ret, "could not read #pwm-cells\n"); + + xilinx_pwm = devm_kzalloc(dev, sizeof(*xilinx_pwm), GFP_KERNEL); + if (!xilinx_pwm) + return -ENOMEM; + platform_set_drvdata(pdev, xilinx_pwm); + priv = &xilinx_pwm->priv; + + regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(regs)) + return PTR_ERR(regs); + + priv->map = devm_regmap_init_mmio(dev, regs, + &xilinx_pwm_regmap_config); + if (IS_ERR(priv->map)) + return dev_err_probe(dev, PTR_ERR(priv->map), + "Could not create regmap\n"); + + ret = of_property_read_u32(np, "xlnx,one-timer-only", &one_timer); + if (ret) + return dev_err_probe(dev, ret, + "Could not read xlnx,one-timer-only\n"); + + if (one_timer) + return dev_err_probe(dev, -EINVAL, + "Two timers required for PWM mode\n"); + + ret = of_property_read_u32(np, "xlnx,count-width", &width); + if (ret == -EINVAL) + width = 32; + else if (ret) + return dev_err_probe(dev, ret, + "Could not read xlnx,count-width\n"); + + if (width != 8 && width != 16 && width != 32) + return dev_err_probe(dev, -EINVAL, + "Invalid counter width %d\n", width); + priv->max = BIT_ULL(width) - 1; + + /* + * The polarity of the Generate Out signals must be active high for PWM + * mode to work. We could determine this from the device tree, but + * alas, such properties are not allowed to be used. + */ + + priv->clk = devm_clk_get(dev, "s_axi_aclk"); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), + "Could not get clock\n"); + + ret = clk_prepare_enable(priv->clk); + if (ret) + return dev_err_probe(dev, ret, "Clock enable failed\n"); + clk_rate_exclusive_get(priv->clk); + + xilinx_pwm->chip.dev = dev; + xilinx_pwm->chip.ops = &xilinx_pwm_ops; + xilinx_pwm->chip.npwm = 1; + ret = pwmchip_add(&xilinx_pwm->chip); + if (ret) { + clk_rate_exclusive_put(priv->clk); + clk_disable_unprepare(priv->clk); + return dev_err_probe(dev, ret, "Could not register PWM chip\n"); + } + + return 0; +} + +static int xilinx_pwm_remove(struct platform_device *pdev) +{ + struct xilinx_pwm_device *xilinx_pwm = platform_get_drvdata(pdev); + + pwmchip_remove(&xilinx_pwm->chip); + clk_rate_exclusive_put(xilinx_pwm->priv.clk); + clk_disable_unprepare(xilinx_pwm->priv.clk); + return 0; +} + +static const struct of_device_id xilinx_pwm_of_match[] = { + { .compatible = "xlnx,xps-timer-1.00.a", }, + {}, +}; +MODULE_DEVICE_TABLE(of, xilinx_pwm_of_match); + +static struct platform_driver xilinx_pwm_driver = { + .probe = xilinx_pwm_probe, + .remove = xilinx_pwm_remove, + .driver = { + .name = "xilinx-pwm", + .of_match_table = of_match_ptr(xilinx_pwm_of_match), + }, +}; +module_platform_driver(xilinx_pwm_driver); + +MODULE_ALIAS("platform:xilinx-pwm"); +MODULE_DESCRIPTION("PWM driver for Xilinx LogiCORE IP AXI Timer"); +MODULE_LICENSE("GPL"); diff --git a/drivers/remoteproc/imx_dsp_rproc.c b/drivers/remoteproc/imx_dsp_rproc.c index 2abee78df96e..ca0817f8e41e 100644 --- a/drivers/remoteproc/imx_dsp_rproc.c +++ b/drivers/remoteproc/imx_dsp_rproc.c @@ -649,99 +649,6 @@ static int imx_dsp_rproc_add_carveout(struct imx_dsp_rproc *priv) return 0; } -/** - * imx_dsp_rproc_elf_load_segments() - load firmware segments to memory - * @rproc: remote processor which will be booted using these fw segments - * @fw: the ELF firmware image - * - * This function specially checks if memsz is zero or not, otherwise it - * is mostly same as rproc_elf_load_segments(). - */ -static int imx_dsp_rproc_elf_load_segments(struct rproc *rproc, - const struct firmware *fw) -{ - struct device *dev = &rproc->dev; - u8 class = fw_elf_get_class(fw); - u32 elf_phdr_get_size = elf_size_of_phdr(class); - const u8 *elf_data = fw->data; - const void *ehdr, *phdr; - int i, ret = 0; - u16 phnum; - - ehdr = elf_data; - phnum = elf_hdr_get_e_phnum(class, ehdr); - phdr = elf_data + elf_hdr_get_e_phoff(class, ehdr); - - /* go through the available ELF segments */ - for (i = 0; i < phnum; i++, phdr += elf_phdr_get_size) { - u64 da = elf_phdr_get_p_paddr(class, phdr); - u64 memsz = elf_phdr_get_p_memsz(class, phdr); - u64 filesz = elf_phdr_get_p_filesz(class, phdr); - u64 offset = elf_phdr_get_p_offset(class, phdr); - u32 type = elf_phdr_get_p_type(class, phdr); - void *ptr; - - /* - * There is a case that with PT_LOAD type, the - * filesz = memsz = 0. If memsz = 0, rproc_da_to_va - * should return NULL ptr, then error is returned. - * So this case should be skipped from the loop. - * Add !memsz checking here. - */ - if (type != PT_LOAD || !memsz) - continue; - - dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n", - type, da, memsz, filesz); - - if (filesz > memsz) { - dev_err(dev, "bad phdr filesz 0x%llx memsz 0x%llx\n", - filesz, memsz); - ret = -EINVAL; - break; - } - - if (offset + filesz > fw->size) { - dev_err(dev, "truncated fw: need 0x%llx avail 0x%zx\n", - offset + filesz, fw->size); - ret = -EINVAL; - break; - } - - if (!rproc_u64_fit_in_size_t(memsz)) { - dev_err(dev, "size (%llx) does not fit in size_t type\n", - memsz); - ret = -EOVERFLOW; - break; - } - - /* grab the kernel address for this device address */ - ptr = rproc_da_to_va(rproc, da, memsz, NULL); - if (!ptr) { - dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da, - memsz); - ret = -EINVAL; - break; - } - - /* put the segment where the remote processor expects it */ - if (filesz) - memcpy(ptr, elf_data + offset, filesz); - - /* - * Zero out remaining memory for this segment. - * - * This isn't strictly required since dma_alloc_coherent already - * did this for us. albeit harmless, we may consider removing - * this. - */ - if (memsz > filesz) - memset(ptr + filesz, 0, memsz - filesz); - } - - return ret; -} - /* Prepare function for rproc_ops */ static int imx_dsp_rproc_prepare(struct rproc *rproc) { @@ -802,14 +709,22 @@ static void imx_dsp_rproc_kick(struct rproc *rproc, int vqid) dev_err(dev, "%s: failed (%d, err:%d)\n", __func__, vqid, err); } +static int imx_dsp_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw) +{ + if (rproc_elf_load_rsc_table(rproc, fw)) + dev_warn(&rproc->dev, "no resource table found for this firmware\n"); + + return 0; +} + static const struct rproc_ops imx_dsp_rproc_ops = { .prepare = imx_dsp_rproc_prepare, .unprepare = imx_dsp_rproc_unprepare, .start = imx_dsp_rproc_start, .stop = imx_dsp_rproc_stop, .kick = imx_dsp_rproc_kick, - .load = imx_dsp_rproc_elf_load_segments, - .parse_fw = rproc_elf_load_rsc_table, + .load = rproc_elf_load_segments, + .parse_fw = imx_dsp_rproc_parse_fw, .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, }; diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 7a096f1891e6..4a3352821b1d 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -91,6 +91,32 @@ struct imx_rproc { void __iomem *rsc_table; }; +static const struct imx_rproc_att imx_rproc_att_imx93[] = { + /* dev addr , sys addr , size , flags */ + /* TCM CODE NON-SECURE */ + { 0x0FFC0000, 0x201C0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x0FFE0000, 0x201E0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* TCM CODE SECURE */ + { 0x1FFC0000, 0x201C0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x1FFE0000, 0x201E0000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* TCM SYS NON-SECURE*/ + { 0x20000000, 0x20200000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x20020000, 0x20220000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* TCM SYS SECURE*/ + { 0x30000000, 0x20200000, 0x00020000, ATT_OWN | ATT_IOMEM }, + { 0x30020000, 0x20220000, 0x00020000, ATT_OWN | ATT_IOMEM }, + + /* DDR */ + { 0x80000000, 0x80000000, 0x10000000, 0 }, + { 0x90000000, 0x80000000, 0x10000000, 0 }, + + { 0xC0000000, 0xa0000000, 0x10000000, 0 }, + { 0xD0000000, 0xa0000000, 0x10000000, 0 }, +}; + static const struct imx_rproc_att imx_rproc_att_imx8mn[] = { /* dev addr , sys addr , size , flags */ /* ITCM */ @@ -261,6 +287,12 @@ static const struct imx_rproc_dcfg imx_rproc_cfg_imx6sx = { .method = IMX_RPROC_MMIO, }; +static const struct imx_rproc_dcfg imx_rproc_cfg_imx93 = { + .att = imx_rproc_att_imx93, + .att_size = ARRAY_SIZE(imx_rproc_att_imx93), + .method = IMX_RPROC_SMC, +}; + static int imx_rproc_start(struct rproc *rproc) { struct imx_rproc *priv = rproc->priv; @@ -423,6 +455,9 @@ static int imx_rproc_prepare(struct rproc *rproc) if (!strcmp(it.node->name, "vdev0buffer")) continue; + if (!strcmp(it.node->name, "rsc-table")) + continue; + rmem = of_reserved_mem_lookup(it.node); if (!rmem) { dev_err(priv->dev, "unable to acquire memory-region\n"); @@ -821,6 +856,7 @@ static const struct of_device_id imx_rproc_of_match[] = { { .compatible = "fsl,imx8mn-cm7", .data = &imx_rproc_cfg_imx8mn }, { .compatible = "fsl,imx8mp-cm7", .data = &imx_rproc_cfg_imx8mn }, { .compatible = "fsl,imx8ulp-cm33", .data = &imx_rproc_cfg_imx8ulp }, + { .compatible = "fsl,imx93-cm33", .data = &imx_rproc_cfg_imx93 }, {}, }; MODULE_DEVICE_TABLE(of, imx_rproc_of_match); diff --git a/drivers/remoteproc/mtk_common.h b/drivers/remoteproc/mtk_common.h index 71ce4977cb0b..ea6fa1100a00 100644 --- a/drivers/remoteproc/mtk_common.h +++ b/drivers/remoteproc/mtk_common.h @@ -54,6 +54,8 @@ #define MT8192_CORE0_WDT_IRQ 0x10030 #define MT8192_CORE0_WDT_CFG 0x10034 +#define MT8195_L1TCM_SRAM_PDN_RESERVED_RSI_BITS GENMASK(7, 4) + #define SCP_FW_VER_LEN 32 #define SCP_SHARE_BUFFER_SIZE 288 diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c index 38609153bf64..47b2a40e1b4a 100644 --- a/drivers/remoteproc/mtk_scp.c +++ b/drivers/remoteproc/mtk_scp.c @@ -365,22 +365,22 @@ static int mt8183_scp_before_load(struct mtk_scp *scp) return 0; } -static void mt8192_power_on_sram(void __iomem *addr) +static void scp_sram_power_on(void __iomem *addr, u32 reserved_mask) { int i; for (i = 31; i >= 0; i--) - writel(GENMASK(i, 0), addr); + writel(GENMASK(i, 0) & ~reserved_mask, addr); writel(0, addr); } -static void mt8192_power_off_sram(void __iomem *addr) +static void scp_sram_power_off(void __iomem *addr, u32 reserved_mask) { int i; writel(0, addr); for (i = 0; i < 32; i++) - writel(GENMASK(i, 0), addr); + writel(GENMASK(i, 0) & ~reserved_mask, addr); } static int mt8186_scp_before_load(struct mtk_scp *scp) @@ -393,7 +393,7 @@ static int mt8186_scp_before_load(struct mtk_scp *scp) writel(0x0, scp->reg_base + MT8183_SCP_CLK_DIV_SEL); /* Turn on the power of SCP's SRAM before using it. Enable 1 block per time*/ - mt8192_power_on_sram(scp->reg_base + MT8183_SCP_SRAM_PDN); + scp_sram_power_on(scp->reg_base + MT8183_SCP_SRAM_PDN, 0); /* Initialize TCM before loading FW. */ writel(0x0, scp->reg_base + MT8183_SCP_L1_SRAM_PD); @@ -412,11 +412,32 @@ static int mt8192_scp_before_load(struct mtk_scp *scp) writel(1, scp->reg_base + MT8192_CORE0_SW_RSTN_SET); /* enable SRAM clock */ - mt8192_power_on_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_0); - mt8192_power_on_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_1); - mt8192_power_on_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_2); - mt8192_power_on_sram(scp->reg_base + MT8192_L1TCM_SRAM_PDN); - mt8192_power_on_sram(scp->reg_base + MT8192_CPU0_SRAM_PD); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_on(scp->reg_base + MT8192_L1TCM_SRAM_PDN, 0); + scp_sram_power_on(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); + + /* enable MPU for all memory regions */ + writel(0xff, scp->reg_base + MT8192_CORE0_MEM_ATT_PREDEF); + + return 0; +} + +static int mt8195_scp_before_load(struct mtk_scp *scp) +{ + /* clear SPM interrupt, SCP2SPM_IPC_CLR */ + writel(0xff, scp->reg_base + MT8192_SCP2SPM_IPC_CLR); + + writel(1, scp->reg_base + MT8192_CORE0_SW_RSTN_SET); + + /* enable SRAM clock */ + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_on(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_on(scp->reg_base + MT8192_L1TCM_SRAM_PDN, + MT8195_L1TCM_SRAM_PDN_RESERVED_RSI_BITS); + scp_sram_power_on(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); /* enable MPU for all memory regions */ writel(0xff, scp->reg_base + MT8192_CORE0_MEM_ATT_PREDEF); @@ -572,11 +593,25 @@ static void mt8183_scp_stop(struct mtk_scp *scp) static void mt8192_scp_stop(struct mtk_scp *scp) { /* Disable SRAM clock */ - mt8192_power_off_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_0); - mt8192_power_off_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_1); - mt8192_power_off_sram(scp->reg_base + MT8192_L2TCM_SRAM_PD_2); - mt8192_power_off_sram(scp->reg_base + MT8192_L1TCM_SRAM_PDN); - mt8192_power_off_sram(scp->reg_base + MT8192_CPU0_SRAM_PD); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_off(scp->reg_base + MT8192_L1TCM_SRAM_PDN, 0); + scp_sram_power_off(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); + + /* Disable SCP watchdog */ + writel(0, scp->reg_base + MT8192_CORE0_WDT_CFG); +} + +static void mt8195_scp_stop(struct mtk_scp *scp) +{ + /* Disable SRAM clock */ + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_0, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_1, 0); + scp_sram_power_off(scp->reg_base + MT8192_L2TCM_SRAM_PD_2, 0); + scp_sram_power_off(scp->reg_base + MT8192_L1TCM_SRAM_PDN, + MT8195_L1TCM_SRAM_PDN_RESERVED_RSI_BITS); + scp_sram_power_off(scp->reg_base + MT8192_CPU0_SRAM_PD, 0); /* Disable SCP watchdog */ writel(0, scp->reg_base + MT8192_CORE0_WDT_CFG); @@ -774,9 +809,13 @@ static int scp_probe(struct platform_device *pdev) struct mtk_scp *scp; struct rproc *rproc; struct resource *res; - char *fw_name = "scp.img"; + const char *fw_name = "scp.img"; int ret, i; + ret = rproc_of_parse_firmware(dev, 0, &fw_name); + if (ret < 0 && ret != -EINVAL) + return ret; + rproc = devm_rproc_alloc(dev, np->name, &scp_ops, fw_name, sizeof(*scp)); if (!rproc) return dev_err_probe(dev, -ENOMEM, "unable to allocate remoteproc\n"); @@ -877,7 +916,6 @@ static int scp_remove(struct platform_device *pdev) for (i = 0; i < SCP_IPI_MAX; i++) mutex_destroy(&scp->ipi_desc[i].lock); mutex_destroy(&scp->send_lock); - rproc_free(scp->rproc); return 0; } @@ -922,11 +960,11 @@ static const struct mtk_scp_of_data mt8192_of_data = { static const struct mtk_scp_of_data mt8195_of_data = { .scp_clk_get = mt8195_scp_clk_get, - .scp_before_load = mt8192_scp_before_load, + .scp_before_load = mt8195_scp_before_load, .scp_irq_handler = mt8192_scp_irq_handler, .scp_reset_assert = mt8192_scp_reset_assert, .scp_reset_deassert = mt8192_scp_reset_deassert, - .scp_stop = mt8192_scp_stop, + .scp_stop = mt8195_scp_stop, .scp_da_to_va = mt8192_scp_da_to_va, .host_to_scp_reg = MT8192_GIPC_IN_SET, .host_to_scp_int_bit = MT8192_HOST_IPC_INT_BIT, diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 1ae47cc153e5..6ae39c5653b1 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -704,6 +704,36 @@ static const struct adsp_data sm8250_cdsp_resource = { .ssctl_id = 0x17, }; +static const struct adsp_data sc8280xp_nsp0_resource = { + .crash_reason_smem = 601, + .firmware_name = "cdsp.mdt", + .pas_id = 18, + .has_aggre2_clk = false, + .auto_boot = true, + .proxy_pd_names = (char*[]){ + "nsp", + NULL + }, + .ssr_name = "cdsp0", + .sysmon_name = "cdsp", + .ssctl_id = 0x17, +}; + +static const struct adsp_data sc8280xp_nsp1_resource = { + .crash_reason_smem = 633, + .firmware_name = "cdsp.mdt", + .pas_id = 30, + .has_aggre2_clk = false, + .auto_boot = true, + .proxy_pd_names = (char*[]){ + "nsp", + NULL + }, + .ssr_name = "cdsp1", + .sysmon_name = "cdsp1", + .ssctl_id = 0x20, +}; + static const struct adsp_data sm8350_cdsp_resource = { .crash_reason_smem = 601, .firmware_name = "cdsp.mdt", @@ -848,6 +878,7 @@ static const struct adsp_data sdx55_mpss_resource = { }; static const struct of_device_id adsp_of_match[] = { + { .compatible = "qcom,msm8226-adsp-pil", .data = &adsp_resource_init}, { .compatible = "qcom,msm8974-adsp-pil", .data = &adsp_resource_init}, { .compatible = "qcom,msm8996-adsp-pil", .data = &msm8996_adsp_resource}, { .compatible = "qcom,msm8996-slpi-pil", .data = &slpi_resource_init}, @@ -861,6 +892,9 @@ static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,sc8180x-adsp-pas", .data = &sm8150_adsp_resource}, { .compatible = "qcom,sc8180x-cdsp-pas", .data = &sm8150_cdsp_resource}, { .compatible = "qcom,sc8180x-mpss-pas", .data = &sc8180x_mpss_resource}, + { .compatible = "qcom,sc8280xp-adsp-pas", .data = &sm8250_adsp_resource}, + { .compatible = "qcom,sc8280xp-nsp0-pas", .data = &sc8280xp_nsp0_resource}, + { .compatible = "qcom,sc8280xp-nsp1-pas", .data = &sc8280xp_nsp1_resource}, { .compatible = "qcom,sdm660-adsp-pas", .data = &adsp_resource_init}, { .compatible = "qcom,sdm845-adsp-pas", .data = &sdm845_adsp_resource_init}, { .compatible = "qcom,sdm845-cdsp-pas", .data = &sdm845_cdsp_resource_init}, diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index 906ff3c4dfdd..687f205fd70a 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -32,21 +32,10 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_ return -EFAULT; if (!strncmp(cmd, "start", len)) { - if (rproc->state == RPROC_RUNNING || - rproc->state == RPROC_ATTACHED) - return -EBUSY; - ret = rproc_boot(rproc); } else if (!strncmp(cmd, "stop", len)) { - if (rproc->state != RPROC_RUNNING && - rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_shutdown(rproc); } else if (!strncmp(cmd, "detach", len)) { - if (rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_detach(rproc); } else { dev_err(&rproc->dev, "Unrecognized option\n"); diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index c510125769b9..02a04ab34a23 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -684,10 +684,6 @@ static int rproc_handle_trace(struct rproc *rproc, void *ptr, /* create the debugfs entry */ trace->tfile = rproc_create_trace_file(name, rproc, trace); - if (!trace->tfile) { - kfree(trace); - return -EINVAL; - } list_add_tail(&trace->node, &rproc->traces); @@ -2075,6 +2071,12 @@ int rproc_shutdown(struct rproc *rproc) return ret; } + if (rproc->state != RPROC_RUNNING && + rproc->state != RPROC_ATTACHED) { + ret = -EINVAL; + goto out; + } + /* if the remote proc is still needed, bail out */ if (!atomic_dec_and_test(&rproc->power)) goto out; @@ -2134,6 +2136,11 @@ int rproc_detach(struct rproc *rproc) return ret; } + if (rproc->state != RPROC_ATTACHED) { + ret = -EINVAL; + goto out; + } + /* if the remote proc is still needed, bail out */ if (!atomic_dec_and_test(&rproc->power)) { ret = 0; diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c index 581930483ef8..b86c1d09c70c 100644 --- a/drivers/remoteproc/remoteproc_debugfs.c +++ b/drivers/remoteproc/remoteproc_debugfs.c @@ -386,16 +386,8 @@ void rproc_remove_trace_file(struct dentry *tfile) struct dentry *rproc_create_trace_file(const char *name, struct rproc *rproc, struct rproc_debug_trace *trace) { - struct dentry *tfile; - - tfile = debugfs_create_file(name, 0400, rproc->dbg_dir, trace, + return debugfs_create_file(name, 0400, rproc->dbg_dir, trace, &trace_rproc_ops); - if (!tfile) { - dev_err(&rproc->dev, "failed to create debugfs trace entry\n"); - return NULL; - } - - return tfile; } void rproc_delete_debug_dir(struct rproc *rproc) @@ -411,8 +403,6 @@ void rproc_create_debug_dir(struct rproc *rproc) return; rproc->dbg_dir = debugfs_create_dir(dev_name(dev), rproc_dbg); - if (!rproc->dbg_dir) - return; debugfs_create_file("name", 0400, rproc->dbg_dir, rproc, &rproc_name_ops); @@ -430,11 +420,8 @@ void rproc_create_debug_dir(struct rproc *rproc) void __init rproc_init_debugfs(void) { - if (debugfs_initialized()) { + if (debugfs_initialized()) rproc_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); - if (!rproc_dbg) - pr_err("can't create debugfs dir\n"); - } } void __exit rproc_exit_debugfs(void) diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c index d635d19a5aa8..5a412d7b6e0b 100644 --- a/drivers/remoteproc/remoteproc_elf_loader.c +++ b/drivers/remoteproc/remoteproc_elf_loader.c @@ -181,7 +181,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) bool is_iomem = false; void *ptr; - if (type != PT_LOAD) + if (type != PT_LOAD || !memsz) continue; dev_dbg(dev, "phdr: type %d da 0x%llx memsz 0x%llx filesz 0x%llx\n", diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 51a04bc6ba7a..8c7ea8922638 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -194,23 +194,12 @@ static ssize_t state_store(struct device *dev, int ret = 0; if (sysfs_streq(buf, "start")) { - if (rproc->state == RPROC_RUNNING || - rproc->state == RPROC_ATTACHED) - return -EBUSY; - ret = rproc_boot(rproc); if (ret) dev_err(&rproc->dev, "Boot failed: %d\n", ret); } else if (sysfs_streq(buf, "stop")) { - if (rproc->state != RPROC_RUNNING && - rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_shutdown(rproc); } else if (sysfs_streq(buf, "detach")) { - if (rproc->state != RPROC_ATTACHED) - return -EINVAL; - ret = rproc_detach(rproc); } else { dev_err(&rproc->dev, "Unrecognised option: %s\n", buf); diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c index 764c980507be..1957b27c4cf3 100644 --- a/drivers/rpmsg/qcom_smd.c +++ b/drivers/rpmsg/qcom_smd.c @@ -1407,9 +1407,9 @@ static int qcom_smd_parse_edge(struct device *dev, edge->name = node->name; irq = irq_of_parse_and_map(node, 0); - if (irq < 0) { + if (!irq) { dev_err(dev, "required smd interrupt missing\n"); - ret = irq; + ret = -EINVAL; goto put_node; } diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 3ede25b1f2e4..905ac7910c98 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -851,7 +851,7 @@ static struct rpmsg_device *rpmsg_virtio_add_ctrl_dev(struct virtio_device *vdev err = rpmsg_ctrldev_register_device(rpdev_ctrl); if (err) { - kfree(vch); + /* vch will be free in virtio_rpmsg_release_device() */ return ERR_PTR(err); } @@ -862,7 +862,7 @@ static void rpmsg_virtio_del_ctrl_dev(struct rpmsg_device *rpdev_ctrl) { if (!rpdev_ctrl) return; - kfree(to_virtio_rpmsg_channel(rpdev_ctrl)); + device_unregister(&rpdev_ctrl->dev); } static int rpmsg_probe(struct virtio_device *vdev) @@ -973,7 +973,8 @@ static int rpmsg_probe(struct virtio_device *vdev) err = rpmsg_ns_register_device(rpdev_ns); if (err) - goto free_vch; + /* vch will be free in virtio_rpmsg_release_device() */ + goto free_ctrldev; } /* @@ -997,8 +998,6 @@ static int rpmsg_probe(struct virtio_device *vdev) return 0; -free_vch: - kfree(vch); free_ctrldev: rpmsg_virtio_del_ctrl_dev(rpdev_ctrl); free_coherent: diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 41c65b4d2baf..a00f901b5c1d 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1548,6 +1548,13 @@ config RTC_DRV_RS5C313 help If you say yes here you get support for the Ricoh RS5C313 RTC chips. +config RTC_DRV_RZN1 + tristate "Renesas RZ/N1 RTC" + depends on ARCH_RZN1 || COMPILE_TEST + depends on OF && HAS_IOMEM + help + If you say yes here you get support for the Renesas RZ/N1 RTC. + config RTC_DRV_GENERIC tristate "Generic RTC support" # Please consider writing a new RTC driver instead of using the generic diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 2d827d8261d5..fb04467b652d 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -151,6 +151,7 @@ obj-$(CONFIG_RTC_DRV_RX6110) += rtc-rx6110.o obj-$(CONFIG_RTC_DRV_RX8010) += rtc-rx8010.o obj-$(CONFIG_RTC_DRV_RX8025) += rtc-rx8025.o obj-$(CONFIG_RTC_DRV_RX8581) += rtc-rx8581.o +obj-$(CONFIG_RTC_DRV_RZN1) += rtc-rzn1.o obj-$(CONFIG_RTC_DRV_S35390A) += rtc-s35390a.o obj-$(CONFIG_RTC_DRV_S3C) += rtc-s3c.o obj-$(CONFIG_RTC_DRV_S5M) += rtc-s5m.o diff --git a/drivers/rtc/rtc-ftrtc010.c b/drivers/rtc/rtc-ftrtc010.c index 53bb08fe1cd4..25c6e7d9570f 100644 --- a/drivers/rtc/rtc-ftrtc010.c +++ b/drivers/rtc/rtc-ftrtc010.c @@ -137,26 +137,34 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev) ret = clk_prepare_enable(rtc->extclk); if (ret) { dev_err(dev, "failed to enable EXTCLK\n"); - return ret; + goto err_disable_pclk; } } rtc->rtc_irq = platform_get_irq(pdev, 0); - if (rtc->rtc_irq < 0) - return rtc->rtc_irq; + if (rtc->rtc_irq < 0) { + ret = rtc->rtc_irq; + goto err_disable_extclk; + } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; + if (!res) { + ret = -ENODEV; + goto err_disable_extclk; + } rtc->rtc_base = devm_ioremap(dev, res->start, resource_size(res)); - if (!rtc->rtc_base) - return -ENOMEM; + if (!rtc->rtc_base) { + ret = -ENOMEM; + goto err_disable_extclk; + } rtc->rtc_dev = devm_rtc_allocate_device(dev); - if (IS_ERR(rtc->rtc_dev)) - return PTR_ERR(rtc->rtc_dev); + if (IS_ERR(rtc->rtc_dev)) { + ret = PTR_ERR(rtc->rtc_dev); + goto err_disable_extclk; + } rtc->rtc_dev->ops = &ftrtc010_rtc_ops; @@ -172,9 +180,15 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev) ret = devm_request_irq(dev, rtc->rtc_irq, ftrtc010_rtc_interrupt, IRQF_SHARED, pdev->name, dev); if (unlikely(ret)) - return ret; + goto err_disable_extclk; return devm_rtc_register_device(rtc->rtc_dev); + +err_disable_extclk: + clk_disable_unprepare(rtc->extclk); +err_disable_pclk: + clk_disable_unprepare(rtc->pclk); + return ret; } static int ftrtc010_rtc_remove(struct platform_device *pdev) diff --git a/drivers/rtc/rtc-gamecube.c b/drivers/rtc/rtc-gamecube.c index 18ca3b38b2d0..c2717bb52b2b 100644 --- a/drivers/rtc/rtc-gamecube.c +++ b/drivers/rtc/rtc-gamecube.c @@ -267,6 +267,7 @@ static int gamecube_rtc_read_offset_from_sram(struct priv *d) ret = regmap_read(d->regmap, RTC_SRAM_BIAS, &d->rtc_bias); if (ret) { pr_err("failed to get the RTC bias\n"); + iounmap(hw_srnprot); return -1; } diff --git a/drivers/rtc/rtc-meson.c b/drivers/rtc/rtc-meson.c index 44bdc8b4a90d..db1d626edca5 100644 --- a/drivers/rtc/rtc-meson.c +++ b/drivers/rtc/rtc-meson.c @@ -399,7 +399,7 @@ static struct platform_driver meson_rtc_driver = { module_platform_driver(meson_rtc_driver); MODULE_DESCRIPTION("Amlogic Meson RTC Driver"); -MODULE_AUTHOR("Ben Dooks "); +MODULE_AUTHOR("Ben Dooks "); MODULE_AUTHOR("Martin Blumenstingl "); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("platform:meson-rtc"); diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c index 80dc479a6ff0..1d297af80f87 100644 --- a/drivers/rtc/rtc-mt6397.c +++ b/drivers/rtc/rtc-mt6397.c @@ -269,6 +269,8 @@ static int mtk_rtc_probe(struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -EINVAL; rtc->addr_base = res->start; rtc->data = of_device_get_match_data(&pdev->dev); diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 0f08f22df869..53d4e253e81f 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -311,7 +311,7 @@ static int mxc_rtc_probe(struct platform_device *pdev) if (!pdata) return -ENOMEM; - pdata->devtype = (enum imx_rtc_type)of_device_get_match_data(&pdev->dev); + pdata->devtype = (uintptr_t)of_device_get_match_data(&pdev->dev); pdata->ioaddr = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pdata->ioaddr)) diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index 9760824ec199..095891999da1 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -650,6 +650,7 @@ static int pcf85063_probe(struct i2c_client *client) } static const struct i2c_device_id pcf85063_ids[] = { + { "pca85073a", PCF85063A }, { "pcf85063", PCF85063 }, { "pcf85063tp", PCF85063TP }, { "pcf85063a", PCF85063A }, @@ -660,6 +661,7 @@ MODULE_DEVICE_TABLE(i2c, pcf85063_ids); #ifdef CONFIG_OF static const struct of_device_id pcf85063_of_match[] = { + { .compatible = "nxp,pca85073a", .data = &pcf85063_cfg[PCF85063A] }, { .compatible = "nxp,pcf85063", .data = &pcf85063_cfg[PCF85063] }, { .compatible = "nxp,pcf85063tp", .data = &pcf85063_cfg[PCF85063TP] }, { .compatible = "nxp,pcf85063a", .data = &pcf85063_cfg[PCF85063A] }, diff --git a/drivers/rtc/rtc-rx8025.c b/drivers/rtc/rtc-rx8025.c index 5bfdd34a72ff..b32117ccd74b 100644 --- a/drivers/rtc/rtc-rx8025.c +++ b/drivers/rtc/rtc-rx8025.c @@ -436,7 +436,6 @@ static int rx8025_set_offset(struct device *dev, long offset) { struct i2c_client *client = to_i2c_client(dev); u8 digoff; - int err; offset /= RX8025_ADJ_RESOLUTION; if (offset > RX8025_ADJ_DATA_MAX) @@ -449,11 +448,7 @@ static int rx8025_set_offset(struct device *dev, long offset) offset += 128; digoff = offset; - err = rx8025_write_reg(client, RX8025_REG_DIGOFF, digoff); - if (err) - return err; - - return 0; + return rx8025_write_reg(client, RX8025_REG_DIGOFF, digoff); } static const struct rtc_class_ops rx8025_rtc_ops = { diff --git a/drivers/rtc/rtc-rzn1.c b/drivers/rtc/rtc-rzn1.c new file mode 100644 index 000000000000..ac788799c8e3 --- /dev/null +++ b/drivers/rtc/rtc-rzn1.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Renesas RZ/N1 Real Time Clock interface for Linux + * + * Copyright: + * - 2014 Renesas Electronics Europe Limited + * - 2022 Schneider Electric + * + * Authors: + * - Michel Pollet , + * - Miquel Raynal + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RZN1_RTC_CTL0 0x00 +#define RZN1_RTC_CTL0_SLSB_SUBU 0 +#define RZN1_RTC_CTL0_SLSB_SCMP BIT(4) +#define RZN1_RTC_CTL0_AMPM BIT(5) +#define RZN1_RTC_CTL0_CE BIT(7) + +#define RZN1_RTC_CTL1 0x04 +#define RZN1_RTC_CTL1_ALME BIT(4) + +#define RZN1_RTC_CTL2 0x08 +#define RZN1_RTC_CTL2_WAIT BIT(0) +#define RZN1_RTC_CTL2_WST BIT(1) +#define RZN1_RTC_CTL2_WUST BIT(5) +#define RZN1_RTC_CTL2_STOPPED (RZN1_RTC_CTL2_WAIT | RZN1_RTC_CTL2_WST) + +#define RZN1_RTC_SEC 0x14 +#define RZN1_RTC_MIN 0x18 +#define RZN1_RTC_HOUR 0x1c +#define RZN1_RTC_WEEK 0x20 +#define RZN1_RTC_DAY 0x24 +#define RZN1_RTC_MONTH 0x28 +#define RZN1_RTC_YEAR 0x2c + +#define RZN1_RTC_SUBU 0x38 +#define RZN1_RTC_SUBU_DEV BIT(7) +#define RZN1_RTC_SUBU_DECR BIT(6) + +#define RZN1_RTC_ALM 0x40 +#define RZN1_RTC_ALH 0x44 +#define RZN1_RTC_ALW 0x48 + +#define RZN1_RTC_SECC 0x4c +#define RZN1_RTC_MINC 0x50 +#define RZN1_RTC_HOURC 0x54 +#define RZN1_RTC_WEEKC 0x58 +#define RZN1_RTC_DAYC 0x5c +#define RZN1_RTC_MONTHC 0x60 +#define RZN1_RTC_YEARC 0x64 + +struct rzn1_rtc { + struct rtc_device *rtcdev; + void __iomem *base; +}; + +static void rzn1_rtc_get_time_snapshot(struct rzn1_rtc *rtc, struct rtc_time *tm) +{ + tm->tm_sec = readl(rtc->base + RZN1_RTC_SECC); + tm->tm_min = readl(rtc->base + RZN1_RTC_MINC); + tm->tm_hour = readl(rtc->base + RZN1_RTC_HOURC); + tm->tm_wday = readl(rtc->base + RZN1_RTC_WEEKC); + tm->tm_mday = readl(rtc->base + RZN1_RTC_DAYC); + tm->tm_mon = readl(rtc->base + RZN1_RTC_MONTHC); + tm->tm_year = readl(rtc->base + RZN1_RTC_YEARC); +} + +static unsigned int rzn1_rtc_tm_to_wday(struct rtc_time *tm) +{ + time64_t time; + unsigned int days; + u32 secs; + + time = rtc_tm_to_time64(tm); + days = div_s64_rem(time, 86400, &secs); + + /* day of the week, 1970-01-01 was a Thursday */ + return (days + 4) % 7; +} + +static int rzn1_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + u32 val, secs; + + /* + * The RTC was not started or is stopped and thus does not carry the + * proper time/date. + */ + val = readl(rtc->base + RZN1_RTC_CTL2); + if (val & RZN1_RTC_CTL2_STOPPED) + return -EINVAL; + + rzn1_rtc_get_time_snapshot(rtc, tm); + secs = readl(rtc->base + RZN1_RTC_SECC); + if (tm->tm_sec != secs) + rzn1_rtc_get_time_snapshot(rtc, tm); + + tm->tm_sec = bcd2bin(tm->tm_sec); + tm->tm_min = bcd2bin(tm->tm_min); + tm->tm_hour = bcd2bin(tm->tm_hour); + tm->tm_wday = bcd2bin(tm->tm_wday); + tm->tm_mday = bcd2bin(tm->tm_mday); + tm->tm_mon = bcd2bin(tm->tm_mon); + tm->tm_year = bcd2bin(tm->tm_year); + + return 0; +} + +static int rzn1_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + u32 val; + int ret; + + tm->tm_sec = bin2bcd(tm->tm_sec); + tm->tm_min = bin2bcd(tm->tm_min); + tm->tm_hour = bin2bcd(tm->tm_hour); + tm->tm_wday = bin2bcd(rzn1_rtc_tm_to_wday(tm)); + tm->tm_mday = bin2bcd(tm->tm_mday); + tm->tm_mon = bin2bcd(tm->tm_mon); + tm->tm_year = bin2bcd(tm->tm_year); + + val = readl(rtc->base + RZN1_RTC_CTL2); + if (!(val & RZN1_RTC_CTL2_STOPPED)) { + /* Hold the counter if it was counting up */ + writel(RZN1_RTC_CTL2_WAIT, rtc->base + RZN1_RTC_CTL2); + + /* Wait for the counter to stop: two 32k clock cycles */ + usleep_range(61, 100); + ret = readl_poll_timeout(rtc->base + RZN1_RTC_CTL2, val, + val & RZN1_RTC_CTL2_WST, 0, 100); + if (ret) + return ret; + } + + writel(tm->tm_sec, rtc->base + RZN1_RTC_SEC); + writel(tm->tm_min, rtc->base + RZN1_RTC_MIN); + writel(tm->tm_hour, rtc->base + RZN1_RTC_HOUR); + writel(tm->tm_wday, rtc->base + RZN1_RTC_WEEK); + writel(tm->tm_mday, rtc->base + RZN1_RTC_DAY); + writel(tm->tm_mon, rtc->base + RZN1_RTC_MONTH); + writel(tm->tm_year, rtc->base + RZN1_RTC_YEAR); + writel(0, rtc->base + RZN1_RTC_CTL2); + + return 0; +} + +static irqreturn_t rzn1_rtc_alarm_irq(int irq, void *dev_id) +{ + struct rzn1_rtc *rtc = dev_id; + + rtc_update_irq(rtc->rtcdev, 1, RTC_AF | RTC_IRQF); + + return IRQ_HANDLED; +} + +static int rzn1_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + u32 ctl1 = readl(rtc->base + RZN1_RTC_CTL1); + + if (enable) + ctl1 |= RZN1_RTC_CTL1_ALME; + else + ctl1 &= ~RZN1_RTC_CTL1_ALME; + + writel(ctl1, rtc->base + RZN1_RTC_CTL1); + + return 0; +} + +static int rzn1_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + struct rtc_time *tm = &alrm->time; + unsigned int min, hour, wday, delta_days; + time64_t alarm; + u32 ctl1; + int ret; + + ret = rzn1_rtc_read_time(dev, tm); + if (ret) + return ret; + + min = readl(rtc->base + RZN1_RTC_ALM); + hour = readl(rtc->base + RZN1_RTC_ALH); + wday = readl(rtc->base + RZN1_RTC_ALW); + + tm->tm_sec = 0; + tm->tm_min = bcd2bin(min); + tm->tm_hour = bcd2bin(hour); + delta_days = ((fls(wday) - 1) - tm->tm_wday + 7) % 7; + tm->tm_wday = fls(wday) - 1; + + if (delta_days) { + alarm = rtc_tm_to_time64(tm) + (delta_days * 86400); + rtc_time64_to_tm(alarm, tm); + } + + ctl1 = readl(rtc->base + RZN1_RTC_CTL1); + alrm->enabled = !!(ctl1 & RZN1_RTC_CTL1_ALME); + + return 0; +} + +static int rzn1_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + struct rtc_time *tm = &alrm->time, tm_now; + unsigned long alarm, farest; + unsigned int days_ahead, wday; + int ret; + + ret = rzn1_rtc_read_time(dev, &tm_now); + if (ret) + return ret; + + /* We cannot set alarms more than one week ahead */ + farest = rtc_tm_to_time64(&tm_now) + (7 * 86400); + alarm = rtc_tm_to_time64(tm); + if (time_after(alarm, farest)) + return -ERANGE; + + /* Convert alarm day into week day */ + days_ahead = tm->tm_mday - tm_now.tm_mday; + wday = (tm_now.tm_wday + days_ahead) % 7; + + writel(bin2bcd(tm->tm_min), rtc->base + RZN1_RTC_ALM); + writel(bin2bcd(tm->tm_hour), rtc->base + RZN1_RTC_ALH); + writel(BIT(wday), rtc->base + RZN1_RTC_ALW); + + rzn1_rtc_alarm_irq_enable(dev, alrm->enabled); + + return 0; +} + +static int rzn1_rtc_read_offset(struct device *dev, long *offset) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + unsigned int ppb_per_step; + bool subtract; + u32 val; + + val = readl(rtc->base + RZN1_RTC_SUBU); + ppb_per_step = val & RZN1_RTC_SUBU_DEV ? 1017 : 3051; + subtract = val & RZN1_RTC_SUBU_DECR; + val &= 0x3F; + + if (!val) + *offset = 0; + else if (subtract) + *offset = -(((~val) & 0x3F) + 1) * ppb_per_step; + else + *offset = (val - 1) * ppb_per_step; + + return 0; +} + +static int rzn1_rtc_set_offset(struct device *dev, long offset) +{ + struct rzn1_rtc *rtc = dev_get_drvdata(dev); + int stepsh, stepsl, steps; + u32 subu = 0, ctl2; + int ret; + + /* + * Check which resolution mode (every 20 or 60s) can be used. + * Between 2 and 124 clock pulses can be added or substracted. + * + * In 20s mode, the minimum resolution is 2 / (32768 * 20) which is + * close to 3051 ppb. In 60s mode, the resolution is closer to 1017. + */ + stepsh = DIV_ROUND_CLOSEST(offset, 1017); + stepsl = DIV_ROUND_CLOSEST(offset, 3051); + + if (stepsh >= -0x3E && stepsh <= 0x3E) { + /* 1017 ppb per step */ + steps = stepsh; + subu |= RZN1_RTC_SUBU_DEV; + } else if (stepsl >= -0x3E && stepsl <= 0x3E) { + /* 3051 ppb per step */ + steps = stepsl; + } else { + return -ERANGE; + } + + if (!steps) + return 0; + + if (steps > 0) { + subu |= steps + 1; + } else { + subu |= RZN1_RTC_SUBU_DECR; + subu |= (~(-steps - 1)) & 0x3F; + } + + ret = readl_poll_timeout(rtc->base + RZN1_RTC_CTL2, ctl2, + !(ctl2 & RZN1_RTC_CTL2_WUST), 100, 2000000); + if (ret) + return ret; + + writel(subu, rtc->base + RZN1_RTC_SUBU); + + return 0; +} + +static const struct rtc_class_ops rzn1_rtc_ops = { + .read_time = rzn1_rtc_read_time, + .set_time = rzn1_rtc_set_time, + .read_alarm = rzn1_rtc_read_alarm, + .set_alarm = rzn1_rtc_set_alarm, + .alarm_irq_enable = rzn1_rtc_alarm_irq_enable, + .read_offset = rzn1_rtc_read_offset, + .set_offset = rzn1_rtc_set_offset, +}; + +static int rzn1_rtc_probe(struct platform_device *pdev) +{ + struct rzn1_rtc *rtc; + int alarm_irq; + int ret; + + rtc = devm_kzalloc(&pdev->dev, sizeof(*rtc), GFP_KERNEL); + if (!rtc) + return -ENOMEM; + + platform_set_drvdata(pdev, rtc); + + rtc->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(rtc->base)) + return dev_err_probe(&pdev->dev, PTR_ERR(rtc->base), "Missing reg\n"); + + alarm_irq = platform_get_irq(pdev, 0); + if (alarm_irq < 0) + return alarm_irq; + + rtc->rtcdev = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(rtc->rtcdev)) + return PTR_ERR(rtc->rtcdev); + + rtc->rtcdev->range_min = RTC_TIMESTAMP_BEGIN_2000; + rtc->rtcdev->range_max = RTC_TIMESTAMP_END_2099; + rtc->rtcdev->ops = &rzn1_rtc_ops; + set_bit(RTC_FEATURE_ALARM_RES_MINUTE, rtc->rtcdev->features); + clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->rtcdev->features); + + devm_pm_runtime_enable(&pdev->dev); + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret < 0) + return ret; + + /* + * Ensure the clock counter is enabled. + * Set 24-hour mode and possible oscillator offset compensation in SUBU mode. + */ + writel(RZN1_RTC_CTL0_CE | RZN1_RTC_CTL0_AMPM | RZN1_RTC_CTL0_SLSB_SUBU, + rtc->base + RZN1_RTC_CTL0); + + /* Disable all interrupts */ + writel(0, rtc->base + RZN1_RTC_CTL1); + + ret = devm_request_irq(&pdev->dev, alarm_irq, rzn1_rtc_alarm_irq, 0, + dev_name(&pdev->dev), rtc); + if (ret) { + dev_err(&pdev->dev, "RTC timer interrupt not available\n"); + goto dis_runtime_pm; + } + + ret = devm_rtc_register_device(rtc->rtcdev); + if (ret) + goto dis_runtime_pm; + + return 0; + +dis_runtime_pm: + pm_runtime_put(&pdev->dev); + + return ret; +} + +static int rzn1_rtc_remove(struct platform_device *pdev) +{ + pm_runtime_put(&pdev->dev); + + return 0; +} + +static const struct of_device_id rzn1_rtc_of_match[] = { + { .compatible = "renesas,rzn1-rtc" }, + {}, +}; +MODULE_DEVICE_TABLE(of, rzn1_rtc_of_match); + +static struct platform_driver rzn1_rtc_driver = { + .probe = rzn1_rtc_probe, + .remove = rzn1_rtc_remove, + .driver = { + .name = "rzn1-rtc", + .of_match_table = rzn1_rtc_of_match, + }, +}; +module_platform_driver(rzn1_rtc_driver); + +MODULE_AUTHOR("Michel Pollet base + SUN6I_GP_DATA + offset + 4 * i); + + return 0; +} + +static int sun6i_rtc_nvmem_write(void *priv, unsigned int offset, void *_val, size_t bytes) +{ + struct sun6i_rtc_dev *chip = priv; + u32 *val = _val; + int i; + + for (i = 0; i < bytes / 4; ++i) + writel(val[i], chip->base + SUN6I_GP_DATA + offset + 4 * i); + + return 0; +} + +static struct nvmem_config sun6i_rtc_nvmem_cfg = { + .type = NVMEM_TYPE_BATTERY_BACKED, + .reg_read = sun6i_rtc_nvmem_read, + .reg_write = sun6i_rtc_nvmem_write, + .size = SUN6I_GP_DATA_SIZE, + .word_size = 4, + .stride = 4, +}; + #ifdef CONFIG_PM_SLEEP /* Enable IRQ wake on suspend, to wake up from RTC. */ static int sun6i_rtc_suspend(struct device *dev) @@ -812,6 +849,11 @@ static int sun6i_rtc_probe(struct platform_device *pdev) if (ret) return ret; + sun6i_rtc_nvmem_cfg.priv = chip; + ret = devm_rtc_nvmem_register(chip->rtc, &sun6i_rtc_nvmem_cfg); + if (ret) + return ret; + dev_info(&pdev->dev, "RTC enabled\n"); return 0; diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 8d1b2771c1aa..0c2be9421ab7 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -16,6 +16,7 @@ #include #include "vfio_ccw_cp.h" +#include "vfio_ccw_private.h" struct pfn_array { /* Starting guest physical I/O address. */ @@ -98,17 +99,17 @@ static int pfn_array_alloc(struct pfn_array *pa, u64 iova, unsigned int len) * If the pin request partially succeeds, or fails completely, * all pages are left unpinned and a negative error value is returned. */ -static int pfn_array_pin(struct pfn_array *pa, struct device *mdev) +static int pfn_array_pin(struct pfn_array *pa, struct vfio_device *vdev) { int ret = 0; - ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr, + ret = vfio_pin_pages(vdev, pa->pa_iova_pfn, pa->pa_nr, IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); if (ret < 0) { goto err_out; } else if (ret > 0 && ret != pa->pa_nr) { - vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret); + vfio_unpin_pages(vdev, pa->pa_iova_pfn, ret); ret = -EINVAL; goto err_out; } @@ -122,11 +123,11 @@ err_out: } /* Unpin the pages before releasing the memory. */ -static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) +static void pfn_array_unpin_free(struct pfn_array *pa, struct vfio_device *vdev) { /* Only unpin if any pages were pinned to begin with */ if (pa->pa_nr) - vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); + vfio_unpin_pages(vdev, pa->pa_iova_pfn, pa->pa_nr); pa->pa_nr = 0; kfree(pa->pa_iova_pfn); } @@ -190,8 +191,7 @@ static void convert_ccw0_to_ccw1(struct ccw1 *source, unsigned long len) * Within the domain (@mdev), copy @n bytes from a guest physical * address (@iova) to a host physical address (@to). */ -static long copy_from_iova(struct device *mdev, - void *to, u64 iova, +static long copy_from_iova(struct vfio_device *vdev, void *to, u64 iova, unsigned long n) { struct pfn_array pa = {0}; @@ -203,9 +203,9 @@ static long copy_from_iova(struct device *mdev, if (ret < 0) return ret; - ret = pfn_array_pin(&pa, mdev); + ret = pfn_array_pin(&pa, vdev); if (ret < 0) { - pfn_array_unpin_free(&pa, mdev); + pfn_array_unpin_free(&pa, vdev); return ret; } @@ -226,7 +226,7 @@ static long copy_from_iova(struct device *mdev, break; } - pfn_array_unpin_free(&pa, mdev); + pfn_array_unpin_free(&pa, vdev); return l; } @@ -423,11 +423,13 @@ static int ccwchain_loop_tic(struct ccwchain *chain, static int ccwchain_handle_ccw(u32 cda, struct channel_program *cp) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; struct ccwchain *chain; int len, ret; /* Copy 2K (the most we support today) of possible CCWs */ - len = copy_from_iova(cp->mdev, cp->guest_cp, cda, + len = copy_from_iova(vdev, cp->guest_cp, cda, CCWCHAIN_LEN_MAX * sizeof(struct ccw1)); if (len) return len; @@ -508,6 +510,8 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, int idx, struct channel_program *cp) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; struct ccw1 *ccw; struct pfn_array *pa; u64 iova; @@ -526,7 +530,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ccw_is_idal(ccw)) { /* Read first IDAW to see if it's 4K-aligned or not. */ /* All subsequent IDAws will be 4K-aligned. */ - ret = copy_from_iova(cp->mdev, &iova, ccw->cda, sizeof(iova)); + ret = copy_from_iova(vdev, &iova, ccw->cda, sizeof(iova)); if (ret) return ret; } else { @@ -555,7 +559,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, if (ccw_is_idal(ccw)) { /* Copy guest IDAL into host IDAL */ - ret = copy_from_iova(cp->mdev, idaws, ccw->cda, idal_len); + ret = copy_from_iova(vdev, idaws, ccw->cda, idal_len); if (ret) goto out_unpin; @@ -574,7 +578,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, } if (ccw_does_data_transfer(ccw)) { - ret = pfn_array_pin(pa, cp->mdev); + ret = pfn_array_pin(pa, vdev); if (ret < 0) goto out_unpin; } else { @@ -590,7 +594,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, return 0; out_unpin: - pfn_array_unpin_free(pa, cp->mdev); + pfn_array_unpin_free(pa, vdev); out_free_idaws: kfree(idaws); out_init: @@ -632,8 +636,10 @@ static int ccwchain_fetch_one(struct ccwchain *chain, * Returns: * %0 on success and a negative error value on failure. */ -int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) +int cp_init(struct channel_program *cp, union orb *orb) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; /* custom ratelimit used to avoid flood during guest IPL */ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 1); int ret; @@ -650,11 +656,12 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) * the problem if something does break. */ if (!orb->cmd.pfch && __ratelimit(&ratelimit_state)) - dev_warn(mdev, "Prefetching channel program even though prefetch not specified in ORB"); + dev_warn( + vdev->dev, + "Prefetching channel program even though prefetch not specified in ORB"); INIT_LIST_HEAD(&cp->ccwchain_list); memcpy(&cp->orb, orb, sizeof(*orb)); - cp->mdev = mdev; /* Build a ccwchain for the first CCW segment */ ret = ccwchain_handle_ccw(orb->cmd.cpa, cp); @@ -682,6 +689,8 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) */ void cp_free(struct channel_program *cp) { + struct vfio_device *vdev = + &container_of(cp, struct vfio_ccw_private, cp)->vdev; struct ccwchain *chain, *temp; int i; @@ -691,7 +700,7 @@ void cp_free(struct channel_program *cp) cp->initialized = false; list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { for (i = 0; i < chain->ch_len; i++) { - pfn_array_unpin_free(chain->ch_pa + i, cp->mdev); + pfn_array_unpin_free(chain->ch_pa + i, vdev); ccwchain_cda_free(chain, i); } ccwchain_free(chain); diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h index ba31240ce965..e4c436199b4c 100644 --- a/drivers/s390/cio/vfio_ccw_cp.h +++ b/drivers/s390/cio/vfio_ccw_cp.h @@ -37,13 +37,11 @@ struct channel_program { struct list_head ccwchain_list; union orb orb; - struct device *mdev; bool initialized; struct ccw1 *guest_cp; }; -extern int cp_init(struct channel_program *cp, struct device *mdev, - union orb *orb); +extern int cp_init(struct channel_program *cp, union orb *orb); extern void cp_free(struct channel_program *cp); extern int cp_prefetch(struct channel_program *cp); extern union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm); diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index e435a9cd92da..8483a266051c 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -262,8 +262,7 @@ static void fsm_io_request(struct vfio_ccw_private *private, errstr = "transport mode"; goto err_out; } - io_region->ret_code = cp_init(&private->cp, mdev_dev(mdev), - orb); + io_region->ret_code = cp_init(&private->cp, orb); if (io_region->ret_code) { VFIO_CCW_MSG_EVENT(2, "%pUl (%x.%x.%04x): cp_init=%d\n", diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index c4d60cdbf247..b49e2e9db2dc 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -183,7 +183,7 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) private->nb.notifier_call = vfio_ccw_mdev_notifier; - ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, + ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events, &private->nb); if (ret) return ret; @@ -204,8 +204,7 @@ static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) out_unregister: vfio_ccw_unregister_dev_regions(private); - vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, - &private->nb); + vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb); return ret; } @@ -223,7 +222,7 @@ static void vfio_ccw_mdev_close_device(struct vfio_device *vdev) cp_free(&private->cp); vfio_ccw_unregister_dev_regions(private); - vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, &private->nb); + vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &private->nb); } static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private, diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index ee0a3bf8f476..a7d2a95796d3 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -124,8 +124,7 @@ static void vfio_ap_free_aqic_resources(struct vfio_ap_queue *q) q->saved_isc = VFIO_AP_ISC_INVALID; } if (q->saved_pfn && !WARN_ON(!q->matrix_mdev)) { - vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), - &q->saved_pfn, 1); + vfio_unpin_pages(&q->matrix_mdev->vdev, &q->saved_pfn, 1); q->saved_pfn = 0; } } @@ -258,7 +257,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, return status; } - ret = vfio_pin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1, + ret = vfio_pin_pages(&q->matrix_mdev->vdev, &g_pfn, 1, IOMMU_READ | IOMMU_WRITE, &h_pfn); switch (ret) { case 1: @@ -301,7 +300,7 @@ static struct ap_queue_status vfio_ap_irq_enable(struct vfio_ap_queue *q, break; case AP_RESPONSE_OTHERWISE_CHANGED: /* We could not modify IRQ setings: clear new configuration */ - vfio_unpin_pages(mdev_dev(q->matrix_mdev->mdev), &g_pfn, 1); + vfio_unpin_pages(&q->matrix_mdev->vdev, &g_pfn, 1); kvm_s390_gisc_unregister(kvm, isc); break; default: @@ -1250,7 +1249,7 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb, struct vfio_iommu_type1_dma_unmap *unmap = data; unsigned long g_pfn = unmap->iova >> PAGE_SHIFT; - vfio_unpin_pages(mdev_dev(matrix_mdev->mdev), &g_pfn, 1); + vfio_unpin_pages(&matrix_mdev->vdev, &g_pfn, 1); return NOTIFY_OK; } @@ -1285,25 +1284,6 @@ static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev) } } -static int vfio_ap_mdev_group_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - int notify_rc = NOTIFY_OK; - struct ap_matrix_mdev *matrix_mdev; - - if (action != VFIO_GROUP_NOTIFY_SET_KVM) - return NOTIFY_OK; - - matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier); - - if (!data) - vfio_ap_mdev_unset_kvm(matrix_mdev); - else if (vfio_ap_mdev_set_kvm(matrix_mdev, data)) - notify_rc = NOTIFY_DONE; - - return notify_rc; -} - static struct vfio_ap_queue *vfio_ap_find_queue(int apqn) { struct device *dev; @@ -1403,25 +1383,23 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev) unsigned long events; int ret; - matrix_mdev->group_notifier.notifier_call = vfio_ap_mdev_group_notifier; - events = VFIO_GROUP_NOTIFY_SET_KVM; + if (!vdev->kvm) + return -EINVAL; - ret = vfio_register_notifier(vdev->dev, VFIO_GROUP_NOTIFY, - &events, &matrix_mdev->group_notifier); + ret = vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm); if (ret) return ret; matrix_mdev->iommu_notifier.notifier_call = vfio_ap_mdev_iommu_notifier; events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; - ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, - &events, &matrix_mdev->iommu_notifier); + ret = vfio_register_notifier(vdev, VFIO_IOMMU_NOTIFY, &events, + &matrix_mdev->iommu_notifier); if (ret) - goto out_unregister_group; + goto err_kvm; return 0; -out_unregister_group: - vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, - &matrix_mdev->group_notifier); +err_kvm: + vfio_ap_mdev_unset_kvm(matrix_mdev); return ret; } @@ -1430,10 +1408,8 @@ static void vfio_ap_mdev_close_device(struct vfio_device *vdev) struct ap_matrix_mdev *matrix_mdev = container_of(vdev, struct ap_matrix_mdev, vdev); - vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, + vfio_unregister_notifier(vdev, VFIO_IOMMU_NOTIFY, &matrix_mdev->iommu_notifier); - vfio_unregister_notifier(vdev->dev, VFIO_GROUP_NOTIFY, - &matrix_mdev->group_notifier); vfio_ap_mdev_unset_kvm(matrix_mdev); } diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 648fcaf8104a..a26efd804d0d 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -81,8 +81,6 @@ struct ap_matrix { * @node: allows the ap_matrix_mdev struct to be added to a list * @matrix: the adapters, usage domains and control domains assigned to the * mediated matrix device. - * @group_notifier: notifier block used for specifying callback function for - * handling the VFIO_GROUP_NOTIFY_SET_KVM event * @iommu_notifier: notifier block used for specifying callback function for * handling the VFIO_IOMMU_NOTIFY_DMA_UNMAP even * @kvm: the struct holding guest's state @@ -94,7 +92,6 @@ struct ap_matrix_mdev { struct vfio_device vdev; struct list_head node; struct ap_matrix matrix; - struct notifier_block group_notifier; struct notifier_block iommu_notifier; struct kvm *kvm; crypto_hook pqap_hook; diff --git a/drivers/spi/spi-fsi.c b/drivers/spi/spi-fsi.c index d403a7a3021d..72ab066ce552 100644 --- a/drivers/spi/spi-fsi.c +++ b/drivers/spi/spi-fsi.c @@ -319,12 +319,12 @@ static int fsi_spi_transfer_data(struct fsi_spi *ctx, end = jiffies + msecs_to_jiffies(SPI_FSI_STATUS_TIMEOUT_MS); do { + if (time_after(jiffies, end)) + return -ETIMEDOUT; + rc = fsi_spi_status(ctx, &status, "TX"); if (rc) return rc; - - if (time_after(jiffies, end)) - return -ETIMEDOUT; } while (status & SPI_FSI_STATUS_TDR_FULL); sent += nb; @@ -337,12 +337,12 @@ static int fsi_spi_transfer_data(struct fsi_spi *ctx, while (transfer->len > recv) { end = jiffies + msecs_to_jiffies(SPI_FSI_STATUS_TIMEOUT_MS); do { + if (time_after(jiffies, end)) + return -ETIMEDOUT; + rc = fsi_spi_status(ctx, &status, "RX"); if (rc) return rc; - - if (time_after(jiffies, end)) - return -ETIMEDOUT; } while (!(status & SPI_FSI_STATUS_RDR_FULL)); rc = fsi_spi_read_reg(ctx, SPI_FSI_DATA_RX, &in); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index fe252a8075a7..b9e2c7e7c580 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1672,7 +1672,8 @@ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread) ret = ctlr->transfer_one_message(ctlr, msg); if (ret) { dev_err(&ctlr->dev, - "failed to transfer one message from queue\n"); + "failed to transfer one message from queue: %d\n", + ret); goto out; } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 767b5d47631a..4def43f5f7b6 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -337,6 +337,14 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) return 0; } +static struct hisi_acc_vf_core_device *hssi_acc_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct hisi_acc_vf_core_device, + core_device); +} + static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_qm *qm) { @@ -962,7 +970,7 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = dev_get_drvdata(&pdev->dev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); if (hisi_acc_vdev->core_device.vdev.migration_flags != VFIO_MIGRATION_STOP_COPY) @@ -1274,11 +1282,10 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device &hisi_acc_vfio_pci_ops); } + dev_set_drvdata(&pdev->dev, &hisi_acc_vdev->core_device); ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device); if (ret) goto out_free; - - dev_set_drvdata(&pdev->dev, hisi_acc_vdev); return 0; out_free: @@ -1289,7 +1296,7 @@ out_free: static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = dev_get_drvdata(&pdev->dev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); @@ -1316,6 +1323,7 @@ static struct pci_driver hisi_acc_vfio_pci_driver = { .probe = hisi_acc_vfio_pci_probe, .remove = hisi_acc_vfio_pci_remove, .err_handler = &hisi_acc_vf_err_handlers, + .driver_managed_dma = true, }; module_pci_driver(hisi_acc_vfio_pci_driver); diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5c9f9218cc1d..9b9f33ca270a 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -5,89 +5,157 @@ #include "cmd.h" -int mlx5vf_cmd_suspend_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod) +static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, + u16 *vhca_id); + +int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; - int ret; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); - MLX5_SET(suspend_vhca_in, in, vhca_id, vhca_id); + MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); - ret = mlx5_cmd_exec_inout(mdev, suspend_vhca, in, out); - mlx5_vf_put_core_dev(mdev); - return ret; + return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); } -int mlx5vf_cmd_resume_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod) +int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; - int ret; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); - MLX5_SET(resume_vhca_in, in, vhca_id, vhca_id); + MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(resume_vhca_in, in, op_mod, op_mod); - ret = mlx5_cmd_exec_inout(mdev, resume_vhca, in, out); - mlx5_vf_put_core_dev(mdev); - return ret; + return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); } -int mlx5vf_cmd_query_vhca_migration_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; int ret; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; MLX5_SET(query_vhca_migration_state_in, in, opcode, MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); - MLX5_SET(query_vhca_migration_state_in, in, vhca_id, vhca_id); + MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); - ret = mlx5_cmd_exec_inout(mdev, query_vhca_migration_state, in, out); + ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, + out); if (ret) - goto end; + return ret; *state_size = MLX5_GET(query_vhca_migration_state_out, out, required_umem_size); - -end: - mlx5_vf_put_core_dev(mdev); - return ret; + return 0; } -int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id) +static int mlx5fv_vf_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mlx5vf_pci_core_device *mvdev = + container_of(nb, struct mlx5vf_pci_core_device, nb); + + mutex_lock(&mvdev->state_mutex); + switch (event) { + case MLX5_PF_NOTIFY_ENABLE_VF: + mvdev->mdev_detach = false; + break; + case MLX5_PF_NOTIFY_DISABLE_VF: + mlx5vf_disable_fds(mvdev); + mvdev->mdev_detach = true; + break; + default: + break; + } + mlx5vf_state_mutex_unlock(mvdev); + return 0; +} + +void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) +{ + if (!mvdev->migrate_cap) + return; + + mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, + &mvdev->nb); + destroy_workqueue(mvdev->cb_wq); +} + +void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) +{ + struct pci_dev *pdev = mvdev->core_device.pdev; + int ret; + + if (!pdev->is_virtfn) + return; + + mvdev->mdev = mlx5_vf_get_core_dev(pdev); + if (!mvdev->mdev) + return; + + if (!MLX5_CAP_GEN(mvdev->mdev, migration)) + goto end; + + mvdev->vf_id = pci_iov_vf_id(pdev); + if (mvdev->vf_id < 0) + goto end; + + if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, + &mvdev->vhca_id)) + goto end; + + mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); + if (!mvdev->cb_wq) + goto end; + + mutex_init(&mvdev->state_mutex); + spin_lock_init(&mvdev->reset_lock); + mvdev->nb.notifier_call = mlx5fv_vf_event; + ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, + &mvdev->nb); + if (ret) { + destroy_workqueue(mvdev->cb_wq); + goto end; + } + + mvdev->migrate_cap = 1; + mvdev->core_device.vdev.migration_flags = + VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P; + +end: + mlx5_vf_put_core_dev(mvdev->mdev); +} + +static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, + u16 *vhca_id) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int ret; - if (!mdev) - return -ENOTCONN; - out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); - if (!out) { - ret = -ENOMEM; - goto end; - } + if (!out) + return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); MLX5_SET(query_hca_cap_in, in, other_function, 1); @@ -105,8 +173,6 @@ int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id) err_exec: kfree(out); -end: - mlx5_vf_put_core_dev(mdev); return ret; } @@ -151,21 +217,68 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, return err; } -int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, +void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) +{ + struct mlx5vf_async_data *async_data = container_of(_work, + struct mlx5vf_async_data, work); + struct mlx5_vf_migration_file *migf = container_of(async_data, + struct mlx5_vf_migration_file, async_data); + struct mlx5_core_dev *mdev = migf->mvdev->mdev; + + mutex_lock(&migf->lock); + if (async_data->status) { + migf->is_err = true; + wake_up_interruptible(&migf->poll_wait); + } + mutex_unlock(&migf->lock); + + mlx5_core_destroy_mkey(mdev, async_data->mkey); + dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); + mlx5_core_dealloc_pd(mdev, async_data->pdn); + kvfree(async_data->out); + fput(migf->filp); +} + +static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) +{ + struct mlx5vf_async_data *async_data = container_of(context, + struct mlx5vf_async_data, cb_work); + struct mlx5_vf_migration_file *migf = container_of(async_data, + struct mlx5_vf_migration_file, async_data); + + if (!status) { + WRITE_ONCE(migf->total_length, + MLX5_GET(save_vhca_state_out, async_data->out, + actual_image_size)); + wake_up_interruptible(&migf->poll_wait); + } + + /* + * The error and the cleanup flows can't run from an + * interrupt context + */ + async_data->status = status; + queue_work(migf->mvdev->cb_wq, &async_data->work); +} + +int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); - u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; + u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; + struct mlx5vf_async_data *async_data; + struct mlx5_core_dev *mdev; u32 pdn, mkey; int err; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; + mdev = mvdev->mdev; err = mlx5_core_alloc_pd(mdev, &pdn); if (err) - goto end; + return err; err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); @@ -179,45 +292,54 @@ int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, MLX5_SET(save_vhca_state_in, in, opcode, MLX5_CMD_OP_SAVE_VHCA_STATE); MLX5_SET(save_vhca_state_in, in, op_mod, 0); - MLX5_SET(save_vhca_state_in, in, vhca_id, vhca_id); + MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, mkey); MLX5_SET(save_vhca_state_in, in, size, migf->total_length); - err = mlx5_cmd_exec_inout(mdev, save_vhca_state, in, out); + async_data = &migf->async_data; + async_data->out = kvzalloc(out_size, GFP_KERNEL); + if (!async_data->out) { + err = -ENOMEM; + goto err_out; + } + + /* no data exists till the callback comes back */ + migf->total_length = 0; + get_file(migf->filp); + async_data->mkey = mkey; + async_data->pdn = pdn; + err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), + async_data->out, + out_size, mlx5vf_save_callback, + &async_data->cb_work); if (err) goto err_exec; - migf->total_length = - MLX5_GET(save_vhca_state_out, out, actual_image_size); - - mlx5_core_destroy_mkey(mdev, mkey); - mlx5_core_dealloc_pd(mdev, pdn); - dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); - mlx5_vf_put_core_dev(mdev); - return 0; err_exec: + fput(migf->filp); + kvfree(async_data->out); +err_out: mlx5_core_destroy_mkey(mdev, mkey); err_create_mkey: dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0); err_dma_map: mlx5_core_dealloc_pd(mdev, pdn); -end: - mlx5_vf_put_core_dev(mdev); return err; } -int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf) { - struct mlx5_core_dev *mdev = mlx5_vf_get_core_dev(pdev); + struct mlx5_core_dev *mdev; u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; u32 pdn, mkey; int err; - if (!mdev) + lockdep_assert_held(&mvdev->state_mutex); + if (mvdev->mdev_detach) return -ENOTCONN; mutex_lock(&migf->lock); @@ -226,6 +348,7 @@ int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, goto end; } + mdev = mvdev->mdev; err = mlx5_core_alloc_pd(mdev, &pdn); if (err) goto end; @@ -241,7 +364,7 @@ int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, MLX5_SET(load_vhca_state_in, in, opcode, MLX5_CMD_OP_LOAD_VHCA_STATE); MLX5_SET(load_vhca_state_in, in, op_mod, 0); - MLX5_SET(load_vhca_state_in, in, vhca_id, vhca_id); + MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(load_vhca_state_in, in, mkey, mkey); MLX5_SET(load_vhca_state_in, in, size, migf->total_length); @@ -253,7 +376,6 @@ err_mkey: err_reg: mlx5_core_dealloc_pd(mdev, pdn); end: - mlx5_vf_put_core_dev(mdev); mutex_unlock(&migf->lock); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 1392a11a9cc0..6c3112fdd8b1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -7,12 +7,23 @@ #define MLX5_VFIO_CMD_H #include +#include #include +struct mlx5vf_async_data { + struct mlx5_async_work cb_work; + struct work_struct work; + int status; + u32 pdn; + u32 mkey; + void *out; +}; + struct mlx5_vf_migration_file { struct file *filp; struct mutex lock; - bool disabled; + u8 disabled:1; + u8 is_err:1; struct sg_append_table table; size_t total_length; @@ -22,15 +33,42 @@ struct mlx5_vf_migration_file { struct scatterlist *last_offset_sg; unsigned int sg_last_entry; unsigned long last_offset; + struct mlx5vf_pci_core_device *mvdev; + wait_queue_head_t poll_wait; + struct mlx5_async_ctx async_ctx; + struct mlx5vf_async_data async_data; }; -int mlx5vf_cmd_suspend_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod); -int mlx5vf_cmd_resume_vhca(struct pci_dev *pdev, u16 vhca_id, u16 op_mod); -int mlx5vf_cmd_query_vhca_migration_state(struct pci_dev *pdev, u16 vhca_id, +struct mlx5vf_pci_core_device { + struct vfio_pci_core_device core_device; + int vf_id; + u16 vhca_id; + u8 migrate_cap:1; + u8 deferred_reset:1; + u8 mdev_detach:1; + /* protect migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protect the reset_done flow */ + spinlock_t reset_lock; + struct mlx5_vf_migration_file *resuming_migf; + struct mlx5_vf_migration_file *saving_migf; + struct workqueue_struct *cb_wq; + struct notifier_block nb; + struct mlx5_core_dev *mdev; +}; + +int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); +int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); +int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size); -int mlx5vf_cmd_get_vhca_id(struct pci_dev *pdev, u16 function_id, u16 *vhca_id); -int mlx5vf_cmd_save_vhca_state(struct pci_dev *pdev, u16 vhca_id, +void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); +int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); -int mlx5vf_cmd_load_vhca_state(struct pci_dev *pdev, u16 vhca_id, +int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); +void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); #endif /* MLX5_VFIO_CMD_H */ diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index bbec5d288fee..0558d0649ddb 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "cmd.h" @@ -25,19 +24,13 @@ /* Arbitrary to prevent userspace from consuming endless memory */ #define MAX_MIGRATION_SIZE (512*1024*1024) -struct mlx5vf_pci_core_device { - struct vfio_pci_core_device core_device; - u16 vhca_id; - u8 migrate_cap:1; - u8 deferred_reset:1; - /* protect migration state */ - struct mutex state_mutex; - enum vfio_device_mig_state mig_state; - /* protect the reset_done flow */ - spinlock_t reset_lock; - struct mlx5_vf_migration_file *resuming_migf; - struct mlx5_vf_migration_file *saving_migf; -}; +static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct mlx5vf_pci_core_device, + core_device); +} static struct page * mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, @@ -149,12 +142,22 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, return -ESPIPE; pos = &filp->f_pos; + if (!(filp->f_flags & O_NONBLOCK)) { + if (wait_event_interruptible(migf->poll_wait, + READ_ONCE(migf->total_length) || migf->is_err)) + return -ERESTARTSYS; + } + mutex_lock(&migf->lock); + if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { + done = -EAGAIN; + goto out_unlock; + } if (*pos > migf->total_length) { done = -EINVAL; goto out_unlock; } - if (migf->disabled) { + if (migf->disabled || migf->is_err) { done = -ENODEV; goto out_unlock; } @@ -194,9 +197,28 @@ out_unlock: return done; } +static __poll_t mlx5vf_save_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct mlx5_vf_migration_file *migf = filp->private_data; + __poll_t pollflags = 0; + + poll_wait(filp, &migf->poll_wait, wait); + + mutex_lock(&migf->lock); + if (migf->disabled || migf->is_err) + pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + else if (READ_ONCE(migf->total_length)) + pollflags = EPOLLIN | EPOLLRDNORM; + mutex_unlock(&migf->lock); + + return pollflags; +} + static const struct file_operations mlx5vf_save_fops = { .owner = THIS_MODULE, .read = mlx5vf_save_read, + .poll = mlx5vf_save_poll, .release = mlx5vf_release_file, .llseek = no_llseek, }; @@ -222,9 +244,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); - - ret = mlx5vf_cmd_query_vhca_migration_state( - mvdev->core_device.pdev, mvdev->vhca_id, &migf->total_length); + init_waitqueue_head(&migf->poll_wait); + mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); + INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, + &migf->total_length); if (ret) goto out_free; @@ -233,8 +257,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto out_free; - ret = mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev, - mvdev->vhca_id, migf); + migf->mvdev = mvdev; + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); if (ret) goto out_free; return migf; @@ -339,7 +363,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) return migf; } -static void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) +void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) { if (mvdev->resuming_migf) { mlx5vf_disable_fd(mvdev->resuming_migf); @@ -347,6 +371,8 @@ static void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) mvdev->resuming_migf = NULL; } if (mvdev->saving_migf) { + mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); + cancel_work_sync(&mvdev->saving_migf->async_data.work); mlx5vf_disable_fd(mvdev->saving_migf); fput(mvdev->saving_migf->filp); mvdev->saving_migf = NULL; @@ -361,8 +387,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, int ret; if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_suspend_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); if (ret) return ERR_PTR(ret); @@ -370,8 +395,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { - ret = mlx5vf_cmd_resume_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); if (ret) return ERR_PTR(ret); @@ -379,8 +403,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { - ret = mlx5vf_cmd_suspend_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_suspend_vhca(mvdev, MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); if (ret) return ERR_PTR(ret); @@ -388,8 +411,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { - ret = mlx5vf_cmd_resume_vhca( - mvdev->core_device.pdev, mvdev->vhca_id, + ret = mlx5vf_cmd_resume_vhca(mvdev, MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); if (ret) return ERR_PTR(ret); @@ -424,8 +446,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, } if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { - ret = mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev, - mvdev->vhca_id, + ret = mlx5vf_cmd_load_vhca_state(mvdev, mvdev->resuming_migf); if (ret) return ERR_PTR(ret); @@ -444,7 +465,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, * This function is called in all state_mutex unlock cases to * handle a 'deferred_reset' if exists. */ -static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) +void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) { again: spin_lock(&mvdev->reset_lock); @@ -505,7 +526,7 @@ static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) { - struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); if (!mvdev->migrate_cap) return; @@ -532,34 +553,16 @@ static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) struct mlx5vf_pci_core_device *mvdev = container_of( core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); struct vfio_pci_core_device *vdev = &mvdev->core_device; - int vf_id; int ret; ret = vfio_pci_core_enable(vdev); if (ret) return ret; - if (!mvdev->migrate_cap) { - vfio_pci_core_finish_enable(vdev); - return 0; - } - - vf_id = pci_iov_vf_id(vdev->pdev); - if (vf_id < 0) { - ret = vf_id; - goto out_disable; - } - - ret = mlx5vf_cmd_get_vhca_id(vdev->pdev, vf_id + 1, &mvdev->vhca_id); - if (ret) - goto out_disable; - - mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + if (mvdev->migrate_cap) + mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; vfio_pci_core_finish_enable(vdev); return 0; -out_disable: - vfio_pci_core_disable(vdev); - return ret; } static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) @@ -596,32 +599,15 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, if (!mvdev) return -ENOMEM; vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - - if (pdev->is_virtfn) { - struct mlx5_core_dev *mdev = - mlx5_vf_get_core_dev(pdev); - - if (mdev) { - if (MLX5_CAP_GEN(mdev, migration)) { - mvdev->migrate_cap = 1; - mvdev->core_device.vdev.migration_flags = - VFIO_MIGRATION_STOP_COPY | - VFIO_MIGRATION_P2P; - mutex_init(&mvdev->state_mutex); - spin_lock_init(&mvdev->reset_lock); - } - mlx5_vf_put_core_dev(mdev); - } - } - + mlx5vf_cmd_set_migratable(mvdev); + dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) goto out_free; - - dev_set_drvdata(&pdev->dev, mvdev); return 0; out_free: + mlx5vf_cmd_remove_migratable(mvdev); vfio_pci_core_uninit_device(&mvdev->core_device); kfree(mvdev); return ret; @@ -629,9 +615,10 @@ out_free: static void mlx5vf_pci_remove(struct pci_dev *pdev) { - struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); + struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); vfio_pci_core_unregister_device(&mvdev->core_device); + mlx5vf_cmd_remove_migratable(mvdev); vfio_pci_core_uninit_device(&mvdev->core_device); kfree(mvdev); } @@ -654,6 +641,7 @@ static struct pci_driver mlx5vf_pci_driver = { .probe = mlx5vf_pci_probe, .remove = mlx5vf_pci_remove, .err_handler = &mlx5vf_err_handlers, + .driver_managed_dma = true, }; static void __exit mlx5vf_pci_cleanup(void) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 58839206d1ca..4d1a97415a27 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -151,10 +151,10 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return -ENOMEM; vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops); + dev_set_drvdata(&pdev->dev, vdev); ret = vfio_pci_core_register_device(vdev); if (ret) goto out_free; - dev_set_drvdata(&pdev->dev, vdev); return 0; out_free: @@ -174,10 +174,12 @@ static void vfio_pci_remove(struct pci_dev *pdev) static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) { + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + if (!enable_sriov) return -ENOENT; - return vfio_pci_core_sriov_configure(pdev, nr_virtfn); + return vfio_pci_core_sriov_configure(vdev, nr_virtfn); } static const struct pci_device_id vfio_pci_table[] = { diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 6e58b4bf7a60..9343f597182d 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -402,11 +402,14 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); /* + * Memory region cannot be accessed if device power state is D3. + * * SR-IOV VF memory enable is handled by the MSE bit in the * PF SR-IOV capability, there's therefore no need to trigger * faults based on the virtual value. */ - return pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY); + return pdev->current_state < PCI_D3hot && + (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } /* @@ -692,6 +695,22 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) return 0; } +/* + * It takes all the required locks to protect the access of power related + * variables and then invokes vfio_pci_set_power_state(). + */ +static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, + pci_power_t state) +{ + if (state >= PCI_D3hot) + vfio_pci_zap_and_down_write_memory_lock(vdev); + else + down_write(&vdev->memory_lock); + + vfio_pci_set_power_state(vdev, state); + up_write(&vdev->memory_lock); +} + static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos, int count, struct perm_bits *perm, int offset, __le32 val) @@ -718,7 +737,7 @@ static int vfio_pm_config_write(struct vfio_pci_core_device *vdev, int pos, break; } - vfio_pci_set_power_state(vdev, state); + vfio_lock_and_set_power_state(vdev, state); } return count; @@ -738,12 +757,29 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm) */ p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + /* + * The guests can't process PME events. If any PME event will be + * generated, then it will be mostly handled in the host and the + * host will clear the PME_STATUS. So virtualize PME_Support bits. + * The vconfig bits will be cleared during device capability + * initialization. + */ + p_setw(perm, PCI_PM_PMC, PCI_PM_CAP_PME_MASK, NO_WRITE); + /* * Power management is defined *per function*, so we can let * the user change power state, but we trap and initiate the * change ourselves, so the state bits are read-only. + * + * The guest can't process PME from D3cold so virtualize PME_Status + * and PME_En bits. The vconfig bits will be cleared during device + * capability initialization. */ - p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK); + p_setd(perm, PCI_PM_CTRL, + PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS, + ~(PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS | + PCI_PM_CTRL_STATE_MASK)); + return 0; } @@ -1412,6 +1448,17 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo return 0; } +static void vfio_update_pm_vconfig_bytes(struct vfio_pci_core_device *vdev, + int offset) +{ + __le16 *pmc = (__le16 *)&vdev->vconfig[offset + PCI_PM_PMC]; + __le16 *ctrl = (__le16 *)&vdev->vconfig[offset + PCI_PM_CTRL]; + + /* Clear vconfig PME_Support, PME_Status, and PME_En bits */ + *pmc &= ~cpu_to_le16(PCI_PM_CAP_PME_MASK); + *ctrl &= ~cpu_to_le16(PCI_PM_CTRL_PME_ENABLE | PCI_PM_CTRL_PME_STATUS); +} + static int vfio_fill_vconfig_bytes(struct vfio_pci_core_device *vdev, int offset, int size) { @@ -1535,6 +1582,9 @@ static int vfio_cap_init(struct vfio_pci_core_device *vdev) if (ret) return ret; + if (cap == PCI_CAP_ID_PM) + vfio_update_pm_vconfig_bytes(vdev, pos); + prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; pos = next; caps++; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 06b6f3594a13..a0d69ddaf90d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -156,7 +156,7 @@ no_mmap: } struct vfio_pci_group_info; -static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); +static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups); @@ -217,6 +217,10 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat bool needs_restore = false, needs_save = false; int ret; + /* Prevent changing power state for PFs with VFs enabled */ + if (pci_num_vf(pdev) && state > PCI_D0) + return -EBUSY; + if (vdev->needs_pm_restore) { if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { pci_save_state(pdev); @@ -255,6 +259,17 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +/* + * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, + * so use structure without any callbacks. + * + * The pci-driver core runtime PM routines always save the device state + * before going into suspended state. If the device is going into low power + * state with only with runtime PM ops, then no explicit handling is needed + * for the devices which have NoSoftRst-. + */ +static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; + int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -262,21 +277,23 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) u16 cmd; u8 msix_pos; - vfio_pci_set_power_state(vdev, PCI_D0); + if (!disable_idle_d3) { + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret < 0) + return ret; + } /* Don't allow our initial saved state to include busmaster */ pci_clear_master(pdev); ret = pci_enable_device(pdev); if (ret) - return ret; + goto out_power; /* If reset fails because of the device lock, fail this path entirely */ ret = pci_try_reset_function(pdev); - if (ret == -EAGAIN) { - pci_disable_device(pdev); - return ret; - } + if (ret == -EAGAIN) + goto out_disable_device; vdev->reset_works = !ret; pci_save_state(pdev); @@ -300,12 +317,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) } ret = vfio_config_init(vdev); - if (ret) { - kfree(vdev->pci_saved_state); - vdev->pci_saved_state = NULL; - pci_disable_device(pdev); - return ret; - } + if (ret) + goto out_free_state; msix_pos = pdev->msix_cap; if (msix_pos) { @@ -326,6 +339,16 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) return 0; + +out_free_state: + kfree(vdev->pci_saved_state); + vdev->pci_saved_state = NULL; +out_disable_device: + pci_disable_device(pdev); +out_power: + if (!disable_idle_d3) + pm_runtime_put(&pdev->dev); + return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_enable); @@ -433,8 +456,11 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) out: pci_disable_device(pdev); - if (!vfio_pci_dev_set_try_reset(vdev->vdev.dev_set) && !disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D3hot); + vfio_pci_dev_set_try_reset(vdev->vdev.dev_set); + + /* Put the pm-runtime usage counter acquired during enable */ + if (!disable_idle_d3) + pm_runtime_put(&pdev->dev); } EXPORT_SYMBOL_GPL(vfio_pci_core_disable); @@ -556,7 +582,7 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) struct vfio_pci_group_info { int count; - struct vfio_group **groups; + struct file **files; }; static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) @@ -1018,10 +1044,10 @@ reset_info_exit: } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { struct vfio_pci_hot_reset hdr; int32_t *group_fds; - struct vfio_group **groups; + struct file **files; struct vfio_pci_group_info info; bool slot = false; - int group_idx, count = 0, ret = 0; + int file_idx, count = 0, ret = 0; minsz = offsetofend(struct vfio_pci_hot_reset, count); @@ -1054,17 +1080,17 @@ reset_info_exit: return -EINVAL; group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); - if (!group_fds || !groups) { + files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + if (!group_fds || !files) { kfree(group_fds); - kfree(groups); + kfree(files); return -ENOMEM; } if (copy_from_user(group_fds, (void __user *)(arg + minsz), hdr.count * sizeof(*group_fds))) { kfree(group_fds); - kfree(groups); + kfree(files); return -EFAULT; } @@ -1073,22 +1099,22 @@ reset_info_exit: * user interface and store the group and iommu ID. This * ensures the group is held across the reset. */ - for (group_idx = 0; group_idx < hdr.count; group_idx++) { - struct vfio_group *group; - struct fd f = fdget(group_fds[group_idx]); - if (!f.file) { + for (file_idx = 0; file_idx < hdr.count; file_idx++) { + struct file *file = fget(group_fds[file_idx]); + + if (!file) { ret = -EBADF; break; } - group = vfio_group_get_external_user(f.file); - fdput(f); - if (IS_ERR(group)) { - ret = PTR_ERR(group); + /* Ensure the FD is a vfio group FD.*/ + if (!vfio_file_iommu_group(file)) { + fput(file); + ret = -EINVAL; break; } - groups[group_idx] = group; + files[file_idx] = file; } kfree(group_fds); @@ -1098,15 +1124,15 @@ reset_info_exit: goto hot_reset_release; info.count = hdr.count; - info.groups = groups; + info.files = files; ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); hot_reset_release: - for (group_idx--; group_idx >= 0; group_idx--) - vfio_group_put_external_user(groups[group_idx]); + for (file_idx--; file_idx >= 0; file_idx--) + fput(files[file_idx]); - kfree(groups); + kfree(files); return ret; } else if (cmd == VFIO_DEVICE_IOEVENTFD) { struct vfio_device_ioeventfd ioeventfd; @@ -1819,8 +1845,13 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; + struct device *dev = &pdev->dev; int ret; + /* Drivers must set the vfio_pci_core_device to their drvdata */ + if (WARN_ON(vdev != dev_get_drvdata(dev))) + return -EINVAL; + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; @@ -1860,19 +1891,21 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) vfio_pci_probe_power_state(vdev); - if (!disable_idle_d3) { - /* - * pci-core sets the device power state to an unknown value at - * bootup and after being removed from a driver. The only - * transition it allows from this unknown state is to D0, which - * typically happens when a driver calls pci_enable_device(). - * We're not ready to enable the device yet, but we do want to - * be able to get to D3. Therefore first do a D0 transition - * before going to D3. - */ - vfio_pci_set_power_state(vdev, PCI_D0); - vfio_pci_set_power_state(vdev, PCI_D3hot); - } + /* + * pci-core sets the device power state to an unknown value at + * bootup and after being removed from a driver. The only + * transition it allows from this unknown state is to D0, which + * typically happens when a driver calls pci_enable_device(). + * We're not ready to enable the device yet, but we do want to + * be able to get to D3. Therefore first do a D0 transition + * before enabling runtime PM. + */ + vfio_pci_set_power_state(vdev, PCI_D0); + + dev->driver->pm = &vfio_pci_core_pm_ops; + pm_runtime_allow(dev); + if (!disable_idle_d3) + pm_runtime_put(dev); ret = vfio_register_group_dev(&vdev->vdev); if (ret) @@ -1881,7 +1914,9 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) out_power: if (!disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D0); + pm_runtime_get_noresume(dev); + + pm_runtime_forbid(dev); out_vf: vfio_pci_vf_uninit(vdev); return ret; @@ -1890,9 +1925,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) { - struct pci_dev *pdev = vdev->pdev; - - vfio_pci_core_sriov_configure(pdev, 0); + vfio_pci_core_sriov_configure(vdev, 0); vfio_unregister_group_dev(&vdev->vdev); @@ -1900,21 +1933,16 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vga_uninit(vdev); if (!disable_idle_d3) - vfio_pci_set_power_state(vdev, PCI_D0); + pm_runtime_get_noresume(&vdev->pdev->dev); + + pm_runtime_forbid(&vdev->pdev->dev); } EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { - struct vfio_pci_core_device *vdev; - struct vfio_device *device; - - device = vfio_device_get_from_dev(&pdev->dev); - if (device == NULL) - return PCI_ERS_RESULT_DISCONNECT; - - vdev = container_of(device, struct vfio_pci_core_device, vdev); + struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); mutex_lock(&vdev->igate); @@ -1923,26 +1951,18 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, mutex_unlock(&vdev->igate); - vfio_device_put(device); - return PCI_ERS_RESULT_CAN_RECOVER; } EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected); -int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) +int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, + int nr_virtfn) { - struct vfio_pci_core_device *vdev; - struct vfio_device *device; + struct pci_dev *pdev = vdev->pdev; int ret = 0; device_lock_assert(&pdev->dev); - device = vfio_device_get_from_dev(&pdev->dev); - if (!device) - return -ENODEV; - - vdev = container_of(device, struct vfio_pci_core_device, vdev); - if (nr_virtfn) { mutex_lock(&vfio_pci_sriov_pfs_mutex); /* @@ -1957,22 +1977,42 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) } list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs); mutex_unlock(&vfio_pci_sriov_pfs_mutex); - ret = pci_enable_sriov(pdev, nr_virtfn); + + /* + * The PF power state should always be higher than the VF power + * state. The PF can be in low power state either with runtime + * power management (when there is no user) or PCI_PM_CTRL + * register write by the user. If PF is in the low power state, + * then change the power state to D0 first before enabling + * SR-IOV. Also, this function can be called at any time, and + * userspace PCI_PM_CTRL write can race against this code path, + * so protect the same with 'memory_lock'. + */ + ret = pm_runtime_resume_and_get(&pdev->dev); if (ret) goto out_del; - ret = nr_virtfn; - goto out_put; + + down_write(&vdev->memory_lock); + vfio_pci_set_power_state(vdev, PCI_D0); + ret = pci_enable_sriov(pdev, nr_virtfn); + up_write(&vdev->memory_lock); + if (ret) { + pm_runtime_put(&pdev->dev); + goto out_del; + } + return nr_virtfn; } - pci_disable_sriov(pdev); + if (pci_num_vf(pdev)) { + pci_disable_sriov(pdev); + pm_runtime_put(&pdev->dev); + } out_del: mutex_lock(&vfio_pci_sriov_pfs_mutex); list_del_init(&vdev->sriov_pfs_item); out_unlock: mutex_unlock(&vfio_pci_sriov_pfs_mutex); -out_put: - vfio_device_put(device); return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); @@ -1988,7 +2028,7 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, unsigned int i; for (i = 0; i < groups->count; i++) - if (groups->groups[i] == vdev->vdev.group) + if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) return true; return false; } @@ -2041,6 +2081,27 @@ vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set) return pdev; } +static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set) +{ + struct vfio_pci_core_device *cur; + int ret; + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { + ret = pm_runtime_resume_and_get(&cur->pdev->dev); + if (ret) + goto unwind; + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(cur, &dev_set->device_list, + vdev.dev_set_list) + pm_runtime_put(&cur->pdev->dev); + + return ret; +} + /* * We need to get memory_lock for each device, but devices can share mmap_lock, * therefore we need to zap and hold the vma_lock for each device, and only then @@ -2147,43 +2208,38 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) * - At least one of the affected devices is marked dirty via * needs_reset (such as by lack of FLR support) * Then attempt to perform that bus or slot reset. - * Returns true if the dev_set was reset. */ -static bool vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) +static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set) { struct vfio_pci_core_device *cur; struct pci_dev *pdev; - int ret; + bool reset_done = false; if (!vfio_pci_dev_set_needs_reset(dev_set)) - return false; + return; pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) - return false; + return; /* - * The pci_reset_bus() will reset all the devices in the bus. - * The power state can be non-D0 for some of the devices in the bus. - * For these devices, the pci_reset_bus() will internally set - * the power state to D0 without vfio driver involvement. - * For the devices which have NoSoftRst-, the reset function can - * cause the PCI config space reset without restoring the original - * state (saved locally in 'vdev->pm_save'). + * Some of the devices in the bus can be in the runtime suspended + * state. Increment the usage count for all the devices in the dev_set + * before reset and decrement the same after reset. */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set)) + return; - ret = pci_reset_bus(pdev); - if (ret) - return false; + if (!pci_reset_bus(pdev)) + reset_done = true; list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - cur->needs_reset = false; + if (reset_done) + cur->needs_reset = false; + if (!disable_idle_d3) - vfio_pci_set_power_state(cur, PCI_D3hot); + pm_runtime_put(&cur->pdev->dev); } - return true; } void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga, diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6a7fb8d91aaa..61e71c1154be 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -66,18 +66,18 @@ struct vfio_group { struct device dev; struct cdev cdev; refcount_t users; - atomic_t container_users; + unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; struct list_head device_list; struct mutex device_lock; struct list_head vfio_next; struct list_head container_next; - atomic_t opened; - wait_queue_head_t container_q; enum vfio_group_type type; unsigned int dev_counter; + struct rw_semaphore group_rwsem; struct kvm *kvm; + struct file *opened_file; struct blocking_notifier_head notifier; }; @@ -361,9 +361,9 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->users, 1); + init_rwsem(&group->group_rwsem); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); - init_waitqueue_head(&group->container_q); group->iommu_group = iommu_group; /* put in vfio_group_release() */ iommu_group_ref_get(iommu_group); @@ -429,7 +429,7 @@ static void vfio_group_put(struct vfio_group *group) * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); - WARN_ON(atomic_read(&group->container_users)); + WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); list_del(&group->vfio_next); @@ -444,31 +444,15 @@ static void vfio_group_get(struct vfio_group *group) refcount_inc(&group->users); } -static struct vfio_group *vfio_group_get_from_dev(struct device *dev) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - - iommu_group = iommu_group_get(dev); - if (!iommu_group) - return NULL; - - group = vfio_group_get_from_iommu(iommu_group); - iommu_group_put(iommu_group); - - return group; -} - /* * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -void vfio_device_put(struct vfio_device *device) +static void vfio_device_put(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -EXPORT_SYMBOL_GPL(vfio_device_put); static bool vfio_device_try_get(struct vfio_device *device) { @@ -547,11 +531,11 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) iommu_group = iommu_group_get(dev); #ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_group && noiommu && !iommu_present(dev->bus)) { + if (!iommu_group && noiommu) { /* * With noiommu enabled, create an IOMMU group for devices that - * don't already have one and don't have an iommu_ops on their - * bus. Taint the kernel because we're about to give a DMA + * don't already have one, implying no IOMMU hardware/driver + * exists. Taint the kernel because we're about to give a DMA * capable device to a user without IOMMU protection. */ group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); @@ -640,29 +624,6 @@ int vfio_register_emulated_iommu_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); -/* - * Get a reference to the vfio_device for a device. Even if the - * caller thinks they own the device, they could be racing with a - * release call path, so we can't trust drvdata for the shortcut. - * Go the long way around, from the iommu_group to the vfio_group - * to the vfio_device. - */ -struct vfio_device *vfio_device_get_from_dev(struct device *dev) -{ - struct vfio_group *group; - struct vfio_device *device; - - group = vfio_group_get_from_dev(dev); - if (!group) - return NULL; - - device = vfio_group_get_device(group, dev); - vfio_group_put(group); - - return device; -} -EXPORT_SYMBOL_GPL(vfio_device_get_from_dev); - static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, char *buf) { @@ -730,23 +691,6 @@ void vfio_unregister_group_dev(struct vfio_device *device) group->dev_counter--; mutex_unlock(&group->device_lock); - /* - * In order to support multiple devices per group, devices can be - * plucked from the group while other devices in the group are still - * in use. The container persists with this group and those remaining - * devices still attached. If the user creates an isolation violation - * by binding this device to another driver while the group is still in - * use, that's their fault. However, in the case of removing the last, - * or potentially the only, device in the group there can be no other - * in-use devices in the group. The user has done their due diligence - * and we should lay no claims to those devices. In order to do that, - * we need to make sure the group is detached from the container. - * Without this stall, we're potentially racing with a user process - * that may attempt to immediately bind this device to another driver. - */ - if (list_empty(&group->device_list)) - wait_event(group->container_q, !group->container); - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -981,6 +925,8 @@ static void __vfio_group_unset_container(struct vfio_group *group) struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; + lockdep_assert_held_write(&group->group_rwsem); + down_write(&container->group_lock); driver = container->iommu_driver; @@ -988,10 +934,11 @@ static void __vfio_group_unset_container(struct vfio_group *group) driver->ops->detach_group(container->iommu_data, group->iommu_group); - iommu_group_release_dma_owner(group->iommu_group); + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner(group->iommu_group); group->container = NULL; - wake_up(&group->container_q); + group->container_users = 0; list_del(&group->container_next); /* Detaching the last group deprivileges a container, remove iommu */ @@ -1015,30 +962,16 @@ static void __vfio_group_unset_container(struct vfio_group *group) */ static int vfio_group_unset_container(struct vfio_group *group) { - int users = atomic_cmpxchg(&group->container_users, 1, 0); + lockdep_assert_held_write(&group->group_rwsem); - if (!users) + if (!group->container) return -EINVAL; - if (users != 1) + if (group->container_users != 1) return -EBUSY; - __vfio_group_unset_container(group); - return 0; } -/* - * When removing container users, anything that removes the last user - * implicitly removes the group from the container. That is, if the - * group file descriptor is closed, as well as any device file descriptors, - * the group is free. - */ -static void vfio_group_try_dissolve_container(struct vfio_group *group) -{ - if (0 == atomic_dec_if_positive(&group->container_users)) - __vfio_group_unset_container(group); -} - static int vfio_group_set_container(struct vfio_group *group, int container_fd) { struct fd f; @@ -1046,7 +979,9 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) struct vfio_iommu_driver *driver; int ret = 0; - if (atomic_read(&group->container_users)) + lockdep_assert_held_write(&group->group_rwsem); + + if (group->container || WARN_ON(group->container_users)) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) @@ -1074,9 +1009,11 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) goto unlock_out; } - ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); - if (ret) - goto unlock_out; + if (group->type == VFIO_IOMMU) { + ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); + if (ret) + goto unlock_out; + } driver = container->iommu_driver; if (driver) { @@ -1084,18 +1021,20 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) group->iommu_group, group->type); if (ret) { - iommu_group_release_dma_owner(group->iommu_group); + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner( + group->iommu_group); goto unlock_out; } } group->container = container; + group->container_users = 1; container->noiommu = (group->type == VFIO_NO_IOMMU); list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ vfio_container_get(container); - atomic_inc(&group->container_users); unlock_out: up_write(&container->group_lock); @@ -1103,54 +1042,74 @@ unlock_out: return ret; } -static int vfio_group_add_container_user(struct vfio_group *group) -{ - if (!atomic_inc_not_zero(&group->container_users)) - return -EINVAL; - - if (group->type == VFIO_NO_IOMMU) { - atomic_dec(&group->container_users); - return -EPERM; - } - if (!group->container->iommu_driver) { - atomic_dec(&group->container_users); - return -EINVAL; - } - - return 0; -} - static const struct file_operations vfio_device_fops; -static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +/* true if the vfio_device has open_device() called but not close_device() */ +static bool vfio_assert_device_open(struct vfio_device *device) { - struct vfio_device *device; - struct file *filep; - int fdno; - int ret = 0; + return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); +} - if (0 == atomic_read(&group->container_users) || - !group->container->iommu_driver) +static int vfio_device_assign_container(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held_write(&group->group_rwsem); + + if (!group->container || !group->container->iommu_driver || + WARN_ON(!group->container_users)) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; - device = vfio_device_get_from_name(group, buf); - if (IS_ERR(device)) - return PTR_ERR(device); + get_file(group->opened_file); + group->container_users++; + return 0; +} + +static void vfio_device_unassign_container(struct vfio_device *device) +{ + down_write(&device->group->group_rwsem); + WARN_ON(device->group->container_users <= 1); + device->group->container_users--; + fput(device->group->opened_file); + up_write(&device->group->group_rwsem); +} + +static struct file *vfio_device_open(struct vfio_device *device) +{ + struct file *filep; + int ret; + + down_write(&device->group->group_rwsem); + ret = vfio_device_assign_container(device); + up_write(&device->group->group_rwsem); + if (ret) + return ERR_PTR(ret); if (!try_module_get(device->dev->driver->owner)) { ret = -ENODEV; - goto err_device_put; + goto err_unassign_container; } mutex_lock(&device->dev_set->lock); device->open_count++; - if (device->open_count == 1 && device->ops->open_device) { - ret = device->ops->open_device(device); - if (ret) - goto err_undo_count; + if (device->open_count == 1) { + /* + * Here we pass the KVM pointer with the group under the read + * lock. If the device driver will use it, it must obtain a + * reference and release it during close_device. + */ + down_read(&device->group->group_rwsem); + device->kvm = device->group->kvm; + + if (device->ops->open_device) { + ret = device->ops->open_device(device); + if (ret) + goto err_undo_count; + } + up_read(&device->group->group_rwsem); } mutex_unlock(&device->dev_set->lock); @@ -1158,15 +1117,11 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) * We can't use anon_inode_getfd() because we need to modify * the f_mode flags directly to allow more than just ioctls */ - fdno = ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - goto err_close_device; - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, device, O_RDWR); if (IS_ERR(filep)) { ret = PTR_ERR(filep); - goto err_fd; + goto err_close_device; } /* @@ -1176,26 +1131,61 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) */ filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - atomic_inc(&group->container_users); - - fd_install(fdno, filep); - - if (group->type == VFIO_NO_IOMMU) + if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); - return fdno; + /* + * On success the ref of device is moved to the file and + * put in vfio_device_fops_release() + */ + return filep; -err_fd: - put_unused_fd(fdno); err_close_device: mutex_lock(&device->dev_set->lock); + down_read(&device->group->group_rwsem); if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); err_undo_count: device->open_count--; + if (device->open_count == 0 && device->kvm) + device->kvm = NULL; + up_read(&device->group->group_rwsem); mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); -err_device_put: +err_unassign_container: + vfio_device_unassign_container(device); + return ERR_PTR(ret); +} + +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +{ + struct vfio_device *device; + struct file *filep; + int fdno; + int ret; + + device = vfio_device_get_from_name(group, buf); + if (IS_ERR(device)) + return PTR_ERR(device); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + ret = fdno; + goto err_put_device; + } + + filep = vfio_device_open(device); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_put_fdno; + } + + fd_install(fdno, filep); + return fdno; + +err_put_fdno: + put_unused_fd(fdno); +err_put_device: vfio_device_put(device); return ret; } @@ -1222,11 +1212,13 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, status.flags = 0; + down_read(&group->group_rwsem); if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) status.flags |= VFIO_GROUP_FLAGS_VIABLE; + up_read(&group->group_rwsem); if (copy_to_user((void __user *)arg, &status, minsz)) return -EFAULT; @@ -1244,11 +1236,15 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, if (fd < 0) return -EINVAL; + down_write(&group->group_rwsem); ret = vfio_group_set_container(group, fd); + up_write(&group->group_rwsem); break; } case VFIO_GROUP_UNSET_CONTAINER: + down_write(&group->group_rwsem); ret = vfio_group_unset_container(group); + up_write(&group->group_rwsem); break; case VFIO_GROUP_GET_DEVICE_FD: { @@ -1271,38 +1267,38 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group = container_of(inode->i_cdev, struct vfio_group, cdev); - int opened; + int ret; + + down_write(&group->group_rwsem); /* users can be zero if this races with vfio_group_put() */ - if (!refcount_inc_not_zero(&group->users)) - return -ENODEV; + if (!refcount_inc_not_zero(&group->users)) { + ret = -ENODEV; + goto err_unlock; + } if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { - vfio_group_put(group); - return -EPERM; + ret = -EPERM; + goto err_put; } - /* Do we need multiple instances of the group open? Seems not. */ - opened = atomic_cmpxchg(&group->opened, 0, 1); - if (opened) { - vfio_group_put(group); - return -EBUSY; + /* + * Do we need multiple instances of the group open? Seems not. + */ + if (group->opened_file) { + ret = -EBUSY; + goto err_put; } - - /* Is something still in use from a previous open? */ - if (group->container) { - atomic_dec(&group->opened); - vfio_group_put(group); - return -EBUSY; - } - - /* Warn if previous user didn't cleanup and re-init to drop them */ - if (WARN_ON(group->notifier.head)) - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); - + group->opened_file = filep; filep->private_data = group; + up_write(&group->group_rwsem); return 0; +err_put: + vfio_group_put(group); +err_unlock: + up_write(&group->group_rwsem); + return ret; } static int vfio_group_fops_release(struct inode *inode, struct file *filep) @@ -1311,9 +1307,18 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; - vfio_group_try_dissolve_container(group); - - atomic_dec(&group->opened); + down_write(&group->group_rwsem); + /* + * Device FDs hold a group file reference, therefore the group release + * is only called when there are no open devices. + */ + WARN_ON(group->notifier.head); + if (group->container) { + WARN_ON(group->container_users != 1); + __vfio_group_unset_container(group); + } + group->opened_file = NULL; + up_write(&group->group_rwsem); vfio_group_put(group); @@ -1336,13 +1341,19 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) struct vfio_device *device = filep->private_data; mutex_lock(&device->dev_set->lock); - if (!--device->open_count && device->ops->close_device) + vfio_assert_device_open(device); + down_read(&device->group->group_rwsem); + if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); + up_read(&device->group->group_rwsem); + device->open_count--; + if (device->open_count == 0) + device->kvm = NULL; mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); - vfio_group_try_dissolve_container(device->group); + vfio_device_unassign_container(device); vfio_device_put(device); @@ -1691,119 +1702,94 @@ static const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -/* - * External user API, exported by symbols to be linked dynamically. +/** + * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file + * @file: VFIO group file * - * The protocol includes: - * 1. do normal VFIO init operation: - * - opening a new container; - * - attaching group(s) to it; - * - setting an IOMMU driver for a container. - * When IOMMU is set for a container, all groups in it are - * considered ready to use by an external user. - * - * 2. User space passes a group fd to an external user. - * The external user calls vfio_group_get_external_user() - * to verify that: - * - the group is initialized; - * - IOMMU is set for it. - * If both checks passed, vfio_group_get_external_user() - * increments the container user counter to prevent - * the VFIO group from disposal before KVM exits. - * - * 3. The external user calls vfio_external_user_iommu_id() - * to know an IOMMU ID. - * - * 4. When the external KVM finishes, it calls - * vfio_group_put_external_user() to release the VFIO group. - * This call decrements the container user counter. + * The returned iommu_group is valid as long as a ref is held on the file. */ -struct vfio_group *vfio_group_get_external_user(struct file *filep) +struct iommu_group *vfio_file_iommu_group(struct file *file) { - struct vfio_group *group = filep->private_data; - int ret; + struct vfio_group *group = file->private_data; - if (filep->f_op != &vfio_group_fops) - return ERR_PTR(-EINVAL); - - ret = vfio_group_add_container_user(group); - if (ret) - return ERR_PTR(ret); - - /* - * Since the caller holds the fget on the file group->users must be >= 1 - */ - vfio_group_get(group); - - return group; + if (file->f_op != &vfio_group_fops) + return NULL; + return group->iommu_group; } -EXPORT_SYMBOL_GPL(vfio_group_get_external_user); +EXPORT_SYMBOL_GPL(vfio_file_iommu_group); -/* - * External user API, exported by symbols to be linked dynamically. - * The external user passes in a device pointer - * to verify that: - * - A VFIO group is assiciated with the device; - * - IOMMU is set for the group. - * If both checks passed, vfio_group_get_external_user_from_dev() - * increments the container user counter to prevent the VFIO group - * from disposal before external user exits and returns the pointer - * to the VFIO group. +/** + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file + * is always CPU cache coherent + * @file: VFIO group file * - * When the external user finishes using the VFIO group, it calls - * vfio_group_put_external_user() to release the VFIO group and - * decrement the container user counter. - * - * @dev [in] : device - * Return error PTR or pointer to VFIO group. + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop + * bit in DMA transactions. A return of false indicates that the user has + * rights to access additional instructions such as wbinvd on x86. */ - -struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev) +bool vfio_file_enforced_coherent(struct file *file) { - struct vfio_group *group; - int ret; + struct vfio_group *group = file->private_data; + bool ret; - group = vfio_group_get_from_dev(dev); - if (!group) - return ERR_PTR(-ENODEV); + if (file->f_op != &vfio_group_fops) + return true; - ret = vfio_group_add_container_user(group); - if (ret) { - vfio_group_put(group); - return ERR_PTR(ret); + down_read(&group->group_rwsem); + if (group->container) { + ret = vfio_ioctl_check_extension(group->container, + VFIO_DMA_CC_IOMMU); + } else { + /* + * Since the coherency state is determined only once a container + * is attached the user must do so before they can prove they + * have permission. + */ + ret = true; } - - return group; + up_read(&group->group_rwsem); + return ret; } -EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev); +EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); -void vfio_group_put_external_user(struct vfio_group *group) +/** + * vfio_file_set_kvm - Link a kvm with VFIO drivers + * @file: VFIO group file + * @kvm: KVM to link + * + * When a VFIO device is first opened the KVM will be available in + * device->kvm if one was associated with the group. + */ +void vfio_file_set_kvm(struct file *file, struct kvm *kvm) { - vfio_group_try_dissolve_container(group); - vfio_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_group_put_external_user); + struct vfio_group *group = file->private_data; -bool vfio_external_group_match_file(struct vfio_group *test_group, - struct file *filep) + if (file->f_op != &vfio_group_fops) + return; + + down_write(&group->group_rwsem); + group->kvm = kvm; + up_write(&group->group_rwsem); +} +EXPORT_SYMBOL_GPL(vfio_file_set_kvm); + +/** + * vfio_file_has_dev - True if the VFIO file is a handle for device + * @file: VFIO file to check + * @device: Device that must be part of the file + * + * Returns true if given file has permission to manipulate the given device. + */ +bool vfio_file_has_dev(struct file *file, struct vfio_device *device) { - struct vfio_group *group = filep->private_data; + struct vfio_group *group = file->private_data; - return (filep->f_op == &vfio_group_fops) && (group == test_group); -} -EXPORT_SYMBOL_GPL(vfio_external_group_match_file); + if (file->f_op != &vfio_group_fops) + return false; -int vfio_external_user_iommu_id(struct vfio_group *group) -{ - return iommu_group_id(group->iommu_group); + return group == device->group; } -EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id); - -long vfio_external_check_extension(struct vfio_group *group, unsigned long arg) -{ - return vfio_ioctl_check_extension(group->container, arg); -} -EXPORT_SYMBOL_GPL(vfio_external_check_extension); +EXPORT_SYMBOL_GPL(vfio_file_has_dev); /* * Sub-module support @@ -1926,7 +1912,7 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); /* * Pin a set of guest PFNs and return their associated host PFNs for local * domain only. - * @dev [in] : device + * @device [in] : device * @user_pfn [in]: array of user/guest PFNs to be pinned. * @npage [in] : count of elements in user_pfn array. This count should not * be greater VFIO_PIN_PAGES_MAX_ENTRIES. @@ -1934,33 +1920,25 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); * @phys_pfn[out]: array of host PFNs * Return error or number of pages pinned. */ -int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, - int prot, unsigned long *phys_pfn) +int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, + int npage, int prot, unsigned long *phys_pfn) { struct vfio_container *container; - struct vfio_group *group; + struct vfio_group *group = device->group; struct vfio_iommu_driver *driver; int ret; - if (!dev || !user_pfn || !phys_pfn || !npage) + if (!user_pfn || !phys_pfn || !npage || + !vfio_assert_device_open(device)) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - - if (group->dev_counter > 1) { - ret = -EINVAL; - goto err_pin_pages; - } - - ret = vfio_group_add_container_user(group); - if (ret) - goto err_pin_pages; + if (group->dev_counter > 1) + return -EINVAL; + /* group->container cannot change while a vfio device is open */ container = group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) @@ -1970,45 +1948,34 @@ int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, else ret = -ENOTTY; - vfio_group_try_dissolve_container(group); - -err_pin_pages: - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_pin_pages); /* * Unpin set of host PFNs for local domain only. - * @dev [in] : device + * @device [in] : device * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES. * @npage [in] : count of elements in user_pfn array. This count should not * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. * Return error or number of pages unpinned. */ -int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) +int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, + int npage) { struct vfio_container *container; - struct vfio_group *group; struct vfio_iommu_driver *driver; int ret; - if (!dev || !user_pfn || !npage) + if (!user_pfn || !npage || !vfio_assert_device_open(device)) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - - ret = vfio_group_add_container_user(group); - if (ret) - goto err_unpin_pages; - - container = group->container; + /* group->container cannot change while a vfio device is open */ + container = device->group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->unpin_pages)) ret = driver->ops->unpin_pages(container->iommu_data, user_pfn, @@ -2016,109 +1983,10 @@ int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) else ret = -ENOTTY; - vfio_group_try_dissolve_container(group); - -err_unpin_pages: - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_unpin_pages); -/* - * Pin a set of guest IOVA PFNs and return their associated host PFNs for a - * VFIO group. - * - * The caller needs to call vfio_group_get_external_user() or - * vfio_group_get_external_user_from_dev() prior to calling this interface, - * so as to prevent the VFIO group from disposal in the middle of the call. - * But it can keep the reference to the VFIO group for several calls into - * this interface. - * After finishing using of the VFIO group, the caller needs to release the - * VFIO group by calling vfio_group_put_external_user(). - * - * @group [in] : VFIO group - * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned. - * @npage [in] : count of elements in user_iova_pfn array. - * This count should not be greater - * VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @phys_pfn [out] : array of host PFNs - * Return error or number of pages pinned. - */ -int vfio_group_pin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage, - int prot, unsigned long *phys_pfn) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret; - - if (!group || !user_iova_pfn || !phys_pfn || !npage) - return -EINVAL; - - if (group->dev_counter > 1) - return -EINVAL; - - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, user_iova_pfn, - npage, prot, phys_pfn); - else - ret = -ENOTTY; - - return ret; -} -EXPORT_SYMBOL(vfio_group_pin_pages); - -/* - * Unpin a set of guest IOVA PFNs for a VFIO group. - * - * The caller needs to call vfio_group_get_external_user() or - * vfio_group_get_external_user_from_dev() prior to calling this interface, - * so as to prevent the VFIO group from disposal in the middle of the call. - * But it can keep the reference to the VFIO group for several calls into - * this interface. - * After finishing using of the VFIO group, the caller needs to release the - * VFIO group by calling vfio_group_put_external_user(). - * - * @group [in] : vfio group - * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned. - * @npage [in] : count of elements in user_iova_pfn array. - * This count should not be greater than - * VFIO_PIN_PAGES_MAX_ENTRIES. - * Return error or number of pages unpinned. - */ -int vfio_group_unpin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret; - - if (!group || !user_iova_pfn || !npage) - return -EINVAL; - - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->unpin_pages)) - ret = driver->ops->unpin_pages(container->iommu_data, - user_iova_pfn, npage); - else - ret = -ENOTTY; - - return ret; -} -EXPORT_SYMBOL(vfio_group_unpin_pages); - - /* * This interface allows the CPUs to perform some sort of virtual DMA on * behalf of the device. @@ -2129,32 +1997,25 @@ EXPORT_SYMBOL(vfio_group_unpin_pages); * As the read/write of user space memory is conducted via the CPUs and is * not a real device DMA, it is not necessary to pin the user space memory. * - * The caller needs to call vfio_group_get_external_user() or - * vfio_group_get_external_user_from_dev() prior to calling this interface, - * so as to prevent the VFIO group from disposal in the middle of the call. - * But it can keep the reference to the VFIO group for several calls into - * this interface. - * After finishing using of the VFIO group, the caller needs to release the - * VFIO group by calling vfio_group_put_external_user(). - * - * @group [in] : VFIO group + * @device [in] : VFIO device * @user_iova [in] : base IOVA of a user space buffer * @data [in] : pointer to kernel buffer * @len [in] : kernel buffer length * @write : indicate read or write * Return error code on failure or 0 on success. */ -int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, - void *data, size_t len, bool write) +int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, + size_t len, bool write) { struct vfio_container *container; struct vfio_iommu_driver *driver; int ret = 0; - if (!group || !data || len <= 0) + if (!data || len <= 0 || !vfio_assert_device_open(device)) return -EINVAL; - container = group->container; + /* group->container cannot change while a vfio device is open */ + container = device->group->container; driver = container->iommu_driver; if (likely(driver && driver->ops->dma_rw)) @@ -2162,7 +2023,6 @@ int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, user_iova, data, len, write); else ret = -ENOTTY; - return ret; } EXPORT_SYMBOL(vfio_dma_rw); @@ -2175,9 +2035,7 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; + lockdep_assert_held_read(&group->group_rwsem); container = group->container; driver = container->iommu_driver; @@ -2187,8 +2045,6 @@ static int vfio_register_iommu_notifier(struct vfio_group *group, else ret = -ENOTTY; - vfio_group_try_dissolve_container(group); - return ret; } @@ -2199,9 +2055,7 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, struct vfio_iommu_driver *driver; int ret; - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; + lockdep_assert_held_read(&group->group_rwsem); container = group->container; driver = container->iommu_driver; @@ -2211,147 +2065,52 @@ static int vfio_unregister_iommu_notifier(struct vfio_group *group, else ret = -ENOTTY; - vfio_group_try_dissolve_container(group); - return ret; } -void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) -{ - group->kvm = kvm; - blocking_notifier_call_chain(&group->notifier, - VFIO_GROUP_NOTIFY_SET_KVM, kvm); -} -EXPORT_SYMBOL_GPL(vfio_group_set_kvm); - -static int vfio_register_group_notifier(struct vfio_group *group, - unsigned long *events, - struct notifier_block *nb) -{ - int ret; - bool set_kvm = false; - - if (*events & VFIO_GROUP_NOTIFY_SET_KVM) - set_kvm = true; - - /* clear known events */ - *events &= ~VFIO_GROUP_NOTIFY_SET_KVM; - - /* refuse to continue if still events remaining */ - if (*events) - return -EINVAL; - - ret = vfio_group_add_container_user(group); - if (ret) - return -EINVAL; - - ret = blocking_notifier_chain_register(&group->notifier, nb); - - /* - * The attaching of kvm and vfio_group might already happen, so - * here we replay once upon registration. - */ - if (!ret && set_kvm && group->kvm) - blocking_notifier_call_chain(&group->notifier, - VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); - - vfio_group_try_dissolve_container(group); - - return ret; -} - -static int vfio_unregister_group_notifier(struct vfio_group *group, - struct notifier_block *nb) +int vfio_register_notifier(struct vfio_device *device, + enum vfio_notify_type type, unsigned long *events, + struct notifier_block *nb) { + struct vfio_group *group = device->group; int ret; - ret = vfio_group_add_container_user(group); - if (ret) + if (!nb || !events || (*events == 0) || + !vfio_assert_device_open(device)) return -EINVAL; - ret = blocking_notifier_chain_unregister(&group->notifier, nb); - - vfio_group_try_dissolve_container(group); - - return ret; -} - -int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, - unsigned long *events, struct notifier_block *nb) -{ - struct vfio_group *group; - int ret; - - if (!dev || !nb || !events || (*events == 0)) - return -EINVAL; - - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - switch (type) { case VFIO_IOMMU_NOTIFY: ret = vfio_register_iommu_notifier(group, events, nb); break; - case VFIO_GROUP_NOTIFY: - ret = vfio_register_group_notifier(group, events, nb); - break; default: ret = -EINVAL; } - - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_register_notifier); -int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type, +int vfio_unregister_notifier(struct vfio_device *device, + enum vfio_notify_type type, struct notifier_block *nb) { - struct vfio_group *group; + struct vfio_group *group = device->group; int ret; - if (!dev || !nb) + if (!nb || !vfio_assert_device_open(device)) return -EINVAL; - group = vfio_group_get_from_dev(dev); - if (!group) - return -ENODEV; - switch (type) { case VFIO_IOMMU_NOTIFY: ret = vfio_unregister_iommu_notifier(group, nb); break; - case VFIO_GROUP_NOTIFY: - ret = vfio_unregister_group_notifier(group, nb); - break; default: ret = -EINVAL; } - - vfio_group_put(group); return ret; } EXPORT_SYMBOL(vfio_unregister_notifier); -struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - - if (!group) - return ERR_PTR(-EINVAL); - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->group_iommu_domain)) - return driver->ops->group_iommu_domain(container->iommu_data, - group->iommu_group); - - return ERR_PTR(-ENOTTY); -} -EXPORT_SYMBOL_GPL(vfio_group_iommu_domain); - /* * Module/class support */ diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index c4e82a8d863f..001ae1be9b61 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -883,6 +883,14 @@ config RENESAS_RZAWDT This driver adds watchdog support for the integrated watchdogs in the Renesas RZ/A SoCs. These watchdogs can be used to reset a system. +config RENESAS_RZN1WDT + tristate "Renesas RZ/N1 watchdog" + depends on ARCH_RENESAS || COMPILE_TEST + select WATCHDOG_CORE + help + This driver adds watchdog support for the integrated watchdogs in the + Renesas RZ/N1 SoCs. These watchdogs can be used to reset a system. + config RENESAS_RZG2LWDT tristate "Renesas RZ/G2L WDT Watchdog" depends on ARCH_RENESAS || COMPILE_TEST @@ -1011,6 +1019,17 @@ config APPLE_WATCHDOG To compile this driver as a module, choose M here: the module will be called apple_wdt. +config SUNPLUS_WATCHDOG + tristate "Sunplus watchdog support" + depends on ARCH_SUNPLUS || COMPILE_TEST + select WATCHDOG_CORE + help + Say Y here to include support for the watchdog timer + in Sunplus SoCs. + + To compile this driver as a module, choose M here: the + module will be called sunplus_wdt. + # X86 (i386 + ia64 + x86_64) Architecture config ACQUIRE_WDT diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index f7da867e8782..5f88a6237f7c 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_LPC18XX_WATCHDOG) += lpc18xx_wdt.o obj-$(CONFIG_BCM7038_WDT) += bcm7038_wdt.o obj-$(CONFIG_RENESAS_WDT) += renesas_wdt.o obj-$(CONFIG_RENESAS_RZAWDT) += rza_wdt.o +obj-$(CONFIG_RENESAS_RZN1WDT) += rzn1_wdt.o obj-$(CONFIG_RENESAS_RZG2LWDT) += rzg2l_wdt.o obj-$(CONFIG_ASPEED_WATCHDOG) += aspeed_wdt.o obj-$(CONFIG_STM32_WATCHDOG) += stm32_iwdg.o @@ -95,6 +96,7 @@ obj-$(CONFIG_ARM_SMC_WATCHDOG) += arm_smc_wdt.o obj-$(CONFIG_VISCONTI_WATCHDOG) += visconti_wdt.o obj-$(CONFIG_MSC313E_WATCHDOG) += msc313e_wdt.o obj-$(CONFIG_APPLE_WATCHDOG) += apple_wdt.o +obj-$(CONFIG_SUNPLUS_WATCHDOG) += sunplus_wdt.o # X86 (i386 + ia64 + x86_64) Architecture obj-$(CONFIG_ACQUIRE_WDT) += acquirewdt.o diff --git a/drivers/watchdog/bcm7038_wdt.c b/drivers/watchdog/bcm7038_wdt.c index 8656a137e9a4..1ffcf6aca6ae 100644 --- a/drivers/watchdog/bcm7038_wdt.c +++ b/drivers/watchdog/bcm7038_wdt.c @@ -218,6 +218,7 @@ static SIMPLE_DEV_PM_OPS(bcm7038_wdt_pm_ops, bcm7038_wdt_suspend, bcm7038_wdt_resume); static const struct of_device_id bcm7038_wdt_match[] = { + { .compatible = "brcm,bcm6345-wdt" }, { .compatible = "brcm,bcm7038-wdt" }, {}, }; diff --git a/drivers/watchdog/da9063_wdt.c b/drivers/watchdog/da9063_wdt.c index 9adad1862bbd..09a4af4c58fc 100644 --- a/drivers/watchdog/da9063_wdt.c +++ b/drivers/watchdog/da9063_wdt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -26,6 +27,8 @@ * others: timeout = 2048 ms * 2^(TWDSCALE-1). */ static const unsigned int wdt_timeout[] = { 0, 2, 4, 8, 16, 32, 65, 131 }; +static bool use_sw_pm; + #define DA9063_TWDSCALE_DISABLE 0 #define DA9063_TWDSCALE_MIN 1 #define DA9063_TWDSCALE_MAX (ARRAY_SIZE(wdt_timeout) - 1) @@ -218,6 +221,8 @@ static int da9063_wdt_probe(struct platform_device *pdev) if (!wdd) return -ENOMEM; + use_sw_pm = device_property_present(dev, "dlg,use-sw-pm"); + wdd->info = &da9063_watchdog_info; wdd->ops = &da9063_watchdog_ops; wdd->min_timeout = DA9063_WDT_MIN_TIMEOUT; @@ -228,6 +233,7 @@ static int da9063_wdt_probe(struct platform_device *pdev) watchdog_set_restart_priority(wdd, 128); watchdog_set_drvdata(wdd, da9063); + dev_set_drvdata(dev, wdd); wdd->timeout = DA9063_WDG_TIMEOUT; @@ -249,10 +255,40 @@ static int da9063_wdt_probe(struct platform_device *pdev) return devm_watchdog_register_device(dev, wdd); } +static int __maybe_unused da9063_wdt_suspend(struct device *dev) +{ + struct watchdog_device *wdd = dev_get_drvdata(dev); + + if (!use_sw_pm) + return 0; + + if (watchdog_active(wdd)) + return da9063_wdt_stop(wdd); + + return 0; +} + +static int __maybe_unused da9063_wdt_resume(struct device *dev) +{ + struct watchdog_device *wdd = dev_get_drvdata(dev); + + if (!use_sw_pm) + return 0; + + if (watchdog_active(wdd)) + return da9063_wdt_start(wdd); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(da9063_wdt_pm_ops, + da9063_wdt_suspend, da9063_wdt_resume); + static struct platform_driver da9063_wdt_driver = { .probe = da9063_wdt_probe, .driver = { .name = DA9063_DRVNAME_WATCHDOG, + .pm = &da9063_wdt_pm_ops, }, }; module_platform_driver(da9063_wdt_driver); diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 3f2f4343644f..34693f11385f 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -596,7 +596,6 @@ static int iTCO_wdt_probe(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM_SLEEP /* * Suspend-to-idle requires this, because it stops the ticks and timekeeping, so * the watchdog cannot be pinged while in that state. In ACPI sleep states the @@ -604,15 +603,15 @@ static int iTCO_wdt_probe(struct platform_device *pdev) */ #ifdef CONFIG_ACPI -static inline bool need_suspend(void) +static inline bool __maybe_unused need_suspend(void) { return acpi_target_system_state() == ACPI_STATE_S0; } #else -static inline bool need_suspend(void) { return true; } +static inline bool __maybe_unused need_suspend(void) { return true; } #endif -static int iTCO_wdt_suspend_noirq(struct device *dev) +static int __maybe_unused iTCO_wdt_suspend_noirq(struct device *dev) { struct iTCO_wdt_private *p = dev_get_drvdata(dev); int ret = 0; @@ -626,7 +625,7 @@ static int iTCO_wdt_suspend_noirq(struct device *dev) return ret; } -static int iTCO_wdt_resume_noirq(struct device *dev) +static int __maybe_unused iTCO_wdt_resume_noirq(struct device *dev) { struct iTCO_wdt_private *p = dev_get_drvdata(dev); @@ -637,20 +636,15 @@ static int iTCO_wdt_resume_noirq(struct device *dev) } static const struct dev_pm_ops iTCO_wdt_pm = { - .suspend_noirq = iTCO_wdt_suspend_noirq, - .resume_noirq = iTCO_wdt_resume_noirq, + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(iTCO_wdt_suspend_noirq, + iTCO_wdt_resume_noirq) }; -#define ITCO_WDT_PM_OPS (&iTCO_wdt_pm) -#else -#define ITCO_WDT_PM_OPS NULL -#endif /* CONFIG_PM_SLEEP */ - static struct platform_driver iTCO_wdt_driver = { .probe = iTCO_wdt_probe, .driver = { .name = DRV_NAME, - .pm = ITCO_WDT_PM_OPS, + .pm = &iTCO_wdt_pm, }, }; diff --git a/drivers/watchdog/mtk_wdt.c b/drivers/watchdog/mtk_wdt.c index 4577a76dd464..f0d4e3cc7459 100644 --- a/drivers/watchdog/mtk_wdt.c +++ b/drivers/watchdog/mtk_wdt.c @@ -10,7 +10,9 @@ */ #include +#include #include +#include #include #include #include @@ -76,10 +78,18 @@ static const struct mtk_wdt_data mt2712_data = { .toprgu_sw_rst_num = MT2712_TOPRGU_SW_RST_NUM, }; +static const struct mtk_wdt_data mt7986_data = { + .toprgu_sw_rst_num = MT7986_TOPRGU_SW_RST_NUM, +}; + static const struct mtk_wdt_data mt8183_data = { .toprgu_sw_rst_num = MT8183_TOPRGU_SW_RST_NUM, }; +static const struct mtk_wdt_data mt8186_data = { + .toprgu_sw_rst_num = MT8186_TOPRGU_SW_RST_NUM, +}; + static const struct mtk_wdt_data mt8192_data = { .toprgu_sw_rst_num = MT8192_TOPRGU_SW_RST_NUM, }; @@ -418,7 +428,9 @@ static int mtk_wdt_resume(struct device *dev) static const struct of_device_id mtk_wdt_dt_ids[] = { { .compatible = "mediatek,mt2712-wdt", .data = &mt2712_data }, { .compatible = "mediatek,mt6589-wdt" }, + { .compatible = "mediatek,mt7986-wdt", .data = &mt7986_data }, { .compatible = "mediatek,mt8183-wdt", .data = &mt8183_data }, + { .compatible = "mediatek,mt8186-wdt", .data = &mt8186_data }, { .compatible = "mediatek,mt8192-wdt", .data = &mt8192_data }, { .compatible = "mediatek,mt8195-wdt", .data = &mt8195_data }, { /* sentinel */ } diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c index db843f825860..053ef3bde12d 100644 --- a/drivers/watchdog/rti_wdt.c +++ b/drivers/watchdog/rti_wdt.c @@ -226,7 +226,7 @@ static int rti_wdt_probe(struct platform_device *pdev) pm_runtime_enable(dev); ret = pm_runtime_get_sync(dev); - if (ret) { + if (ret < 0) { pm_runtime_put_noidle(dev); pm_runtime_disable(&pdev->dev); return dev_err_probe(dev, ret, "runtime pm failed\n"); @@ -253,6 +253,7 @@ static int rti_wdt_probe(struct platform_device *pdev) } if (readl(wdt->base + RTIDWDCTRL) == WDENABLE_KEY) { + int preset_heartbeat; u32 time_left_ms; u64 heartbeat_ms; u32 wsize; @@ -263,11 +264,12 @@ static int rti_wdt_probe(struct platform_device *pdev) heartbeat_ms <<= WDT_PRELOAD_SHIFT; heartbeat_ms *= 1000; do_div(heartbeat_ms, wdt->freq); - if (heartbeat_ms != heartbeat * 1000) + preset_heartbeat = heartbeat_ms + 500; + preset_heartbeat /= 1000; + if (preset_heartbeat != heartbeat) dev_warn(dev, "watchdog already running, ignoring heartbeat config!\n"); - heartbeat = heartbeat_ms; - heartbeat /= 1000; + heartbeat = preset_heartbeat; wsize = readl(wdt->base + RTIWWDSIZECTRL); ret = rti_wdt_setup_hw_hb(wdd, wsize); diff --git a/drivers/watchdog/rzg2l_wdt.c b/drivers/watchdog/rzg2l_wdt.c index 6b426df34fd6..6eea0ee4af49 100644 --- a/drivers/watchdog/rzg2l_wdt.c +++ b/drivers/watchdog/rzg2l_wdt.c @@ -21,8 +21,11 @@ #define WDTSET 0x04 #define WDTTIM 0x08 #define WDTINT 0x0C +#define PECR 0x10 +#define PEEN 0x14 #define WDTCNT_WDTEN BIT(0) #define WDTINT_INTDISP BIT(0) +#define PEEN_FORCE BIT(0) #define WDT_DEFAULT_TIMEOUT 60U @@ -43,6 +46,8 @@ struct rzg2l_wdt_priv { struct reset_control *rstc; unsigned long osc_clk_rate; unsigned long delay; + struct clk *pclk; + struct clk *osc_clk; }; static void rzg2l_wdt_wait_delay(struct rzg2l_wdt_priv *priv) @@ -53,7 +58,7 @@ static void rzg2l_wdt_wait_delay(struct rzg2l_wdt_priv *priv) static u32 rzg2l_wdt_get_cycle_usec(unsigned long cycle, u32 wdttime) { - u64 timer_cycle_us = 1024 * 1024 * (wdttime + 1) * MICRO; + u64 timer_cycle_us = 1024 * 1024ULL * (wdttime + 1) * MICRO; return div64_ul(timer_cycle_us, cycle); } @@ -86,7 +91,6 @@ static int rzg2l_wdt_start(struct watchdog_device *wdev) { struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); - reset_control_deassert(priv->rstc); pm_runtime_get_sync(wdev->parent); /* Initialize time out */ @@ -106,7 +110,26 @@ static int rzg2l_wdt_stop(struct watchdog_device *wdev) struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); pm_runtime_put(wdev->parent); - reset_control_assert(priv->rstc); + reset_control_reset(priv->rstc); + + return 0; +} + +static int rzg2l_wdt_set_timeout(struct watchdog_device *wdev, unsigned int timeout) +{ + struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); + + wdev->timeout = timeout; + + /* + * If the watchdog is active, reset the module for updating the WDTSET + * register so that it is updated with new timeout values. + */ + if (watchdog_active(wdev)) { + pm_runtime_put(wdev->parent); + reset_control_reset(priv->rstc); + rzg2l_wdt_start(wdev); + } return 0; } @@ -116,15 +139,14 @@ static int rzg2l_wdt_restart(struct watchdog_device *wdev, { struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); - /* Reset the module before we modify any register */ - reset_control_reset(priv->rstc); - pm_runtime_get_sync(wdev->parent); + clk_prepare_enable(priv->pclk); + clk_prepare_enable(priv->osc_clk); - /* smallest counter value to reboot soon */ - rzg2l_wdt_write(priv, WDTSET_COUNTER_VAL(1), WDTSET); + /* Generate Reset (WDTRSTB) Signal on parity error */ + rzg2l_wdt_write(priv, 0, PECR); - /* Enable watchdog timer*/ - rzg2l_wdt_write(priv, WDTCNT_WDTEN, WDTCNT); + /* Force parity error */ + rzg2l_wdt_write(priv, PEEN_FORCE, PEEN); return 0; } @@ -148,15 +170,15 @@ static const struct watchdog_ops rzg2l_wdt_ops = { .start = rzg2l_wdt_start, .stop = rzg2l_wdt_stop, .ping = rzg2l_wdt_ping, + .set_timeout = rzg2l_wdt_set_timeout, .restart = rzg2l_wdt_restart, }; -static void rzg2l_wdt_reset_assert_pm_disable_put(void *data) +static void rzg2l_wdt_reset_assert_pm_disable(void *data) { struct watchdog_device *wdev = data; struct rzg2l_wdt_priv *priv = watchdog_get_drvdata(wdev); - pm_runtime_put(wdev->parent); pm_runtime_disable(wdev->parent); reset_control_assert(priv->rstc); } @@ -166,7 +188,6 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct rzg2l_wdt_priv *priv; unsigned long pclk_rate; - struct clk *wdt_clk; int ret; priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); @@ -178,22 +199,20 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) return PTR_ERR(priv->base); /* Get watchdog main clock */ - wdt_clk = clk_get(&pdev->dev, "oscclk"); - if (IS_ERR(wdt_clk)) - return dev_err_probe(&pdev->dev, PTR_ERR(wdt_clk), "no oscclk"); + priv->osc_clk = devm_clk_get(&pdev->dev, "oscclk"); + if (IS_ERR(priv->osc_clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(priv->osc_clk), "no oscclk"); - priv->osc_clk_rate = clk_get_rate(wdt_clk); - clk_put(wdt_clk); + priv->osc_clk_rate = clk_get_rate(priv->osc_clk); if (!priv->osc_clk_rate) return dev_err_probe(&pdev->dev, -EINVAL, "oscclk rate is 0"); /* Get Peripheral clock */ - wdt_clk = clk_get(&pdev->dev, "pclk"); - if (IS_ERR(wdt_clk)) - return dev_err_probe(&pdev->dev, PTR_ERR(wdt_clk), "no pclk"); + priv->pclk = devm_clk_get(&pdev->dev, "pclk"); + if (IS_ERR(priv->pclk)) + return dev_err_probe(&pdev->dev, PTR_ERR(priv->pclk), "no pclk"); - pclk_rate = clk_get_rate(wdt_clk); - clk_put(wdt_clk); + pclk_rate = clk_get_rate(priv->pclk); if (!pclk_rate) return dev_err_probe(&pdev->dev, -EINVAL, "pclk rate is 0"); @@ -204,13 +223,11 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) return dev_err_probe(&pdev->dev, PTR_ERR(priv->rstc), "failed to get cpg reset"); - reset_control_deassert(priv->rstc); + ret = reset_control_deassert(priv->rstc); + if (ret) + return dev_err_probe(dev, ret, "failed to deassert"); + pm_runtime_enable(&pdev->dev); - ret = pm_runtime_resume_and_get(&pdev->dev); - if (ret < 0) { - dev_err(dev, "pm_runtime_resume_and_get failed ret=%pe", ERR_PTR(ret)); - goto out_pm_get; - } priv->wdev.info = &rzg2l_wdt_ident; priv->wdev.ops = &rzg2l_wdt_ops; @@ -222,7 +239,7 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) watchdog_set_drvdata(&priv->wdev, priv); ret = devm_add_action_or_reset(&pdev->dev, - rzg2l_wdt_reset_assert_pm_disable_put, + rzg2l_wdt_reset_assert_pm_disable, &priv->wdev); if (ret < 0) return ret; @@ -235,12 +252,6 @@ static int rzg2l_wdt_probe(struct platform_device *pdev) dev_warn(dev, "Specified timeout invalid, using default"); return devm_watchdog_register_device(&pdev->dev, &priv->wdev); - -out_pm_get: - pm_runtime_disable(dev); - reset_control_assert(priv->rstc); - - return ret; } static const struct of_device_id rzg2l_wdt_ids[] = { diff --git a/drivers/watchdog/rzn1_wdt.c b/drivers/watchdog/rzn1_wdt.c new file mode 100644 index 000000000000..55ab384b9965 --- /dev/null +++ b/drivers/watchdog/rzn1_wdt.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/N1 Watchdog timer. + * This is a 12-bit timer driver from a (62.5/16384) MHz clock. It can't even + * cope with 2 seconds. + * + * Copyright 2018 Renesas Electronics Europe Ltd. + * + * Derived from Ralink RT288x watchdog timer. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_TIMEOUT 60 + +#define RZN1_WDT_RETRIGGER 0x0 +#define RZN1_WDT_RETRIGGER_RELOAD_VAL 0 +#define RZN1_WDT_RETRIGGER_RELOAD_VAL_MASK 0xfff +#define RZN1_WDT_RETRIGGER_PRESCALE BIT(12) +#define RZN1_WDT_RETRIGGER_ENABLE BIT(13) +#define RZN1_WDT_RETRIGGER_WDSI (0x2 << 14) + +#define RZN1_WDT_PRESCALER 16384 +#define RZN1_WDT_MAX 4095 + +struct rzn1_watchdog { + struct watchdog_device wdtdev; + void __iomem *base; + unsigned long clk_rate_khz; +}; + +static inline uint32_t max_heart_beat_ms(unsigned long clk_rate_khz) +{ + return (RZN1_WDT_MAX * RZN1_WDT_PRESCALER) / clk_rate_khz; +} + +static inline uint32_t compute_reload_value(uint32_t tick_ms, + unsigned long clk_rate_khz) +{ + return (tick_ms * clk_rate_khz) / RZN1_WDT_PRESCALER; +} + +static int rzn1_wdt_ping(struct watchdog_device *w) +{ + struct rzn1_watchdog *wdt = watchdog_get_drvdata(w); + + /* Any value retrigggers the watchdog */ + writel(0, wdt->base + RZN1_WDT_RETRIGGER); + + return 0; +} + +static int rzn1_wdt_start(struct watchdog_device *w) +{ + struct rzn1_watchdog *wdt = watchdog_get_drvdata(w); + u32 val; + + /* + * The hardware allows you to write to this reg only once. + * Since this includes the reload value, there is no way to change the + * timeout once started. Also note that the WDT clock is half the bus + * fabric clock rate, so if the bus fabric clock rate is changed after + * the WDT is started, the WDT interval will be wrong. + */ + val = RZN1_WDT_RETRIGGER_WDSI; + val |= RZN1_WDT_RETRIGGER_ENABLE; + val |= RZN1_WDT_RETRIGGER_PRESCALE; + val |= compute_reload_value(w->max_hw_heartbeat_ms, wdt->clk_rate_khz); + writel(val, wdt->base + RZN1_WDT_RETRIGGER); + + return 0; +} + +static irqreturn_t rzn1_wdt_irq(int irq, void *_wdt) +{ + pr_crit("RZN1 Watchdog. Initiating system reboot\n"); + emergency_restart(); + + return IRQ_HANDLED; +} + +static struct watchdog_info rzn1_wdt_info = { + .identity = "RZ/N1 Watchdog", + .options = WDIOF_MAGICCLOSE | WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING, +}; + +static const struct watchdog_ops rzn1_wdt_ops = { + .owner = THIS_MODULE, + .start = rzn1_wdt_start, + .ping = rzn1_wdt_ping, +}; + +static void rzn1_wdt_clk_disable_unprepare(void *data) +{ + clk_disable_unprepare(data); +} + +static int rzn1_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rzn1_watchdog *wdt; + struct device_node *np = dev->of_node; + struct clk *clk; + unsigned long clk_rate; + int ret; + int irq; + + wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + wdt->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(wdt->base)) + return PTR_ERR(wdt->base); + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + ret = devm_request_irq(dev, irq, rzn1_wdt_irq, 0, + np->name, wdt); + if (ret) { + dev_err(dev, "failed to request irq %d\n", irq); + return ret; + } + + clk = devm_clk_get(dev, NULL); + if (IS_ERR(clk)) { + dev_err(dev, "failed to get the clock\n"); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (ret) { + dev_err(dev, "failed to prepare/enable the clock\n"); + return ret; + } + + ret = devm_add_action_or_reset(dev, rzn1_wdt_clk_disable_unprepare, + clk); + if (ret) + return ret; + + clk_rate = clk_get_rate(clk); + if (!clk_rate) { + dev_err(dev, "failed to get the clock rate\n"); + return -EINVAL; + } + + wdt->clk_rate_khz = clk_rate / 1000; + wdt->wdtdev.info = &rzn1_wdt_info, + wdt->wdtdev.ops = &rzn1_wdt_ops, + wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS, + wdt->wdtdev.parent = dev; + /* + * The period of the watchdog cannot be changed once set + * and is limited to a very short period. + * Configure it for a 1s period once and for all, and + * rely on the heart-beat provided by the watchdog core + * to make this usable by the user-space. + */ + wdt->wdtdev.max_hw_heartbeat_ms = max_heart_beat_ms(wdt->clk_rate_khz); + if (wdt->wdtdev.max_hw_heartbeat_ms > 1000) + wdt->wdtdev.max_hw_heartbeat_ms = 1000; + + wdt->wdtdev.timeout = DEFAULT_TIMEOUT; + ret = watchdog_init_timeout(&wdt->wdtdev, 0, dev); + if (ret) + return ret; + + watchdog_set_drvdata(&wdt->wdtdev, wdt); + + return devm_watchdog_register_device(dev, &wdt->wdtdev); +} + + +static const struct of_device_id rzn1_wdt_match[] = { + { .compatible = "renesas,rzn1-wdt" }, + {}, +}; +MODULE_DEVICE_TABLE(of, rzn1_wdt_match); + +static struct platform_driver rzn1_wdt_driver = { + .probe = rzn1_wdt_probe, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = rzn1_wdt_match, + }, +}; + +module_platform_driver(rzn1_wdt_driver); + +MODULE_DESCRIPTION("Renesas RZ/N1 hardware watchdog"); +MODULE_AUTHOR("Phil Edworthy "); +MODULE_LICENSE("GPL"); diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c index dbeb2146c968..f9479a3fe2a6 100644 --- a/drivers/watchdog/sp805_wdt.c +++ b/drivers/watchdog/sp805_wdt.c @@ -272,6 +272,7 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id) watchdog_set_nowayout(&wdt->wdd, nowayout); watchdog_set_drvdata(&wdt->wdd, wdt); watchdog_set_restart_priority(&wdt->wdd, 128); + watchdog_stop_on_unregister(&wdt->wdd); /* * If 'timeout-sec' devicetree property is specified, use that. diff --git a/drivers/watchdog/sunplus_wdt.c b/drivers/watchdog/sunplus_wdt.c new file mode 100644 index 000000000000..e2d8c532bcb1 --- /dev/null +++ b/drivers/watchdog/sunplus_wdt.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sunplus Watchdog Driver + * + * Copyright (C) 2021 Sunplus Technology Co., Ltd. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#define WDT_CTRL 0x00 +#define WDT_CNT 0x04 + +#define WDT_STOP 0x3877 +#define WDT_RESUME 0x4A4B +#define WDT_CLRIRQ 0x7482 +#define WDT_UNLOCK 0xAB00 +#define WDT_LOCK 0xAB01 +#define WDT_CONMAX 0xDEAF + +/* TIMEOUT_MAX = ffff0/90kHz =11.65, so longer than 11 seconds will time out. */ +#define SP_WDT_MAX_TIMEOUT 11U +#define SP_WDT_DEFAULT_TIMEOUT 10 + +#define STC_CLK 90000 + +#define DEVICE_NAME "sunplus-wdt" + +static unsigned int timeout; +module_param(timeout, int, 0); +MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds"); + +static bool nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +struct sp_wdt_priv { + struct watchdog_device wdev; + void __iomem *base; + struct clk *clk; + struct reset_control *rstc; +}; + +static int sp_wdt_restart(struct watchdog_device *wdev, + unsigned long action, void *data) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + + writel(WDT_STOP, base + WDT_CTRL); + writel(WDT_UNLOCK, base + WDT_CTRL); + writel(0x0001, base + WDT_CNT); + writel(WDT_LOCK, base + WDT_CTRL); + writel(WDT_RESUME, base + WDT_CTRL); + + return 0; +} + +static int sp_wdt_ping(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + u32 count; + + if (wdev->timeout > SP_WDT_MAX_TIMEOUT) { + /* WDT_CONMAX sets the count to the maximum (down-counting). */ + writel(WDT_CONMAX, base + WDT_CTRL); + } else { + writel(WDT_UNLOCK, base + WDT_CTRL); + /* + * Watchdog timer is a 20-bit down-counting based on STC_CLK. + * This register bits[16:0] is from bit[19:4] of the watchdog + * timer counter. + */ + count = (wdev->timeout * STC_CLK) >> 4; + writel(count, base + WDT_CNT); + writel(WDT_LOCK, base + WDT_CTRL); + } + + return 0; +} + +static int sp_wdt_stop(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + + writel(WDT_STOP, base + WDT_CTRL); + + return 0; +} + +static int sp_wdt_start(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + + writel(WDT_RESUME, base + WDT_CTRL); + + return 0; +} + +static unsigned int sp_wdt_get_timeleft(struct watchdog_device *wdev) +{ + struct sp_wdt_priv *priv = watchdog_get_drvdata(wdev); + void __iomem *base = priv->base; + u32 val; + + val = readl(base + WDT_CNT); + val &= 0xffff; + val = val << 4; + + return val; +} + +static const struct watchdog_info sp_wdt_info = { + .identity = DEVICE_NAME, + .options = WDIOF_SETTIMEOUT | + WDIOF_MAGICCLOSE | + WDIOF_KEEPALIVEPING, +}; + +static const struct watchdog_ops sp_wdt_ops = { + .owner = THIS_MODULE, + .start = sp_wdt_start, + .stop = sp_wdt_stop, + .ping = sp_wdt_ping, + .get_timeleft = sp_wdt_get_timeleft, + .restart = sp_wdt_restart, +}; + +static void sp_clk_disable_unprepare(void *data) +{ + clk_disable_unprepare(data); +} + +static void sp_reset_control_assert(void *data) +{ + reset_control_assert(data); +} + +static int sp_wdt_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sp_wdt_priv *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->clk = devm_clk_get(dev, NULL); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), "Failed to get clock\n"); + + ret = clk_prepare_enable(priv->clk); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable clock\n"); + + ret = devm_add_action_or_reset(dev, sp_clk_disable_unprepare, priv->clk); + if (ret) + return ret; + + /* The timer and watchdog shared the STC reset */ + priv->rstc = devm_reset_control_get_shared(dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), "Failed to get reset\n"); + + reset_control_deassert(priv->rstc); + + ret = devm_add_action_or_reset(dev, sp_reset_control_assert, priv->rstc); + if (ret) + return ret; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->wdev.info = &sp_wdt_info; + priv->wdev.ops = &sp_wdt_ops; + priv->wdev.timeout = SP_WDT_DEFAULT_TIMEOUT; + priv->wdev.max_hw_heartbeat_ms = SP_WDT_MAX_TIMEOUT * 1000; + priv->wdev.min_timeout = 1; + priv->wdev.parent = dev; + + watchdog_set_drvdata(&priv->wdev, priv); + watchdog_init_timeout(&priv->wdev, timeout, dev); + watchdog_set_nowayout(&priv->wdev, nowayout); + watchdog_stop_on_reboot(&priv->wdev); + watchdog_set_restart_priority(&priv->wdev, 128); + + return devm_watchdog_register_device(dev, &priv->wdev); +} + +static const struct of_device_id sp_wdt_of_match[] = { + {.compatible = "sunplus,sp7021-wdt", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, sp_wdt_of_match); + +static struct platform_driver sp_wdt_driver = { + .probe = sp_wdt_probe, + .driver = { + .name = DEVICE_NAME, + .of_match_table = sp_wdt_of_match, + }, +}; + +module_platform_driver(sp_wdt_driver); + +MODULE_AUTHOR("Xiantao Hu "); +MODULE_DESCRIPTION("Sunplus Watchdog Timer Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/watchdog/ts4800_wdt.c b/drivers/watchdog/ts4800_wdt.c index c137ad2bd5c3..0ea554c7cda5 100644 --- a/drivers/watchdog/ts4800_wdt.c +++ b/drivers/watchdog/ts4800_wdt.c @@ -125,13 +125,16 @@ static int ts4800_wdt_probe(struct platform_device *pdev) ret = of_property_read_u32_index(np, "syscon", 1, ®); if (ret < 0) { dev_err(dev, "no offset in syscon\n"); + of_node_put(syscon_np); return ret; } /* allocate memory for watchdog struct */ wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL); - if (!wdt) + if (!wdt) { + of_node_put(syscon_np); return -ENOMEM; + } /* set regmap and offset to know where to write */ wdt->feed_offset = reg; diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 195c8c004b69..e6f95e99156d 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -344,6 +344,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) wdat->period = tbl->timer_period; wdat->wdd.min_hw_heartbeat_ms = wdat->period * tbl->min_count; wdat->wdd.max_hw_heartbeat_ms = wdat->period * tbl->max_count; + wdat->wdd.min_timeout = 1; wdat->stopped_in_sleep = tbl->flags & ACPI_WDAT_STOPPED; wdat->wdd.info = &wdat_wdt_info; wdat->wdd.ops = &wdat_wdt_ops; @@ -450,8 +451,7 @@ static int wdat_wdt_probe(struct platform_device *pdev) * watchdog properly after it has opened the device. In some cases * the BIOS default is too short and causes immediate reboot. */ - if (timeout * 1000 < wdat->wdd.min_hw_heartbeat_ms || - timeout * 1000 > wdat->wdd.max_hw_heartbeat_ms) { + if (watchdog_timeout_invalid(&wdat->wdd, timeout)) { dev_warn(dev, "Invalid timeout %d given, using %d\n", timeout, WDAT_DEFAULT_TIMEOUT); timeout = WDAT_DEFAULT_TIMEOUT; @@ -462,6 +462,8 @@ static int wdat_wdt_probe(struct platform_device *pdev) return ret; watchdog_set_nowayout(&wdat->wdd, nowayout); + watchdog_stop_on_reboot(&wdat->wdd); + watchdog_stop_on_unregister(&wdat->wdd); return devm_watchdog_register_device(dev, &wdat->wdd); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 94aa7356248e..79f6b74336d2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -463,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, } /* skip if starts before the current position */ - if (offset < curr) + if (offset < curr) { + if (next > curr) + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7584aa6e5025..e5221be6eb55 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct iov_iter iter; ssize_t err = 0; size_t len; + int mode; __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) goto out; /* We need to fetch the inline data. */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -604,8 +606,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); - if (IS_ERR(req)) + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); return PTR_ERR(req); + } set_page_writeback(page); if (caching) @@ -1644,7 +1648,7 @@ int ceph_uninline_data(struct file *file) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_request *req; + struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf; struct folio *folio = NULL; u64 inline_version = CEPH_INLINE_NONE; @@ -1652,18 +1656,6 @@ int ceph_uninline_data(struct file *file) int err = 0; u64 len; - prealloc_cf = ceph_alloc_cap_flush(); - if (!prealloc_cf) - return -ENOMEM; - - folio = read_mapping_folio(inode->i_mapping, 0, file); - if (IS_ERR(folio)) { - err = PTR_ERR(folio); - goto out; - } - - folio_lock(folio); - spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; spin_unlock(&ci->i_ceph_lock); @@ -1671,9 +1663,23 @@ int ceph_uninline_data(struct file *file) dout("uninline_data %p %llx.%llx inline_version %llu\n", inode, ceph_vinop(inode), inline_version); - if (inline_version == 1 || /* initial version, no data */ - inline_version == CEPH_INLINE_NONE) - goto out_unlock; + if (inline_version == CEPH_INLINE_NONE) + return 0; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + if (inline_version == 1) /* initial version, no data */ + goto out_uninline; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); len = i_size_read(inode); if (len > folio_size(folio)) @@ -1739,6 +1745,7 @@ int ceph_uninline_data(struct file *file) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); +out_uninline: if (!err) { int dirty; @@ -1757,8 +1764,10 @@ out_put_req: if (err == -ECANCELED) err = 0; out_unlock: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } out: ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", @@ -1777,7 +1786,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) if (!mapping->a_ops->read_folio) return -ENOEXEC; - file_accessed(file); vma->vm_ops = &ceph_vmops; return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5c14ef04e474..bf2e94005598 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; - struct ceph_cap_flush *cf; + struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { @@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, } ret = -ENOENT; - list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { - if (cf->tid >= first_tid) { + list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { + if (iter->tid >= first_tid) { + cf = iter; ret = 0; break; } @@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; + bool queue_writeback = false; if (session) ceph_get_mds_session(session); @@ -2062,10 +2064,27 @@ retry: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & cap_used) == 0) { - dout("completed revocation of %s\n", - ceph_cap_string(cap->implemented & ~cap->issued)); - goto ack; + if (revoking) { + if ((revoking & cap_used) == 0) { + dout("completed revocation of %s\n", + ceph_cap_string(cap->implemented & ~cap->issued)); + goto ack; + } + + /* + * If the "i_wrbuffer_ref" was increased by mmap or generic + * cache write just before the ceph_check_caps() is called, + * the Fb capability revoking will fail this time. Then we + * must wait for the BDI's delayed work to flush the dirty + * pages and to release the "i_wrbuffer_ref", which will cost + * at most 5 seconds. That means the MDS needs to wait at + * most 5 seconds to finished the Fb capability's revocation. + * + * Let's queue a writeback for it. + */ + if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && + (revoking & CEPH_CAP_FILE_BUFFER)) + queue_writeback = true; } /* want more caps from mds? */ @@ -2135,6 +2154,8 @@ ack: spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); + if (queue_writeback) + ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } @@ -2218,9 +2239,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) } /* - * wait for any unsafe requests to complete. + * flush the mdlog and wait for any unsafe requests to complete. */ -static int unsafe_request_wait(struct inode *inode) +static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -2336,7 +2357,7 @@ retry: kfree(sessions); } - dout("unsafe_request_wait %p wait on tid %llu %llu\n", + dout("%s %p wait on tid %llu %llu\n", __func__, inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, @@ -2380,7 +2401,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - err = unsafe_request_wait(inode); + err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds @@ -3182,10 +3203,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap = NULL; + struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; - bool found = false; bool flush_snaps = false; bool complete_capsnap = false; @@ -3212,14 +3232,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->context == snapc) { - found = true; + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->context == snapc) { + capsnap = iter; break; } } - if (!found) { + if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. @@ -3769,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); - struct ceph_cap_snap *capsnap; - bool flushed = false; + struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; @@ -3778,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, inode, ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - if (capsnap->follows == follows) { - if (capsnap->cap_flush.tid != flush_tid) { + list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { + if (iter->follows == follows) { + if (iter->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", capsnap, follows, - flush_tid, capsnap->cap_flush.tid); + " %lld\n", iter, follows, + flush_tid, iter->cap_flush.tid); break; } - flushed = true; + capsnap = iter; break; } else { dout(" skipping cap_snap %p follows %lld\n", - capsnap, capsnap->follows); + iter, iter->follows); } } - if (flushed) + if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); - if (flushed) { + if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 63113e2a4890..b7e9cac3aeef 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode) __ceph_remove_caps(ci); - if (__ceph_has_any_quota(ci)) + if (__ceph_has_quota(ci, QUOTA_GET_ANY)) ceph_adjust_quota_realms_count(inode, false); /* @@ -1466,10 +1466,12 @@ retry_lookup: } else if (have_lease) { if (d_unhashed(dn)) d_add(dn, NULL); + } + + if (!d_unhashed(dn) && have_lease) update_dentry_lease(dir, dn, rinfo->dlease, session, req->r_request_started); - } goto done; } @@ -1884,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - ceph_fscache_invalidate(inode, false); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_inode_pages2 %llx.%llx failed\n", ceph_vinop(inode)); @@ -2258,6 +2259,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; } +int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) +{ + int issued = ceph_caps_issued(ceph_inode(inode)); + + /* + * If any 'x' caps is issued we can just choose the auth MDS + * instead of the random replica MDSes. Because only when the + * Locker is in LOCK_EXEC state will the loner client could + * get the 'x' caps. And if we send the getattr requests to + * any replica MDS it must auth pin and tries to rdlock from + * the auth MDS, and then the auth MDS need to do the Locker + * state transition to LOCK_SYNC. And after that the lock state + * will change back. + * + * This cost much when doing the Locker state transition and + * usually will need to revoke caps from clients. + */ + if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) + || (mask & CEPH_STAT_RSTAT)) + return USE_AUTH_MDS; + else + return USE_ANY_MDS; +} + /* * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. @@ -2281,7 +2306,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; + mode = ceph_try_to_choose_auth_mds(inode, mask); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); if (IS_ERR(req)) return PTR_ERR(req); @@ -2423,7 +2448,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, return -ESTALE; /* Skip the getattr altogether if we're asked not to sync */ - if (!(flags & AT_STATX_DONT_SYNC)) { + if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) { err = ceph_do_getattr(inode, statx_to_caps(request_mask, inode->i_mode), flags & AT_STATX_FORCE_SYNC); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 00c3de177dd6..f5d110d90b77 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -437,7 +437,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_32_safe(p, end, sets, bad); dout("got %u sets of delegated inodes\n", sets); while (sets--) { - u64 start, len, ino; + u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); @@ -449,7 +449,7 @@ static int ceph_parse_deleg_inos(void **p, void *end, continue; } while (len--) { - int err = xa_insert(&s->s_delegated_inos, ino = start++, + int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { @@ -2651,7 +2651,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; - int flags = 0; + int flags = 0, max_retry; + + /* + * The type of 'r_attempts' in kernel 'ceph_mds_request' + * is 'int', while in 'ceph_mds_request_head' the type of + * 'num_retry' is '__u8'. So in case the request retries + * exceeding 256 times, the MDS will receive a incorrect + * retry seq. + * + * In this case it's ususally a bug in MDS and continue + * retrying the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); + max_retry = 1 << (max_retry * BITS_PER_BYTE); + if (req->r_attempts >= max_retry) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } req->r_attempts++; if (req->r_inode) { @@ -2663,7 +2684,7 @@ static int __prepare_send_request(struct ceph_mds_session *session, else req->r_sent_on_mseq = -1; } - dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, + dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { @@ -3265,6 +3286,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; + bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); @@ -3273,16 +3295,41 @@ static void handle_forward(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { + mutex_unlock(&mdsc->mutex); dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); - goto out; /* dup reply? */ + return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd) { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); + /* + * The type of 'num_fwd' in ceph 'MClientRequestForward' + * is 'int32_t', while in 'ceph_mds_request_head' the + * type is '__u8'. So in case the request bounces between + * MDSes exceeding 256 times, the client will get stuck. + * + * In this case it's ususally a bug in MDS and continue + * bouncing the request makes no sense. + * + * In future this could be fixed in ceph code, so avoid + * using the hardcode here. + */ + int max = sizeof_field(struct ceph_mds_request_head, num_fwd); + max = 1 << (max * BITS_PER_BYTE); + if (req->r_num_fwd >= max) { + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", + tid); + } else { + dout("forward tid %llu to mds%d - old seq %d <= %d\n", + tid, next_mds, req->r_num_fwd, fwd_seq); + } } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -3294,9 +3341,12 @@ static void handle_forward(struct ceph_mds_client *mdsc, put_request_session(req); __do_request(mdsc, req); } - ceph_mdsc_put_request(req); -out: mutex_unlock(&mdsc->mutex); + + /* kick calling process */ + if (aborted) + complete_request(mdsc, req); + ceph_mdsc_put_request(req); return; bad: @@ -3375,13 +3425,17 @@ static void handle_session(struct ceph_mds_session *session, } if (msg_version >= 5) { - u32 flags; - /* version >= 4, struct_v, struct_cv, len, metric_spec */ - ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); + u32 flags, len; + + /* version >= 4 */ + ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ + ceph_decode_32_safe(&p, end, len, bad); /* len */ + ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ + /* version >= 5, flags */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { - pr_warn("mds%d session blocklisted\n", session->s_mds); + pr_warn("mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } @@ -4396,12 +4450,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); - /* - * if this is a preemptive lease RELEASE, no need to - * flush request stream, since the actual request will - * soon follow. - */ - msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE); ceph_con_send(&session->s_con, msg); } @@ -4696,15 +4744,17 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) } /* - * wait for all write mds requests to flush. + * flush the mdlog and wait for all write mds requests to flush. */ -static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) +static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, + u64 want_tid) { struct ceph_mds_request *req = NULL, *nextreq; + struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("wait_unsafe_requests want %lld\n", want_tid); + dout("%s want %lld\n", __func__, want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -4716,14 +4766,32 @@ restart: nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { + struct ceph_mds_session *s = req->r_session; + + if (!s) { + req = nextreq; + continue; + } + /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); + s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests wait on %llu (want %llu)\n", + + /* send flush mdlog request to MDS */ + if (last_session != s) { + send_flush_mdlog(s); + ceph_put_mds_session(last_session); + last_session = s; + } else { + ceph_put_mds_session(s); + } + dout("%s wait on %llu (want %llu)\n", __func__, req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); + mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) @@ -4738,7 +4806,8 @@ restart: req = nextreq; } mutex_unlock(&mdsc->mutex); - dout("wait_unsafe_requests done\n"); + ceph_put_mds_session(last_session); + dout("%s done\n", __func__); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) @@ -4767,7 +4836,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); - wait_unsafe_requests(mdsc, want_tid); + flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 33497846e47e..1140aecd82ce 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -579,7 +579,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, - TASK_INTERRUPTIBLE); + TASK_KILLABLE); } extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index a338a3ec0dc4..64592adfe48f 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -195,9 +195,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) /* * This function walks through the snaprealm for an inode and returns the - * ceph_snap_realm for the first snaprealm that has quotas set (either max_files - * or max_bytes). If the root is reached, return the root ceph_snap_realm - * instead. + * ceph_snap_realm for the first snaprealm that has quotas set (max_files, + * max_bytes, or any, depending on the 'which_quota' argument). If the root is + * reached, return the root ceph_snap_realm instead. * * Note that the caller is responsible for calling ceph_put_snap_realm() on the * returned realm. @@ -209,7 +209,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc) * will be restarted. */ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, - struct inode *inode, bool retry) + struct inode *inode, + enum quota_get_realm which_quota, + bool retry) { struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; @@ -248,7 +250,7 @@ restart: } ci = ceph_inode(in); - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, which_quota); iput(in); next = realm->parent; @@ -279,8 +281,8 @@ restart: * dropped and we can then restart the whole operation. */ down_read(&mdsc->snap_rwsem); - old_realm = get_quota_realm(mdsc, old, true); - new_realm = get_quota_realm(mdsc, new, false); + old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true); + new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false); if (PTR_ERR(new_realm) == -EAGAIN) { up_read(&mdsc->snap_rwsem); if (old_realm) @@ -483,7 +485,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) bool is_updated = false; down_read(&mdsc->snap_rwsem); - realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), true); + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root), + QUOTA_GET_MAX_BYTES, true); up_read(&mdsc->snap_rwsem); if (!realm) return false; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e6987d295079..b73b4f75462c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1119,6 +1119,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_gran = 1; s->s_time_min = 0; s->s_time_max = U32_MAX; + s->s_flags |= SB_NODIRATIME | SB_NOATIME; ret = set_anon_super_fc(s, fc); if (ret != 0) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20ceab74e871..dd7dac0f984a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1022,6 +1022,7 @@ static inline void ceph_queue_flush_snaps(struct inode *inode) ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } +extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) @@ -1278,9 +1279,29 @@ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ -static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) + +enum quota_get_realm { + QUOTA_GET_MAX_FILES, + QUOTA_GET_MAX_BYTES, + QUOTA_GET_ANY +}; + +static inline bool __ceph_has_quota(struct ceph_inode_info *ci, + enum quota_get_realm which) { - return ci->i_max_files || ci->i_max_bytes; + bool has_quota = false; + + switch (which) { + case QUOTA_GET_MAX_BYTES: + has_quota = !!ci->i_max_bytes; + break; + case QUOTA_GET_MAX_FILES: + has_quota = !!ci->i_max_files; + break; + default: + has_quota = !!(ci->i_max_files || ci->i_max_bytes); + } + return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); @@ -1289,10 +1310,10 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; - had_quota = __ceph_has_any_quota(ci); + had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; - has_quota = __ceph_has_any_quota(ci); + has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index afec84088471..8c2dc2c762a4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -366,6 +366,14 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, } #define XATTR_RSTAT_FIELD(_type, _name) \ XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT) +#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .exists_cb = NULL, \ + .flags = VXATTR_FLAG_RSTAT, \ + } #define XATTR_LAYOUT_FIELD(_type, _name, _field) \ { \ .name = CEPH_XATTR_NAME2(_type, _name, _field), \ @@ -404,7 +412,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rsubdirs), XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), - XATTR_RSTAT_FIELD(dir, rctime), + XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime), { .name = "ceph.dir.pin", .name_size = sizeof("ceph.dir.pin"), diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index a5cc4ed2cd0d..8e01d89c3319 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -17,6 +17,7 @@ static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space rreq->start = start; rreq->len = len; rreq->mapping = mapping; + rreq->inode = mapping->host; INIT_LIST_HEAD(&rreq->subrequests); refcount_set(&rreq->ref, 1); return rreq; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index bcc8335b46b3..95a403720e8c 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -288,7 +288,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - err = z_erofs_fill_inode(inode); + if (!erofs_is_fscache_mode(inode->i_sb)) + err = z_erofs_fill_inode(inode); + else + err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 95efc127b2ba..724bb57075f6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -199,7 +199,6 @@ struct z_erofs_decompress_frontend { struct z_erofs_pagevec_ctor vector; struct z_erofs_pcluster *pcl, *tailpcl; - struct z_erofs_collection *cl; /* a pointer used to pick up inplace I/O pages */ struct page **icpage_ptr; z_erofs_next_pcluster_t owned_head; @@ -214,7 +213,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED } + .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; static DEFINE_MUTEX(z_pagemap_global_lock); @@ -357,7 +356,7 @@ static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, return false; } -/* callers must be with collection lock held */ +/* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct page *page, enum z_erofs_page_type type, bool pvec_safereuse) @@ -372,7 +371,7 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, ret = z_erofs_pagevec_enqueue(&fe->vector, page, type, pvec_safereuse); - fe->cl->vcnt += (unsigned int)ret; + fe->pcl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; } @@ -405,12 +404,11 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) f->mode = COLLECT_PRIMARY; } -static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { struct z_erofs_pcluster *pcl = fe->pcl; - struct z_erofs_collection *cl; unsigned int length; /* to avoid unexpected loop formed by corrupted images */ @@ -419,8 +417,7 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, return -EFSCORRUPTED; } - cl = z_erofs_primarycollection(pcl); - if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -443,23 +440,21 @@ static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe, length = READ_ONCE(pcl->length); } } - mutex_lock(&cl->lock); + mutex_lock(&pcl->lock); /* used to check tail merging loop due to corrupted images */ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) fe->tailpcl = pcl; z_erofs_try_to_claim_pcluster(fe); - fe->cl = cl; return 0; } -static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe, + struct inode *inode, + struct erofs_map_blocks *map) { bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; - struct z_erofs_collection *cl; struct erofs_workgroup *grp; int err; @@ -482,17 +477,15 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; + pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = COLLECT_PRIMARY_FOLLOWED; - cl = z_erofs_primarycollection(pcl); - cl->pageofs = map->m_la & ~PAGE_MASK; - /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_init(&cl->lock); - DBG_BUGON(!mutex_trylock(&cl->lock)); + mutex_init(&pcl->lock); + DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ @@ -519,11 +512,10 @@ static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe, fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; - fe->cl = cl; return 0; err_out: - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); z_erofs_free_pcluster(pcl); return err; } @@ -535,9 +527,9 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, struct erofs_workgroup *grp; int ret; - DBG_BUGON(fe->cl); + DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); @@ -554,14 +546,14 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe, fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); } else { tailpacking: - ret = z_erofs_register_collection(fe, inode, map); + ret = z_erofs_register_pcluster(fe, inode, map); if (!ret) goto out; if (ret != -EEXIST) return ret; } - ret = z_erofs_lookup_collection(fe, inode, map); + ret = z_erofs_lookup_pcluster(fe, inode, map); if (ret) { erofs_workgroup_put(&fe->pcl->obj); return ret; @@ -569,7 +561,7 @@ tailpacking: out: z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS, - fe->cl->pagevec, fe->cl->vcnt); + fe->pcl->pagevec, fe->pcl->vcnt); /* since file-backed online pages are traversed in reverse order */ fe->icpage_ptr = fe->pcl->compressed_pages + z_erofs_pclusterpages(fe->pcl); @@ -582,48 +574,36 @@ out: */ static void z_erofs_rcu_callback(struct rcu_head *head) { - struct z_erofs_collection *const cl = - container_of(head, struct z_erofs_collection, rcu); - - z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(head, + struct z_erofs_pcluster, rcu)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); - call_rcu(&cl->rcu, z_erofs_rcu_callback); -} - -static void z_erofs_collection_put(struct z_erofs_collection *cl) -{ - struct z_erofs_pcluster *const pcl = - container_of(cl, struct z_erofs_pcluster, primary_collection); - - erofs_workgroup_put(&pcl->obj); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) { - struct z_erofs_collection *cl = fe->cl; + struct z_erofs_pcluster *pcl = fe->pcl; - if (!cl) + if (!pcl) return false; z_erofs_pagevec_ctor_exit(&fe->vector, false); - mutex_unlock(&cl->lock); + mutex_unlock(&pcl->lock); /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) - z_erofs_collection_put(cl); + erofs_workgroup_put(&pcl->obj); - fe->cl = NULL; + fe->pcl = NULL; return true; } @@ -663,28 +643,23 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, repeat: cur = end - 1; - /* lucky, within the range of the current map_blocks */ - if (offset + cur >= map->m_la && - offset + cur < map->m_la + map->m_llen) { - /* didn't get a valid collection previously (very rare) */ - if (!fe->cl) - goto restart_now; - goto hitted; + if (offset + cur < map->m_la || + offset + cur >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + cur); + + if (z_erofs_collector_end(fe)) + fe->backmost = false; + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto err_out; + } else { + if (fe->pcl) + goto hitted; + /* didn't get a valid pcluster previously (very rare) */ } - /* go ahead the next map_blocks */ - erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); - - if (z_erofs_collector_end(fe)) - fe->backmost = false; - - map->m_la = offset + cur; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto err_out; - -restart_now: if (!(map->m_flags & EROFS_MAP_MAPPED)) goto hitted; @@ -766,7 +741,7 @@ retry: /* bump up the number of spiltted parts of a page */ ++spiltted; /* also update nr_pages */ - fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1); + fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); next_part: /* can be used for verification */ map->m_llen = offset + cur - map->m_la; @@ -821,15 +796,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, enum z_erofs_page_type page_type; bool overlapped, partial; - struct z_erofs_collection *cl; int err; might_sleep(); - cl = z_erofs_primarycollection(pcl); - DBG_BUGON(!READ_ONCE(cl->nr_pages)); + DBG_BUGON(!READ_ONCE(pcl->nr_pages)); - mutex_lock(&cl->lock); - nr_pages = cl->nr_pages; + mutex_lock(&pcl->lock); + nr_pages = pcl->nr_pages; if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { pages = pages_onstack; @@ -857,9 +830,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, err = 0; z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, 0); + pcl->pagevec, 0); - for (i = 0; i < cl->vcnt; ++i) { + for (i = 0; i < pcl->vcnt; ++i) { unsigned int pagenr; page = z_erofs_pagevec_dequeue(&ctor, &page_type); @@ -945,11 +918,11 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, goto out; llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { + if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { outputsize = llen; partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); } else { - outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; + outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; partial = true; } @@ -963,7 +936,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb, .in = compressed_pages, .out = pages, .pageofs_in = pcl->pageofs_in, - .pageofs_out = cl->pageofs, + .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, .outputsize = outputsize, .alg = pcl->algorithmformat, @@ -1012,16 +985,12 @@ out: else if (pages != pages_onstack) kvfree(pages); - cl->nr_pages = 0; - cl->vcnt = 0; + pcl->nr_pages = 0; + pcl->vcnt = 0; - /* all cl locks MUST be taken before the following line */ + /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); - - /* all cl locks SHOULD be released right now */ - mutex_unlock(&cl->lock); - - z_erofs_collection_put(cl); + mutex_unlock(&pcl->lock); return err; } @@ -1043,6 +1012,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(pcl->next); z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + erofs_workgroup_put(&pcl->obj); } } @@ -1466,22 +1436,19 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct page *page; page = erofs_grab_cache_page_nowait(inode->i_mapping, index); - if (!page) - goto skip; - - if (PageUptodate(page)) { - unlock_page(page); + if (page) { + if (PageUptodate(page)) { + unlock_page(page); + } else { + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + } put_page(page); - goto skip; } - err = z_erofs_do_read_page(f, page, pagepool); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - put_page(page); -skip: if (cur < PAGE_SIZE) break; cur = (index << PAGE_SHIFT) - 1; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 800b11c53f57..58053bb5066f 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -12,21 +12,40 @@ #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_NR_INLINE_PAGEVECS 3 +#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 +#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only * for everyone else; * - * L: Field should be protected by pageset lock; + * L: Field should be protected by the pcluster lock; * * A: Field should be accessed / updated in atomic for parallelized code. */ -struct z_erofs_collection { +struct z_erofs_pcluster { + struct erofs_workgroup obj; struct mutex lock; + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* A: lower limit of decompressed length and if full length or not */ + unsigned int length; + /* I: page offset of start position of decompression */ - unsigned short pageofs; + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; /* L: maximum relative page index in pagevec[] */ unsigned short nr_pages; @@ -41,29 +60,6 @@ struct z_erofs_collection { /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; -}; - -#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 -#define Z_EROFS_PCLUSTER_LENGTH_BIT 1 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct z_erofs_collection primary_collection; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* A: lower limit of decompressed length and if full length or not */ - unsigned int length; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; union { /* I: physical cluster size in pages */ @@ -80,8 +76,6 @@ struct z_erofs_pcluster { struct page *compressed_pages[]; }; -#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) - /* let's avoid the valid 32-bit kernel addresses */ /* the chained workgroup has't submitted io (still open) */ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 456c1e89386a..6d8b2bf14de0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -98,13 +98,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { - if (page->index == sbi->metapage_eio_ofs) { - if (sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) - set_ckpt_flags(sbi, CP_ERROR_FLAG); - } else { - sbi->metapage_eio_ofs = page->index; - sbi->metapage_eio_cnt = 0; - } + f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -158,7 +152,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); } return exist; } @@ -196,7 +190,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); - WARN_ON(1); + dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); @@ -1010,9 +1004,7 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - if (!f2fs_is_volatile_file(inode)) - list_add_tail(&F2FS_I(inode)->dirty_list, - &sbi->inode_list[type]); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 28a1ddf56407..7d524d9d183d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -69,8 +69,7 @@ static bool __is_cp_guaranteed(struct page *page) if (f2fs_is_compressed_page(page)) return false; - if ((S_ISREG(inode->i_mode) && - (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || + if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; @@ -589,6 +588,34 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return false; } +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_PAGE_TYPE; i++) { + int n = (i == META) ? 1 : NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = f2fs_kmalloc(sbi, + array_size(n, sizeof(struct f2fs_bio_info)), + GFP_KERNEL); + if (!sbi->write_io[i]) + return -ENOMEM; + + for (j = HOT; j < n; j++) { + init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); + init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); + } + } + + return 0; +} + static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { @@ -2568,7 +2595,12 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) bool ipu_force = false; int err = 0; - set_new_dnode(&dn, inode, NULL, NULL, 0); + /* Use COW inode to make dnode_of_data for atomic write */ + if (f2fs_is_atomic_file(inode)) + set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); + else + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; @@ -2605,6 +2637,7 @@ got_it: err = -EFSCORRUPTED; goto out_writepage; } + /* * If current allocation needs SSR, * it had better in-place writes for updated data. @@ -2741,11 +2774,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, write: if (f2fs_is_drop_cache(inode)) goto out; - /* we should not write 0'th page having journal header */ - if (f2fs_is_volatile_file(inode) && (!page->index || - (!wbc->for_reclaim && - f2fs_available_free_memory(sbi, BASE_CHECK)))) - goto redirty_out; /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { @@ -3318,6 +3346,100 @@ unlock_out: return err; } +static int __find_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr) +{ + struct dnode_of_data dn; + struct page *ipage; + struct extent_info ei = {0, }; + int err = 0; + + ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + dn.data_blkaddr = NULL_ADDR; + err = 0; + } + } + *blk_addr = dn.data_blkaddr; + f2fs_put_dnode(&dn); + return err; +} + +static int __reserve_data_block(struct inode *inode, pgoff_t index, + block_t *blk_addr, bool *node_changed) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage; + int err = 0; + + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + ipage = f2fs_get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + set_new_dnode(&dn, inode, ipage, ipage, 0); + + err = f2fs_get_block(&dn, index); + + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; + f2fs_put_dnode(&dn); + +unlock_out: + f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + return err; +} + +static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned int len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + struct inode *cow_inode = F2FS_I(inode)->cow_inode; + pgoff_t index = page->index; + int err = 0; + block_t ori_blk_addr; + + /* If pos is beyond the end of file, reserve a new block in COW inode */ + if ((pos & PAGE_MASK) >= i_size_read(inode)) + return __reserve_data_block(cow_inode, index, blk_addr, + node_changed); + + /* Look for the block in COW inode first */ + err = __find_data_block(cow_inode, index, blk_addr); + if (err) + return err; + else if (*blk_addr != NULL_ADDR) + return 0; + + /* Look for the block in the original inode */ + err = __find_data_block(inode, index, &ori_blk_addr); + if (err) + return err; + + /* Finally, we should reserve a new block in COW inode for the update */ + err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); + if (err) + return err; + + if (ori_blk_addr != NULL_ADDR) + *blk_addr = ori_blk_addr; + return 0; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { @@ -3325,7 +3447,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; - bool need_balance = false, drop_atomic = false; + bool need_balance = false; block_t blkaddr = NULL_ADDR; int err = 0; @@ -3336,14 +3458,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } - if ((f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) || - is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - err = -ENOMEM; - drop_atomic = true; - goto fail; - } - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -3391,7 +3505,11 @@ repeat: *pagep = page; - err = prepare_write_begin(sbi, page, pos, len, + if (f2fs_is_atomic_file(inode)) + err = prepare_atomic_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + else + err = prepare_write_begin(sbi, page, pos, len, &blkaddr, &need_balance); if (err) goto fail; @@ -3447,8 +3565,6 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); - if (drop_atomic) - f2fs_drop_inmem_pages_all(sbi, false); return err; } @@ -3492,8 +3608,12 @@ static int f2fs_write_end(struct file *file, set_page_dirty(page); if (pos + copied > i_size_read(inode) && - !f2fs_verity_in_progress(inode)) + !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); + if (f2fs_is_atomic_file(inode)) + f2fs_i_size_write(F2FS_I(inode)->cow_inode, + pos + copied); + } unlock_out: f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -3526,9 +3646,6 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) inode->i_ino == F2FS_COMPRESS_INO(sbi)) clear_page_private_data(&folio->page); - if (page_private_atomic(&folio->page)) - return f2fs_drop_inmem_page(inode, &folio->page); - folio_detach_private(folio); } @@ -3540,10 +3657,6 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait) if (folio_test_dirty(folio)) return false; - /* This is atomic written page, keep Private */ - if (page_private_atomic(&folio->page)) - return false; - sbi = F2FS_M_SB(folio->mapping); if (test_opt(sbi, COMPRESS_CACHE)) { struct inode *inode = folio->mapping->host; @@ -3569,18 +3682,6 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!page_private_atomic(&folio->page)) { - f2fs_register_inmem_page(inode, &folio->page); - return true; - } - /* - * Previously, this page has been registered, we just - * return here. - */ - return false; - } - if (!folio_test_dirty(folio)) { filemap_dirty_folio(mapping, folio); f2fs_update_dirty_folio(inode, folio); @@ -3660,42 +3761,14 @@ out: int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { - int rc, extra_count; - struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = page_private_atomic(page); + int rc, extra_count = 0; BUG_ON(PageWriteback(page)); - /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written) { - if (mode != MIGRATE_SYNC) - return -EBUSY; - if (!mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; - } - - /* one extra reference was held for atomic_write page */ - extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) { - if (atomic_written) - mutex_unlock(&fi->inmem_lock); + if (rc != MIGRATEPAGE_SUCCESS) return rc; - } - - if (atomic_written) { - struct inmem_pages *cur; - - list_for_each_entry(cur, &fi->inmem_pages, list) - if (cur->page == page) { - cur->page = newpage; - break; - } - mutex_unlock(&fi->inmem_lock); - put_page(page); - get_page(newpage); - } /* guarantee to start from no stale private field */ set_page_private(newpage, 0); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fcdf253cd211..c92625ef16d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -91,11 +91,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; - si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = sbi->atomic_files; - si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); - si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); @@ -167,8 +164,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; - si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; - si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) / 2; @@ -296,7 +291,6 @@ get_cache: sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * sizeof(struct nat_entry_set); - si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * @@ -491,10 +485,6 @@ static int stat_show(struct seq_file *s, void *v) si->bg_data_blks); seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, si->bg_node_blks); - seq_printf(s, "Skipped : atomic write %llu (%llu)\n", - si->skipped_atomic_files[BG_GC] + - si->skipped_atomic_files[FG_GC], - si->skipped_atomic_files[BG_GC]); seq_printf(s, "BG skip : IO: %u, Other: %u\n", si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); @@ -519,10 +509,8 @@ static int stat_show(struct seq_file *s, void *v) si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " - "volatile IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt, - si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - atomic IO: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); @@ -623,9 +611,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); - atomic_set(&sbi->max_vw_cnt, 0); raw_spin_lock_irqsave(&f2fs_stat_lock, flags); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a0e51937d92e..d5bd7932fb64 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -82,7 +82,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, #if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = dir->i_sb; - if (IS_CASEFOLDED(dir)) { + if (IS_CASEFOLDED(dir) && + !is_dot_dotdot(fname->usr_fname->name, fname->usr_fname->len)) { fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 10d1f138d14f..d9bbecd008d2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -509,11 +509,11 @@ struct f2fs_filename { #if IS_ENABLED(CONFIG_UNICODE) /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode, if the directory is both - * casefolded and encrypted and its encryption key is unavailable, or if - * the filesystem is doing an internal operation where usr_fname is also - * NULL. In all these cases we fall back to treating the name as an - * opaque byte sequence. + * if the original name is not valid Unicode, if the original name is + * "." or "..", if the directory is both casefolded and encrypted and + * its encryption key is unavailable, or if the filesystem is doing an + * internal operation where usr_fname is also NULL. In all these cases + * we fall back to treating the name as an opaque byte sequence. */ struct fscrypt_str cf_name; #endif @@ -579,8 +579,8 @@ enum { /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 -/* maximum retry of EIO'ed meta page */ -#define MAX_RETRY_META_PAGE_EIO 100 +/* maximum retry of EIO'ed page */ +#define MAX_RETRY_PAGE_EIO 100 #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ @@ -717,7 +717,6 @@ enum { enum { GC_FAILURE_PIN, - GC_FAILURE_ATOMIC, MAX_GC_FAILURE }; @@ -739,8 +738,6 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ @@ -753,7 +750,6 @@ enum { FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ @@ -795,11 +791,9 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ - struct list_head inmem_ilist; /* list for inmem inodes */ - struct list_head inmem_pages; /* inmemory pages managed by f2fs */ - struct task_struct *inmem_task; /* store inmemory task */ - struct mutex inmem_lock; /* lock for inmemory pages */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct inode *cow_inode; /* copy-on-write inode for atomic write */ /* avoid racing between foreground op and gc */ struct f2fs_rwsem i_gc_rwsem[2]; @@ -1093,7 +1087,6 @@ enum count_type { F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, - F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, @@ -1118,16 +1111,12 @@ enum count_type { */ #define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { - DATA, - NODE, + DATA = 0, + NODE = 1, /* should not change this */ META, NR_PAGE_TYPE, META_FLUSH, - INMEM, /* the below types are used by tracepoints only. */ - INMEM_DROP, - INMEM_INVALIDATE, - INMEM_REVOKE, - IPU, + IPU, /* the below types are used by tracepoints only. */ OPU, }; @@ -1277,6 +1266,15 @@ struct atgc_management { unsigned long long age_threshold; /* age threshold */ }; +struct f2fs_gc_control { + unsigned int victim_segno; /* target victim segment number */ + int init_gc_type; /* FG_GC or BG_GC */ + bool no_bg_gc; /* check the space and stop bg_gc */ + bool should_migrate_blocks; /* should migrate blocks */ + bool err_gc_skipped; /* return EAGAIN if GC skipped */ + unsigned int nr_free_secs; /* # of free sections to do GC */ +}; + /* For s_flag in struct f2fs_sb_info */ enum { SBI_IS_DIRTY, /* dirty flag for checkpoint */ @@ -1615,8 +1613,8 @@ struct f2fs_sb_info { /* keep migration IO order for LFS mode */ struct f2fs_rwsem io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ - pgoff_t metapage_eio_ofs; /* EIO page offset */ - int metapage_eio_cnt; /* EIO count */ + pgoff_t page_eio_ofs[NR_PAGE_TYPE]; /* EIO page offset */ + int page_eio_cnt[NR_PAGE_TYPE]; /* EIO count */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -1719,7 +1717,6 @@ struct f2fs_sb_info { /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ - unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ @@ -1750,9 +1747,7 @@ struct f2fs_sb_info { atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t compr_inode; /* # of compressed inodes */ atomic64_t compr_blocks; /* # of compressed blocks */ - atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ - atomic_t max_vw_cnt; /* max # of volatile writes */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1763,7 +1758,7 @@ struct f2fs_sb_info { unsigned int data_io_flag; unsigned int node_io_flag; - /* For sysfs suppport */ + /* For sysfs support */ struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; @@ -2606,11 +2601,17 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_block_count); - f2fs_bug_on(sbi, !sbi->total_valid_node_count); + if (unlikely(!sbi->total_valid_block_count || + !sbi->total_valid_node_count)) { + f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u", + sbi->total_valid_block_count, + sbi->total_valid_node_count); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count--; + sbi->total_valid_node_count--; + } - sbi->total_valid_node_count--; - sbi->total_valid_block_count--; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks++; @@ -3173,6 +3174,10 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +/* + * Notice: check inline_data flag without inode page lock is unsafe. + * It could change at any time by f2fs_convert_inline_page(). + */ static inline int f2fs_has_inline_data(struct inode *inode) { return is_inode_flag_set(inode, FI_INLINE_DATA); @@ -3203,16 +3208,6 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } -static inline bool f2fs_is_commit_atomic_write(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); -} - -static inline bool f2fs_is_volatile_file(struct inode *inode) -{ - return is_inode_flag_set(inode, FI_VOLATILE_FILE); -} - static inline bool f2fs_is_first_block_written(struct inode *inode) { return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); @@ -3445,6 +3440,8 @@ void f2fs_handle_failed_inode(struct inode *inode); int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set); struct dentry *f2fs_get_parent(struct dentry *child); +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode); /* * dir.c @@ -3580,11 +3577,8 @@ void f2fs_destroy_node_manager_caches(void); * segment.c */ bool f2fs_need_SSR(struct f2fs_sb_info *sbi); -void f2fs_register_inmem_page(struct inode *inode, struct page *page); -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure); -void f2fs_drop_inmem_pages(struct inode *inode); -void f2fs_drop_inmem_page(struct inode *inode, struct page *page); -int f2fs_commit_inmem_pages(struct inode *inode); +int f2fs_commit_atomic_write(struct inode *inode); +void f2fs_abort_atomic_write(struct inode *inode, bool clean); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); @@ -3726,6 +3720,7 @@ int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type); +int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3787,8 +3782,7 @@ extern const struct iomap_ops f2fs_iomap_ops; int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, - unsigned int segno); +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); @@ -3816,7 +3810,6 @@ struct f2fs_stat_info { int ext_tree, zombie_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - int inmem_pages; unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; @@ -3834,7 +3827,7 @@ struct f2fs_stat_info { int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; - int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -3846,7 +3839,6 @@ struct f2fs_stat_info { int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; - unsigned long long skipped_atomic_files[2]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -3946,17 +3938,6 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_volatile_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_dec_volatile_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) -#define stat_update_max_volatile_write(inode) \ - do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ - int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ - if (cur > max) \ - atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ - } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -4018,9 +3999,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) -#define stat_inc_volatile_write(inode) do { } while (0) -#define stat_dec_volatile_write(inode) do { } while (0) -#define stat_update_max_volatile_write(inode) do { } while (0) #define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) @@ -4053,6 +4031,7 @@ extern struct kmem_cache *f2fs_inode_entry_slab; * inline.c */ bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_sanity_check_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void f2fs_do_read_inline_data(struct page *page, struct page *ipage); void f2fs_truncate_inline_inode(struct inode *inode, @@ -4422,8 +4401,7 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } @@ -4431,8 +4409,8 @@ static inline bool f2fs_may_compress(struct inode *inode) static inline void f2fs_i_compr_blocks_update(struct inode *inode, u64 blocks, bool add) { - int diff = F2FS_I(inode)->i_cluster_size - blocks; struct f2fs_inode_info *fi = F2FS_I(inode); + int diff = fi->i_cluster_size - blocks; /* don't update i_compr_blocks if saved blocks were released */ if (!add && !atomic_read(&fi->i_compr_blocks)) @@ -4540,6 +4518,21 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs, + enum page_type type) +{ + if (unlikely(f2fs_cp_error(sbi))) + return; + + if (ofs == sbi->page_eio_ofs[type]) { + if (sbi->page_eio_cnt[type]++ == MAX_RETRY_PAGE_EIO) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + } else { + sbi->page_eio_ofs[type] = ofs; + sbi->page_eio_cnt[type] = 0; + } +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 100637b1adb3..bd14cef1b08f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -372,7 +372,8 @@ sync_nodes: f2fs_remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - if (!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) + if ((!atomic && F2FS_OPTION(sbi).fsync_mode != FSYNC_MODE_NOBARRIER) || + (atomic && !test_opt(sbi, NOBARRIER) && f2fs_sb_has_blkzoned(sbi))) ret = f2fs_issue_flush(sbi, inode->i_ino); if (!ret) { f2fs_remove_ino_entry(sbi, ino, UPDATE_INO); @@ -1437,11 +1438,19 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, ret = -ENOSPC; break; } - if (dn->data_blkaddr != NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn->data_blkaddr); - dn->data_blkaddr = NEW_ADDR; - f2fs_set_data_blkaddr(dn); + + if (dn->data_blkaddr == NEW_ADDR) + continue; + + if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr, + DATA_GENERIC_ENHANCE)) { + ret = -EFSCORRUPTED; + break; } + + f2fs_invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + f2fs_set_data_blkaddr(dn); } f2fs_update_extent_cache_range(dn, start, 0, index - start); @@ -1638,6 +1647,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 0 }; pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1675,8 +1689,8 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + err = f2fs_gc(sbi, &gc_control); + if (err && err != -ENODATA) goto out_err; } @@ -1766,6 +1780,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode_lock(inode); + ret = file_modified(file); + if (ret) + goto out; + if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) goto out; @@ -1804,16 +1822,8 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) atomic_read(&inode->i_writecount) != 1) return 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - set_inode_flag(inode, FI_DROP_CACHE); - filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(inode, FI_DROP_CACHE); - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - } + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1828,8 +1838,8 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) * before dropping file lock, it needs to do in ->flush. */ if (f2fs_is_atomic_file(inode) && - F2FS_I(inode)->inmem_task == current) - f2fs_drop_inmem_pages(inode); + F2FS_I(inode)->atomic_write_task == current) + f2fs_abort_atomic_write(inode, true); return 0; } @@ -1992,6 +2002,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct user_namespace *mnt_userns = file_mnt_user_ns(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inode *pinode; int ret; if (!inode_owner_or_capable(mnt_userns, inode)) @@ -2014,44 +2025,55 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; } - if (f2fs_is_atomic_file(inode)) { - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) - ret = -EINVAL; + if (f2fs_is_atomic_file(inode)) goto out; - } ret = f2fs_convert_inline_inode(inode); if (ret) goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_down_write(&fi->i_gc_rwsem[WRITE]); /* * Should wait end_io to count F2FS_WB_CP_DATA correctly by * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(F2FS_I_SB(inode), "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); goto out; } + /* Create a COW inode for atomic write */ + pinode = f2fs_iget(inode->i_sb, fi->i_pino); + if (IS_ERR(pinode)) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + ret = PTR_ERR(pinode); + goto out; + } + + ret = f2fs_get_tmpfile(mnt_userns, pinode, &fi->cow_inode); + iput(pinode); + if (ret) { + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); + goto out; + } + f2fs_i_size_write(fi->cow_inode, i_size_read(inode)); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(&fi->inmem_ilist)) - list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); sbi->atomic_files++; spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - /* add inode in inmem_list first and set atomic_file */ set_inode_flag(inode, FI_ATOMIC_FILE); - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + set_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + clear_inode_flag(fi->cow_inode, FI_INLINE_DATA); + f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - F2FS_I(inode)->inmem_task = current; + f2fs_update_time(sbi, REQ_TIME); + fi->atomic_write_task = current; stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2076,130 +2098,23 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) inode_lock(inode); - if (f2fs_is_volatile_file(inode)) { - ret = -EINVAL; - goto err_out; - } - if (f2fs_is_atomic_file(inode)) { - ret = f2fs_commit_inmem_pages(inode); + ret = f2fs_commit_atomic_write(inode); if (ret) - goto err_out; + goto unlock_out; ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, false); } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } -err_out: - if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - ret = -EINVAL; - } +unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; } -static int f2fs_ioc_start_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_volatile_file(inode)) - goto out; - - ret = f2fs_convert_inline_inode(inode); - if (ret) - goto out; - - stat_inc_volatile_write(inode); - stat_update_max_volatile_write(inode); - - set_inode_flag(inode, FI_VOLATILE_FILE); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_release_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (!f2fs_is_volatile_file(inode)) - goto out; - - if (!f2fs_is_first_block_written(inode)) { - ret = truncate_partial_data_page(inode, 0, true); - goto out; - } - - ret = punch_hole(inode, 0, F2FS_BLKSIZE); -out: - inode_unlock(inode); - mnt_drop_write_file(filp); - return ret; -} - -static int f2fs_ioc_abort_volatile_write(struct file *filp) -{ - struct inode *inode = file_inode(filp); - struct user_namespace *mnt_userns = file_mnt_user_ns(filp); - int ret; - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EACCES; - - ret = mnt_want_write_file(filp); - if (ret) - return ret; - - inode_lock(inode); - - if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); - if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(inode, FI_VOLATILE_FILE); - stat_dec_volatile_write(inode); - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - } - - clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - - inode_unlock(inode); - - mnt_drop_write_file(filp); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return ret; -} - static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2437,6 +2352,10 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, + .no_bg_gc = false, + .should_migrate_blocks = false, + .nr_free_secs = 0 }; __u32 sync; int ret; @@ -2462,7 +2381,9 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); + gc_control.init_gc_type = sync ? FG_GC : BG_GC; + gc_control.err_gc_skipped = sync; + ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); return ret; @@ -2471,6 +2392,12 @@ out: static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) { struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); + struct f2fs_gc_control gc_control = { + .init_gc_type = range->sync ? FG_GC : BG_GC, + .no_bg_gc = false, + .should_migrate_blocks = false, + .err_gc_skipped = range->sync, + .nr_free_secs = 0 }; u64 end; int ret; @@ -2498,8 +2425,8 @@ do_more: f2fs_down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, false, - GET_SEGNO(sbi, range->start)); + gc_control.victim_segno = GET_SEGNO(sbi, range->start); + ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2674,6 +2601,7 @@ do_map: } set_page_dirty(page); + set_page_private_gcing(page); f2fs_put_page(page, 1); idx++; @@ -2913,6 +2841,11 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) unsigned int start_segno = 0, end_segno = 0; unsigned int dev_start_segno = 0, dev_end_segno = 0; struct f2fs_flush_device range; + struct f2fs_gc_control gc_control = { + .init_gc_type = FG_GC, + .should_migrate_blocks = true, + .err_gc_skipped = true, + .nr_free_secs = 0 }; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2956,7 +2889,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, true, start_segno); + + gc_control.victim_segno = start_segno; + ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; else if (ret < 0) @@ -3017,7 +2952,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) kprojid = make_kprojid(&init_user_ns, (projid_t)projid); - if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + if (projid_eq(kprojid, fi->i_projid)) return 0; err = -EPERM; @@ -3037,7 +2972,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) if (err) goto out_unlock; - F2FS_I(inode)->i_projid = kprojid; + fi->i_projid = kprojid; inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: @@ -3987,7 +3922,7 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t page_idx = 0, last_idx; unsigned int blk_per_seg = sbi->blocks_per_seg; - int cluster_size = F2FS_I(inode)->i_cluster_size; + int cluster_size = fi->i_cluster_size; int count, ret; if (!f2fs_sb_has_compression(sbi) || @@ -4010,11 +3945,6 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4082,11 +4012,6 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) goto out; } - if (f2fs_is_mmap_file(inode)) { - ret = -EBUSY; - goto out; - } - ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) goto out; @@ -4136,11 +4061,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: - return f2fs_ioc_start_volatile_write(filp); case F2FS_IOC_RELEASE_VOLATILE_WRITE: - return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: - return f2fs_ioc_abort_volatile_write(filp); + return -EOPNOTSUPP; case F2FS_IOC_SHUTDOWN: return f2fs_ioc_shutdown(filp, arg); case FITRIM: @@ -4328,17 +4251,39 @@ out: static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); + const loff_t pos = iocb->ki_pos; ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - if (f2fs_should_use_dio(inode, iocb, to)) - return f2fs_dio_read_iter(iocb, to); + if (trace_f2fs_dataread_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), PATH_MAX, GFP_KERNEL); + char *path; - ret = filemap_read(iocb, to, 0); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + if (!p) + goto skip_read_trace; + + path = dentry_path_raw(file_dentry(iocb->ki_filp), p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_read_trace; + } + + trace_f2fs_dataread_start(inode, pos, iov_iter_count(to), + current->pid, path, current->comm); + kfree(p); + } +skip_read_trace: + if (f2fs_should_use_dio(inode, iocb, to)) { + ret = f2fs_dio_read_iter(iocb, to); + } else { + ret = filemap_read(iocb, to, 0); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + } + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); return ret; } @@ -4630,14 +4575,36 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* Possibly preallocate the blocks for the write. */ target_size = iocb->ki_pos + iov_iter_count(from); preallocated = f2fs_preallocate_blocks(iocb, from, dio); - if (preallocated < 0) + if (preallocated < 0) { ret = preallocated; - else + } else { + if (trace_f2fs_datawrite_start_enabled()) { + char *p = f2fs_kmalloc(F2FS_I_SB(inode), + PATH_MAX, GFP_KERNEL); + char *path; + + if (!p) + goto skip_write_trace; + path = dentry_path_raw(file_dentry(iocb->ki_filp), + p, PATH_MAX); + if (IS_ERR(path)) { + kfree(p); + goto skip_write_trace; + } + trace_f2fs_datawrite_start(inode, orig_pos, orig_count, + current->pid, path, current->comm); + kfree(p); + } +skip_write_trace: /* Do the actual write. */ ret = dio ? f2fs_dio_write_iter(iocb, from, &may_need_sync): f2fs_buffered_write_iter(iocb, from); + if (trace_f2fs_datawrite_end_enabled()) + trace_f2fs_datawrite_end(inode, orig_pos, ret); + } + /* Don't leave any preallocated blocks around past i_size. */ if (preallocated && i_size_read(inode) < target_size) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ea5b93b689cd..d5fb426e0747 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -35,6 +35,10 @@ static int gc_thread_func(void *data) wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .should_migrate_blocks = false, + .err_gc_skipped = false }; wait_ms = gc_th->min_sleep_time; @@ -141,8 +145,12 @@ do_gc: if (foreground) sync_mode = false; + gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; + gc_control.no_bg_gc = foreground; + gc_control.nr_free_secs = foreground ? 1 : 0; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) + if (f2fs_gc(sbi, &gc_control)) wait_ms = gc_th->no_gc_sleep_time; if (foreground) @@ -646,6 +654,54 @@ static void release_victim_entry(struct f2fs_sb_info *sbi) f2fs_bug_on(sbi, !list_empty(&am->victim_list)); } +static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + + if (!dirty_i->enable_pin_section) + return false; + if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) + dirty_i->pinned_secmap_cnt++; + return true; +} + +static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) +{ + return dirty_i->pinned_secmap_cnt; +} + +static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, + unsigned int secno) +{ + return dirty_i->enable_pin_section && + f2fs_pinned_section_exists(dirty_i) && + test_bit(secno, dirty_i->pinned_secmap); +} + +static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) +{ + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); + + if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { + memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); + DIRTY_I(sbi)->pinned_secmap_cnt = 0; + } + DIRTY_I(sbi)->enable_pin_section = enable; +} + +static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, + unsigned int segno) +{ + if (!f2fs_is_pinned_file(inode)) + return 0; + if (gc_type != FG_GC) + return -EBUSY; + if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) + f2fs_pin_file_control(inode, true); + return -EAGAIN; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -787,6 +843,9 @@ retry: if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) + goto next; + if (is_atgc) { add_victim_entry(sbi, &p, segno); goto next; @@ -1194,18 +1253,9 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } - - if (f2fs_is_pinned_file(inode)) { - f2fs_pin_file_control(inode, true); - err = -EAGAIN; - goto out; - } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); @@ -1344,18 +1394,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } - if (f2fs_is_atomic_file(inode)) { - F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; - F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; - err = -EAGAIN; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err) goto out; - } - if (f2fs_is_pinned_file(inode)) { - if (gc_type == FG_GC) - f2fs_pin_file_control(inode, true); - err = -EAGAIN; - goto out; - } if (gc_type == BG_GC) { if (PageWriteback(page)) { @@ -1475,11 +1516,19 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + int err; + inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode) || special_file(inode->i_mode)) continue; + err = f2fs_gc_pinned_control(inode, gc_type, segno); + if (err == -EAGAIN) { + iput(inode); + return submitted; + } + if (!f2fs_down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); @@ -1699,23 +1748,21 @@ skip: return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, bool force, unsigned int segno) +int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { - int gc_type = sync ? FG_GC : BG_GC; + int gc_type = gc_control->init_gc_type; + unsigned int segno = gc_control->victim_segno; int sec_freed = 0, seg_freed = 0, total_freed = 0; int ret = 0; struct cp_control cpc; - unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; - unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; - trace_f2fs_gc_begin(sbi->sb, sync, background, + trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, + gc_control->nr_free_secs, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1726,7 +1773,6 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); sbi->skipped_gc_rwsem = 0; - first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1743,8 +1789,7 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi) && - !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { + if (prefree_segments(sbi)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1754,54 +1799,69 @@ gc_more: } /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) { + if (gc_type == BG_GC && gc_control->no_bg_gc) { ret = -EINVAL; goto stop; } +retry: ret = __get_victim(sbi, &segno, gc_type); - if (ret) + if (ret) { + /* allow to search victim from sections has pinned data */ + if (ret == -ENODATA && gc_type == FG_GC && + f2fs_pinned_section_exists(DIRTY_I(sbi))) { + f2fs_unpin_all_sections(sbi, false); + goto retry; + } goto stop; + } - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); - if (gc_type == FG_GC && - seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) - sec_freed++; + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, + gc_control->should_migrate_blocks); total_freed += seg_freed; - if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped || - sbi->skipped_gc_rwsem) - skipped_round++; - last_skipped = sbi->skipped_atomic_files[FG_GC]; - round++; - } + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (sync) + if (gc_control->init_gc_type == FG_GC || + !has_not_enough_free_secs(sbi, + (gc_type == FG_GC) ? sec_freed : 0, 0)) { + if (gc_type == FG_GC && sec_freed < gc_control->nr_free_secs) + goto go_gc_more; goto stop; - - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round <= MAX_SKIP_GC_COUNT || - skipped_round * 2 < round) { - segno = NULL_SEGNO; - goto gc_more; - } - - if (first_skipped < last_skipped && - (last_skipped - first_skipped) > - sbi->skipped_gc_rwsem) { - f2fs_drop_inmem_pages_all(sbi, true); - segno = NULL_SEGNO; - goto gc_more; - } - if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) - ret = f2fs_write_checkpoint(sbi, &cpc); } + + /* FG_GC stops GC by skip_count */ + if (gc_type == FG_GC) { + if (sbi->skipped_gc_rwsem) + skipped_round++; + round++; + if (skipped_round > MAX_SKIP_GC_COUNT && + skipped_round * 2 >= round) { + ret = f2fs_write_checkpoint(sbi, &cpc); + goto stop; + } + } + + /* Write checkpoint to reclaim prefree segments */ + if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE && + prefree_segments(sbi)) { + ret = f2fs_write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } +go_gc_more: + segno = NULL_SEGNO; + goto gc_more; + stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; - SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; + + if (gc_type == FG_GC) + f2fs_unpin_all_sections(sbi, true); trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), @@ -1816,7 +1876,7 @@ stop: put_gc_inode(&gc_list); - if (sync && !ret) + if (gc_control->err_gc_skipped && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 3cb1e7a24740..049ce50cec9b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -91,7 +91,7 @@ static u32 TEA_hash_name(const u8 *p, size_t len) /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also - * @fname->cf_name if the filename is valid Unicode. + * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { @@ -110,10 +110,11 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that - * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. Note - * that to handle encrypted directories, the fallback must use - * usr_fname (plaintext) rather than disk_name (ciphertext). + * should only be because the name wasn't valid Unicode or was + * "." or "..", so fall back to treating the name as an opaque + * byte sequence. Note that to handle encrypted directories, + * the fallback must use usr_fname (plaintext) rather than + * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a578bf83b803..bf46a7dfbea2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -14,21 +14,40 @@ #include "node.h" #include -bool f2fs_may_inline_data(struct inode *inode) +static bool support_inline_data(struct inode *inode) { if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; + return true; +} - if (f2fs_post_read_required(inode)) +bool f2fs_may_inline_data(struct inode *inode) +{ + if (!support_inline_data(inode)) return false; - return true; + return !f2fs_post_read_required(inode); +} + +bool f2fs_sanity_check_inline_data(struct inode *inode) +{ + if (!f2fs_has_inline_data(inode)) + return false; + + if (!support_inline_data(inode)) + return true; + + /* + * used by sanity_check_inode(), when disk layout fields has not + * been synchronized to inmem fields. + */ + return (S_ISREG(inode->i_mode) && + (file_is_encrypt(inode) || file_is_verity(inode) || + (F2FS_I(inode)->i_flags & F2FS_COMPR_FL))); } bool f2fs_may_inline_dentry(struct inode *inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 83639238a1fe..fc55f5bd1fcc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -260,8 +260,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (F2FS_I(inode)->extent_tree) { - struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + if (fi->extent_tree) { + struct extent_info *ei = &fi->extent_tree->largest; if (ei->len && (!f2fs_is_valid_blkaddr(sbi, ei->blk, @@ -276,8 +276,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } } - if (f2fs_has_inline_data(inode) && - (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + if (f2fs_sanity_check_inline_data(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); @@ -466,10 +465,10 @@ static int do_read_inode(struct inode *inode) } } - F2FS_I(inode)->i_disk_time[0] = inode->i_atime; - F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; - F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; - F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[3] = fi->i_crtime; f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -745,9 +744,8 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -796,8 +794,22 @@ retry: f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); f2fs_unlock_op(sbi); - if (err == -ENOENT) + if (err == -ENOENT) { err = 0; + + /* + * in fuzzed image, another node may has the same + * block address as inode's, if it was truncated + * previously, truncation of inode node will fail. + */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + f2fs_warn(F2FS_I_SB(inode), + "f2fs_evict_inode: inconsistent node id, ino:%lu", + inode->i_ino); + f2fs_inode_synced(inode); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } + } } /* give more chances, if ENOMEM case */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 5ed79b29999f..c549acb52ac4 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -37,13 +37,10 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns, if (!inode) return ERR_PTR(-ENOMEM); - f2fs_lock_op(sbi); if (!f2fs_alloc_nid(sbi, &ino)) { - f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - f2fs_unlock_op(sbi); nid_free = true; @@ -461,6 +458,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + if (!S_ISDIR(dir->i_mode)) { + f2fs_err(sbi, "inconsistent inode status, skip recovering inline_dots inode (ino:%lu, i_mode:%u, pino:%u)", + dir->i_ino, dir->i_mode, pino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -ENOTDIR; + } + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -836,8 +840,8 @@ out: } static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, - struct inode **whiteout) + struct dentry *dentry, umode_t mode, bool is_whiteout, + struct inode **new_inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -851,7 +855,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) return PTR_ERR(inode); - if (whiteout) { + if (is_whiteout) { init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); inode->i_op = &f2fs_special_inode_operations; } else { @@ -876,21 +880,25 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, f2fs_add_orphan_inode(inode); f2fs_alloc_nid_done(sbi, inode->i_ino); - if (whiteout) { + if (is_whiteout) { f2fs_i_links_write(inode, false); spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); - - *whiteout = inode; } else { - d_tmpfile(dentry, inode); + if (dentry) + d_tmpfile(dentry, inode); + else + f2fs_i_links_write(inode, false); } /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + if (new_inode) + *new_inode = inode; + f2fs_balance_fs(sbi, true); return 0; @@ -911,7 +919,7 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL); + return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, false, NULL); } static int f2fs_create_whiteout(struct user_namespace *mnt_userns, @@ -921,7 +929,13 @@ static int f2fs_create_whiteout(struct user_namespace *mnt_userns, return -EIO; return __f2fs_tmpfile(mnt_userns, dir, NULL, - S_IFCHR | WHITEOUT_MODE, whiteout); + S_IFCHR | WHITEOUT_MODE, true, whiteout); +} + +int f2fs_get_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct inode **new_inode) +{ + return __f2fs_tmpfile(mnt_userns, dir, NULL, S_IFREG, false, new_inode); } static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ccff18560ff..836c79a20afc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -90,10 +90,6 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); - } else if (type == INMEM_PAGES) { - /* it allows 20% / total_ram for inmemory pages */ - mem_size = get_pages(sbi, F2FS_INMEM_PAGES); - res = mem_size < (val.totalram / 5); } else if (type == DISCARD_CACHE) { mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; @@ -1416,8 +1412,7 @@ repeat: err = read_node_page(page, 0); if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + goto out_put_err; } else if (err == LOCKED_PAGE) { err = 0; goto page_hit; @@ -1443,19 +1438,21 @@ repeat: goto out_err; } page_hit: - if (unlikely(nid != nid_of_node(page))) { - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + if (likely(nid == nid_of_node(page))) + return page; + + f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; out_err: - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(err); - } - return page; + ClearPageUptodate(page); +out_put_err: + f2fs_handle_page_eio(sbi, page->index, NODE); + f2fs_put_page(page, 1); + return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) @@ -1631,7 +1628,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, goto redirty_out; } - if (atomic && !test_opt(sbi, NOBARRIER)) + if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 4c1d34bfea78..3c09cae058b0 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,7 +147,6 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ - INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7225ce09f3ab..874c1b9c41a2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -30,7 +30,7 @@ static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; -static struct kmem_cache *inmem_entry_slab; +static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { @@ -185,301 +185,175 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } -void f2fs_register_inmem_page(struct inode *inode, struct page *page) -{ - struct inmem_pages *new; - - set_page_private_atomic(page); - - new = f2fs_kmem_cache_alloc(inmem_entry_slab, - GFP_NOFS, true, NULL); - - /* add atomic page indices to the list */ - new->page = page; - INIT_LIST_HEAD(&new->list); - - /* increase reference count with clean state */ - get_page(page); - mutex_lock(&F2FS_I(inode)->inmem_lock); - list_add_tail(&new->list, &F2FS_I(inode)->inmem_pages); - inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); - mutex_unlock(&F2FS_I(inode)->inmem_lock); - - trace_f2fs_register_inmem_page(page, INMEM); -} - -static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover, - bool trylock) +void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inmem_pages *cur, *tmp; - int err = 0; + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (f2fs_is_atomic_file(inode)) { + if (clean) + truncate_inode_pages_final(inode->i_mapping); + clear_inode_flag(fi->cow_inode, FI_ATOMIC_FILE); + iput(fi->cow_inode); + fi->cow_inode = NULL; + clear_inode_flag(inode, FI_ATOMIC_FILE); + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + sbi->atomic_files--; + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } +} + +static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, + block_t new_addr, block_t *old_addr, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct node_info ni; + int err; + +retry: + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE_RA); + if (err) { + if (err == -ENOMEM) { + f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + goto retry; + } + return err; + } + + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + + if (recover) { + /* dn.data_blkaddr is always valid */ + if (!__is_valid_data_blkaddr(new_addr)) { + if (new_addr == NULL_ADDR) + dec_valid_block_count(sbi, inode, 1); + f2fs_invalidate_blocks(sbi, dn.data_blkaddr); + f2fs_update_data_blkaddr(&dn, new_addr); + } else { + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + new_addr, ni.version, true, true); + } + } else { + blkcnt_t count = 1; + + *old_addr = dn.data_blkaddr; + f2fs_truncate_data_blocks_range(&dn, 1); + dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); + inc_valid_block_count(sbi, inode, &count); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, + ni.version, true, false); + } + + f2fs_put_dnode(&dn); + return 0; +} + +static void __complete_revoke_list(struct inode *inode, struct list_head *head, + bool revoke) +{ + struct revoke_entry *cur, *tmp; list_for_each_entry_safe(cur, tmp, head, list) { - struct page *page = cur->page; - - if (drop) - trace_f2fs_commit_inmem_page(page, INMEM_DROP); - - if (trylock) { - /* - * to avoid deadlock in between page lock and - * inmem_lock. - */ - if (!trylock_page(page)) - continue; - } else { - lock_page(page); - } - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - if (recover) { - struct dnode_of_data dn; - struct node_info ni; - - trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); -retry: - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_dnode_of_data(&dn, page->index, - LOOKUP_NODE); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - err = -EAGAIN; - goto next; - } - - err = f2fs_get_node_info(sbi, dn.nid, &ni, false); - if (err) { - f2fs_put_dnode(&dn); - return err; - } - - if (cur->old_addr == NEW_ADDR) { - f2fs_invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } else - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, - cur->old_addr, ni.version, true, true); - f2fs_put_dnode(&dn); - } -next: - /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) { - ClearPageUptodate(page); - clear_page_private_gcing(page); - } - detach_page_private(page); - set_page_private(page, 0); - f2fs_put_page(page, 1); - + if (revoke) + __replace_atomic_write_block(inode, cur->index, + cur->old_addr, NULL, true); list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + kmem_cache_free(revoke_entry_slab, cur); } - return err; } -void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) -{ - struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; - struct inode *inode; - struct f2fs_inode_info *fi; - unsigned int count = sbi->atomic_files; - unsigned int looped = 0; -next: - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (list_empty(head)) { - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - return; - } - fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); - inode = igrab(&fi->vfs_inode); - if (inode) - list_move_tail(&fi->inmem_ilist, head); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - if (inode) { - if (gc_failure) { - if (!fi->i_gc_failures[GC_FAILURE_ATOMIC]) - goto skip; - } - set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_drop_inmem_pages(inode); -skip: - iput(inode); - } - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (gc_failure) { - if (++looped >= count) - return; - } - goto next; -} - -void f2fs_drop_inmem_pages(struct inode *inode) +static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - - do { - mutex_lock(&fi->inmem_lock); - if (list_empty(&fi->inmem_pages)) { - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - - mutex_unlock(&fi->inmem_lock); - break; - } - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, true); - mutex_unlock(&fi->inmem_lock); - } while (1); -} - -void f2fs_drop_inmem_page(struct inode *inode, struct page *page) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct list_head *head = &fi->inmem_pages; - struct inmem_pages *cur = NULL; - - f2fs_bug_on(sbi, !page_private_atomic(page)); - - mutex_lock(&fi->inmem_lock); - list_for_each_entry(cur, head, list) { - if (cur->page == page) - break; - } - - f2fs_bug_on(sbi, list_empty(head) || cur->page != page); - list_del(&cur->list); - mutex_unlock(&fi->inmem_lock); - - dec_page_count(sbi, F2FS_INMEM_PAGES); - kmem_cache_free(inmem_entry_slab, cur); - - ClearPageUptodate(page); - clear_page_private_atomic(page); - f2fs_put_page(page, 0); - - detach_page_private(page); - set_page_private(page, 0); - - trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); -} - -static int __f2fs_commit_inmem_pages(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - struct inmem_pages *cur, *tmp; - struct f2fs_io_info fio = { - .sbi = sbi, - .ino = inode->i_ino, - .type = DATA, - .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_PRIO, - .io_type = FS_DATA_IO, - }; + struct inode *cow_inode = fi->cow_inode; + struct revoke_entry *new; struct list_head revoke_list; - bool submit_bio = false; - int err = 0; + block_t blkaddr; + struct dnode_of_data dn; + pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + pgoff_t off = 0, blen, index; + int ret = 0, i; INIT_LIST_HEAD(&revoke_list); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - struct page *page = cur->page; + while (len) { + blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); - lock_page(page); - if (page->mapping == inode->i_mapping) { - trace_f2fs_commit_inmem_page(page, INMEM); - - f2fs_wait_on_page_writeback(page, DATA, true, true); - - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { - inode_dec_dirty_pages(inode); - f2fs_remove_dirty_inode(inode); - } -retry: - fio.page = page; - fio.old_blkaddr = NULL_ADDR; - fio.encrypted_page = NULL; - fio.need_lock = LOCK_DONE; - err = f2fs_do_write_data_page(&fio); - if (err) { - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_NOFS); - goto retry; - } - unlock_page(page); - break; - } - /* record old blkaddr for revoking */ - cur->old_addr = fio.old_blkaddr; - submit_bio = true; + set_new_dnode(&dn, cow_inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + ret = 0; + if (dn.max_level == 0) + goto out; + goto next; } - unlock_page(page); - list_move_tail(&cur->list, &revoke_list); + + blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + len); + index = off; + for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { + blkaddr = f2fs_data_blkaddr(&dn); + + if (!__is_valid_data_blkaddr(blkaddr)) { + continue; + } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, + DATA_GENERIC_ENHANCE)) { + f2fs_put_dnode(&dn); + ret = -EFSCORRUPTED; + goto out; + } + + new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, + true, NULL); + + ret = __replace_atomic_write_block(inode, index, blkaddr, + &new->old_addr, false); + if (ret) { + f2fs_put_dnode(&dn); + kmem_cache_free(revoke_entry_slab, new); + goto out; + } + + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + new->index = index; + list_add_tail(&new->list, &revoke_list); + } + f2fs_put_dnode(&dn); +next: + off += blen; + len -= blen; } - if (submit_bio) - f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); +out: + __complete_revoke_list(inode, &revoke_list, ret ? true : false); - if (err) { - /* - * try to revoke all committed pages, but still we could fail - * due to no memory or other reason, if that happened, EAGAIN - * will be returned, which means in such case, transaction is - * already not integrity, caller should use journal to do the - * recovery or rewrite & commit last transaction. For other - * error number, revoking was done by filesystem itself. - */ - err = __revoke_inmem_pages(inode, &revoke_list, - false, true, false); - - /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, - true, false, false); - } else { - __revoke_inmem_pages(inode, &revoke_list, - false, false, false); - } - - return err; + return ret; } -int f2fs_commit_inmem_pages(struct inode *inode) +int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; - f2fs_balance_fs(sbi, true); + err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (err) + return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); - f2fs_lock_op(sbi); - set_inode_flag(inode, FI_ATOMIC_COMMIT); - mutex_lock(&fi->inmem_lock); - err = __f2fs_commit_inmem_pages(inode); - mutex_unlock(&fi->inmem_lock); - - clear_inode_flag(inode, FI_ATOMIC_COMMIT); + err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); @@ -520,8 +394,15 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = BG_GC, + .no_bg_gc = true, + .should_migrate_blocks = false, + .err_gc_skipped = false, + .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + f2fs_gc(sbi, &gc_control); } } } @@ -1664,33 +1545,32 @@ static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct discard_cmd *dc, *tmp; - bool need_wait; + struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: - need_wait = false; + dc = NULL; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (dc->lstart + dc->len <= start || end <= dc->lstart) + list_for_each_entry_safe(iter, tmp, wait_list, list) { + if (iter->lstart + iter->len <= start || end <= iter->lstart) continue; - if (dc->len < dpolicy->granularity) + if (iter->len < dpolicy->granularity) continue; - if (dc->state == D_DONE && !dc->ref) { - wait_for_completion_io(&dc->wait); - if (!dc->error) - trimmed += dc->len; - __remove_discard_cmd(sbi, dc); + if (iter->state == D_DONE && !iter->ref) { + wait_for_completion_io(&iter->wait); + if (!iter->error) + trimmed += iter->len; + __remove_discard_cmd(sbi, iter); } else { - dc->ref++; - need_wait = true; + iter->ref++; + dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); - if (need_wait) { + if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } @@ -3286,8 +3166,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_atomic_file(inode) || - f2fs_is_volatile_file(inode)) + f2fs_is_atomic_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -4084,10 +3963,12 @@ static void adjust_sit_entry_set(struct sit_entry_set *ses, return; list_for_each_entry_continue(next, head, set_list) - if (ses->entry_cnt <= next->entry_cnt) - break; + if (ses->entry_cnt <= next->entry_cnt) { + list_move_tail(&ses->set_list, &next->set_list); + return; + } - list_move_tail(&ses->set_list, &next->set_list); + list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) @@ -4455,7 +4336,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; - block_t total_node_blocks = 0; + block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, @@ -4480,8 +4361,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (err) return err; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { /* build discard map only one time */ @@ -4521,15 +4402,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; - if (IS_NODESEG(se->type)) - total_node_blocks -= old_valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); - if (IS_NODESEG(se->type)) - total_node_blocks += se->valid_blocks; + + sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { @@ -4551,13 +4432,24 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) } up_read(&curseg->journal_rwsem); - if (!err && total_node_blocks != valid_node_count(sbi)) { + if (err) + return err; + + if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", - total_node_blocks, valid_node_count(sbi)); - err = -EFSCORRUPTED; + sit_valid_blocks[NODE], valid_node_count(sbi)); + return -EFSCORRUPTED; } - return err; + if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > + valid_user_blocks(sbi)) { + f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", + sit_valid_blocks[DATA], sit_valid_blocks[NODE], + valid_user_blocks(sbi)); + return -EFSCORRUPTED; + } + + return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -4637,6 +4529,13 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; + + dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); + if (!dirty_i->pinned_secmap) + return -ENOMEM; + + dirty_i->pinned_secmap_cnt = 0; + dirty_i->enable_pin_section = true; return 0; } @@ -5225,6 +5124,7 @@ static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } @@ -5335,9 +5235,9 @@ int __init f2fs_create_segment_manager_caches(void) if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", - sizeof(struct inmem_pages)); - if (!inmem_entry_slab) + revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", + sizeof(struct revoke_entry)); + if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; @@ -5356,5 +5256,5 @@ void f2fs_destroy_segment_manager_caches(void) kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); - kmem_cache_destroy(inmem_entry_slab); + kmem_cache_destroy(revoke_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5c94caf0c0a1..3f277dfcb131 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -24,6 +24,7 @@ #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) +#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) @@ -224,10 +225,10 @@ struct segment_allocation { #define MAX_SKIP_GC_COUNT 16 -struct inmem_pages { +struct revoke_entry { struct list_head list; - struct page *page; block_t old_addr; /* for revoking when fail to commit */ + pgoff_t index; }; struct sit_info { @@ -294,6 +295,9 @@ struct dirty_seglist_info { struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ + unsigned long *pinned_secmap; /* pinned victims from foreground GC */ + unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ + bool enable_pin_section; /* enable pinning section */ }; /* victim selection function for cleaning and SSR */ @@ -572,11 +576,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } -static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, + unsigned int node_blocks, unsigned int dent_blocks) { - unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + - get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; int i; @@ -602,19 +605,28 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS) + + get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && - has_curseg_enough_space(sbi)) + free = free_sections(sbi) + freed; + need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed; + need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + + if (free > need_upper) return false; - return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + needed); + else if (free <= need_lower) + return true; + return !has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ed3e8b7a8260..37221e94e5ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -525,10 +525,11 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return -EINVAL; } f2fs_warn(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_warn(sbi, "Test dummy encryption mount option ignored"); -#endif return 0; +#else + f2fs_warn(sbi, "test_dummy_encryption option not supported"); + return -EINVAL; +#endif } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -1339,9 +1340,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); - INIT_LIST_HEAD(&fi->inmem_ilist); - INIT_LIST_HEAD(&fi->inmem_pages); - mutex_init(&fi->inmem_lock); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1382,9 +1380,8 @@ static int f2fs_drop_inode(struct inode *inode) atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - f2fs_drop_inmem_pages(inode); + f2fs_abort_atomic_write(inode, true); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); @@ -1707,18 +1704,23 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count; u64 avail_node_count; + unsigned int total_valid_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); - user_block_count = sbi->user_block_count; start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr); buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; + + spin_lock(&sbi->stat_lock); + + user_block_count = sbi->user_block_count; + total_valid_node_count = valid_node_count(sbi); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; - spin_lock(&sbi->stat_lock); if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) buf->f_bfree = 0; else @@ -1731,14 +1733,12 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) else buf->f_bavail = 0; - avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { buf->f_files = user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; - buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_ffree = min(avail_node_count - total_valid_node_count, buf->f_bavail); } @@ -2055,7 +2055,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - unsigned int gc_mode; + unsigned int gc_mode = sbi->gc_mode; int err = 0; int ret; block_t unusable; @@ -2066,14 +2066,25 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) } sbi->sb->s_flags |= SB_ACTIVE; + /* check if we need more GC first */ + unusable = f2fs_get_unusable_blocks(sbi); + if (!f2fs_disable_cp_again(sbi, unusable)) + goto skip_gc; + f2fs_update_time(sbi, DISABLE_TIME); - gc_mode = sbi->gc_mode; sbi->gc_mode = GC_URGENT_HIGH; while (!f2fs_time_over(sbi, DISABLE_TIME)) { + struct f2fs_gc_control gc_control = { + .victim_segno = NULL_SEGNO, + .init_gc_type = FG_GC, + .should_migrate_blocks = false, + .err_gc_skipped = true, + .nr_free_secs = 1 }; + f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); + err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; break; @@ -2094,6 +2105,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } +skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); @@ -2684,7 +2696,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) if (!sb_has_quota_active(sb, cnt)) continue; - inode_lock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_lock(dqopt->files[cnt]); /* * do_quotactl @@ -2703,7 +2716,8 @@ int f2fs_quota_sync(struct super_block *sb, int type) f2fs_up_read(&sbi->quota_sem); f2fs_unlock_op(sbi); - inode_unlock(dqopt->files[cnt]); + if (!f2fs_sb_has_quota_ino(sbi)) + inode_unlock(dqopt->files[cnt]); if (ret) break; @@ -3648,22 +3662,29 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; + u64 zone_sectors; int ret; if (!f2fs_sb_has_blkzoned(sbi)) return 0; - if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + zone_sectors = bdev_zone_sectors(bdev); + if (!is_power_of_2(zone_sectors)) { + f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n"); return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + } + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(zone_sectors)) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(zone_sectors); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + if (nr_sectors & (zone_sectors - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi, @@ -4070,30 +4091,9 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1 : NR_TEMP_TYPE; - int j; - - sbi->write_io[i] = - f2fs_kmalloc(sbi, - array_size(n, - sizeof(struct f2fs_bio_info)), - GFP_KERNEL); - if (!sbi->write_io[i]) { - err = -ENOMEM; - goto free_bio_info; - } - - for (j = HOT; j < n; j++) { - init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem); - sbi->write_io[i][j].sbi = sbi; - sbi->write_io[i][j].bio = NULL; - spin_lock_init(&sbi->write_io[i][j].io_lock); - INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); - INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); - init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); - } - } + err = f2fs_init_write_merge_io(sbi); + if (err) + goto free_bio_info; init_f2fs_rwsem(&sbi->cp_rwsem); init_f2fs_rwsem(&sbi->quota_sem); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 65395ae188aa..7b8f2b41c29b 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -129,7 +129,7 @@ static int f2fs_begin_enable_verity(struct file *filp) if (f2fs_verity_in_progress(inode)) return -EBUSY; - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) return -EOPNOTSUPP; /* diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 208d2cff7bd3..e8f476c5f189 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,7 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; - conn->outstanding_credits = 1; + conn->outstanding_credits = 0; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); @@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work) return 0; } -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len) +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_read) ret = conn->transport->ops->rdma_read(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { int ret = -EINVAL; if (conn->transport->ops->rdma_write) ret = conn->transport->ops->rdma_write(conn->transport, buf, buflen, - remote_key, remote_offset, - remote_len); + desc, desc_len); return ret; } diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 7a59aacb5daa..98c1cbe45ec9 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -122,11 +122,14 @@ struct ksmbd_transport_ops { int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, unsigned int remote_key); - int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, - u32 remote_key, u64 remote_offset, u32 remote_len); - int (*rdma_write)(struct ksmbd_transport *t, void *buf, - unsigned int len, u32 remote_key, u64 remote_offset, - u32 remote_len); + int (*rdma_read)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); + int (*rdma_write)(struct ksmbd_transport *t, + void *buf, unsigned int len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); }; struct ksmbd_transport { @@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); -int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); -int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, - unsigned int buflen, u32 remote_key, u64 remote_offset, - u32 remote_len); +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index ebe6ca08467a..52aa0adeb951 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -104,7 +104,8 @@ struct ksmbd_startup_request { */ __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ - __u32 reserved[128]; /* Reserved room */ + __u32 smbd_max_io_size; /* smbd read write size */ + __u32 reserved[127]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 1e2076a53bed..df991107ad2c 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -20,7 +20,7 @@ * wildcard '*' and '?' * TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR * - * @string: string to compare with a pattern + * @str: string to compare with a pattern * @len: string length * @pattern: pattern string which might include wildcard '*' and '?' * @@ -152,8 +152,8 @@ out: /** * convert_to_nt_pathname() - extract and return windows path string * whose share directory prefix was removed from file path - * @filename : unix filename - * @sharepath: share path string + * @share: ksmbd_share_config pointer + * @path: path to report * * Return : windows path string or error */ @@ -250,8 +250,8 @@ char *ksmbd_extract_sharename(char *treename) /** * convert_to_unix_name() - convert windows name to unix format - * @path: name to be converted - * @tid: tree id of mathing share + * @share: ksmbd_share_config pointer + * @name: file name that is relative to share * * Return: converted name on success, otherwise NULL */ diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 4a9460153b59..f8f456377a51 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -338,7 +338,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ret = 1; } - if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + if ((u64)conn->outstanding_credits + credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", credit_charge, conn->outstanding_credits); ret = 1; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 16c803a9d996..e6f4ccc12f49 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3938,6 +3938,12 @@ int smb2_query_dir(struct ksmbd_work *work) set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); + /* + * req->OutputBufferLength is too small to contain even one entry. + * In this case, it immediately returns OutputBufferLength 0 to client. + */ + if (!d_info.out_buf_len && !d_info.num_entry) + goto no_buf_len; if (rc == 0) restart_ctx(&dir_fp->readdir_data.ctx); if (rc == -ENOSPC) @@ -3964,10 +3970,12 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->Buffer[0] = 0; inc_rfc1001_len(work->response_buf, 9); } else { +no_buf_len: ((struct file_directory_info *) ((char *)rsp->Buffer + d_info.last_entry_offset)) ->NextEntryOffset = 0; - d_info.data_count -= d_info.last_entry_off_align; + if (d_info.data_count >= d_info.last_entry_off_align) + d_info.data_count -= d_info.last_entry_off_align; rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); @@ -6116,7 +6124,6 @@ out: static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, struct smb2_buffer_desc_v1 *desc, __le32 Channel, - __le16 ChannelInfoOffset, __le16 ChannelInfoLength) { unsigned int i, ch_count; @@ -6134,15 +6141,13 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, le32_to_cpu(desc[i].length)); } } - if (ch_count != 1) { - ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n", - ch_count); + if (!ch_count) return -EINVAL; - } work->need_invalidate_rkey = (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); + if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) + work->remote_key = le32_to_cpu(desc->token); return 0; } @@ -6150,14 +6155,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, struct smb2_read_req *req, void *data_buf, size_t length) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), + le16_to_cpu(req->ReadChannelInfoLength)); if (err) return err; @@ -6180,6 +6183,8 @@ int smb2_read(struct ksmbd_work *work) size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; int err = 0; + bool is_rdma_channel = false; + unsigned int max_read_size = conn->vals->max_read_size; WORK_BUFFERS(work, req, rsp); @@ -6191,6 +6196,11 @@ int smb2_read(struct ksmbd_work *work) if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { + is_rdma_channel = true; + max_read_size = get_smbd_max_read_write_size(); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { @@ -6201,7 +6211,6 @@ int smb2_read(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->ReadChannelInfoOffset, req->ReadChannelInfoLength); if (err) goto out; @@ -6223,9 +6232,9 @@ int smb2_read(struct ksmbd_work *work) length = le32_to_cpu(req->Length); mincount = le32_to_cpu(req->MinimumCount); - if (length > conn->vals->max_read_size) { + if (length > max_read_size) { ksmbd_debug(SMB, "limiting read size to max size(%u)\n", - conn->vals->max_read_size); + max_read_size); err = -EINVAL; goto out; } @@ -6257,8 +6266,7 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n", nbytes, offset, mincount); - if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || - req->Channel == SMB2_CHANNEL_RDMA_V1) { + if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, work->aux_payload_buf, @@ -6328,23 +6336,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) length = le32_to_cpu(req->Length); id = req->VolatileFileId; - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length); if (rpc_resp) { if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { @@ -6384,21 +6387,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t offset, size_t length, bool sync) { - struct smb2_buffer_desc_v1 *desc; char *data_buf; int ret; ssize_t nbytes; - desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, - le32_to_cpu(desc->token), - le64_to_cpu(desc->offset), - le32_to_cpu(desc->length)); + (struct smb2_buffer_desc_v1 *) + ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), + le16_to_cpu(req->WriteChannelInfoLength)); if (ret < 0) { kvfree(data_buf); return ret; @@ -6427,8 +6427,9 @@ int smb2_write(struct ksmbd_work *work) size_t length; ssize_t nbytes; char *data_buf; - bool writethrough = false; + bool writethrough = false, is_rdma_channel = false; int err = 0; + unsigned int max_write_size = work->conn->vals->max_write_size; WORK_BUFFERS(work, req, rsp); @@ -6437,8 +6438,17 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + offset = le64_to_cpu(req->Offset); + length = le32_to_cpu(req->Length); + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + is_rdma_channel = true; + max_write_size = get_smbd_max_read_write_size(); + length = le32_to_cpu(req->RemainingBytes); + } + + if (is_rdma_channel == true) { unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); if (req->Length != 0 || req->DataOffset != 0 || @@ -6450,7 +6460,6 @@ int smb2_write(struct ksmbd_work *work) (struct smb2_buffer_desc_v1 *) ((char *)req + ch_offset), req->Channel, - req->WriteChannelInfoOffset, req->WriteChannelInfoLength); if (err) goto out; @@ -6474,12 +6483,9 @@ int smb2_write(struct ksmbd_work *work) goto out; } - offset = le64_to_cpu(req->Offset); - length = le32_to_cpu(req->Length); - - if (length > work->conn->vals->max_write_size) { + if (length > max_write_size) { ksmbd_debug(SMB, "limiting write size to max size(%u)\n", - work->conn->vals->max_write_size); + max_write_size); err = -EINVAL; goto out; } @@ -6487,24 +6493,17 @@ int smb2_write(struct ksmbd_work *work) if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) writethrough = true; - if (req->Channel != SMB2_CHANNEL_RDMA_V1 && - req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { - if (le16_to_cpu(req->DataOffset) == - offsetof(struct smb2_write_req, Buffer)) { - data_buf = (char *)&req->Buffer[0]; - } else { - if ((u64)le16_to_cpu(req->DataOffset) + length > - get_rfc1002_len(work->request_buf)) { - pr_err("invalid write data offset %u, smb_len %u\n", - le16_to_cpu(req->DataOffset), - get_rfc1002_len(work->request_buf)); - err = -EINVAL; - goto out; - } - - data_buf = (char *)(((char *)&req->hdr.ProtocolId) + - le16_to_cpu(req->DataOffset)); + if (is_rdma_channel == false) { + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(work->request_buf)); + err = -EINVAL; + goto out; } + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) @@ -6520,8 +6519,7 @@ int smb2_write(struct ksmbd_work *work) /* read data from the client using rdma channel, and * write the data. */ - nbytes = smb2_write_rdma_channel(work, req, fp, offset, - le32_to_cpu(req->RemainingBytes), + nbytes = smb2_write_rdma_channel(work, req, fp, offset, length, writethrough); if (nbytes < 0) { err = (int)nbytes; diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 9a7e211dbf4f..7f8ab14fb8ec 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -140,8 +140,10 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) hdr = work->request_buf; if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER && - hdr->Command == SMB_COM_NEGOTIATE) + hdr->Command == SMB_COM_NEGOTIATE) { + work->conn->outstanding_credits++; return 0; + } return -EINVAL; } diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 6ecf55ea1fed..38f23bf981ac 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -1261,6 +1261,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, if (!access_bits) access_bits = SET_MINIMUM_RIGHTS; + posix_acl_release(posix_acls); goto check_access_bits; } } diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 3ad6881e0f7e..7cb0eeb07c80 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -26,6 +26,7 @@ #include "mgmt/ksmbd_ida.h" #include "connection.h" #include "transport_tcp.h" +#include "transport_rdma.h" #define IPC_WAIT_TIMEOUT (2 * HZ) @@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_trans_size(req->smb2_max_trans); if (req->smb2_max_credits) init_smb2_max_credits(req->smb2_max_credits); + if (req->smbd_max_io_size) + init_smbd_max_io_size(req->smbd_max_io_size); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index e646d79554b8..d035e060c2f0 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 524224; - -static int smb_direct_max_outstanding_rw_ops = 8; +static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; static LIST_HEAD(smb_direct_device_list); static DEFINE_RWLOCK(smb_direct_device_lock); @@ -147,18 +145,18 @@ struct smb_direct_transport { atomic_t send_credits; spinlock_t lock_new_recv_credits; int new_recv_credits; - atomic_t rw_avail_ops; + int max_rw_credits; + int pages_per_rw_credit; + atomic_t rw_credits; wait_queue_head_t wait_send_credits; - wait_queue_head_t wait_rw_avail_ops; + wait_queue_head_t wait_rw_credits; mempool_t *sendmsg_mempool; struct kmem_cache *sendmsg_cache; mempool_t *recvmsg_mempool; struct kmem_cache *recvmsg_cache; - wait_queue_head_t wait_send_payload_pending; - atomic_t send_payload_pending; wait_queue_head_t wait_send_pending; atomic_t send_pending; @@ -208,12 +206,25 @@ struct smb_direct_recvmsg { struct smb_direct_rdma_rw_msg { struct smb_direct_transport *t; struct ib_cqe cqe; + int status; struct completion *completion; + struct list_head list; struct rdma_rw_ctx rw_ctx; struct sg_table sgt; struct scatterlist sg_list[]; }; +void init_smbd_max_io_size(unsigned int sz) +{ + sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); + smb_direct_max_read_write_size = sz; +} + +unsigned int get_smbd_max_read_write_size(void) +{ + return smb_direct_max_read_write_size; +} + static inline int get_buf_page_count(void *buf, int size) { return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - @@ -377,7 +388,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) t->reassembly_queue_length = 0; init_waitqueue_head(&t->wait_reassembly_queue); init_waitqueue_head(&t->wait_send_credits); - init_waitqueue_head(&t->wait_rw_avail_ops); + init_waitqueue_head(&t->wait_rw_credits); spin_lock_init(&t->receive_credit_lock); spin_lock_init(&t->recvmsg_queue_lock); @@ -386,8 +397,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) spin_lock_init(&t->empty_recvmsg_queue_lock); INIT_LIST_HEAD(&t->empty_recvmsg_queue); - init_waitqueue_head(&t->wait_send_payload_pending); - atomic_set(&t->send_payload_pending, 0); init_waitqueue_head(&t->wait_send_pending); atomic_set(&t->send_pending, 0); @@ -417,8 +426,6 @@ static void free_transport(struct smb_direct_transport *t) wake_up_interruptible(&t->wait_send_credits); ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); - wait_event(t->wait_send_payload_pending, - atomic_read(&t->send_payload_pending) == 0); wait_event(t->wait_send_pending, atomic_read(&t->send_pending) == 0); @@ -569,6 +576,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + t->status = SMB_DIRECT_CS_CONNECTED; enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; @@ -873,13 +881,8 @@ static void send_done(struct ib_cq *cq, struct ib_wc *wc) smb_direct_disconnect_rdma_connection(t); } - if (sendmsg->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); /* iterate and free the list of messages in reverse. the list's head * is invalid. @@ -911,21 +914,12 @@ static int smb_direct_post_send(struct smb_direct_transport *t, { int ret; - if (wr->num_sge > 1) - atomic_inc(&t->send_payload_pending); - else - atomic_inc(&t->send_pending); - + atomic_inc(&t->send_pending); ret = ib_post_send(t->qp, wr, NULL); if (ret) { pr_err("failed to post send: %d\n", ret); - if (wr->num_sge > 1) { - if (atomic_dec_and_test(&t->send_payload_pending)) - wake_up(&t->wait_send_payload_pending); - } else { - if (atomic_dec_and_test(&t->send_pending)) - wake_up(&t->wait_send_pending); - } + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); smb_direct_disconnect_rdma_connection(t); } return ret; @@ -983,18 +977,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, } static int wait_for_credits(struct smb_direct_transport *t, - wait_queue_head_t *waitq, atomic_t *credits) + wait_queue_head_t *waitq, atomic_t *total_credits, + int needed) { int ret; do { - if (atomic_dec_return(credits) >= 0) + if (atomic_sub_return(needed, total_credits) >= 0) return 0; - atomic_inc(credits); + atomic_add(needed, total_credits); ret = wait_event_interruptible(*waitq, - atomic_read(credits) > 0 || - t->status != SMB_DIRECT_CS_CONNECTED); + atomic_read(total_credits) >= needed || + t->status != SMB_DIRECT_CS_CONNECTED); if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; @@ -1015,7 +1010,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t, return ret; } - return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); + return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); +} + +static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) +{ + return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); +} + +static int calc_rw_credits(struct smb_direct_transport *t, + char *buf, unsigned int len) +{ + return DIV_ROUND_UP(get_buf_page_count(buf, len), + t->pages_per_rw_credit); } static int smb_direct_create_header(struct smb_direct_transport *t, @@ -1086,7 +1093,7 @@ static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nen int offset, len; int i = 0; - if (nentries < get_buf_page_count(buf, size)) + if (size <= 0 || nentries < get_buf_page_count(buf, size)) return -EINVAL; offset = offset_in_page(buf); @@ -1118,7 +1125,7 @@ static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, int npages; npages = get_sg_list(buf, size, sg_list, nentries); - if (npages <= 0) + if (npages < 0) return -EINVAL; return ib_dma_map_sg(device, sg_list, npages, dir); } @@ -1313,11 +1320,21 @@ done: * that means all the I/Os have been out and we are good to return */ - wait_event(st->wait_send_payload_pending, - atomic_read(&st->send_payload_pending) == 0); + wait_event(st->wait_send_pending, + atomic_read(&st->send_pending) == 0); return ret; } +static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, + struct smb_direct_rdma_rw_msg *msg, + enum dma_data_direction dir) +{ + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, msg->sgt.nents, dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); +} + static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, enum dma_data_direction dir) { @@ -1326,19 +1343,14 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, struct smb_direct_transport *t = msg->t; if (wc->status != IB_WC_SUCCESS) { + msg->status = -EIO; pr_err("read/write error. opcode = %d, status = %s(%d)\n", wc->opcode, ib_wc_status_msg(wc->status), wc->status); - smb_direct_disconnect_rdma_connection(t); + if (wc->status != IB_WC_WR_FLUSH_ERR) + smb_direct_disconnect_rdma_connection(t); } - if (atomic_inc_return(&t->rw_avail_ops) > 0) - wake_up(&t->wait_rw_avail_ops); - - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, dir); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); complete(msg->completion); - kfree(msg); } static void read_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1351,94 +1363,141 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) read_write_done(cq, wc, DMA_TO_DEVICE); } -static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, - int buf_len, u32 remote_key, u64 remote_offset, - u32 remote_len, bool is_read) +static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + void *buf, int buf_len, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len, + bool is_read) { - struct smb_direct_rdma_rw_msg *msg; - int ret; + struct smb_direct_rdma_rw_msg *msg, *next_msg; + int i, ret; DECLARE_COMPLETION_ONSTACK(completion); - struct ib_send_wr *first_wr = NULL; + struct ib_send_wr *first_wr; + LIST_HEAD(msg_list); + char *desc_buf; + int credits_needed; + unsigned int desc_buf_len; + size_t total_length = 0; - ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; + + /* calculate needed credits */ + credits_needed = 0; + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + desc_buf_len = le32_to_cpu(desc[i].length); + + credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); + desc_buf += desc_buf_len; + total_length += desc_buf_len; + if (desc_buf_len == 0 || total_length > buf_len || + total_length > t->max_rdma_rw_size) + return -EINVAL; + } + + ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", + is_read ? "read" : "write", buf_len, credits_needed); + + ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) return ret; - /* TODO: mempool */ - msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + - sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); - if (!msg) { - atomic_inc(&t->rw_avail_ops); - return -ENOMEM; + /* build rdma_rw_ctx for each descriptor */ + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + desc_buf_len = le32_to_cpu(desc[i].length); + + msg->t = t; + msg->cqe.done = is_read ? read_done : write_done; + msg->completion = &completion; + + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + get_buf_page_count(desc_buf, desc_buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { + kfree(msg); + ret = -ENOMEM; + goto out; + } + + ret = get_sg_list(desc_buf, desc_buf_len, + msg->sgt.sgl, msg->sgt.orig_nents); + if (ret < 0) { + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + msg->sgt.sgl, + get_buf_page_count(desc_buf, desc_buf_len), + 0, + le64_to_cpu(desc[i].offset), + le32_to_cpu(desc[i].token), + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + goto out; + } + + list_add_tail(&msg->list, &msg_list); + desc_buf += desc_buf_len; } - msg->sgt.sgl = &msg->sg_list[0]; - ret = sg_alloc_table_chained(&msg->sgt, - get_buf_page_count(buf, buf_len), - msg->sg_list, SG_CHUNK_SIZE); - if (ret) { - atomic_inc(&t->rw_avail_ops); - kfree(msg); - return -ENOMEM; + /* concatenate work requests of rdma_rw_ctxs */ + first_wr = NULL; + list_for_each_entry_reverse(msg, &msg_list, list) { + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + &msg->cqe, first_wr); } - ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret <= 0) { - pr_err("failed to get pages\n"); - goto err; - } - - ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, get_buf_page_count(buf, buf_len), - 0, remote_offset, remote_key, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - if (ret < 0) { - pr_err("failed to init rdma_rw_ctx: %d\n", ret); - goto err; - } - - msg->t = t; - msg->cqe.done = is_read ? read_done : write_done; - msg->completion = &completion; - first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, - &msg->cqe, NULL); - ret = ib_post_send(t->qp, first_wr, NULL); if (ret) { - pr_err("failed to post send wr: %d\n", ret); - goto err; + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); + goto out; } + msg = list_last_entry(&msg_list, struct smb_direct_rdma_rw_msg, list); wait_for_completion(&completion); - return 0; - -err: - atomic_inc(&t->rw_avail_ops); - if (first_wr) - rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, - msg->sg_list, msg->sgt.nents, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); + ret = msg->status; +out: + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { + list_del(&msg->list); + smb_direct_free_rdma_rw_msg(t, msg, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + atomic_add(credits_needed, &t->rw_credits); + wake_up(&t->wait_rw_credits); return ret; } -static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_write(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, false); + desc, desc_len, false); } -static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, - unsigned int buflen, u32 remote_key, - u64 remote_offset, u32 remote_len) +static int smb_direct_rdma_read(struct ksmbd_transport *t, + void *buf, unsigned int buflen, + struct smb2_buffer_desc_v1 *desc, + unsigned int desc_len) { return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, - remote_key, remote_offset, - remote_len, true); + desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) @@ -1638,41 +1697,57 @@ out_err: return ret; } +static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) +{ + return min_t(unsigned int, + t->cm_id->device->attrs.max_fast_reg_page_list_len, + 256); +} + static int smb_direct_init_params(struct smb_direct_transport *t, struct ib_qp_cap *cap) { struct ib_device *device = t->cm_id->device; - int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; + int max_send_sges, max_rw_wrs, max_send_wrs; + unsigned int max_sge_per_wr, wrs_per_credit; - /* need 2 more sge. because a SMB_DIRECT header will be mapped, - * and maybe a send buffer could be not page aligned. + /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, + * SMB2 response could be mapped. */ t->max_send_size = smb_direct_max_send_size; - max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2; + max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 3; if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { pr_err("max_send_size %d is too large\n", t->max_send_size); return -EINVAL; } - /* - * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA - * read/writes. HCA guarantees at least max_send_sge of sges for - * a RDMA read/write work request, and if memory registration is used, - * we need reg_mr, local_inv wrs for each read/write. + /* Calculate the number of work requests for RDMA R/W. + * The maximum number of pages which can be registered + * with one Memory region can be transferred with one + * R/W credit. And at least 4 work requests for each credit + * are needed for MR registration, RDMA R/W, local & remote + * MR invalidation. */ t->max_rdma_rw_size = smb_direct_max_read_write_size; - max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; - max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); - max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, - max_pages) * 2; - max_rw_wrs *= smb_direct_max_outstanding_rw_ops; + t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); + t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, + (t->pages_per_rw_credit - 1) * + PAGE_SIZE); + + max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, + device->attrs.max_sge_rd); + max_sge_per_wr = max_t(unsigned int, max_sge_per_wr, + max_send_sges); + wrs_per_credit = max_t(unsigned int, 4, + DIV_ROUND_UP(t->pages_per_rw_credit, + max_sge_per_wr) + 1); + max_rw_wrs = t->max_rw_credits * wrs_per_credit; max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; if (max_send_wrs > device->attrs.max_cqe || max_send_wrs > device->attrs.max_qp_wr) { - pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", - smb_direct_send_credit_target, - smb_direct_max_outstanding_rw_ops); + pr_err("consider lowering send_credit_target = %d\n", + smb_direct_send_credit_target); pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", device->attrs.max_cqe, device->attrs.max_qp_wr); return -EINVAL; @@ -1687,11 +1762,6 @@ static int smb_direct_init_params(struct smb_direct_transport *t, return -EINVAL; } - if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { - pr_err("warning: device max_send_sge = %d too small\n", - device->attrs.max_send_sge); - return -EINVAL; - } if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { pr_err("warning: device max_recv_sge = %d too small\n", device->attrs.max_recv_sge); @@ -1707,7 +1777,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, t->send_credit_target = smb_direct_send_credit_target; atomic_set(&t->send_credits, 0); - atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); + atomic_set(&t->rw_credits, t->max_rw_credits); t->max_send_size = smb_direct_max_send_size; t->max_recv_size = smb_direct_max_receive_size; @@ -1715,12 +1785,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_wr = max_send_wrs; cap->max_recv_wr = t->recv_credit_max; - cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_send_sge = max_sge_per_wr; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = - rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * - smb_direct_max_outstanding_rw_ops; + cap->max_rdma_ctxs = t->max_rw_credits; return 0; } @@ -1813,7 +1881,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->send_cq = ib_alloc_cq(t->cm_id->device, t, - t->send_credit_target, 0, IB_POLL_WORKQUEUE); + smb_direct_send_credit_target + cap->max_rdma_ctxs, + 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->send_cq)) { pr_err("Can't create RDMA send CQ\n"); ret = PTR_ERR(t->send_cq); @@ -1822,8 +1891,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, } t->recv_cq = ib_alloc_cq(t->cm_id->device, t, - cap->max_send_wr + cap->max_rdma_ctxs, - 0, IB_POLL_WORKQUEUE); + t->recv_credit_max, 0, IB_POLL_WORKQUEUE); if (IS_ERR(t->recv_cq)) { pr_err("Can't create RDMA recv CQ\n"); ret = PTR_ERR(t->recv_cq); @@ -1852,17 +1920,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { - int pages_per_mr, mr_count; - - pages_per_mr = min_t(int, pages_per_rw, - t->cm_id->device->attrs.max_fast_reg_page_list_len); - mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * - atomic_read(&t->rw_avail_ops); - ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, - IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, + t->max_rw_credits, IB_MR_TYPE_MEM_REG, + t->pages_per_rw_credit, 0); if (ret) { pr_err("failed to init mr pool count %d pages %d\n", - mr_count, pages_per_mr); + t->max_rw_credits, t->pages_per_rw_credit); goto err; } } diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 5567d93a6f96..77aee4e5c9dc 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,6 +7,10 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ +#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024) +#define SMBD_MIN_IOSIZE (512 * 1024) +#define SMBD_MAX_IOSIZE (16 * 1024 * 1024) + /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; @@ -52,10 +56,14 @@ struct smb_direct_data_transfer { int ksmbd_rdma_init(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); +void init_smbd_max_io_size(unsigned int sz); +unsigned int get_smbd_max_read_write_size(void); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline int ksmbd_rdma_destroy(void) { return 0; } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } +static inline void init_smbd_max_io_size(unsigned int sz) { } +static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } #endif #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6f5425e89ca6..2d72b1b7ed74 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -206,15 +206,16 @@ static int nfs_file_fsync_commit(struct file *file, int datasync) { struct inode *inode = file_inode(file); - int ret; + int ret, ret2; dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret < 0) - return ret; - return file_check_and_advance_wb_err(file); + ret2 = file_check_and_advance_wb_err(file); + if (ret2 < 0) + return ret2; + return ret; } int @@ -387,11 +388,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx, mapping->host)) { - status = nfs_wb_all(mapping->host); - if (status < 0) - return status; - } + if (nfs_ctx_key_to_expire(ctx, mapping->host)) + nfs_wb_all(mapping->host); return copied; } @@ -606,18 +604,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .page_mkwrite = nfs_vm_page_mkwrite, }; -static int nfs_need_check_write(struct file *filp, struct inode *inode, - int error) -{ - struct nfs_open_context *ctx; - - ctx = nfs_file_open_context(filp); - if (nfs_error_is_fatal_on_server(error) || - nfs_ctx_key_to_expire(ctx, inode)) - return 1; - return 0; -} - ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -645,7 +631,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) - goto out; + return result; } nfs_clear_invalid_mapping(file->f_mapping); @@ -664,6 +650,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { result = filemap_fdatawrite_range(file->f_mapping, @@ -681,17 +668,22 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) } result = generic_write_sync(iocb, written); if (result < 0) - goto out; + return result; +out: /* Return error values */ error = filemap_check_wb_err(file->f_mapping, since); - if (nfs_need_check_write(file, inode, error)) { - int err = nfs_wb_all(inode); - if (err < 0) - result = err; + switch (error) { + default: + break; + case -EDQUOT: + case -EFBIG: + case -ENOSPC: + nfs_wb_all(inode); + error = file_check_and_advance_wb_err(file); + if (error < 0) + result = error; } - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); -out: return result; out_swapfile: diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 76deddab0a8f..2b2661582bbe 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -839,7 +839,12 @@ fl_pnfs_update_layout(struct inode *ino, lseg = pnfs_update_layout(ino, ctx, pos, count, iomode, strict_iomode, gfp_flags); - if (IS_ERR_OR_NULL(lseg)) + if (IS_ERR(lseg)) { + /* Fall back to MDS on recoverable errors */ + if (!nfs_error_is_fatal_on_server(PTR_ERR(lseg))) + lseg = NULL; + goto out; + } else if (!lseg) goto out; lo = NFS_I(ino)->layout; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index f73c09a9cf0a..e861d7bae305 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -231,11 +231,10 @@ void nfs_fscache_release_file(struct inode *inode, struct file *filp) { struct nfs_fscache_inode_auxdata auxdata; struct fscache_cookie *cookie = nfs_i_fscache(inode); + loff_t i_size = i_size_read(inode); - if (fscache_cookie_valid(cookie)) { - nfs_fscache_update_auxdata(&auxdata, inode); - fscache_unuse_cookie(cookie, &auxdata, NULL); - } + nfs_fscache_update_auxdata(&auxdata, inode); + fscache_unuse_cookie(cookie, &auxdata, &i_size); } /* diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7eefa16ed381..8f8cd6e2d4db 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -841,6 +841,7 @@ static inline bool nfs_error_is_fatal_on_server(int err) case 0: case -ERESTARTSYS: case -EINTR: + case -ENOMEM: return false; } return nfs_error_is_fatal(err); diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3680c8da510c..f2dbf904c598 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -417,6 +417,9 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (!fs_locations) goto out_free; + fs_locations->fattr = nfs_alloc_fattr(); + if (!fs_locations->fattr) + goto out_free_2; /* Get locations */ dentry = ctx->clone_data.dentry; @@ -427,14 +430,16 @@ static int nfs_do_refmount(struct fs_context *fc, struct rpc_clnt *client) err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0) - goto out_free_2; + goto out_free_3; err = -ENOENT; if (fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) - goto out_free_2; + goto out_free_3; err = nfs_follow_referral(fc, fs_locations); +out_free_3: + kfree(fs_locations->fattr); out_free_2: kfree(fs_locations); out_free: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a79f66432bd3..c0fdcf8c0032 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1162,7 +1162,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, { unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (server->caps & NFS_CAP_MOVEABLE) task_flags = RPC_TASK_MOVEABLE; return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -2568,7 +2568,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; kref_get(&data->kref); @@ -3098,6 +3098,10 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } if (!opendata->cancelled) nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; @@ -3733,7 +3737,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -4243,6 +4247,8 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, if (locations == NULL) goto out; + locations->fattr = fattr; + status = nfs4_proc_fs_locations(client, dir, name, locations, page); if (status != 0) goto out; @@ -4252,17 +4258,14 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir, * referral. Cause us to drop into the exception handler, which * will kick off migration recovery. */ - if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { + if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &fattr->fsid)) { dprintk("%s: server did not return a different fsid for" " a referral at %s\n", __func__, name->name); status = -NFS4ERR_MOVED; goto out; } /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ - nfs_fixup_referral_attributes(&locations->fattr); - - /* replace the lookup nfs_fattr with the locations nfs_fattr */ - memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); + nfs_fixup_referral_attributes(fattr); memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -4404,7 +4407,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; - if (server->nfs_client->cl_minorversion) + if (nfs_server_capable(dir, NFS_CAP_MOVEABLE)) task_flags = RPC_TASK_MOVEABLE; /* Is this is an attribute revalidation, subject to softreval? */ @@ -5768,9 +5771,17 @@ static int nfs4_proc_renew(struct nfs_client *clp, const struct cred *cred) return 0; } -static inline int nfs4_server_supports_acls(struct nfs_server *server) +static bool nfs4_server_supports_acls(const struct nfs_server *server, + enum nfs4_acl_type type) { - return server->caps & NFS_CAP_ACLS; + switch (type) { + default: + return server->attr_bitmask[0] & FATTR4_WORD0_ACL; + case NFS4ACL_DACL: + return server->attr_bitmask[1] & FATTR4_WORD1_DACL; + case NFS4ACL_SACL: + return server->attr_bitmask[1] & FATTR4_WORD1_SACL; + } } /* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that @@ -5809,6 +5820,7 @@ unwind: } struct nfs4_cached_acl { + enum nfs4_acl_type type; int cached; size_t len; char data[]; @@ -5829,7 +5841,8 @@ static void nfs4_zap_acl_attr(struct inode *inode) nfs4_set_cached_acl(inode, NULL); } -static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +static ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_cached_acl *acl; @@ -5839,6 +5852,8 @@ static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_ acl = nfsi->nfs4_acl; if (acl == NULL) goto out; + if (acl->type != type) + goto out; if (buf == NULL) /* user is just asking for length */ goto out_len; if (acl->cached == 0) @@ -5854,7 +5869,9 @@ out: return ret; } -static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len) +static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, + size_t pgbase, size_t acl_len, + enum nfs4_acl_type type) { struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; @@ -5871,6 +5888,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size goto out; acl->cached = 0; } + acl->type = type; acl->len = acl_len; out: nfs4_set_cached_acl(inode, acl); @@ -5886,14 +5904,17 @@ out: * length. The next getxattr call will then produce another round trip to * the server, this time with the input buf of the required size. */ -static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct page **pages; struct nfs_getaclargs args = { .fh = NFS_FH(inode), + .acl_type = type, .acl_len = buflen, }; struct nfs_getaclres res = { + .acl_type = type, .acl_len = buflen, }; struct rpc_message msg = { @@ -5943,7 +5964,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu ret = -ERANGE; goto out_free; } - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len, + type); if (buf) { if (res.acl_len > buflen) { ret = -ERANGE; @@ -5963,14 +5985,15 @@ out_free: return ret; } -static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { .interruptible = true, }; ssize_t ret; do { - ret = __nfs4_get_acl_uncached(inode, buf, buflen); + ret = __nfs4_get_acl_uncached(inode, buf, buflen, type); trace_nfs4_get_acl(inode, ret); if (ret >= 0) break; @@ -5979,34 +6002,37 @@ static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bufl return ret; } -static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, + enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); int ret; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret < 0) return ret; if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); - ret = nfs4_read_cached_acl(inode, buf, buflen); + ret = nfs4_read_cached_acl(inode, buf, buflen, type); if (ret != -ENOENT) /* -ENOENT is returned if there is no ACL or if there is an ACL * but no cached acl data, just the acl length */ return ret; - return nfs4_get_acl_uncached(inode, buf, buflen); + return nfs4_get_acl_uncached(inode, buf, buflen, type); } -static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_setaclargs arg = { - .fh = NFS_FH(inode), - .acl_pages = pages, - .acl_len = buflen, + .fh = NFS_FH(inode), + .acl_type = type, + .acl_len = buflen, + .acl_pages = pages, }; struct nfs_setaclres res; struct rpc_message msg = { @@ -6020,7 +6046,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl /* You can't remove system.nfs4_acl: */ if (buflen == 0) return -EINVAL; - if (!nfs4_server_supports_acls(server)) + if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) return -ERANGE; @@ -6051,12 +6077,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return ret; } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, + size_t buflen, enum nfs4_acl_type type) { struct nfs4_exception exception = { }; int err; do { - err = __nfs4_proc_set_acl(inode, buf, buflen); + err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { /* @@ -6612,10 +6639,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return -ENOMEM; @@ -6929,10 +6959,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; - struct nfs_client *client = - NFS_SERVER(lsp->ls_state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(lsp->ls_state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, @@ -7203,9 +7231,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; - struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; - if (client->cl_minorversion) + if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE)) task_setup_data.flags |= RPC_TASK_MOVEABLE; data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -7655,21 +7682,70 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - return nfs4_proc_set_acl(inode, buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_ACL); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - return nfs4_proc_get_acl(inode, buf, buflen); + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_ACL); } static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_ACL); } +#if defined(CONFIG_NFS_V4_1) +#define XATTR_NAME_NFSV4_DACL "system.nfs4_dacl" + +static int nfs4_xattr_set_nfs4_dacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static int nfs4_xattr_get_nfs4_dacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_DACL); +} + +static bool nfs4_xattr_list_nfs4_dacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_DACL); +} + +#define XATTR_NAME_NFSV4_SACL "system.nfs4_sacl" + +static int nfs4_xattr_set_nfs4_sacl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) +{ + return nfs4_proc_set_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static int nfs4_xattr_get_nfs4_sacl(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *key, void *buf, size_t buflen) +{ + return nfs4_proc_get_acl(inode, buf, buflen, NFS4ACL_SACL); +} + +static bool nfs4_xattr_list_nfs4_sacl(struct dentry *dentry) +{ + return nfs4_server_supports_acls(NFS_SB(dentry->d_sb), NFS4ACL_SACL); +} + +#endif + #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, @@ -7902,7 +7978,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, else bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - nfs_fattr_init(&fs_locations->fattr); + nfs_fattr_init(fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(client, server, &msg, &args.seq_args, &res.seq_res, 0); @@ -7967,7 +8043,7 @@ static int _nfs40_proc_get_locations(struct nfs_server *server, unsigned long now = jiffies; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -8032,7 +8108,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server, }; int status; - nfs_fattr_init(&locations->fattr); + nfs_fattr_init(locations->fattr); locations->server = server; locations->nlocations = 0; @@ -10391,7 +10467,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 - | NFS_CAP_LGOPEN, + | NFS_CAP_LGOPEN + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10426,7 +10503,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR - | NFS_CAP_READ_PLUS, + | NFS_CAP_READ_PLUS + | NFS_CAP_MOVEABLE, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -10587,6 +10665,22 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { .set = nfs4_xattr_set_nfs4_acl, }; +#if defined(CONFIG_NFS_V4_1) +static const struct xattr_handler nfs4_xattr_nfs4_dacl_handler = { + .name = XATTR_NAME_NFSV4_DACL, + .list = nfs4_xattr_list_nfs4_dacl, + .get = nfs4_xattr_get_nfs4_dacl, + .set = nfs4_xattr_set_nfs4_dacl, +}; + +static const struct xattr_handler nfs4_xattr_nfs4_sacl_handler = { + .name = XATTR_NAME_NFSV4_SACL, + .list = nfs4_xattr_list_nfs4_sacl, + .get = nfs4_xattr_get_nfs4_sacl, + .set = nfs4_xattr_set_nfs4_sacl, +}; +#endif + #ifdef CONFIG_NFS_V4_2 static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { .prefix = XATTR_USER_PREFIX, @@ -10597,6 +10691,10 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { const struct xattr_handler *nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, +#if defined(CONFIG_NFS_V4_1) + &nfs4_xattr_nfs4_dacl_handler, + &nfs4_xattr_nfs4_sacl_handler, +#endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL &nfs4_xattr_nfs4_label_handler, #endif diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9e1c987c81e7..2540b35ec187 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1602,7 +1602,8 @@ static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, #endif /* CONFIG_NFS_V4_2 */ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, - const struct nfs4_state_recovery_ops *ops) + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_lock_state *lock; int status; @@ -1620,7 +1621,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st list_for_each_entry(lock, &state->lock_states, ls_locks) { trace_nfs4_state_lock_reclaim(state, lock); if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + *lost_locks += 1; } spin_unlock(&state->state_lock); } @@ -1630,7 +1631,9 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st return status; } -static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, + const struct nfs4_state_recovery_ops *ops, + int *lost_locks) { struct nfs4_state *state; unsigned int loop = 0; @@ -1666,7 +1669,7 @@ restart: #endif /* CONFIG_NFS_V4_2 */ refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = __nfs4_reclaim_open_state(sp, state, ops); + status = __nfs4_reclaim_open_state(sp, state, ops, lost_locks); switch (status) { default: @@ -1909,6 +1912,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov struct rb_node *pos; LIST_HEAD(freeme); int status = 0; + int lost_locks = 0; restart: rcu_read_lock(); @@ -1928,8 +1932,11 @@ restart: spin_unlock(&clp->cl_lock); rcu_read_unlock(); - status = nfs4_reclaim_open_state(sp, ops); + status = nfs4_reclaim_open_state(sp, ops, &lost_locks); if (status < 0) { + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); status = nfs4_recovery_handle_error(clp, status); @@ -1943,6 +1950,9 @@ restart: } rcu_read_unlock(); nfs4_free_state_owners(&freeme); + if (lost_locks) + pr_warn("NFS: %s: lost %d locks\n", + clp->cl_hostname, lost_locks); return 0; } @@ -2106,6 +2116,11 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred dprintk("<-- %s: no memory\n", __func__); goto out; } + locations->fattr = nfs_alloc_fattr(); + if (locations->fattr == NULL) { + dprintk("<-- %s: no memory\n", __func__); + goto out; + } inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, @@ -2120,7 +2135,7 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred if (!locations->nlocations) goto out; - if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { + if (!(locations->fattr->valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); goto out; @@ -2145,6 +2160,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred out: if (page != NULL) __free_page(page); + if (locations != NULL) + kfree(locations->fattr); kfree(locations); if (result) { pr_err("NFS: migration recovery failed (server %s)\n", diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 86a5f6516928..acfe5f4bda48 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1680,19 +1680,35 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) encode_op_hdr(xdr, OP_RESTOREFH, decode_restorefh_maxsz, hdr); } -static void -encode_setacl(struct xdr_stream *xdr, const struct nfs_setaclargs *arg, - struct compound_hdr *hdr) +static void nfs4_acltype_to_bitmap(enum nfs4_acl_type type, __u32 bitmap[2]) { - __be32 *p; + switch (type) { + default: + bitmap[0] = FATTR4_WORD0_ACL; + bitmap[1] = 0; + break; + case NFS4ACL_DACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_DACL; + break; + case NFS4ACL_SACL: + bitmap[0] = 0; + bitmap[1] = FATTR4_WORD1_SACL; + } +} + +static void encode_setacl(struct xdr_stream *xdr, + const struct nfs_setaclargs *arg, + struct compound_hdr *hdr) +{ + __u32 bitmap[2]; + + nfs4_acltype_to_bitmap(arg->acl_type, bitmap); encode_op_hdr(xdr, OP_SETATTR, decode_setacl_maxsz, hdr); encode_nfs4_stateid(xdr, &zero_stateid); - p = reserve_space(xdr, 2*4); - *p++ = cpu_to_be32(1); - *p = cpu_to_be32(FATTR4_WORD0_ACL); - p = reserve_space(xdr, 4); - *p = cpu_to_be32(arg->acl_len); + xdr_encode_bitmap4(xdr, bitmap, ARRAY_SIZE(bitmap)); + encode_uint32(xdr, arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len); } @@ -2587,11 +2603,11 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - const __u32 nfs4_acl_bitmap[1] = { - [0] = FATTR4_WORD0_ACL, - }; + __u32 nfs4_acl_bitmap[2]; uint32_t replen; + nfs4_acltype_to_bitmap(args->acl_type, nfs4_acl_bitmap); + encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); @@ -5386,7 +5402,7 @@ decode_restorefh(struct xdr_stream *xdr) } static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, - struct nfs_getaclres *res) + struct nfs_getaclres *res, enum nfs4_acl_type type) { unsigned int savep; uint32_t attrlen, @@ -5404,26 +5420,39 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) goto out; - if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) - return -EIO; - if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { + switch (type) { + default: + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (!(bitmap[0] & FATTR4_WORD0_ACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_DACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_DACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_DACL)) + return -EOPNOTSUPP; + break; + case NFS4ACL_SACL: + if (unlikely(bitmap[0] || bitmap[1] & (FATTR4_WORD1_SACL - 1U))) + return -EIO; + if (!(bitmap[1] & FATTR4_WORD1_SACL)) + return -EOPNOTSUPP; + } - /* The bitmap (xdr len + bitmaps) and the attr xdr len words - * are stored with the acl data to handle the problem of - * variable length bitmaps.*/ - res->acl_data_offset = xdr_page_pos(xdr); - res->acl_len = attrlen; - - /* Check for receive buffer overflow */ - if (res->acl_len > xdr_stream_remaining(xdr) || - res->acl_len + res->acl_data_offset > xdr->buf->page_len) { - res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", - attrlen, xdr_stream_remaining(xdr)); - } - } else - status = -EOPNOTSUPP; + /* The bitmap (xdr len + bitmaps) and the attr xdr len words + * are stored with the acl data to handle the problem of + * variable length bitmaps.*/ + res->acl_data_offset = xdr_page_pos(xdr); + res->acl_len = attrlen; + /* Check for receive buffer overflow */ + if (res->acl_len > xdr_stream_remaining(xdr) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); + } out: return status; } @@ -6486,7 +6515,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_getacl(xdr, rqstp, res); + status = decode_getacl(xdr, rqstp, res, res->acl_type); out: return status; @@ -7051,7 +7080,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, if (res->migration) { xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); if (status) @@ -7064,7 +7093,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, goto out; xdr_enter_page(xdr, PAGE_SIZE); status = decode_getfattr_generic(xdr, - &res->fs_locations->fattr, + res->fs_locations->fattr, NULL, res->fs_locations, res->fs_locations->server); } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9157dd19b8b4..317cedfa52bf 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -767,6 +767,9 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .flags = RPC_TASK_ASYNC | flags, }; + if (nfs_server_capable(hdr->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); dprintk("NFS: initiated pgio call " diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 856c962273c7..68a87be3e6f9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2000,6 +2000,7 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; @@ -2128,6 +2129,7 @@ lookup_again: lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); if (!lgp) { + lseg = ERR_PTR(-ENOMEM); trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, PNFS_UPDATE_LAYOUT_NOMEM); nfs_layoutget_end(lo); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 6f325e10056c..9697cd5d2561 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -102,6 +102,10 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); + + if (nfs_server_capable(inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(data->res.dir_attr); @@ -344,6 +348,10 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; + if (nfs_server_capable(old_dir, NFS_CAP_MOVEABLE) && + nfs_server_capable(new_dir, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (data == NULL) return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f00d45cf80ef..1c706465d090 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -603,8 +603,9 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static int nfs_page_async_flush(struct page *page, + struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; int ret = 0; @@ -630,11 +631,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, /* * Remove the problematic req upon fatal errors on the server */ - if (nfs_error_is_fatal(ret)) { - if (nfs_error_is_fatal_on_server(ret)) - goto out_launder; - } else - ret = -EAGAIN; + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; + if (wbc->sync_mode == WB_SYNC_NONE) + ret = AOP_WRITEPAGE_ACTIVATE; + redirty_page_for_writepage(wbc, page); nfs_redirty_request(req); pgio->pg_error = 0; } else @@ -650,15 +651,8 @@ out_launder: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - int ret; - nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); - ret = AOP_WRITEPAGE_ACTIVATE; - } - return ret; + return nfs_page_async_flush(page, wbc, pgio); } /* @@ -681,11 +675,7 @@ static int nfs_writepage_locked(struct page *page, err = nfs_do_writepage(page, wbc, &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); - if (err < 0) - return err; - if (nfs_error_is_fatal(pgio.pg_error)) - return pgio.pg_error; - return 0; + return err; } int nfs_writepage(struct page *page, struct writeback_control *wbc) @@ -737,19 +727,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) priority = wb_priority(wbc); } - nfs_pageio_init_write(&pgio, inode, priority, false, - &nfs_async_write_completion_ops); - pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); - pgio.pg_error = 0; - nfs_pageio_complete(&pgio); + do { + nfs_pageio_init_write(&pgio, inode, priority, false, + &nfs_async_write_completion_ops); + pgio.pg_io_completion = ioc; + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, + &pgio); + pgio.pg_error = 0; + nfs_pageio_complete(&pgio); + } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); if (err < 0) goto out_err; - err = pgio.pg_error; - if (nfs_error_is_fatal(err)) - goto out_err; return 0; out_err: return err; @@ -1444,7 +1434,7 @@ static void nfs_async_write_error(struct list_head *head, int error) while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); - if (nfs_error_is_fatal(error)) + if (nfs_error_is_fatal_on_server(error)) nfs_write_error(req, error); else nfs_redirty_request(req); @@ -1719,6 +1709,10 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; + + if (nfs_server_capable(data->inode, NFS_CAP_MOVEABLE)) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg, &task_setup_data.rpc_client); trace_nfs_initiate_commit(data); diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1e4ee042d52f..3e920cf1b454 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -173,7 +173,6 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); kmem_free(pag); } @@ -192,7 +191,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); + XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); xfs_iunlink_destroy(pag); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 14ae0826bc15..836ab1b8ed7b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -25,10 +25,9 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_attr_item.h" -#include "xfs_log.h" +#include "xfs_xattr.h" -struct kmem_cache *xfs_attri_cache; -struct kmem_cache *xfs_attrd_cache; +struct kmem_cache *xfs_attr_intent_cache; /* * xfs_attr.c @@ -58,11 +57,11 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); -static int xfs_attr_node_try_addname(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr); -STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, - struct xfs_da_state **state); +static int xfs_attr_node_try_addname(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_remove_attr(struct xfs_attr_intent *attr); +STATIC int xfs_attr_node_lookup(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( @@ -377,7 +376,7 @@ xfs_attr_try_sf_addname( static int xfs_attr_sf_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -423,7 +422,7 @@ out: */ static enum xfs_delattr_state xfs_attr_complete_op( - struct xfs_attr_item *attr, + struct xfs_attr_intent *attr, enum xfs_delattr_state replace_state) { struct xfs_da_args *args = attr->xattri_da_args; @@ -439,7 +438,7 @@ xfs_attr_complete_op( static int xfs_attr_leaf_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -493,7 +492,7 @@ out: */ static int xfs_attr_node_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -530,7 +529,7 @@ out: static int xfs_attr_rmtval_alloc( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error = 0; @@ -594,6 +593,19 @@ xfs_attr_leaf_mark_incomplete( return xfs_attr3_leaf_setflag(args); } +/* Ensure the da state of an xattr deferred work item is ready to go. */ +static inline void +xfs_attr_item_init_da_state( + struct xfs_attr_intent *attr) +{ + struct xfs_da_args *args = attr->xattri_da_args; + + if (!attr->xattri_da_state) + attr->xattri_da_state = xfs_da_state_alloc(args); + else + xfs_da_state_reset(attr->xattri_da_state, args); +} + /* * Initial setup for xfs_attr_node_removename. Make sure the attr is there and * the blocks are valid. Attr keys with remote blocks will be marked @@ -601,29 +613,33 @@ xfs_attr_leaf_mark_incomplete( */ static int xfs_attr_node_removename_setup( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state **state = &attr->xattri_da_state; + struct xfs_da_state *state; int error; - error = xfs_attr_node_hasname(args, state); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); if (error != -EEXIST) goto out; error = 0; - ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); - ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == + state = attr->xattri_da_state; + ASSERT(state->path.blk[state->path.active - 1].bp != NULL); + ASSERT(state->path.blk[state->path.active - 1].magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_mark_incomplete(args, *state); + error = xfs_attr_leaf_mark_incomplete(args, state); if (error) goto out; if (args->rmtblkno > 0) error = xfs_attr_rmtval_invalidate(args); out: - if (error) - xfs_da_state_free(*state); + if (error) { + xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } return error; } @@ -635,7 +651,7 @@ out: */ static int xfs_attr_leaf_remove_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -700,7 +716,7 @@ xfs_attr_leaf_shrink( */ int xfs_attr_set_iter( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error = 0; @@ -852,6 +868,7 @@ xfs_attr_lookup( { struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; + struct xfs_da_state *state; int error; if (!xfs_inode_hasattr(dp)) @@ -869,19 +886,22 @@ xfs_attr_lookup( return error; } - return xfs_attr_node_hasname(args, NULL); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); + xfs_da_state_free(state); + return error; } static int -xfs_attr_item_init( +xfs_attr_intent_init( struct xfs_da_args *args, unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_item **attr) /* new xfs_attr_item */ + struct xfs_attr_intent **attr) /* new xfs_attr_intent */ { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; - new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS); + new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); new->xattri_op_flags = op_flags; new->xattri_da_args = args; @@ -894,10 +914,10 @@ static int xfs_attr_defer_add( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); if (error) return error; @@ -913,10 +933,10 @@ static int xfs_attr_defer_replace( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error = 0; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); if (error) return error; @@ -933,10 +953,10 @@ xfs_attr_defer_remove( struct xfs_da_args *args) { - struct xfs_attr_item *new; + struct xfs_attr_intent *new; int error; - error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new); + error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); if (error) return error; @@ -962,7 +982,6 @@ xfs_attr_set( int error, local; int rmt_blks = 0; unsigned int total; - int delayed = xfs_has_larp(mp); if (xfs_is_shutdown(dp->i_mount)) return -EIO; @@ -1007,12 +1026,6 @@ xfs_attr_set( rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX); } - if (delayed) { - error = xfs_attr_use_log_assist(mp); - if (error) - return error; - } - /* * Root fork attributes can use reserved data blocks for this * operation if necessary @@ -1020,7 +1033,7 @@ xfs_attr_set( xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) - goto drop_incompat; + return error; if (args->value || xfs_inode_hasattr(dp)) { error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK, @@ -1080,9 +1093,6 @@ xfs_attr_set( error = xfs_trans_commit(args->trans); out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); -drop_incompat: - if (delayed) - xlog_drop_incompat_feat(mp->m_log); return error; out_trans_cancel: @@ -1091,40 +1101,6 @@ out_trans_cancel: goto out_unlock; } -int __init -xfs_attri_init_cache(void) -{ - xfs_attri_cache = kmem_cache_create("xfs_attri", - sizeof(struct xfs_attri_log_item), - 0, 0, NULL); - - return xfs_attri_cache != NULL ? 0 : -ENOMEM; -} - -void -xfs_attri_destroy_cache(void) -{ - kmem_cache_destroy(xfs_attri_cache); - xfs_attri_cache = NULL; -} - -int __init -xfs_attrd_init_cache(void) -{ - xfs_attrd_cache = kmem_cache_create("xfs_attrd", - sizeof(struct xfs_attrd_log_item), - 0, 0, NULL); - - return xfs_attrd_cache != NULL ? 0 : -ENOMEM; -} - -void -xfs_attrd_destroy_cache(void) -{ - kmem_cache_destroy(xfs_attrd_cache); - xfs_attrd_cache = NULL; -} - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -1384,32 +1360,20 @@ xfs_attr_leaf_get(xfs_da_args_t *args) return error; } -/* - * Return EEXIST if attr is found, or ENOATTR if not - * statep: If not null is set to point at the found state. Caller will - * be responsible for freeing the state in this case. - */ +/* Return EEXIST if attr is found, or ENOATTR if not. */ STATIC int -xfs_attr_node_hasname( +xfs_attr_node_lookup( struct xfs_da_args *args, - struct xfs_da_state **statep) + struct xfs_da_state *state) { - struct xfs_da_state *state; int retval, error; - state = xfs_da_state_alloc(args); - if (statep != NULL) - *statep = state; - /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); if (error) - retval = error; - - if (!statep) - xfs_da_state_free(state); + return error; return retval; } @@ -1420,7 +1384,7 @@ xfs_attr_node_hasname( STATIC int xfs_attr_node_addname_find_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error; @@ -1429,7 +1393,8 @@ xfs_attr_node_addname_find_attr( * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_attr_node_hasname(args, &attr->xattri_da_state); + xfs_attr_item_init_da_state(attr); + error = xfs_attr_node_lookup(args, attr->xattri_da_state); switch (error) { case -ENOATTR: if (args->op_flags & XFS_DA_OP_REPLACE) @@ -1456,8 +1421,10 @@ xfs_attr_node_addname_find_attr( return 0; error: - if (attr->xattri_da_state) + if (attr->xattri_da_state) { xfs_da_state_free(attr->xattri_da_state); + attr->xattri_da_state = NULL; + } return error; } @@ -1470,7 +1437,7 @@ error: */ static int xfs_attr_node_try_addname( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_da_state *state = attr->xattri_da_state; @@ -1511,6 +1478,7 @@ xfs_attr_node_try_addname( out: xfs_da_state_free(state); + attr->xattri_da_state = NULL; return error; } @@ -1535,10 +1503,10 @@ xfs_attr_node_removename( static int xfs_attr_node_remove_attr( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; - struct xfs_da_state *state = NULL; + struct xfs_da_state *state = xfs_da_state_alloc(args); int retval = 0; int error = 0; @@ -1548,8 +1516,6 @@ xfs_attr_node_remove_attr( * attribute entry after any split ops. */ args->attr_filter |= XFS_ATTR_INCOMPLETE; - state = xfs_da_state_alloc(args); - state->inleaf = 0; error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1567,8 +1533,7 @@ xfs_attr_node_remove_attr( retval = error = 0; out: - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); if (error) return error; return retval; @@ -1597,7 +1562,8 @@ xfs_attr_node_get( /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_attr_node_hasname(args, &state); + state = xfs_da_state_alloc(args); + error = xfs_attr_node_lookup(args, state); if (error != -EEXIST) goto out_release; @@ -1616,8 +1582,7 @@ out_release: state->path.blk[i].bp = NULL; } - if (state) - xfs_da_state_free(state); + xfs_da_state_free(state); return error; } @@ -1637,3 +1602,20 @@ xfs_attr_namecheck( /* There shouldn't be any nulls here */ return !memchr(name, 0, length); } + +int __init +xfs_attr_intent_init_cache(void) +{ + xfs_attr_intent_cache = kmem_cache_create("xfs_attr_intent", + sizeof(struct xfs_attr_intent), + 0, 0, NULL); + + return xfs_attr_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_attr_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_attr_intent_cache); + xfs_attr_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 1af7abe29eef..e329da3e7afa 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -31,7 +31,8 @@ struct xfs_attr_list_context; static inline bool xfs_has_larp(struct xfs_mount *mp) { #ifdef DEBUG - return xfs_globals.larp; + /* Logged xattrs require a V5 super for log_incompat */ + return xfs_has_crc(mp) && xfs_globals.larp; #else return false; #endif @@ -434,7 +435,7 @@ struct xfs_attr_list_context { */ /* - * Enum values for xfs_attr_item.xattri_da_state + * Enum values for xfs_attr_intent.xattri_da_state * * These values are used by delayed attribute operations to keep track of where * they were before they returned -EAGAIN. A return code of -EAGAIN signals the @@ -501,44 +502,46 @@ enum xfs_delattr_state { { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \ { XFS_DAS_DONE, "XFS_DAS_DONE" } -/* - * Defines for xfs_attr_item.xattri_flags - */ -#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/ +struct xfs_attri_log_nameval; /* * Context used for keeping track of delayed attribute operations */ -struct xfs_attr_item { +struct xfs_attr_intent { + /* + * used to log this item to an intent containing a list of attrs to + * commit later + */ + struct list_head xattri_list; + + /* Used in xfs_attr_node_removename to roll through removing blocks */ + struct xfs_da_state *xattri_da_state; + struct xfs_da_args *xattri_da_args; + /* + * Shared buffer containing the attr name and value so that the logging + * code can share large memory buffers between log items. + */ + struct xfs_attri_log_nameval *xattri_nameval; + /* * Used by xfs_attr_set to hold a leaf buffer across a transaction roll */ struct xfs_buf *xattri_leaf_bp; - /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ - struct xfs_bmbt_irec xattri_map; - xfs_dablk_t xattri_lblkno; - int xattri_blkcnt; - - /* Used in xfs_attr_node_removename to roll through removing blocks */ - struct xfs_da_state *xattri_da_state; - /* Used to keep track of current state of delayed operation */ - unsigned int xattri_flags; enum xfs_delattr_state xattri_dela_state; /* - * Attr operation being performed - XFS_ATTR_OP_FLAGS_* + * Attr operation being performed - XFS_ATTRI_OP_FLAGS_* */ unsigned int xattri_op_flags; - /* - * used to log this item to an intent containing a list of attrs to - * commit later - */ - struct list_head xattri_list; + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + xfs_dablk_t xattri_lblkno; + int xattri_blkcnt; + struct xfs_bmbt_irec xattri_map; }; @@ -557,21 +560,13 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); -int xfs_attr_set_iter(struct xfs_attr_item *attr); -int xfs_attr_remove_iter(struct xfs_attr_item *attr); +int xfs_attr_set_iter(struct xfs_attr_intent *attr); +int xfs_attr_remove_iter(struct xfs_attr_intent *attr); bool xfs_attr_namecheck(const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, unsigned int *total); -extern struct kmem_cache *xfs_attri_cache; -extern struct kmem_cache *xfs_attrd_cache; - -int __init xfs_attri_init_cache(void); -void xfs_attri_destroy_cache(void); -int __init xfs_attrd_init_cache(void); -void xfs_attrd_destroy_cache(void); - /* * Check to see if the attr should be upgraded from non-existent or shortform to * single-leaf-block attribute list. @@ -634,4 +629,8 @@ xfs_attr_init_replace_state(struct xfs_da_args *args) return xfs_attr_init_add_state(args); } +extern struct kmem_cache *xfs_attr_intent_cache; +int __init xfs_attr_intent_init_cache(void); +void xfs_attr_intent_destroy_cache(void); + #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4250159ecced..7298c148f848 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -568,7 +568,7 @@ xfs_attr_rmtval_stale( */ int xfs_attr_rmtval_find_space( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_bmbt_irec *map = &attr->xattri_map; @@ -598,7 +598,7 @@ xfs_attr_rmtval_find_space( */ int xfs_attr_rmtval_set_blk( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; struct xfs_inode *dp = args->dp; @@ -674,7 +674,7 @@ xfs_attr_rmtval_invalidate( */ int xfs_attr_rmtval_remove( - struct xfs_attr_item *attr) + struct xfs_attr_intent *attr) { struct xfs_da_args *args = attr->xattri_da_args; int error, done; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 62b398edec3f..d097ec6c4dc3 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_attr_item *attr); +int xfs_attr_rmtval_remove(struct xfs_attr_intent *attr); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); -int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr); -int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr); +int xfs_attr_rmtval_set_blk(struct xfs_attr_intent *attr); +int xfs_attr_rmtval_find_space(struct xfs_attr_intent *attr); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2aa300f7461f..2eecc49fc1b2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -51,16 +51,31 @@ xfs_btree_magic( return magic; } -static xfs_failaddr_t +/* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t xfs_btree_check_lblock_siblings( struct xfs_mount *mp, struct xfs_btree_cur *cur, int level, xfs_fsblock_t fsb, - xfs_fsblock_t sibling) + __be64 dsibling) { - if (sibling == NULLFSBLOCK) + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; + + sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; if (level >= 0) { @@ -74,17 +89,21 @@ xfs_btree_check_lblock_siblings( return NULL; } -static xfs_failaddr_t +static inline xfs_failaddr_t xfs_btree_check_sblock_siblings( struct xfs_mount *mp, struct xfs_btree_cur *cur, int level, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_agblock_t sibling) + __be32 dsibling) { - if (sibling == NULLAGBLOCK) + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) return NULL; + + sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; if (level >= 0) { @@ -136,10 +155,10 @@ __xfs_btree_check_lblock( fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - be64_to_cpu(block->bb_u.l.bb_leftsib)); + block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - be64_to_cpu(block->bb_u.l.bb_rightsib)); + block->bb_u.l.bb_rightsib); return fa; } @@ -204,10 +223,10 @@ __xfs_btree_check_sblock( } fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_leftsib)); + block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, - agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); + agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -426,8 +445,14 @@ xfs_btree_del_cursor( break; } + /* + * If we are doing a BMBT update, the number of unaccounted blocks + * allocated during this cursor life time should be zero. If it's not + * zero, then we should be shut down or on our way to shutdown due to + * cancelling a dirty transaction on error. + */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - xfs_is_shutdown(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -3247,7 +3272,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3327,7 +3352,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3347,7 +3372,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3432,6 +3457,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -4523,10 +4550,10 @@ xfs_btree_lblock_verify( /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - be64_to_cpu(block->bb_u.l.bb_leftsib)); + block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - be64_to_cpu(block->bb_u.l.bb_rightsib)); + block->bb_u.l.bb_rightsib); return fa; } @@ -4580,10 +4607,10 @@ xfs_btree_sblock_verify( agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_leftsib)); + block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, - be32_to_cpu(block->bb_u.s.bb_rightsib)); + block->bb_u.s.bb_rightsib); return fa; } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index aa74f3fdb571..e7201dc68f43 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -117,6 +117,17 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_cache_free(xfs_da_state_cache, state); } +void +xfs_da_state_reset( + struct xfs_da_state *state, + struct xfs_da_args *args) +{ + xfs_da_state_kill_altpath(state); + memset(state, 0, sizeof(struct xfs_da_state)); + state->args = args; + state->mp = state->args->dp->i_mount; +} + static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) { if (whichfork == XFS_DATA_FORK) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ed2303e4d46a..d33b7686a0b3 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -225,6 +225,7 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args); void xfs_da_state_free(xfs_da_state_t *state); +void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args); void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from); diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ceb222b4f261..5a321b783398 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -191,35 +191,56 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; -static bool +/* + * Ensure there's a log intent item associated with this deferred work item if + * the operation must be restarted on crash. Returns 1 if there's a log item; + * 0 if there isn't; or a negative errno. + */ +static int xfs_defer_create_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp, bool sort) { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; - if (!dfp->dfp_intent) - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); - return dfp->dfp_intent != NULL; + if (dfp->dfp_intent) + return 1; + + lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + if (!lip) + return 0; + if (IS_ERR(lip)) + return PTR_ERR(lip); + + dfp->dfp_intent = lip; + return 1; } /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of * the pending list. + * + * Returns 1 if at least one log item was associated with the deferred work; + * 0 if there are no log items; or a negative errno. */ -static bool +static int xfs_defer_create_intents( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; - bool ret = false; + int ret = 0; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { + int ret2; + trace_xfs_defer_create_intent(tp->t_mountp, dfp); - ret |= xfs_defer_create_intent(tp, dfp, true); + ret2 = xfs_defer_create_intent(tp, dfp, true); + if (ret2 < 0) + return ret2; + ret |= ret2; } return ret; } @@ -457,6 +478,8 @@ xfs_defer_finish_one( dfp->dfp_count--; error = ops->finish_item(tp, dfp->dfp_done, li, &state); if (error == -EAGAIN) { + int ret; + /* * Caller wants a fresh transaction; put the work item * back on the list and log a new log intent item to @@ -467,7 +490,9 @@ xfs_defer_finish_one( dfp->dfp_count++; dfp->dfp_done = NULL; dfp->dfp_intent = NULL; - xfs_defer_create_intent(tp, dfp, false); + ret = xfs_defer_create_intent(tp, dfp, false); + if (ret < 0) + error = ret; } if (error) @@ -514,10 +539,14 @@ xfs_defer_finish_noroll( * of time that any one intent item can stick around in memory, * pinning the log tail. */ - bool has_intents = xfs_defer_create_intents(*tp); + int has_intents = xfs_defer_create_intents(*tp); list_splice_init(&(*tp)->t_dfops, &dop_pending); + if (has_intents < 0) { + error = has_intents; + goto out_shutdown; + } if (has_intents || dfp) { error = xfs_defer_trans_roll(tp); if (error) @@ -676,13 +705,15 @@ xfs_defer_ops_capture( if (list_empty(&tp->t_dfops)) return NULL; + error = xfs_defer_create_intents(tp); + if (error < 0) + return ERR_PTR(error); + /* Create an object to capture the defer ops. */ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); - xfs_defer_create_intents(tp); - /* Move the dfops chain and transaction state to the capture struct. */ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; @@ -759,6 +790,10 @@ xfs_defer_ops_capture_and_commit( /* If we don't capture anything, commit transaction and exit. */ dfc = xfs_defer_ops_capture(tp); + if (IS_ERR(dfc)) { + xfs_trans_cancel(tp); + return PTR_ERR(dfc); + } if (!dfc) return xfs_trans_commit(tp); @@ -873,10 +908,7 @@ xfs_defer_init_item_caches(void) error = xfs_extfree_intent_init_cache(); if (error) goto err; - error = xfs_attri_init_cache(); - if (error) - goto err; - error = xfs_attrd_init_cache(); + error = xfs_attr_intent_init_cache(); if (error) goto err; return 0; @@ -889,8 +921,7 @@ err: void xfs_defer_destroy_item_caches(void) { - xfs_attri_destroy_cache(); - xfs_attrd_destroy_cache(); + xfs_attr_intent_destroy_cache(); xfs_extfree_intent_destroy_cache(); xfs_bmap_intent_destroy_cache(); xfs_refcount_intent_destroy_cache(); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f7edd1ecf6d9..b351b9dc6561 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -906,10 +906,18 @@ struct xfs_icreate_log { * Flags for deferred attribute operations. * Upper bits are flags, lower byte is type code */ -#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */ -#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */ -#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */ -#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ +#define XFS_ATTRI_OP_FLAGS_SET 1 /* Set the attribute */ +#define XFS_ATTRI_OP_FLAGS_REMOVE 2 /* Remove the attribute */ +#define XFS_ATTRI_OP_FLAGS_REPLACE 3 /* Replace the attribute */ +#define XFS_ATTRI_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */ + +/* + * alfi_attr_filter captures the state of xfs_da_args.attr_filter, so it should + * never have any other bits set. + */ +#define XFS_ATTRI_FILTER_MASK (XFS_ATTR_ROOT | \ + XFS_ATTR_SECURE | \ + XFS_ATTR_INCOMPLETE) /* * This is the structure used to lay out an attr log item in the @@ -924,7 +932,7 @@ struct xfs_attri_log_format { uint32_t alfi_op_flags; /* marks the op as a set or remove */ uint32_t alfi_name_len; /* attr name length */ uint32_t alfi_value_len; /* attr value length */ - uint32_t alfi_attr_flags;/* attr flags */ + uint32_t alfi_attr_filter;/* attr filter flags */ }; struct xfs_attrd_log_format { diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 32e216255cb0..2420865f3007 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -110,12 +110,6 @@ struct xlog_recover { #define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) -/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -128,5 +122,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +int xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index f0b38f4aba80..8b9bd178a487 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -213,7 +213,7 @@ xfs_symlink_shortform_verify( /* * Zero length symlinks should never occur in memory as they are - * never alllowed to exist on disk. + * never allowed to exist on disk. */ if (!size) return __this_address; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index b11870d07c56..2e8e400f10a9 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -340,20 +340,6 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, }; -/* This isn't a stable feature, warn once per day. */ -static inline void -xchk_experimental_warning( - struct xfs_mount *mp) -{ - static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( - "xchk_warning", 86400 * HZ, 1); - ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&scrub_warning)) - xfs_alert(mp, -"EXPERIMENTAL online scrub feature in use. Use at your own risk!"); -} - static int xchk_validate_inputs( struct xfs_mount *mp, @@ -478,7 +464,8 @@ xfs_scrub_metadata( if (error) goto out; - xchk_experimental_warning(mp); + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB, + "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); if (!sc) { diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 3df9c1782ead..b744c62052b6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -17,6 +17,7 @@ #include "xfs_error.h" #include "xfs_acl.h" #include "xfs_trans.h" +#include "xfs_xattr.h" #include @@ -202,7 +203,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) xfs_acl_to_disk(args.value, acl); } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); kmem_free(args.value); /* diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e8ac88d9fd14..4a28c2d77070 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -22,13 +22,15 @@ #include "xfs_attr.h" #include "xfs_attr_item.h" #include "xfs_trace.h" -#include "xfs_inode.h" #include "xfs_trans_space.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +struct kmem_cache *xfs_attri_cache; +struct kmem_cache *xfs_attrd_cache; + static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, @@ -39,12 +41,80 @@ static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_attri_log_item, attri_item); } +/* + * Shared xattr name/value buffers for logged extended attribute operations + * + * When logging updates to extended attributes, we can create quite a few + * attribute log intent items for a single xattr update. To avoid cycling the + * memory allocator and memcpy overhead, the name (and value, for setxattr) + * are kept in a refcounted object that is shared across all related log items + * and the upper-level deferred work state structure. The shared buffer has + * a control structure, followed by the name, and then the value. + */ + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_get( + struct xfs_attri_log_nameval *nv) +{ + if (!refcount_inc_not_zero(&nv->refcount)) + return NULL; + return nv; +} + +static inline void +xfs_attri_log_nameval_put( + struct xfs_attri_log_nameval *nv) +{ + if (!nv) + return; + if (refcount_dec_and_test(&nv->refcount)) + kvfree(nv); +} + +static inline struct xfs_attri_log_nameval * +xfs_attri_log_nameval_alloc( + const void *name, + unsigned int name_len, + const void *value, + unsigned int value_len) +{ + struct xfs_attri_log_nameval *nv; + + /* + * This could be over 64kB in length, so we have to use kvmalloc() for + * this. But kvmalloc() utterly sucks, so we use our own version. + */ + nv = xlog_kvmalloc(sizeof(struct xfs_attri_log_nameval) + + name_len + value_len); + if (!nv) + return nv; + + nv->name.i_addr = nv + 1; + nv->name.i_len = name_len; + nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; + memcpy(nv->name.i_addr, name, name_len); + + if (value_len) { + nv->value.i_addr = nv->name.i_addr + name_len; + nv->value.i_len = value_len; + memcpy(nv->value.i_addr, value, value_len); + } else { + nv->value.i_addr = NULL; + nv->value.i_len = 0; + } + nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; + + refcount_set(&nv->refcount, 1); + return nv; +} + STATIC void xfs_attri_item_free( struct xfs_attri_log_item *attrip) { kmem_free(attrip->attri_item.li_lv_shadow); - kvfree(attrip); + xfs_attri_log_nameval_put(attrip->attri_nameval); + kmem_cache_free(xfs_attri_cache, attrip); } /* @@ -73,16 +143,17 @@ xfs_attri_item_size( int *nbytes) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; *nvecs += 2; *nbytes += sizeof(struct xfs_attri_log_format) + - xlog_calc_iovec_len(attrip->attri_name_len); + xlog_calc_iovec_len(nv->name.i_len); - if (!attrip->attri_value_len) + if (!nv->value.i_len) return; *nvecs += 1; - *nbytes += xlog_calc_iovec_len(attrip->attri_value_len); + *nbytes += xlog_calc_iovec_len(nv->value.i_len); } /* @@ -97,6 +168,7 @@ xfs_attri_item_format( { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_log_iovec *vecp = NULL; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; attrip->attri_format.alfi_type = XFS_LI_ATTRI; attrip->attri_format.alfi_size = 1; @@ -108,22 +180,18 @@ xfs_attri_item_format( * the log recovery. */ - ASSERT(attrip->attri_name_len > 0); + ASSERT(nv->name.i_len > 0); attrip->attri_format.alfi_size++; - if (attrip->attri_value_len > 0) + if (nv->value.i_len > 0) attrip->attri_format.alfi_size++; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, - attrip->attri_name, - attrip->attri_name_len); - if (attrip->attri_value_len > 0) - xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, - attrip->attri_value, - attrip->attri_value_len); + xlog_copy_from_iovec(lv, &vecp, &nv->name); + if (nv->value.i_len > 0) + xlog_copy_from_iovec(lv, &vecp, &nv->value); } /* @@ -158,41 +226,18 @@ xfs_attri_item_release( STATIC struct xfs_attri_log_item * xfs_attri_init( struct xfs_mount *mp, - uint32_t name_len, - uint32_t value_len) - + struct xfs_attri_log_nameval *nv) { struct xfs_attri_log_item *attrip; - uint32_t buffer_size = name_len + value_len; - if (buffer_size) { - /* - * This could be over 64kB in length, so we have to use - * kvmalloc() for this. But kvmalloc() utterly sucks, so we - * use own version. - */ - attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) + - buffer_size); - } else { - attrip = kmem_cache_alloc(xfs_attri_cache, - GFP_NOFS | __GFP_NOFAIL); - } - memset(attrip, 0, sizeof(struct xfs_attri_log_item)); + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); - attrip->attri_name_len = name_len; - if (name_len) - attrip->attri_name = ((char *)attrip) + - sizeof(struct xfs_attri_log_item); - else - attrip->attri_name = NULL; - - attrip->attri_value_len = value_len; - if (value_len) - attrip->attri_value = ((char *)attrip) + - sizeof(struct xfs_attri_log_item) + - name_len; - else - attrip->attri_value = NULL; + /* + * Grab an extra reference to the name/value buffer for this log item. + * The caller retains its own reference! + */ + attrip->attri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attrip->attri_nameval); xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI, &xfs_attri_item_ops); @@ -233,7 +278,7 @@ STATIC void xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) { kmem_free(attrdp->attrd_item.li_lv_shadow); - kmem_free(attrdp); + kmem_cache_free(xfs_attrd_cache, attrdp); } STATIC void @@ -297,7 +342,7 @@ xfs_attrd_item_intent( */ STATIC int xfs_xattri_finish_update( - struct xfs_attr_item *attr, + struct xfs_attr_intent *attr, struct xfs_attrd_log_item *attrdp) { struct xfs_da_args *args = attr->xattri_da_args; @@ -335,7 +380,7 @@ STATIC void xfs_attr_log_item( struct xfs_trans *tp, struct xfs_attri_log_item *attrip, - struct xfs_attr_item *attr) + const struct xfs_attr_intent *attr) { struct xfs_attri_log_format *attrp; @@ -343,23 +388,18 @@ xfs_attr_log_item( set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); /* - * At this point the xfs_attr_item has been constructed, and we've + * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format - * structure with fields from this xfs_attr_item + * structure with fields from this xfs_attr_intent */ attrp = &attrip->attri_format; attrp->alfi_ino = attr->xattri_da_args->dp->i_ino; + ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = attr->xattri_da_args->valuelen; - attrp->alfi_name_len = attr->xattri_da_args->namelen; - attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter; - - memcpy(attrip->attri_name, attr->xattri_da_args->name, - attr->xattri_da_args->namelen); - memcpy(attrip->attri_value, attr->xattri_da_args->value, - attr->xattri_da_args->valuelen); - attrip->attri_name_len = attr->xattri_da_args->namelen; - attrip->attri_value_len = attr->xattri_da_args->valuelen; + attrp->alfi_value_len = attr->xattri_nameval->value.i_len; + attrp->alfi_name_len = attr->xattri_nameval->name.i_len; + ASSERT(!(attr->xattri_da_args->attr_filter & ~XFS_ATTRI_FILTER_MASK)); + attrp->alfi_attr_filter = attr->xattri_da_args->attr_filter; } /* Get an ATTRI. */ @@ -372,7 +412,7 @@ xfs_attr_create_intent( { struct xfs_mount *mp = tp->t_mountp; struct xfs_attri_log_item *attrip; - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; ASSERT(count == 1); @@ -383,19 +423,47 @@ xfs_attr_create_intent( * Each attr item only performs one attribute operation at a time, so * this is a list of one */ - list_for_each_entry(attr, items, xattri_list) { - attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen, - attr->xattri_da_args->valuelen); - if (attrip == NULL) - return NULL; + attr = list_first_entry_or_null(items, struct xfs_attr_intent, + xattri_list); - xfs_trans_add_item(tp, &attrip->attri_item); - xfs_attr_log_item(tp, attrip, attr); + /* + * Create a buffer to store the attribute name and value. This buffer + * will be shared between the higher level deferred xattr work state + * and the lower level xattr log items. + */ + if (!attr->xattri_nameval) { + struct xfs_da_args *args = attr->xattri_da_args; + + /* + * Transfer our reference to the name/value buffer to the + * deferred work state structure. + */ + attr->xattri_nameval = xfs_attri_log_nameval_alloc(args->name, + args->namelen, args->value, args->valuelen); } + if (!attr->xattri_nameval) + return ERR_PTR(-ENOMEM); + + attrip = xfs_attri_init(mp, attr->xattri_nameval); + xfs_trans_add_item(tp, &attrip->attri_item); + xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; } +static inline void +xfs_attr_free_item( + struct xfs_attr_intent *attr) +{ + if (attr->xattri_da_state) + xfs_da_state_free(attr->xattri_da_state); + xfs_attri_log_nameval_put(attr->xattri_nameval); + if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) + kmem_free(attr); + else + kmem_cache_free(xfs_attr_intent_cache, attr); +} + /* Process an attr. */ STATIC int xfs_attr_finish_item( @@ -404,11 +472,11 @@ xfs_attr_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; struct xfs_attrd_log_item *done_item = NULL; int error; - attr = container_of(item, struct xfs_attr_item, xattri_list); + attr = container_of(item, struct xfs_attr_intent, xattri_list); if (done) done_item = ATTRD_ITEM(done); @@ -420,7 +488,7 @@ xfs_attr_finish_item( error = xfs_xattri_finish_update(attr, done_item); if (error != -EAGAIN) - kmem_free(attr); + xfs_attr_free_item(attr); return error; } @@ -438,33 +506,10 @@ STATIC void xfs_attr_cancel_item( struct list_head *item) { - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; - attr = container_of(item, struct xfs_attr_item, xattri_list); - kmem_free(attr); -} - -STATIC xfs_lsn_t -xfs_attri_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); - - /* - * The attrip refers to xfs_attr_item memory to log the name and value - * with the intent item. This already occurred when the intent was - * committed so these fields are no longer accessed. Clear them out of - * caution since we're about to free the xfs_attr_item. - */ - attrip->attri_name = NULL; - attrip->attri_value = NULL; - - /* - * The ATTRI is logged only once and cannot be moved in the log, so - * simply return the lsn at which it's been logged. - */ - return lsn; + attr = container_of(item, struct xfs_attr_intent, xattri_list); + xfs_attr_free_item(attr); } STATIC bool @@ -482,16 +527,22 @@ xfs_attri_validate( struct xfs_attri_log_format *attrp) { unsigned int op = attrp->alfi_op_flags & - XFS_ATTR_OP_FLAGS_TYPE_MASK; + XFS_ATTRI_OP_FLAGS_TYPE_MASK; if (attrp->__pad != 0) return false; + if (attrp->alfi_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK) + return false; + + if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) + return false; + /* alfi_op_flags should be either a set or remove */ switch (op) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: - case XFS_ATTR_OP_FLAGS_REMOVE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + case XFS_ATTRI_OP_FLAGS_REMOVE: break; default: return false; @@ -517,13 +568,14 @@ xfs_attri_item_recover( struct list_head *capture_list) { struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); - struct xfs_attr_item *attr; + struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; struct xfs_trans_res tres; struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error, ret = 0; int total; int local; @@ -535,41 +587,50 @@ xfs_attri_item_recover( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) return -EFSCORRUPTED; error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); if (error) return error; - attr = kmem_zalloc(sizeof(struct xfs_attr_item) + + attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; - attr->xattri_op_flags = attrp->alfi_op_flags; + attr->xattri_op_flags = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + + /* + * We're reconstructing the deferred work state structure from the + * recovered log item. Grab a reference to the name/value buffer and + * attach it to the new work state. + */ + attr->xattri_nameval = xfs_attri_log_nameval_get(nv); + ASSERT(attr->xattri_nameval); args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->name = attrip->attri_name; - args->namelen = attrp->alfi_name_len; + args->name = nv->name.i_addr; + args->namelen = nv->name.i_len; args->hashval = xfs_da_hashname(args->name, args->namelen); - args->attr_filter = attrp->alfi_attr_flags; + args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT; - switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) { - case XFS_ATTR_OP_FLAGS_SET: - case XFS_ATTR_OP_FLAGS_REPLACE: - args->value = attrip->attri_value; - args->valuelen = attrp->alfi_value_len; + switch (attr->xattri_op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + args->value = nv->value.i_addr; + args->valuelen = nv->value.i_len; args->total = xfs_attr_calc_size(args, &local); if (xfs_inode_hasattr(args->dp)) attr->xattri_dela_state = xfs_attr_init_replace_state(args); else attr->xattri_dela_state = xfs_attr_init_add_state(args); break; - case XFS_ATTR_OP_FLAGS_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: if (!xfs_inode_hasattr(args->dp)) goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); @@ -613,7 +674,7 @@ out_unlock: xfs_irele(ip); out: if (ret != -EAGAIN) - kmem_free(attr); + xfs_attr_free_item(attr); return error; } @@ -636,22 +697,18 @@ xfs_attri_item_relog( attrdp = xfs_trans_get_attrd(tp, old_attrip); set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len, - old_attrp->alfi_value_len); + /* + * Create a new log item that shares the same name/value buffer as the + * old log item. + */ + new_attrip = xfs_attri_init(tp->t_mountp, old_attrip->attri_nameval); new_attrp = &new_attrip->attri_format; new_attrp->alfi_ino = old_attrp->alfi_ino; new_attrp->alfi_op_flags = old_attrp->alfi_op_flags; new_attrp->alfi_value_len = old_attrp->alfi_value_len; new_attrp->alfi_name_len = old_attrp->alfi_name_len; - new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags; - - memcpy(new_attrip->attri_name, old_attrip->attri_name, - new_attrip->attri_name_len); - - if (new_attrip->attri_value_len > 0) - memcpy(new_attrip->attri_value, old_attrip->attri_value, - new_attrip->attri_value_len); + new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; xfs_trans_add_item(tp, &new_attrip->attri_item); set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); @@ -666,47 +723,47 @@ xlog_recover_attri_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t lsn) { - int error; struct xfs_mount *mp = log->l_mp; struct xfs_attri_log_item *attrip; struct xfs_attri_log_format *attri_formatp; - int region = 0; + struct xfs_attri_log_nameval *nv; + const void *attr_value = NULL; + const void *attr_name; + int error; - attri_formatp = item->ri_buf[region].i_addr; + attri_formatp = item->ri_buf[0].i_addr; + attr_name = item->ri_buf[1].i_addr; - /* Validate xfs_attri_log_format */ + /* Validate xfs_attri_log_format before the large memory allocation */ if (!xfs_attri_validate(mp, attri_formatp)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; } - /* memory alloc failure will cause replay to abort */ - attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len, - attri_formatp->alfi_value_len); - if (attrip == NULL) + if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (attri_formatp->alfi_value_len) + attr_value = item->ri_buf[2].i_addr; + + /* + * Memory alloc failure will cause replay to abort. We attach the + * name/value buffer to the recovered incore log item and drop our + * reference. + */ + nv = xfs_attri_log_nameval_alloc(attr_name, + attri_formatp->alfi_name_len, attr_value, + attri_formatp->alfi_value_len); + if (!nv) return -ENOMEM; - error = xfs_attri_copy_format(&item->ri_buf[region], - &attrip->attri_format); + attrip = xfs_attri_init(mp, nv); + error = xfs_attri_copy_format(&item->ri_buf[0], &attrip->attri_format); if (error) goto out; - region++; - memcpy(attrip->attri_name, item->ri_buf[region].i_addr, - attrip->attri_name_len); - - if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto out; - } - - if (attrip->attri_value_len > 0) { - region++; - memcpy(attrip->attri_value, item->ri_buf[region].i_addr, - attrip->attri_value_len); - } - /* * The ATTRI has two references. One for the ATTRD and one for ATTRI to * ensure it makes it into the AIL. Insert the ATTRI into the AIL @@ -715,9 +772,11 @@ xlog_recover_attri_commit_pass2( */ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); xfs_attri_release(attrip); + xfs_attri_log_nameval_put(nv); return 0; out: xfs_attri_item_free(attrip); + xfs_attri_log_nameval_put(nv); return error; } @@ -797,7 +856,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_size = xfs_attri_item_size, .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, - .iop_committed = xfs_attri_item_committed, .iop_release = xfs_attri_item_release, .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index c3b779f82adb..3280a7930287 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -11,25 +11,30 @@ struct xfs_mount; struct kmem_zone; +struct xfs_attri_log_nameval { + struct xfs_log_iovec name; + struct xfs_log_iovec value; + refcount_t refcount; + + /* name and value follow the end of this struct */ +}; + /* * This is the "attr intention" log item. It is used to log the fact that some * extended attribute operations need to be processed. An operation is * currently either a set or remove. Set or remove operations are described by - * the xfs_attr_item which may be logged to this intent. + * the xfs_attr_intent which may be logged to this intent. * * During a normal attr operation, name and value point to the name and value * fields of the caller's xfs_da_args structure. During a recovery, the name * and value buffers are copied from the log, and stored in a trailing buffer - * attached to the xfs_attr_item until they are committed. They are freed when - * the xfs_attr_item itself is freed when the work is done. + * attached to the xfs_attr_intent until they are committed. They are freed + * when the xfs_attr_intent itself is freed when the work is done. */ struct xfs_attri_log_item { struct xfs_log_item attri_item; atomic_t attri_refcount; - int attri_name_len; - int attri_value_len; - void *attri_name; - void *attri_value; + struct xfs_attri_log_nameval *attri_nameval; struct xfs_attri_log_format attri_format; }; @@ -43,4 +48,7 @@ struct xfs_attrd_log_item { struct xfs_attrd_log_format attrd_format; }; +extern struct kmem_cache *xfs_attri_cache; +extern struct kmem_cache *xfs_attrd_cache; + #endif /* __XFS_ATTR_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index e484251dc9c8..ffa94102094d 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -23,6 +23,15 @@ #include "xfs_dir2.h" #include "xfs_quota.h" +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + /* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. @@ -993,3 +1002,60 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +int +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + void *p; + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + int i; + + if (!log->l_buf_cancel_table) + return; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a60632ecc3f0..5a171c0b244b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -576,9 +576,9 @@ xfs_file_dio_write_unaligned( * don't even bother trying the fast path in this case. */ if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) { -retry_exclusive: if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; +retry_exclusive: iolock = XFS_IOLOCK_EXCL; flags = IOMAP_DIO_FORCE_WAIT; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 888839e75d11..d4a77c53f94b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -149,12 +149,7 @@ xfs_growfs_data_private( error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, delta, &lastag_extended); } else { - static struct ratelimit_state shrink_warning = \ - RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); - ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); - - if (__ratelimit(&shrink_warning)) - xfs_alert(mp, + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b2879870a17e..52d6f2c7d58b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2622,7 +2622,7 @@ xfs_ifree( */ error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - return error; + goto out; error = xfs_iunlink_remove(tp, pag, ip); if (error) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e5cb7936206..5a364a7d58fd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -37,6 +37,7 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include #include @@ -524,7 +525,7 @@ xfs_attrmulti_attr_set( args.valuelen = len; } - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (flags & XFS_IOC_ATTR_ROOT)) xfs_forget_acl(inode, name); kfree(args.value); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e912b7fee714..29f5b8b8aca6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -24,6 +24,7 @@ #include "xfs_iomap.h" #include "xfs_error.h" #include "xfs_ioctl.h" +#include "xfs_xattr.h" #include #include @@ -61,7 +62,7 @@ xfs_initxattrs( .value = xattr->value, .valuelen = xattr->value_len, }; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (error < 0) break; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9dc748abdf33..1e972f884a81 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3877,44 +3877,3 @@ xlog_drop_incompat_feat( { up_read(&log->l_incompat_users); } - -/* - * Get permission to use log-assisted atomic exchange of file extents. - * - * Callers must not be running any transactions or hold any inode locks, and - * they must release the permission by calling xlog_drop_incompat_feat - * when they're done. - */ -int -xfs_attr_use_log_assist( - struct xfs_mount *mp) -{ - int error = 0; - - /* - * Protect ourselves from an idle log clearing the logged xattrs log - * incompat feature bit. - */ - xlog_use_incompat_feat(mp->m_log); - - /* - * If log-assisted xattrs are already enabled, the caller can use the - * log assisted swap functions with the log-incompat reference we got. - */ - if (xfs_sb_version_haslogxattrs(&mp->m_sb)) - return 0; - - /* Enable log-assisted xattrs. */ - error = xfs_add_incompat_log_feature(mp, - XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); - if (error) - goto drop_incompat; - - xfs_warn_once(mp, -"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!"); - - return 0; -drop_incompat: - xlog_drop_incompat_feat(mp->m_log); - return error; -} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 252b098cde1f..f3ce046a7d45 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -86,6 +86,13 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } +static inline void * +xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + const struct xfs_log_iovec *src) +{ + return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); +} + /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 67fd9789e69a..686c01eb3661 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -428,9 +428,6 @@ struct xlog { struct rw_semaphore l_incompat_users; }; -#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 97b941c07957..5f7e4e6e33ce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -39,13 +39,6 @@ STATIC int xlog_clear_stale_blocks( struct xlog *, xfs_lsn_t); -#if defined(DEBUG) -STATIC void -xlog_recover_check_summary( - struct xlog *); -#else -#define xlog_recover_check_summary(log) -#endif STATIC int xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); @@ -3230,7 +3223,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error; ASSERT(head_blk != tail_blk); @@ -3238,37 +3231,25 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error; error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; } @@ -3339,8 +3320,6 @@ xlog_do_recover( } mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); - xlog_recover_check_summary(log); - /* Normal transactions can now occur */ clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; @@ -3483,7 +3462,6 @@ xlog_recover_finish( } xlog_recover_process_iunlinks(log); - xlog_recover_check_summary(log); /* * Recover any CoW staging blocks that are still referenced by the @@ -3517,52 +3495,3 @@ xlog_recover_cancel( xlog_recover_cancel_intents(log); } -#if defined(DEBUG) -/* - * Read all of the agf and agi counters and check that they - * are consistent with the superblock counters. - */ -STATIC void -xlog_recover_check_summary( - struct xlog *log) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; - - freeblks = 0LL; - itotal = 0LL; - ifree = 0LL; - for_each_perag(mp, agno, pag) { - error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); - if (error) { - xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agf *agfp = agfbp->b_addr; - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - } - - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); - if (error) { - xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, pag->pag_agno, error); - } else { - struct xfs_agi *agi = agibp->b_addr; - - itotal += be32_to_cpu(agi->agi_count); - ifree += be32_to_cpu(agi->agi_freecount); - xfs_buf_relse(agibp); - } - } -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 55ee464ab59f..cc323775a12c 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -75,6 +75,12 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_mount(mp, warntag, fmt, ...) \ +do { \ + if (xfs_should_warn((mp), (warntag))) \ + xfs_warn((mp), (fmt), ##__VA_ARGS__); \ +} while (0) + #define xfs_warn_once(dev, fmt, ...) \ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) #define xfs_notice_once(dev, fmt, ...) \ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0c0bcbd4949d..daa8d29c46b4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1356,7 +1356,6 @@ xfs_clear_incompat_log_features( if (xfs_sb_has_incompat_log_feature(&mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { - xfs_info(mp, "Clearing log incompat feature flags."); xfs_sb_remove_incompat_log_features(&mp->m_sb); ret = true; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c42786e4942..ba5d42abf66e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -391,6 +391,13 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 +/* Kernel has logged a warning about online fsck being used on this fs. */ +#define XFS_OPSTATE_WARNED_SCRUB 7 +/* Kernel has logged a warning about shrink being used on this fs. */ +#define XFS_OPSTATE_WARNED_SHRINK 8 +/* Kernel has logged a warning about logged xattr updates being used. */ +#define XFS_OPSTATE_WARNED_LARP 9 + #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ { \ @@ -413,6 +420,12 @@ __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +static inline bool +xfs_should_warn(struct xfs_mount *mp, long nr) +{ + return !test_and_set_bit(nr, &mp->m_opstate); +} + #define XFS_OPSTATE_STRINGS \ { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ @@ -420,7 +433,10 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ - { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ + { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 8fc813cb6011..abf08bbf34a9 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1308,8 +1308,15 @@ xfs_qm_quotacheck( error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); - if (error) + if (error) { + /* + * The inode walk may have partially populated the dquot + * caches. We must purge them before disabling quota and + * tearing down the quotainfo, or else the dquots will leak. + */ + xfs_qm_dqpurge_all(mp); goto error_return; + } /* * We've made all the changes that we need to make incore. Flush them diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8495ef076ffc..ed18160e6181 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,8 @@ #include "xfs_pwork.h" #include "xfs_ag.h" #include "xfs_defer.h" +#include "xfs_attr_item.h" +#include "xfs_xattr.h" #include #include @@ -2079,8 +2081,24 @@ xfs_init_caches(void) if (!xfs_bui_cache) goto out_destroy_bud_cache; + xfs_attrd_cache = kmem_cache_create("xfs_attrd_item", + sizeof(struct xfs_attrd_log_item), + 0, 0, NULL); + if (!xfs_attrd_cache) + goto out_destroy_bui_cache; + + xfs_attri_cache = kmem_cache_create("xfs_attri_item", + sizeof(struct xfs_attri_log_item), + 0, 0, NULL); + if (!xfs_attri_cache) + goto out_destroy_attrd_cache; + return 0; + out_destroy_attrd_cache: + kmem_cache_destroy(xfs_attrd_cache); + out_destroy_bui_cache: + kmem_cache_destroy(xfs_bui_cache); out_destroy_bud_cache: kmem_cache_destroy(xfs_bud_cache); out_destroy_cui_cache: @@ -2127,6 +2145,8 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_attri_cache); + kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); kmem_cache_destroy(xfs_bud_cache); kmem_cache_destroy(xfs_cui_cache); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 167d23f92ffe..3cd5a51bace1 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -91,7 +91,6 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); extern const struct export_operations xfs_export_operations; -extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 7a044afd4c46..35e13e125ec6 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -15,9 +15,86 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" +#include "xfs_log.h" +#include "xfs_xattr.h" #include +/* + * Get permission to use log-assisted atomic exchange of file extents. + * + * Callers must not be running any transactions or hold any inode locks, and + * they must release the permission by calling xlog_drop_incompat_feat + * when they're done. + */ +static inline int +xfs_attr_grab_log_assist( + struct xfs_mount *mp) +{ + int error = 0; + + /* + * Protect ourselves from an idle log clearing the logged xattrs log + * incompat feature bit. + */ + xlog_use_incompat_feat(mp->m_log); + + /* + * If log-assisted xattrs are already enabled, the caller can use the + * log assisted swap functions with the log-incompat reference we got. + */ + if (xfs_sb_version_haslogxattrs(&mp->m_sb)) + return 0; + + /* Enable log-assisted xattrs. */ + error = xfs_add_incompat_log_feature(mp, + XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); + if (error) + goto drop_incompat; + + xfs_warn_mount(mp, XFS_OPSTATE_WARNED_LARP, + "EXPERIMENTAL logged extended attributes feature in use. Use at your own risk!"); + + return 0; +drop_incompat: + xlog_drop_incompat_feat(mp->m_log); + return error; +} + +static inline void +xfs_attr_rele_log_assist( + struct xfs_mount *mp) +{ + xlog_drop_incompat_feat(mp->m_log); +} + +/* + * Set or remove an xattr, having grabbed the appropriate logging resources + * prior to calling libxfs. + */ +int +xfs_attr_change( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + bool use_logging = false; + int error; + + if (xfs_has_larp(mp)) { + error = xfs_attr_grab_log_assist(mp); + if (error) + return error; + + use_logging = true; + } + + error = xfs_attr_set(args); + + if (use_logging) + xfs_attr_rele_log_assist(mp); + return error; +} + static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, @@ -56,7 +133,7 @@ xfs_xattr_set(const struct xattr_handler *handler, }; int error; - error = xfs_attr_set(&args); + error = xfs_attr_change(&args); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h new file mode 100644 index 000000000000..2b09133b1b9b --- /dev/null +++ b/fs/xfs/xfs_xattr.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_XATTR_H__ +#define __XFS_XATTR_H__ + +int xfs_attr_change(struct xfs_da_args *args); + +extern const struct xattr_handler *xfs_xattr_handlers[]; + +#endif /* __XFS_XATTR_H__ */ diff --git a/include/clocksource/timer-xilinx.h b/include/clocksource/timer-xilinx.h new file mode 100644 index 000000000000..c0f56fe6d22a --- /dev/null +++ b/include/clocksource/timer-xilinx.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright (C) 2021 Sean Anderson + */ + +#ifndef XILINX_TIMER_H +#define XILINX_TIMER_H + +#include + +#define TCSR0 0x00 +#define TLR0 0x04 +#define TCR0 0x08 +#define TCSR1 0x10 +#define TLR1 0x14 +#define TCR1 0x18 + +#define TCSR_MDT BIT(0) +#define TCSR_UDT BIT(1) +#define TCSR_GENT BIT(2) +#define TCSR_CAPT BIT(3) +#define TCSR_ARHT BIT(4) +#define TCSR_LOAD BIT(5) +#define TCSR_ENIT BIT(6) +#define TCSR_ENT BIT(7) +#define TCSR_TINT BIT(8) +#define TCSR_PWMA BIT(9) +#define TCSR_ENALL BIT(10) +#define TCSR_CASC BIT(11) + +struct clk; +struct device_node; +struct regmap; + +/** + * struct xilinx_timer_priv - Private data for Xilinx AXI timer drivers + * @map: Regmap of the device, possibly with an offset + * @clk: Parent clock + * @max: Maximum value of the counters + */ +struct xilinx_timer_priv { + struct regmap *map; + struct clk *clk; + u32 max; +}; + +/** + * xilinx_timer_tlr_cycles() - Calculate the TLR for a period specified + * in clock cycles + * @priv: The timer's private data + * @tcsr: The value of the TCSR register for this counter + * @cycles: The number of cycles in this period + * + * Callers of this function MUST ensure that @cycles is representable as + * a TLR. + * + * Return: The calculated value for TLR + */ +u32 xilinx_timer_tlr_cycles(struct xilinx_timer_priv *priv, u32 tcsr, + u64 cycles); + +/** + * xilinx_timer_get_period() - Get the current period of a counter + * @priv: The timer's private data + * @tlr: The value of TLR for this counter + * @tcsr: The value of TCSR for this counter + * + * Return: The period, in ns + */ +unsigned int xilinx_timer_get_period(struct xilinx_timer_priv *priv, + u32 tlr, u32 tcsr); + +#endif /* XILINX_TIMER_H */ diff --git a/include/dt-bindings/mfd/cros_ec.h b/include/dt-bindings/mfd/cros_ec.h new file mode 100644 index 000000000000..3b29cd049578 --- /dev/null +++ b/include/dt-bindings/mfd/cros_ec.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * DTS binding definitions used for the Chromium OS Embedded Controller. + * + * Copyright (c) 2022 The Chromium OS Authors. All rights reserved. + */ + +#ifndef _DT_BINDINGS_MFD_CROS_EC_H +#define _DT_BINDINGS_MFD_CROS_EC_H + +/* Typed channel for keyboard backlight. */ +#define CROS_EC_PWM_DT_KB_LIGHT 0 +/* Typed channel for display backlight. */ +#define CROS_EC_PWM_DT_DISPLAY_LIGHT 1 +/* Number of typed channels. */ +#define CROS_EC_PWM_DT_COUNT 2 + +#endif diff --git a/include/dt-bindings/reset/mt7986-resets.h b/include/dt-bindings/reset/mt7986-resets.h new file mode 100644 index 000000000000..af3d16c81192 --- /dev/null +++ b/include/dt-bindings/reset/mt7986-resets.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2022 MediaTek Inc. + * Author: Sam Shih + */ + +#ifndef _DT_BINDINGS_RESET_CONTROLLER_MT7986 +#define _DT_BINDINGS_RESET_CONTROLLER_MT7986 + +/* INFRACFG resets */ +#define MT7986_INFRACFG_PEXTP_MAC_SW_RST 6 +#define MT7986_INFRACFG_SSUSB_SW_RST 7 +#define MT7986_INFRACFG_EIP97_SW_RST 8 +#define MT7986_INFRACFG_AUDIO_SW_RST 13 +#define MT7986_INFRACFG_CQ_DMA_SW_RST 14 + +#define MT7986_INFRACFG_TRNG_SW_RST 17 +#define MT7986_INFRACFG_AP_DMA_SW_RST 32 +#define MT7986_INFRACFG_I2C_SW_RST 33 +#define MT7986_INFRACFG_NFI_SW_RST 34 +#define MT7986_INFRACFG_SPI0_SW_RST 35 +#define MT7986_INFRACFG_SPI1_SW_RST 36 +#define MT7986_INFRACFG_UART0_SW_RST 37 +#define MT7986_INFRACFG_UART1_SW_RST 38 +#define MT7986_INFRACFG_UART2_SW_RST 39 +#define MT7986_INFRACFG_AUXADC_SW_RST 43 + +#define MT7986_INFRACFG_APXGPT_SW_RST 66 +#define MT7986_INFRACFG_PWM_SW_RST 68 + +#define MT7986_INFRACFG_SW_RST_NUM 69 + +/* TOPRGU resets */ +#define MT7986_TOPRGU_APMIXEDSYS_SW_RST 0 +#define MT7986_TOPRGU_SGMII0_SW_RST 1 +#define MT7986_TOPRGU_SGMII1_SW_RST 2 +#define MT7986_TOPRGU_INFRA_SW_RST 3 +#define MT7986_TOPRGU_U2PHY_SW_RST 5 +#define MT7986_TOPRGU_PCIE_SW_RST 6 +#define MT7986_TOPRGU_SSUSB_SW_RST 7 +#define MT7986_TOPRGU_ETHDMA_SW_RST 20 +#define MT7986_TOPRGU_CONSYS_SW_RST 23 + +#define MT7986_TOPRGU_SW_RST_NUM 24 + +/* ETHSYS Subsystem resets */ +#define MT7986_ETHSYS_FE_SW_RST 6 +#define MT7986_ETHSYS_PMTR_SW_RST 8 +#define MT7986_ETHSYS_GMAC_SW_RST 23 +#define MT7986_ETHSYS_PPE0_SW_RST 30 +#define MT7986_ETHSYS_PPE1_SW_RST 31 + +#define MT7986_ETHSYS_SW_RST_NUM 32 + +#endif /* _DT_BINDINGS_RESET_CONTROLLER_MT7986 */ diff --git a/include/dt-bindings/reset/mt8186-resets.h b/include/dt-bindings/reset/mt8186-resets.h new file mode 100644 index 000000000000..5f850370c42c --- /dev/null +++ b/include/dt-bindings/reset/mt8186-resets.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2022 MediaTek Inc. + * Author: Runyang Chen + */ + +#ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8186 +#define _DT_BINDINGS_RESET_CONTROLLER_MT8186 + +#define MT8186_TOPRGU_INFRA_SW_RST 0 +#define MT8186_TOPRGU_MM_SW_RST 1 +#define MT8186_TOPRGU_MFG_SW_RST 2 +#define MT8186_TOPRGU_VENC_SW_RST 3 +#define MT8186_TOPRGU_VDEC_SW_RST 4 +#define MT8186_TOPRGU_IMG_SW_RST 5 +#define MT8186_TOPRGU_DDR_SW_RST 6 +#define MT8186_TOPRGU_INFRA_AO_SW_RST 8 +#define MT8186_TOPRGU_CONNSYS_SW_RST 9 +#define MT8186_TOPRGU_APMIXED_SW_RST 10 +#define MT8186_TOPRGU_PWRAP_SW_RST 11 +#define MT8186_TOPRGU_CONN_MCU_SW_RST 12 +#define MT8186_TOPRGU_IPNNA_SW_RST 13 +#define MT8186_TOPRGU_WPE_SW_RST 14 +#define MT8186_TOPRGU_ADSP_SW_RST 15 +#define MT8186_TOPRGU_AUDIO_SW_RST 17 +#define MT8186_TOPRGU_CAM_MAIN_SW_RST 18 +#define MT8186_TOPRGU_CAM_RAWA_SW_RST 19 +#define MT8186_TOPRGU_CAM_RAWB_SW_RST 20 +#define MT8186_TOPRGU_IPE_SW_RST 21 +#define MT8186_TOPRGU_IMG2_SW_RST 22 +#define MT8186_TOPRGU_SW_RST_NUM 23 + +/* MMSYS resets */ +#define MT8186_MMSYS_SW0_RST_B_DISP_DSI0 19 + +#endif /* _DT_BINDINGS_RESET_CONTROLLER_MT8186 */ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 2614247a9781..293e29960c6e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -16,8 +16,6 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) -#include - /* task patch states */ #define KLP_UNDEFINED -1 #define KLP_UNPATCHED 0 diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b064bc278f52..5040cd774c5a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -447,6 +447,11 @@ struct mlx5_qp_table { struct radix_tree_root tree; }; +enum { + MLX5_PF_NOTIFY_DISABLE_VF, + MLX5_PF_NOTIFY_ENABLE_VF, +}; + struct mlx5_vf_context { int enabled; u64 port_guid; @@ -457,6 +462,7 @@ struct mlx5_vf_context { u8 port_guid_valid:1; u8 node_guid_valid:1; enum port_state_policy policy; + struct blocking_notifier_head notifier; }; struct mlx5_core_sriov { @@ -1162,6 +1168,12 @@ int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev); void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev); +int mlx5_sriov_blocking_notifier_register(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb); +void mlx5_sriov_blocking_notifier_unregister(struct mlx5_core_dev *mdev, + int vf_id, + struct notifier_block *nb); #ifdef CONFIG_MLX5_CORE_IPOIB struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, struct ib_device *ibdev, diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5662d8be04eb..8d04b6a5964c 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -451,6 +451,8 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) +#define FATTR4_WORD1_DACL (1UL << 26) +#define FATTR4_WORD1_SACL (1UL << 27) #define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) #define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0) #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 157d2bd6b241..ea2f7e6b1b0b 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -287,4 +287,5 @@ struct nfs_server { #define NFS_CAP_XATTR (1U << 28) #define NFS_CAP_READ_PLUS (1U << 29) #define NFS_CAP_FS_LOCATIONS (1U << 30) +#define NFS_CAP_MOVEABLE (1U << 31) #endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2863e5a69c6a..0e3aa0f5f324 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -800,9 +800,17 @@ struct nfs_setattrargs { const struct nfs4_label *label; }; +enum nfs4_acl_type { + NFS4ACL_NONE = 0, + NFS4ACL_ACL, + NFS4ACL_DACL, + NFS4ACL_SACL, +}; + struct nfs_setaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; + enum nfs4_acl_type acl_type; size_t acl_len; struct page ** acl_pages; }; @@ -814,6 +822,7 @@ struct nfs_setaclres { struct nfs_getaclargs { struct nfs4_sequence_args seq_args; struct nfs_fh * fh; + enum nfs4_acl_type acl_type; size_t acl_len; struct page ** acl_pages; }; @@ -822,6 +831,7 @@ struct nfs_getaclargs { #define NFS4_ACL_TRUNC 0x0001 /* ACL was truncated */ struct nfs_getaclres { struct nfs4_sequence_res seq_res; + enum nfs4_acl_type acl_type; size_t acl_len; size_t acl_data_offset; int acl_flags; @@ -1212,7 +1222,7 @@ struct nfs4_fs_location { #define NFS4_FS_LOCATIONS_MAXENTRIES 10 struct nfs4_fs_locations { - struct nfs_fattr fattr; + struct nfs_fattr *fattr; const struct nfs_server *server; struct nfs4_pathname fs_path; int nlocations; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 66dda06ec42d..aa888cc51757 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -15,6 +15,8 @@ #include #include +struct kvm; + /* * VFIO devices can be placed in a set, this allows all devices to share this * structure and the VFIO core will provide a lock that is held around @@ -34,6 +36,8 @@ struct vfio_device { struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; + /* Driver must reference the kvm during open_device or never touch it */ + struct kvm *kvm; /* Members below here are private, not for driver use */ refcount_t refcount; @@ -125,8 +129,6 @@ void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); -extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); -extern void vfio_device_put(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); @@ -138,56 +140,36 @@ int vfio_mig_get_next_state(struct vfio_device *device, /* * External user API */ -extern struct vfio_group *vfio_group_get_external_user(struct file *filep); -extern void vfio_group_put_external_user(struct vfio_group *group); -extern struct vfio_group *vfio_group_get_external_user_from_dev(struct device - *dev); -extern bool vfio_external_group_match_file(struct vfio_group *group, - struct file *filep); -extern int vfio_external_user_iommu_id(struct vfio_group *group); -extern long vfio_external_check_extension(struct vfio_group *group, - unsigned long arg); +extern struct iommu_group *vfio_file_iommu_group(struct file *file); +extern bool vfio_file_enforced_coherent(struct file *file); +extern void vfio_file_set_kvm(struct file *file, struct kvm *kvm); +extern bool vfio_file_has_dev(struct file *file, struct vfio_device *device); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) -extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, +extern int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage, int prot, unsigned long *phys_pfn); -extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, +extern int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, int npage); - -extern int vfio_group_pin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage, - int prot, unsigned long *phys_pfn); -extern int vfio_group_unpin_pages(struct vfio_group *group, - unsigned long *user_iova_pfn, int npage); - -extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova, +extern int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, size_t len, bool write); -extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group); - /* each type has independent events */ enum vfio_notify_type { VFIO_IOMMU_NOTIFY = 0, - VFIO_GROUP_NOTIFY = 1, }; /* events for VFIO_IOMMU_NOTIFY */ #define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0) -/* events for VFIO_GROUP_NOTIFY */ -#define VFIO_GROUP_NOTIFY_SET_KVM BIT(0) - -extern int vfio_register_notifier(struct device *dev, +extern int vfio_register_notifier(struct vfio_device *device, enum vfio_notify_type type, unsigned long *required_events, struct notifier_block *nb); -extern int vfio_unregister_notifier(struct device *dev, +extern int vfio_unregister_notifier(struct vfio_device *device, enum vfio_notify_type type, struct notifier_block *nb); -struct kvm; -extern void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); /* * Sub-module helpers diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 48f2dd3c568c..23c176d4b073 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -227,8 +227,9 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); -int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn); extern const struct pci_error_handlers vfio_pci_core_err_handlers; +int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, + int nr_virtfn); long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index bea654a85e6b..513e889ef8aa 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,10 +15,6 @@ TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); -TRACE_DEFINE_ENUM(INMEM); -TRACE_DEFINE_ENUM(INMEM_DROP); -TRACE_DEFINE_ENUM(INMEM_INVALIDATE); -TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); @@ -59,10 +55,6 @@ TRACE_DEFINE_ENUM(CP_RESIZE); { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ - { INMEM, "INMEM" }, \ - { INMEM_DROP, "INMEM_DROP" }, \ - { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ - { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -652,19 +644,22 @@ TRACE_EVENT(f2fs_background_gc, TRACE_EVENT(f2fs_gc_begin, - TP_PROTO(struct super_block *sb, bool sync, bool background, + TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, + unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), - TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, + dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) - __field(bool, sync) - __field(bool, background) + __field(int, gc_type) + __field(bool, no_bg_gc) + __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) @@ -676,8 +671,9 @@ TRACE_EVENT(f2fs_gc_begin, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->sync = sync; - __entry->background = background; + __entry->gc_type = gc_type; + __entry->no_bg_gc = no_bg_gc; + __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; @@ -687,12 +683,13 @@ TRACE_EVENT(f2fs_gc_begin, __entry->prefree_seg = prefree_seg; ), - TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " - "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), - __entry->sync, - __entry->background, + show_gc_type(__entry->gc_type), + (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, + __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, @@ -1285,20 +1282,6 @@ DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_ARGS(page, type) ); -DEFINE_EVENT(f2fs__page, f2fs_register_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - -DEFINE_EVENT(f2fs__page, f2fs_commit_inmem_page, - - TP_PROTO(struct page *page, int type), - - TP_ARGS(page, type) -); - TRACE_EVENT(f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, unsigned long ret), @@ -2063,6 +2046,100 @@ TRACE_EVENT(f2fs_fiemap, __entry->ret) ); +DECLARE_EVENT_CLASS(f2fs__rw_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command), + + TP_STRUCT__entry( + __string(pathbuf, pathname) + __field(loff_t, offset) + __field(int, bytes) + __field(loff_t, i_size) + __string(cmdline, command) + __field(pid_t, pid) + __field(ino_t, ino) + ), + + TP_fast_assign( + /* + * Replace the spaces in filenames and cmdlines + * because this screws up the tooling that parses + * the traces. + */ + __assign_str(pathbuf, pathname); + (void)strreplace(__get_str(pathbuf), ' ', '_'); + __entry->offset = offset; + __entry->bytes = bytes; + __entry->i_size = i_size_read(inode); + __assign_str(cmdline, command); + (void)strreplace(__get_str(cmdline), ' ', '_'); + __entry->pid = pid; + __entry->ino = inode->i_ino; + ), + + TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," + " pid %d, i_size %llu, ino %lu", + __get_str(pathbuf), __entry->offset, __entry->bytes, + __get_str(cmdline), __entry->pid, __entry->i_size, + (unsigned long) __entry->ino) +); + +DECLARE_EVENT_CLASS(f2fs__rw_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes), + + TP_STRUCT__entry( + __field(ino_t, ino) + __field(loff_t, offset) + __field(int, bytes) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->bytes = bytes; + ), + + TP_printk("ino %lu, offset %llu, bytes %d", + (unsigned long) __entry->ino, + __entry->offset, __entry->bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + +DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes, + pid_t pid, char *pathname, char *command), + + TP_ARGS(inode, offset, bytes, pid, pathname, command) +); + +DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, + + TP_PROTO(struct inode *inode, loff_t offset, int bytes), + + TP_ARGS(inode, offset, bytes) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fea86061b44e..733a1cddde30 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -643,7 +643,7 @@ enum { }; /** - * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) * * Return: 0 on success, -errno on failure: @@ -770,7 +770,7 @@ struct vfio_device_ioeventfd { #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) /** - * VFIO_DEVICE_FEATURE - _IORW(VFIO_TYPE, VFIO_BASE + 17, + * VFIO_DEVICE_FEATURE - _IOWR(VFIO_TYPE, VFIO_BASE + 17, * struct vfio_device_feature) * * Get, set, or probe feature data of the device. The feature is selected diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index c172bf92b576..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 90398d10fd84..4c65fdf940c4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3912,7 +3912,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) } if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible_all(&log_wait); + wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = diff --git a/lib/assoc_array.c b/lib/assoc_array.c index 079c72e26493..ca0b4f360c1a 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -1461,6 +1461,7 @@ int assoc_array_gc(struct assoc_array *array, struct assoc_array_ptr *cursor, *ptr; struct assoc_array_ptr *new_root, *new_parent, **new_ptr_pp; unsigned long nr_leaves_on_tree; + bool retained; int keylen, slot, nr_free, next_slot, i; pr_devel("-->%s()\n", __func__); @@ -1536,6 +1537,7 @@ continue_node: goto descend; } +retry_compress: pr_devel("-- compress node %p --\n", new_n); /* Count up the number of empty slots in this node and work out the @@ -1553,6 +1555,7 @@ continue_node: pr_devel("free=%d, leaves=%lu\n", nr_free, new_n->nr_leaves_on_branch); /* See what we can fold in */ + retained = false; next_slot = 0; for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) { struct assoc_array_shortcut *s; @@ -1602,9 +1605,14 @@ continue_node: pr_devel("[%d] retain node %lu/%d [nx %d]\n", slot, child->nr_leaves_on_branch, nr_free + 1, next_slot); + retained = true; } } + if (retained && new_n->nr_leaves_on_branch <= ASSOC_ARRAY_FAN_OUT) { + pr_devel("internal nodes remain despite enough space, retrying\n"); + goto retry_compress; + } pr_devel("after: %lu\n", new_n->nr_leaves_on_branch); nr_leaves_on_tree = new_n->nr_leaves_on_branch; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 7057f8db4f99..1daf95e17d67 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -906,7 +906,6 @@ int crush_do_rule(const struct crush_map *map, int recurse_to_leaf; int wsize = 0; int osize; - int *tmp; const struct crush_rule *rule; __u32 step; int i, j; @@ -1073,9 +1072,7 @@ int crush_do_rule(const struct crush_map *map, memcpy(o, c, osize*sizeof(*o)); /* swap o and w arrays */ - tmp = o; - o = w; - w = tmp; + swap(o, w); wsize = osize; break; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 281ddb87ac8d..190a4de239c8 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1121,6 +1121,7 @@ static bool rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) #if defined(CONFIG_SUNRPC_BACKCHANNEL) { + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct xdr_stream *xdr = &rep->rr_stream; __be32 *p; @@ -1144,6 +1145,10 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) if (*p != cpu_to_be32(RPC_CALL)) return false; + /* No bc service. */ + if (xprt->bc_serv == NULL) + return false; + /* Now that we are sure this is a backchannel call, * advance to the RPC header. */ diff --git a/tools/testing/memblock/TODO b/tools/testing/memblock/TODO index c25b2fdec45e..cd1a30d5acc9 100644 --- a/tools/testing/memblock/TODO +++ b/tools/testing/memblock/TODO @@ -23,6 +23,3 @@ TODO 5. Add tests for memblock_alloc_node() to check if the correct NUMA node is set for the new region - -6. Update comments in tests/basic_api.c to match the style used in - tests/alloc_*.c diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index fbc1ce160303..a7bc180316d6 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -26,8 +26,8 @@ static int memblock_initialization_check(void) /* * A simple test that adds a memory block of a specified base address * and size to the collection of available memory regions (memblock.memory). - * It checks if a new entry was created and if region counter and total memory - * were correctly updated. + * Expect to create a new entry. The region counter and total memory get + * updated. */ static int memblock_add_simple_check(void) { @@ -53,10 +53,10 @@ static int memblock_add_simple_check(void) } /* - * A simple test that adds a memory block of a specified base address, size + * A simple test that adds a memory block of a specified base address, size, * NUMA node and memory flags to the collection of available memory regions. - * It checks if the new entry, region counter and total memory size have - * expected values. + * Expect to create a new entry. The region counter and total memory get + * updated. */ static int memblock_add_node_simple_check(void) { @@ -87,9 +87,15 @@ static int memblock_add_node_simple_check(void) /* * A test that tries to add two memory blocks that don't overlap with one - * another. It checks if two correctly initialized entries were added to the - * collection of available memory regions (memblock.memory) and if this - * change was reflected in memblock.memory's total size and region counter. + * another: + * + * | +--------+ +--------+ | + * | | r1 | | r2 | | + * +--------+--------+--------+--------+--+ + * + * Expect to add two correctly initialized entries to the collection of + * available memory regions (memblock.memory). The total size and + * region counter fields get updated. */ static int memblock_add_disjoint_check(void) { @@ -124,11 +130,21 @@ static int memblock_add_disjoint_check(void) } /* - * A test that tries to add two memory blocks, where the second one overlaps - * with the beginning of the first entry (that is r1.base < r2.base + r2.size). - * After this, it checks if two entries are merged into one region that starts - * at r2.base and has size of two regions minus their intersection. It also - * verifies the reported total size of the available memory and region counter. + * A test that tries to add two memory blocks r1 and r2, where r2 overlaps + * with the beginning of r1 (that is r1.base < r2.base + r2.size): + * + * | +----+----+------------+ | + * | | |r2 | r1 | | + * +----+----+----+------------+----------+ + * ^ ^ + * | | + * | r1.base + * | + * r2.base + * + * Expect to merge the two entries into one region that starts at r2.base + * and has size of two regions minus their intersection. The total size of + * the available memory is updated, and the region counter stays the same. */ static int memblock_add_overlap_top_check(void) { @@ -162,12 +178,21 @@ static int memblock_add_overlap_top_check(void) } /* - * A test that tries to add two memory blocks, where the second one overlaps - * with the end of the first entry (that is r2.base < r1.base + r1.size). - * After this, it checks if two entries are merged into one region that starts - * at r1.base and has size of two regions minus their intersection. It verifies - * that memblock can still see only one entry and has a correct total size of - * the available memory. + * A test that tries to add two memory blocks r1 and r2, where r2 overlaps + * with the end of r1 (that is r2.base < r1.base + r1.size): + * + * | +--+------+----------+ | + * | | | r1 | r2 | | + * +--+--+------+----------+--------------+ + * ^ ^ + * | | + * | r2.base + * | + * r1.base + * + * Expect to merge the two entries into one region that starts at r1.base + * and has size of two regions minus their intersection. The total size of + * the available memory is updated, and the region counter stays the same. */ static int memblock_add_overlap_bottom_check(void) { @@ -201,11 +226,19 @@ static int memblock_add_overlap_bottom_check(void) } /* - * A test that tries to add two memory blocks, where the second one is - * within the range of the first entry (that is r1.base < r2.base && - * r2.base + r2.size < r1.base + r1.size). It checks if two entries are merged - * into one region that stays the same. The counter and total size of available - * memory are expected to not be updated. + * A test that tries to add two memory blocks r1 and r2, where r2 is + * within the range of r1 (that is r1.base < r2.base && + * r2.base + r2.size < r1.base + r1.size): + * + * | +-------+--+-----------------------+ + * | | |r2| r1 | + * +---+-------+--+-----------------------+ + * ^ + * | + * r1.base + * + * Expect to merge two entries into one region that stays the same. + * The counter and total size of available memory are not updated. */ static int memblock_add_within_check(void) { @@ -236,8 +269,8 @@ static int memblock_add_within_check(void) } /* - * A simple test that tries to add the same memory block twice. The counter - * and total size of available memory are expected to not be updated. + * A simple test that tries to add the same memory block twice. Expect + * the counter and total size of available memory to not be updated. */ static int memblock_add_twice_check(void) { @@ -270,12 +303,12 @@ static int memblock_add_checks(void) return 0; } - /* - * A simple test that marks a memory block of a specified base address - * and size as reserved and to the collection of reserved memory regions - * (memblock.reserved). It checks if a new entry was created and if region - * counter and total memory size were correctly updated. - */ +/* + * A simple test that marks a memory block of a specified base address + * and size as reserved and to the collection of reserved memory regions + * (memblock.reserved). Expect to create a new entry. The region counter + * and total memory size are updated. + */ static int memblock_reserve_simple_check(void) { struct memblock_region *rgn; @@ -297,10 +330,15 @@ static int memblock_reserve_simple_check(void) } /* - * A test that tries to mark two memory blocks that don't overlap as reserved - * and checks if two entries were correctly added to the collection of reserved - * memory regions (memblock.reserved) and if this change was reflected in - * memblock.reserved's total size and region counter. + * A test that tries to mark two memory blocks that don't overlap as reserved: + * + * | +--+ +----------------+ | + * | |r1| | r2 | | + * +--------+--+------+----------------+--+ + * + * Expect to add two entries to the collection of reserved memory regions + * (memblock.reserved). The total size and region counter for + * memblock.reserved are updated. */ static int memblock_reserve_disjoint_check(void) { @@ -335,13 +373,22 @@ static int memblock_reserve_disjoint_check(void) } /* - * A test that tries to mark two memory blocks as reserved, where the - * second one overlaps with the beginning of the first (that is - * r1.base < r2.base + r2.size). - * It checks if two entries are merged into one region that starts at r2.base - * and has size of two regions minus their intersection. The test also verifies - * that memblock can still see only one entry and has a correct total size of - * the reserved memory. + * A test that tries to mark two memory blocks r1 and r2 as reserved, + * where r2 overlaps with the beginning of r1 (that is + * r1.base < r2.base + r2.size): + * + * | +--------------+--+--------------+ | + * | | r2 | | r1 | | + * +--+--------------+--+--------------+--+ + * ^ ^ + * | | + * | r1.base + * | + * r2.base + * + * Expect to merge two entries into one region that starts at r2.base and + * has size of two regions minus their intersection. The total size of the + * reserved memory is updated, and the region counter is not updated. */ static int memblock_reserve_overlap_top_check(void) { @@ -375,13 +422,22 @@ static int memblock_reserve_overlap_top_check(void) } /* - * A test that tries to mark two memory blocks as reserved, where the - * second one overlaps with the end of the first entry (that is - * r2.base < r1.base + r1.size). - * It checks if two entries are merged into one region that starts at r1.base - * and has size of two regions minus their intersection. It verifies that - * memblock can still see only one entry and has a correct total size of the - * reserved memory. + * A test that tries to mark two memory blocks r1 and r2 as reserved, + * where r2 overlaps with the end of r1 (that is + * r2.base < r1.base + r1.size): + * + * | +--------------+--+--------------+ | + * | | r1 | | r2 | | + * +--+--------------+--+--------------+--+ + * ^ ^ + * | | + * | r2.base + * | + * r1.base + * + * Expect to merge two entries into one region that starts at r1.base and + * has size of two regions minus their intersection. The total size of the + * reserved memory is updated, and the region counter is not updated. */ static int memblock_reserve_overlap_bottom_check(void) { @@ -415,12 +471,21 @@ static int memblock_reserve_overlap_bottom_check(void) } /* - * A test that tries to mark two memory blocks as reserved, where the second - * one is within the range of the first entry (that is - * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)). - * It checks if two entries are merged into one region that stays the - * same. The counter and total size of available memory are expected to not be - * updated. + * A test that tries to mark two memory blocks r1 and r2 as reserved, + * where r2 is within the range of r1 (that is + * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)): + * + * | +-----+--+---------------------------| + * | | |r2| r1 | + * +-+-----+--+---------------------------+ + * ^ ^ + * | | + * | r2.base + * | + * r1.base + * + * Expect to merge two entries into one region that stays the same. The + * counter and total size of available memory are not updated. */ static int memblock_reserve_within_check(void) { @@ -452,7 +517,7 @@ static int memblock_reserve_within_check(void) /* * A simple test that tries to reserve the same memory block twice. - * The region counter and total size of reserved memory are expected to not + * Expect the region counter and total size of reserved memory to not * be updated. */ static int memblock_reserve_twice_check(void) @@ -485,14 +550,22 @@ static int memblock_reserve_checks(void) return 0; } - /* - * A simple test that tries to remove the first entry of the array of - * available memory regions. By "removing" a region we mean overwriting it - * with the next region in memblock.memory. To check this is the case, the - * test adds two memory blocks and verifies that the value of the latter - * was used to erase r1 region. It also checks if the region counter and - * total size were updated to expected values. - */ +/* + * A simple test that tries to remove a region r1 from the array of + * available memory regions. By "removing" a region we mean overwriting it + * with the next region r2 in memblock.memory: + * + * | ...... +----------------+ | + * | : r1 : | r2 | | + * +--+----+----------+----------------+--+ + * ^ + * | + * rgn.base + * + * Expect to add two memory blocks r1 and r2 and then remove r1 so that + * r2 is the first available region. The region counter and total size + * are updated. + */ static int memblock_remove_simple_check(void) { struct memblock_region *rgn; @@ -522,11 +595,22 @@ static int memblock_remove_simple_check(void) return 0; } - /* - * A test that tries to remove a region that was not registered as available - * memory (i.e. has no corresponding entry in memblock.memory). It verifies - * that array, regions counter and total size were not modified. - */ +/* + * A test that tries to remove a region r2 that was not registered as + * available memory (i.e. has no corresponding entry in memblock.memory): + * + * +----------------+ + * | r2 | + * +----------------+ + * | +----+ | + * | | r1 | | + * +--+----+------------------------------+ + * ^ + * | + * rgn.base + * + * Expect the array, regions counter and total size to not be modified. + */ static int memblock_remove_absent_check(void) { struct memblock_region *rgn; @@ -556,11 +640,23 @@ static int memblock_remove_absent_check(void) } /* - * A test that tries to remove a region which overlaps with the beginning of - * the already existing entry r1 (that is r1.base < r2.base + r2.size). It - * checks if only the intersection of both regions is removed from the available - * memory pool. The test also checks if the regions counter and total size are - * updated to expected values. + * A test that tries to remove a region r2 that overlaps with the + * beginning of the already existing entry r1 + * (that is r1.base < r2.base + r2.size): + * + * +-----------------+ + * | r2 | + * +-----------------+ + * | .........+--------+ | + * | : r1 | rgn | | + * +-----------------+--------+--------+--+ + * ^ ^ + * | | + * | rgn.base + * r1.base + * + * Expect that only the intersection of both regions is removed from the + * available memory pool. The regions counter and total size are updated. */ static int memblock_remove_overlap_top_check(void) { @@ -596,11 +692,21 @@ static int memblock_remove_overlap_top_check(void) } /* - * A test that tries to remove a region which overlaps with the end of the - * first entry (that is r2.base < r1.base + r1.size). It checks if only the - * intersection of both regions is removed from the available memory pool. - * The test also checks if the regions counter and total size are updated to - * expected values. + * A test that tries to remove a region r2 that overlaps with the end of + * the already existing region r1 (that is r2.base < r1.base + r1.size): + * + * +--------------------------------+ + * | r2 | + * +--------------------------------+ + * | +---+..... | + * | |rgn| r1 : | + * +-+---+----+---------------------------+ + * ^ + * | + * r1.base + * + * Expect that only the intersection of both regions is removed from the + * available memory pool. The regions counter and total size are updated. */ static int memblock_remove_overlap_bottom_check(void) { @@ -633,13 +739,23 @@ static int memblock_remove_overlap_bottom_check(void) } /* - * A test that tries to remove a region which is within the range of the - * already existing entry (that is - * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)). - * It checks if the region is split into two - one that ends at r2.base and - * second that starts at r2.base + size, with appropriate sizes. The test - * also checks if the region counter and total size were updated to - * expected values. + * A test that tries to remove a region r2 that is within the range of + * the already existing entry r1 (that is + * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)): + * + * +----+ + * | r2 | + * +----+ + * | +-------------+....+---------------+ | + * | | rgn1 | r1 | rgn2 | | + * +-+-------------+----+---------------+-+ + * ^ + * | + * r1.base + * + * Expect that the region is split into two - one that ends at r2.base and + * another that starts at r2.base + r2.size, with appropriate sizes. The + * region counter and total size are updated. */ static int memblock_remove_within_check(void) { @@ -690,12 +806,19 @@ static int memblock_remove_checks(void) } /* - * A simple test that tries to free a memory block that was marked earlier - * as reserved. By "freeing" a region we mean overwriting it with the next - * entry in memblock.reserved. To check this is the case, the test reserves - * two memory regions and verifies that the value of the latter was used to - * erase r1 region. - * The test also checks if the region counter and total size were updated. + * A simple test that tries to free a memory block r1 that was marked + * earlier as reserved. By "freeing" a region we mean overwriting it with + * the next entry r2 in memblock.reserved: + * + * | ...... +----+ | + * | : r1 : | r2 | | + * +--------------+----+-----------+----+-+ + * ^ + * | + * rgn.base + * + * Expect to reserve two memory regions and then erase r1 region with the + * value of r2. The region counter and total size are updated. */ static int memblock_free_simple_check(void) { @@ -726,11 +849,22 @@ static int memblock_free_simple_check(void) return 0; } - /* - * A test that tries to free a region that was not marked as reserved - * (i.e. has no corresponding entry in memblock.reserved). It verifies - * that array, regions counter and total size were not modified. - */ +/* + * A test that tries to free a region r2 that was not marked as reserved + * (i.e. has no corresponding entry in memblock.reserved): + * + * +----------------+ + * | r2 | + * +----------------+ + * | +----+ | + * | | r1 | | + * +--+----+------------------------------+ + * ^ + * | + * rgn.base + * + * The array, regions counter and total size are not modified. + */ static int memblock_free_absent_check(void) { struct memblock_region *rgn; @@ -760,11 +894,23 @@ static int memblock_free_absent_check(void) } /* - * A test that tries to free a region which overlaps with the beginning of - * the already existing entry r1 (that is r1.base < r2.base + r2.size). It - * checks if only the intersection of both regions is freed. The test also - * checks if the regions counter and total size are updated to expected - * values. + * A test that tries to free a region r2 that overlaps with the beginning + * of the already existing entry r1 (that is r1.base < r2.base + r2.size): + * + * +----+ + * | r2 | + * +----+ + * | ...+--------------+ | + * | : | r1 | | + * +----+--+--------------+---------------+ + * ^ ^ + * | | + * | rgn.base + * | + * r1.base + * + * Expect that only the intersection of both regions is freed. The + * regions counter and total size are updated. */ static int memblock_free_overlap_top_check(void) { @@ -798,10 +944,18 @@ static int memblock_free_overlap_top_check(void) } /* - * A test that tries to free a region which overlaps with the end of the - * first entry (that is r2.base < r1.base + r1.size). It checks if only the - * intersection of both regions is freed. The test also checks if the - * regions counter and total size are updated to expected values. + * A test that tries to free a region r2 that overlaps with the end of + * the already existing entry r1 (that is r2.base < r1.base + r1.size): + * + * +----------------+ + * | r2 | + * +----------------+ + * | +-----------+..... | + * | | r1 | : | + * +----+-----------+----+----------------+ + * + * Expect that only the intersection of both regions is freed. The + * regions counter and total size are updated. */ static int memblock_free_overlap_bottom_check(void) { @@ -835,13 +989,23 @@ static int memblock_free_overlap_bottom_check(void) } /* - * A test that tries to free a region which is within the range of the - * already existing entry (that is - * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)). - * It checks if the region is split into two - one that ends at r2.base and - * second that starts at r2.base + size, with appropriate sizes. It is - * expected that the region counter and total size fields were updated t - * reflect that change. + * A test that tries to free a region r2 that is within the range of the + * already existing entry r1 (that is + * (r1.base < r2.base) && (r2.base + r2.size < r1.base + r1.size)): + * + * +----+ + * | r2 | + * +----+ + * | +------------+....+---------------+ + * | | rgn1 | r1 | rgn2 | + * +----+------------+----+---------------+ + * ^ + * | + * r1.base + * + * Expect that the region is split into two - one that ends at r2.base and + * another that starts at r2.base + r2.size, with appropriate sizes. The + * region counter and total size fields are updated. */ static int memblock_free_within_check(void) { diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 8fcbc50221c2..ce1b01d02c51 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -23,7 +23,7 @@ struct kvm_vfio_group { struct list_head node; - struct vfio_group *vfio_group; + struct file *file; }; struct kvm_vfio { @@ -32,118 +32,61 @@ struct kvm_vfio { bool noncoherent; }; -static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep) +static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm) { - struct vfio_group *vfio_group; - struct vfio_group *(*fn)(struct file *); + void (*fn)(struct file *file, struct kvm *kvm); - fn = symbol_get(vfio_group_get_external_user); + fn = symbol_get(vfio_file_set_kvm); if (!fn) - return ERR_PTR(-EINVAL); + return; - vfio_group = fn(filep); + fn(file, kvm); - symbol_put(vfio_group_get_external_user); - - return vfio_group; + symbol_put(vfio_file_set_kvm); } -static bool kvm_vfio_external_group_match_file(struct vfio_group *group, - struct file *filep) +static bool kvm_vfio_file_enforced_coherent(struct file *file) { - bool ret, (*fn)(struct vfio_group *, struct file *); + bool (*fn)(struct file *file); + bool ret; - fn = symbol_get(vfio_external_group_match_file); + fn = symbol_get(vfio_file_enforced_coherent); if (!fn) return false; - ret = fn(group, filep); + ret = fn(file); - symbol_put(vfio_external_group_match_file); + symbol_put(vfio_file_enforced_coherent); return ret; } -static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group) +static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { - void (*fn)(struct vfio_group *); + struct iommu_group *(*fn)(struct file *file); + struct iommu_group *ret; - fn = symbol_get(vfio_group_put_external_user); + fn = symbol_get(vfio_file_iommu_group); if (!fn) - return; + return NULL; - fn(vfio_group); + ret = fn(file); - symbol_put(vfio_group_put_external_user); -} + symbol_put(vfio_file_iommu_group); -static void kvm_vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) -{ - void (*fn)(struct vfio_group *, struct kvm *); - - fn = symbol_get(vfio_group_set_kvm); - if (!fn) - return; - - fn(group, kvm); - - symbol_put(vfio_group_set_kvm); -} - -static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) -{ - long (*fn)(struct vfio_group *, unsigned long); - long ret; - - fn = symbol_get(vfio_external_check_extension); - if (!fn) - return false; - - ret = fn(vfio_group, VFIO_DMA_CC_IOMMU); - - symbol_put(vfio_external_check_extension); - - return ret > 0; + return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU -static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group) -{ - int (*fn)(struct vfio_group *); - int ret = -EINVAL; - - fn = symbol_get(vfio_external_user_iommu_id); - if (!fn) - return ret; - - ret = fn(vfio_group); - - symbol_put(vfio_external_user_iommu_id); - - return ret; -} - -static struct iommu_group *kvm_vfio_group_get_iommu_group( - struct vfio_group *group) -{ - int group_id = kvm_vfio_external_user_iommu_id(group); - - if (group_id < 0) - return NULL; - - return iommu_group_get_by_id(group_id); -} - static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, - struct vfio_group *vfio_group) + struct kvm_vfio_group *kvg) { - struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group); + struct iommu_group *grp = kvm_vfio_file_iommu_group(kvg->file); if (WARN_ON_ONCE(!grp)) return; kvm_spapr_tce_release_iommu_group(kvm, grp); - iommu_group_put(grp); } #endif @@ -163,7 +106,7 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) { + if (!kvm_vfio_file_enforced_coherent(kvg->file)) { noncoherent = true; break; } @@ -181,149 +124,162 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) mutex_unlock(&kv->lock); } -static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) +static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; struct kvm_vfio_group *kvg; - int32_t __user *argp = (int32_t __user *)(unsigned long)arg; - struct fd f; - int32_t fd; + struct file *filp; int ret; + filp = fget(fd); + if (!filp) + return -EBADF; + + /* Ensure the FD is a vfio group FD.*/ + if (!kvm_vfio_file_iommu_group(filp)) { + ret = -EINVAL; + goto err_fput; + } + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->file == filp) { + ret = -EEXIST; + goto err_unlock; + } + } + + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); + if (!kvg) { + ret = -ENOMEM; + goto err_unlock; + } + + kvg->file = filp; + list_add_tail(&kvg->node, &kv->group_list); + + kvm_arch_start_assignment(dev->kvm); + + mutex_unlock(&kv->lock); + + kvm_vfio_file_set_kvm(kvg->file, dev->kvm); + kvm_vfio_update_coherency(dev); + + return 0; +err_unlock: + mutex_unlock(&kv->lock); +err_fput: + fput(filp); + return ret; +} + +static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) +{ + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_group *kvg; + struct fd f; + int ret; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->file != f.file) + continue; + + list_del(&kvg->node); + kvm_arch_end_assignment(dev->kvm); +#ifdef CONFIG_SPAPR_TCE_IOMMU + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); +#endif + kvm_vfio_file_set_kvm(kvg->file, NULL); + fput(kvg->file); + kfree(kvg); + ret = 0; + break; + } + + mutex_unlock(&kv->lock); + + fdput(f); + + kvm_vfio_update_coherency(dev); + + return ret; +} + +#ifdef CONFIG_SPAPR_TCE_IOMMU +static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, + void __user *arg) +{ + struct kvm_vfio_spapr_tce param; + struct kvm_vfio *kv = dev->private; + struct kvm_vfio_group *kvg; + struct fd f; + int ret; + + if (copy_from_user(¶m, arg, sizeof(struct kvm_vfio_spapr_tce))) + return -EFAULT; + + f = fdget(param.groupfd); + if (!f.file) + return -EBADF; + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + struct iommu_group *grp; + + if (kvg->file != f.file) + continue; + + grp = kvm_vfio_file_iommu_group(kvg->file); + if (WARN_ON_ONCE(!grp)) { + ret = -EIO; + goto err_fdput; + } + + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, + grp); + break; + } + +err_fdput: + mutex_unlock(&kv->lock); + fdput(f); + return ret; +} +#endif + +static int kvm_vfio_set_group(struct kvm_device *dev, long attr, + void __user *arg) +{ + int32_t __user *argp = arg; + int32_t fd; + switch (attr) { case KVM_DEV_VFIO_GROUP_ADD: if (get_user(fd, argp)) return -EFAULT; - - f = fdget(fd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group == vfio_group) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -EEXIST; - } - } - - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); - if (!kvg) { - mutex_unlock(&kv->lock); - kvm_vfio_group_put_external_user(vfio_group); - return -ENOMEM; - } - - list_add_tail(&kvg->node, &kv->group_list); - kvg->vfio_group = vfio_group; - - kvm_arch_start_assignment(dev->kvm); - - mutex_unlock(&kv->lock); - - kvm_vfio_group_set_kvm(vfio_group, dev->kvm); - - kvm_vfio_update_coherency(dev); - - return 0; + return kvm_vfio_group_add(dev, fd); case KVM_DEV_VFIO_GROUP_DEL: if (get_user(fd, argp)) return -EFAULT; + return kvm_vfio_group_del(dev, fd); - f = fdget(fd); - if (!f.file) - return -EBADF; - - ret = -ENOENT; - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_external_group_match_file(kvg->vfio_group, - f.file)) - continue; - - list_del(&kvg->node); - kvm_arch_end_assignment(dev->kvm); #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, - kvg->vfio_group); + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: + return kvm_vfio_group_set_spapr_tce(dev, arg); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); - kfree(kvg); - ret = 0; - break; - } - - mutex_unlock(&kv->lock); - - fdput(f); - - kvm_vfio_update_coherency(dev); - - return ret; - -#ifdef CONFIG_SPAPR_TCE_IOMMU - case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { - struct kvm_vfio_spapr_tce param; - struct kvm_vfio *kv = dev->private; - struct vfio_group *vfio_group; - struct kvm_vfio_group *kvg; - struct fd f; - struct iommu_group *grp; - - if (copy_from_user(¶m, (void __user *)arg, - sizeof(struct kvm_vfio_spapr_tce))) - return -EFAULT; - - f = fdget(param.groupfd); - if (!f.file) - return -EBADF; - - vfio_group = kvm_vfio_group_get_external_user(f.file); - fdput(f); - - if (IS_ERR(vfio_group)) - return PTR_ERR(vfio_group); - - grp = kvm_vfio_group_get_iommu_group(vfio_group); - if (WARN_ON_ONCE(!grp)) { - kvm_vfio_group_put_external_user(vfio_group); - return -EIO; - } - - ret = -ENOENT; - - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->vfio_group != vfio_group) - continue; - - ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, - param.tablefd, grp); - break; - } - - mutex_unlock(&kv->lock); - - iommu_group_put(grp); - kvm_vfio_group_put_external_user(vfio_group); - - return ret; - } -#endif /* CONFIG_SPAPR_TCE_IOMMU */ } return -ENXIO; @@ -334,7 +290,8 @@ static int kvm_vfio_set_attr(struct kvm_device *dev, { switch (attr->group) { case KVM_DEV_VFIO_GROUP: - return kvm_vfio_set_group(dev, attr->attr, attr->addr); + return kvm_vfio_set_group(dev, attr->attr, + u64_to_user_ptr(attr->addr)); } return -ENXIO; @@ -367,10 +324,10 @@ static void kvm_vfio_destroy(struct kvm_device *dev) list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); #endif - kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); - kvm_vfio_group_put_external_user(kvg->vfio_group); + kvm_vfio_file_set_kvm(kvg->file, NULL); + fput(kvg->file); list_del(&kvg->node); kfree(kvg); kvm_arch_end_assignment(dev->kvm);