diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst index 45defcf15da7..285e2d2b21ae 100644 --- a/Documentation/core-api/maple_tree.rst +++ b/Documentation/core-api/maple_tree.rst @@ -81,6 +81,9 @@ section. Sometimes it is necessary to ensure the next call to store to a maple tree does not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case. +You can use mtree_dup() to duplicate an entire maple tree. It is a more +efficient way than inserting all elements one by one into a new tree. + Finally, you can remove all entries from a maple tree by calling mtree_destroy(). If the maple tree entries are pointers, you may wish to free the entries first. @@ -112,6 +115,7 @@ Takes ma_lock internally: * mtree_insert() * mtree_insert_range() * mtree_erase() + * mtree_dup() * mtree_destroy() * mt_set_in_rcu() * mt_clear_in_rcu() diff --git a/android/abi_gki_aarch64.stg b/android/abi_gki_aarch64.stg index e570ab134873..573e477d70d3 100644 --- a/android/abi_gki_aarch64.stg +++ b/android/abi_gki_aarch64.stg @@ -9018,6 +9018,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x72d62916 } +pointer_reference { + id: 0x1625e208 + kind: POINTER + pointee_type_id: 0x72d76ebd +} pointer_reference { id: 0x162c7a70 kind: POINTER @@ -18183,6 +18188,11 @@ pointer_reference { kind: POINTER pointee_type_id: 0x9d41cc1a } +pointer_reference { + id: 0x2dc069c5 + kind: POINTER + pointee_type_id: 0x9d414188 +} pointer_reference { id: 0x2dc1540f kind: POINTER @@ -30713,6 +30723,11 @@ typedef { name: "p4d_t" referred_type_id: 0x148546d4 } +typedef { + id: 0xbad82a2c + name: "parse_endpoint_func" + referred_type_id: 0x2dc069c5 +} typedef { id: 0x8ef19fe7 name: "pci_bus_flags_t" @@ -52189,6 +52204,11 @@ member { name: "base" type_id: 0x180f82e8 } +member { + id: 0x85d2e2e4 + name: "base" + type_id: 0x080c6fc2 +} member { id: 0x85d6188a name: "base" @@ -56419,6 +56439,12 @@ member { name: "bus" type_id: 0x2309ad3e } +member { + id: 0xdaf846cc + name: "bus" + type_id: 0x286a95aa + offset: 160 +} member { id: 0x1639ef00 name: "bus_cleanup" @@ -56648,6 +56674,12 @@ member { type_id: 0x945e7ef6 offset: 448 } +member { + id: 0x2c928e64 + name: "bus_type" + type_id: 0x3c57148f + offset: 128 +} member { id: 0xb43c45b4 name: "bus_width" @@ -116596,6 +116628,12 @@ member { name: "link_fd" type_id: 0xe62ebf07 } +member { + id: 0x6075ccdc + name: "link_frequencies" + type_id: 0x2e18f543 + offset: 512 +} member { id: 0x178cf8a4 name: "link_gen" @@ -126808,6 +126846,18 @@ member { name: "mipi_csi1" type_id: 0xe49bfc8b } +member { + id: 0xa7e5d7c1 + name: "mipi_csi1" + type_id: 0xe49bfc8b + offset: 64 +} +member { + id: 0xeda56411 + name: "mipi_csi2" + type_id: 0xe72f0de6 + offset: 128 +} member { id: 0xeda56dd3 name: "mipi_csi2" @@ -136120,6 +136170,12 @@ member { type_id: 0xe62ebf07 offset: 672 } +member { + id: 0x4519d21b + name: "nr_of_link_frequencies" + type_id: 0x4585663f + offset: 576 +} member { id: 0x9c6b34f7 name: "nr_off" @@ -211032,6 +211088,16 @@ struct_union { member_id: 0x9683f73d } } +struct_union { + id: 0x286a95aa + kind: STRUCT + definition { + bytesize: 40 + member_id: 0xc0bc4db7 + member_id: 0xa7e5d7c1 + member_id: 0xeda56411 + } +} struct_union { id: 0x2880e524 kind: STRUCT @@ -265808,6 +265874,19 @@ struct_union { member_id: 0x465224ed } } +struct_union { + id: 0x72d76ebd + kind: STRUCT + name: "v4l2_fwnode_endpoint" + definition { + bytesize: 80 + member_id: 0x85d2e2e4 + member_id: 0x2c928e64 + member_id: 0xdaf846cc + member_id: 0x6075ccdc + member_id: 0x4519d21b + } +} struct_union { id: 0xccd4dc1a kind: STRUCT @@ -287051,6 +287130,13 @@ enumeration { } } } +function { + id: 0x003279c7 + return_type_id: 0x3c2dd1ca + parameter_id: 0x3cfe7778 + parameter_id: 0x0490bb4a + parameter_id: 0x4585663f +} function { id: 0x004cf563 return_type_id: 0x48b5725f @@ -291664,6 +291750,11 @@ function { parameter_id: 0x14528516 parameter_id: 0x2712b6f9 } +function { + id: 0x15112911 + return_type_id: 0x48b5725f + parameter_id: 0x1625e208 +} function { id: 0x151457b1 return_type_id: 0xd5cc9c9a @@ -298544,6 +298635,11 @@ function { parameter_id: 0x3e10b518 parameter_id: 0x0bb0c019 } +function { + id: 0x1f821b4c + return_type_id: 0x48b5725f + parameter_id: 0x3c692b7e +} function { id: 0x1f835b6f return_type_id: 0x48b5725f @@ -298655,6 +298751,11 @@ function { parameter_id: 0x3c2755a3 parameter_id: 0x0cbf60eb } +function { + id: 0x1fa7cc4d + return_type_id: 0x48b5725f + parameter_id: 0x3cfe7778 +} function { id: 0x1fa8b2bc return_type_id: 0x48b5725f @@ -315510,6 +315611,14 @@ function { parameter_id: 0x6720d32f parameter_id: 0x3c2755a3 } +function { + id: 0x9a2ab624 + return_type_id: 0x6720d32f + parameter_id: 0x18bd6530 + parameter_id: 0x6720d32f + parameter_id: 0x6720d32f + parameter_id: 0x11cfee5a +} function { id: 0x9a2abc7b return_type_id: 0x6720d32f @@ -321566,6 +321675,12 @@ function { parameter_id: 0x04b193cc parameter_id: 0x0335a07f } +function { + id: 0x9ca0dc77 + return_type_id: 0x6720d32f + parameter_id: 0x074f1a14 + parameter_id: 0x3cfe7778 +} function { id: 0x9ca1921c return_type_id: 0x6720d32f @@ -322030,6 +322145,12 @@ function { parameter_id: 0x054f691a parameter_id: 0x0aa1f0ee } +function { + id: 0x9cfc5a75 + return_type_id: 0x6720d32f + parameter_id: 0x0490bb4a + parameter_id: 0x1625e208 +} function { id: 0x9cfd713b return_type_id: 0x6720d32f @@ -322052,6 +322173,12 @@ function { parameter_id: 0x02ed0755 parameter_id: 0x0e68dab6 } +function { + id: 0x9d027320 + return_type_id: 0x6720d32f + parameter_id: 0x01c5a749 + parameter_id: 0x3cfe7778 +} function { id: 0x9d038726 return_type_id: 0x6720d32f @@ -322600,6 +322727,13 @@ function { parameter_id: 0x0258f96e parameter_id: 0x15f20052 } +function { + id: 0x9d414188 + return_type_id: 0x6720d32f + parameter_id: 0x0258f96e + parameter_id: 0x1625e208 + parameter_id: 0x3c2dd1ca +} function { id: 0x9d419277 return_type_id: 0x6720d32f @@ -323720,6 +323854,14 @@ function { parameter_id: 0x33756485 parameter_id: 0x064d6086 } +function { + id: 0x9ddac293 + return_type_id: 0x6720d32f + parameter_id: 0x0258f96e + parameter_id: 0x3cfe7778 + parameter_id: 0xf435685e + parameter_id: 0xbad82a2c +} function { id: 0x9ddaf106 return_type_id: 0x6720d32f @@ -337636,6 +337778,15 @@ elf_symbol { type_id: 0x9b08a261 full_name: "__traceiter_android_vh_rproc_recovery_set" } +elf_symbol { + id: 0xd56fbf76 + name: "__traceiter_android_vh_rt_mutex_steal" + is_defined: true + symbol_type: FUNCTION + crc: 0xf0a6d2df + type_id: 0x9a2ab624 + full_name: "__traceiter_android_vh_rt_mutex_steal" +} elf_symbol { id: 0x3ef508a2 name: "__traceiter_android_vh_rtmutex_wait_finish" @@ -341632,6 +341783,15 @@ elf_symbol { type_id: 0x18ccbd2c full_name: "__tracepoint_android_vh_rproc_recovery_set" } +elf_symbol { + id: 0xed43b088 + name: "__tracepoint_android_vh_rt_mutex_steal" + is_defined: true + symbol_type: OBJECT + crc: 0xdc6b8d43 + type_id: 0x18ccbd2c + full_name: "__tracepoint_android_vh_rt_mutex_steal" +} elf_symbol { id: 0xa3915d70 name: "__tracepoint_android_vh_rtmutex_wait_finish" @@ -343000,6 +343160,24 @@ elf_symbol { type_id: 0x20cd94dc full_name: "__usecs_to_jiffies" } +elf_symbol { + id: 0xf51d746f + name: "__v4l2_async_nf_add_fwnode" + is_defined: true + symbol_type: FUNCTION + crc: 0x03599cac + type_id: 0x003279c7 + full_name: "__v4l2_async_nf_add_fwnode" +} +elf_symbol { + id: 0xe13e16ca + name: "__v4l2_async_nf_add_fwnode_remote" + is_defined: true + symbol_type: FUNCTION + crc: 0x82966749 + type_id: 0x003279c7 + full_name: "__v4l2_async_nf_add_fwnode_remote" +} elf_symbol { id: 0x4c0a941a name: "__v4l2_ctrl_handler_setup" @@ -359191,6 +359369,15 @@ elf_symbol { type_id: 0x10985193 full_name: "dump_stack" } +elf_symbol { + id: 0x652fbf96 + name: "dump_tasks" + is_defined: true + symbol_type: FUNCTION + crc: 0x6fe3e49b + type_id: 0x1f821b4c + full_name: "dump_tasks" +} elf_symbol { id: 0xda364c85 name: "dw_handle_msi_irq" @@ -365125,6 +365312,15 @@ elf_symbol { type_id: 0x16dc304e full_name: "iio_trigger_unregister" } +elf_symbol { + id: 0xfb09b362 + name: "iio_trigger_using_own" + is_defined: true + symbol_type: FUNCTION + crc: 0xe2c1359e + type_id: 0xf886bca4 + full_name: "iio_trigger_using_own" +} elf_symbol { id: 0xdf3e8655 name: "iio_update_buffers" @@ -388565,6 +388761,15 @@ elf_symbol { type_id: 0x4585663f full_name: "sysctl_sched_features" } +elf_symbol { + id: 0xe6ea21b1 + name: "sysctl_sched_idle_min_granularity" + is_defined: true + symbol_type: OBJECT + crc: 0x69545cfa + type_id: 0x4585663f + full_name: "sysctl_sched_idle_min_granularity" +} elf_symbol { id: 0x87812861 name: "sysctl_sched_latency" @@ -388574,6 +388779,15 @@ elf_symbol { type_id: 0x4585663f full_name: "sysctl_sched_latency" } +elf_symbol { + id: 0x34555a8a + name: "sysctl_sched_min_granularity" + is_defined: true + symbol_type: OBJECT + crc: 0x04390257 + type_id: 0x4585663f + full_name: "sysctl_sched_min_granularity" +} elf_symbol { id: 0x18d0dd21 name: "sysctl_vals" @@ -394559,6 +394773,87 @@ elf_symbol { type_id: 0x927d452a full_name: "uuid_parse" } +elf_symbol { + id: 0x4e2f55da + name: "v4l2_async_nf_cleanup" + is_defined: true + symbol_type: FUNCTION + crc: 0xdad12cba + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_cleanup" +} +elf_symbol { + id: 0x04aadf7f + name: "v4l2_async_nf_init" + is_defined: true + symbol_type: FUNCTION + crc: 0xc88abf32 + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_init" +} +elf_symbol { + id: 0x7920fabe + name: "v4l2_async_nf_parse_fwnode_endpoints" + is_defined: true + symbol_type: FUNCTION + crc: 0xde590e4b + type_id: 0x9ddac293 + full_name: "v4l2_async_nf_parse_fwnode_endpoints" +} +elf_symbol { + id: 0x48e55006 + name: "v4l2_async_nf_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x8be566ca + type_id: 0x9ca0dc77 + full_name: "v4l2_async_nf_register" +} +elf_symbol { + id: 0x65ffd1d0 + name: "v4l2_async_nf_unregister" + is_defined: true + symbol_type: FUNCTION + crc: 0xc74894f9 + type_id: 0x1fa7cc4d + full_name: "v4l2_async_nf_unregister" +} +elf_symbol { + id: 0x507a9ef5 + name: "v4l2_async_register_subdev" + is_defined: true + symbol_type: FUNCTION + crc: 0x64ab86bc + type_id: 0x9df18afd + full_name: "v4l2_async_register_subdev" +} +elf_symbol { + id: 0x050dd932 + name: "v4l2_async_register_subdev_sensor" + is_defined: true + symbol_type: FUNCTION + crc: 0x61c8f608 + type_id: 0x9df18afd + full_name: "v4l2_async_register_subdev_sensor" +} +elf_symbol { + id: 0x0664687c + name: "v4l2_async_subdev_nf_register" + is_defined: true + symbol_type: FUNCTION + crc: 0x4d890f4b + type_id: 0x9d027320 + full_name: "v4l2_async_subdev_nf_register" +} +elf_symbol { + id: 0xf440f7f1 + name: "v4l2_async_unregister_subdev" + is_defined: true + symbol_type: FUNCTION + crc: 0x2592ea78 + type_id: 0x10e93841 + full_name: "v4l2_async_unregister_subdev" +} elf_symbol { id: 0xf39bae65 name: "v4l2_compat_ioctl32" @@ -394964,6 +395259,33 @@ elf_symbol { type_id: 0x209ae488 full_name: "v4l2_format_info" } +elf_symbol { + id: 0x7ba36329 + name: "v4l2_fwnode_endpoint_alloc_parse" + is_defined: true + symbol_type: FUNCTION + crc: 0x05930b06 + type_id: 0x9cfc5a75 + full_name: "v4l2_fwnode_endpoint_alloc_parse" +} +elf_symbol { + id: 0x2643c2c9 + name: "v4l2_fwnode_endpoint_free" + is_defined: true + symbol_type: FUNCTION + crc: 0xf01d6f06 + type_id: 0x15112911 + full_name: "v4l2_fwnode_endpoint_free" +} +elf_symbol { + id: 0xcb8b4f14 + name: "v4l2_fwnode_endpoint_parse" + is_defined: true + symbol_type: FUNCTION + crc: 0x9dcd6cfe + type_id: 0x9cfc5a75 + full_name: "v4l2_fwnode_endpoint_parse" +} elf_symbol { id: 0x58330374 name: "v4l2_g_parm_cap" @@ -399133,6 +399455,7 @@ interface { symbol_id: 0x8d62858f symbol_id: 0xcef5d79f symbol_id: 0x91384eff + symbol_id: 0xd56fbf76 symbol_id: 0x3ef508a2 symbol_id: 0xfb1b8d64 symbol_id: 0xc56d7179 @@ -399577,6 +399900,7 @@ interface { symbol_id: 0x04365139 symbol_id: 0xd94bc301 symbol_id: 0x3fc5ffc9 + symbol_id: 0xed43b088 symbol_id: 0xa3915d70 symbol_id: 0xf01f02ea symbol_id: 0xeaebbadf @@ -399729,6 +400053,8 @@ interface { symbol_id: 0x7c261545 symbol_id: 0xf497de36 symbol_id: 0xf44f6a18 + symbol_id: 0xf51d746f + symbol_id: 0xe13e16ca symbol_id: 0x4c0a941a symbol_id: 0xfc85c168 symbol_id: 0xb6af2644 @@ -401527,6 +401853,7 @@ interface { symbol_id: 0xe09fd784 symbol_id: 0xded28924 symbol_id: 0xe3421d56 + symbol_id: 0x652fbf96 symbol_id: 0xda364c85 symbol_id: 0x68e0756b symbol_id: 0x12cb063e @@ -402186,6 +402513,7 @@ interface { symbol_id: 0x7551a60b symbol_id: 0x08fd4b84 symbol_id: 0xc6d8f246 + symbol_id: 0xfb09b362 symbol_id: 0xdf3e8655 symbol_id: 0x6f2f4bd1 symbol_id: 0xf87ecda4 @@ -404791,7 +405119,9 @@ interface { symbol_id: 0x2f857527 symbol_id: 0x3e5f4f82 symbol_id: 0xbf1515af + symbol_id: 0xe6ea21b1 symbol_id: 0x87812861 + symbol_id: 0x34555a8a symbol_id: 0x18d0dd21 symbol_id: 0x92705587 symbol_id: 0xdbe66171 @@ -405457,6 +405787,15 @@ interface { symbol_id: 0xb0c1eaf9 symbol_id: 0xe7b3f166 symbol_id: 0xb21b47da + symbol_id: 0x4e2f55da + symbol_id: 0x04aadf7f + symbol_id: 0x7920fabe + symbol_id: 0x48e55006 + symbol_id: 0x65ffd1d0 + symbol_id: 0x507a9ef5 + symbol_id: 0x050dd932 + symbol_id: 0x0664687c + symbol_id: 0xf440f7f1 symbol_id: 0xf39bae65 symbol_id: 0xfd78bf45 symbol_id: 0x218d39b6 @@ -405502,6 +405841,9 @@ interface { symbol_id: 0xe66642fe symbol_id: 0x538ad5cc symbol_id: 0x2244c8f0 + symbol_id: 0x7ba36329 + symbol_id: 0x2643c2c9 + symbol_id: 0xcb8b4f14 symbol_id: 0x58330374 symbol_id: 0xdb18c924 symbol_id: 0x5e36dba6 diff --git a/android/abi_gki_aarch64_imx b/android/abi_gki_aarch64_imx index c3797c477a4c..1b556c8705f6 100644 --- a/android/abi_gki_aarch64_imx +++ b/android/abi_gki_aarch64_imx @@ -1025,6 +1025,7 @@ iio_trigger_poll_chained iio_trigger_register iio_trigger_unregister + iio_trigger_using_own import_iovec in4_pton inet_csk_get_port diff --git a/android/abi_gki_aarch64_oplus b/android/abi_gki_aarch64_oplus index a374dcf65636..8c9846b3a27d 100644 --- a/android/abi_gki_aarch64_oplus +++ b/android/abi_gki_aarch64_oplus @@ -158,6 +158,7 @@ __traceiter_android_vh_dm_bufio_shrink_scan_bypass __traceiter_android_vh_mutex_unlock_slowpath __traceiter_android_vh_rtmutex_waiter_prio + __traceiter_android_vh_rt_mutex_steal __traceiter_android_vh_rwsem_can_spin_on_owner __traceiter_android_vh_rwsem_opt_spin_finish __traceiter_android_vh_rwsem_opt_spin_start @@ -258,6 +259,7 @@ __tracepoint_android_vh_record_rtmutex_lock_starttime __tracepoint_android_vh_record_rwsem_lock_starttime __tracepoint_android_vh_rtmutex_waiter_prio + __tracepoint_android_vh_rt_mutex_steal __tracepoint_android_vh_rwsem_can_spin_on_owner __tracepoint_android_vh_rwsem_opt_spin_finish __tracepoint_android_vh_rwsem_opt_spin_start diff --git a/android/abi_gki_aarch64_pixel b/android/abi_gki_aarch64_pixel index b7d8f1baa215..d0f6d7be74ff 100644 --- a/android/abi_gki_aarch64_pixel +++ b/android/abi_gki_aarch64_pixel @@ -340,6 +340,7 @@ desc_to_gpio destroy_workqueue dev_addr_mod + _dev_alert dev_alloc_name __dev_change_net_namespace dev_close @@ -542,6 +543,7 @@ dmaengine_unmap_put dma_fence_add_callback dma_fence_array_create + dma_fence_array_ops dma_fence_context_alloc dma_fence_default_wait dma_fence_enable_sw_signaling @@ -738,6 +740,7 @@ drm_kms_helper_poll_fini drm_kms_helper_poll_init drm_match_cea_mode + __drmm_crtc_alloc_with_planes drmm_kmalloc drmm_mode_config_init drm_mode_config_reset @@ -810,6 +813,7 @@ drm_writeback_signal_completion dump_backtrace dump_stack + dump_tasks dw_handle_msi_irq dw_pcie_find_capability dw_pcie_host_init @@ -2089,7 +2093,9 @@ synchronize_rcu syscon_regmap_lookup_by_phandle sysctl_sched_features + sysctl_sched_idle_min_granularity sysctl_sched_latency + sysctl_sched_min_granularity sysfs_add_file_to_group sysfs_add_link_to_group sysfs_create_file_ns diff --git a/android/abi_gki_aarch64_rockchip b/android/abi_gki_aarch64_rockchip index 0010cf2300b6..a051b3843047 100644 --- a/android/abi_gki_aarch64_rockchip +++ b/android/abi_gki_aarch64_rockchip @@ -1268,6 +1268,15 @@ usb_submit_urb __usecs_to_jiffies usleep_range_state + __v4l2_async_nf_add_fwnode_remote + v4l2_async_nf_cleanup + v4l2_async_nf_init + v4l2_async_nf_parse_fwnode_endpoints + v4l2_async_nf_register + v4l2_async_register_subdev + v4l2_async_register_subdev_sensor + v4l2_async_subdev_nf_register + v4l2_async_unregister_subdev v4l2_ctrl_find v4l2_ctrl_g_ctrl v4l2_ctrl_g_ctrl_int64 @@ -1295,6 +1304,9 @@ v4l2_event_subscribe v4l2_event_unsubscribe v4l2_fh_open + v4l2_fwnode_endpoint_alloc_parse + v4l2_fwnode_endpoint_free + v4l2_fwnode_endpoint_parse v4l2_i2c_subdev_init v4l2_match_dv_timings v4l2_pipeline_link_notify @@ -2871,9 +2883,11 @@ # required by video_rkcif.ko media_entity_setup_link + __v4l2_async_nf_add_fwnode # required by video_rkisp.ko param_ops_ullong + v4l2_async_nf_unregister v4l2_ctrl_poll # required by videobuf2-cma-sg.ko diff --git a/arch/arm64/configs/gki_defconfig b/arch/arm64/configs/gki_defconfig index ff7758dc92b0..38139e71219a 100644 --- a/arch/arm64/configs/gki_defconfig +++ b/arch/arm64/configs/gki_defconfig @@ -431,6 +431,7 @@ CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_GOV_USER_SPACE=y CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y CONFIG_CPU_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y CONFIG_DEVFREQ_THERMAL=y CONFIG_THERMAL_EMULATION=y CONFIG_WATCHDOG=y @@ -580,6 +581,7 @@ CONFIG_IIO_TRIGGER=y CONFIG_PWM=y CONFIG_GENERIC_PHY=y CONFIG_POWERCAP=y +CONFIG_IDLE_INJECT=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_DEBUG_SYMBOLS=y diff --git a/arch/arm64/include/asm/kvm_hypevents.h b/arch/arm64/include/asm/kvm_hypevents.h index 8a2dd41b8569..c507d8978444 100644 --- a/arch/arm64/include/asm/kvm_hypevents.h +++ b/arch/arm64/include/asm/kvm_hypevents.h @@ -53,7 +53,7 @@ HYP_EVENT(host_smc, __entry->id = id; __entry->forwarded = forwarded; ), - HE_PRINTK("id=%llu invalid=%u", + HE_PRINTK("id=%llu forwarded=%u", __entry->id, __entry->forwarded) ); diff --git a/arch/arm64/include/asm/kvm_hypevents_defs.h b/arch/arm64/include/asm/kvm_hypevents_defs.h index e228d894a898..606f3477ecd3 100644 --- a/arch/arm64/include/asm/kvm_hypevents_defs.h +++ b/arch/arm64/include/asm/kvm_hypevents_defs.h @@ -15,10 +15,10 @@ struct hyp_entry_hdr { /* * Hyp events definitions common to the hyp and the host */ -#define HYP_EVENT_FORMAT(__name, __struct) \ - struct trace_hyp_format_##__name { \ - struct hyp_entry_hdr hdr; \ - __struct \ +#define HYP_EVENT_FORMAT(__name, __struct) \ + struct __packed trace_hyp_format_##__name { \ + struct hyp_entry_hdr hdr; \ + __struct \ } #define HE_PROTO(args...) args diff --git a/arch/arm64/include/asm/kvm_pkvm_module.h b/arch/arm64/include/asm/kvm_pkvm_module.h index bf68d862b7d8..b8e5f8064358 100644 --- a/arch/arm64/include/asm/kvm_pkvm_module.h +++ b/arch/arm64/include/asm/kvm_pkvm_module.h @@ -72,6 +72,11 @@ enum pkvm_psci_notification { * @register_host_perm_fault_handler), otherwise * pKVM will be unable to handle this fault and the * CPU will be stuck in an infinite loop. + * @host_stage2_mod_prot_range: Similar to @host_stage2_mod_prot, but takes a + * range as an argument (@nr_pages). This + * considerably speeds up the process for a + * contiguous memory region, compared to the + * per-page @host_stage2_mod_prot. * @host_stage2_get_leaf: Query the host's stage2 page-table entry for * the page @phys. * @register_host_smc_handler: @cb is called whenever the host issues an SMC diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2c1032a59826..3a5193ca0fb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -149,22 +149,16 @@ static void prepare_host_vtcr(void) static int prepopulate_host_stage2(void) { struct memblock_region *reg; - u64 addr = 0; - int i, ret; + int i, ret = 0; for (i = 0; i < hyp_memblock_nr; i++) { reg = &hyp_memory[i]; - ret = host_stage2_idmap_locked(addr, reg->base - addr, PKVM_HOST_MMIO_PROT, false); - if (ret) - return ret; ret = host_stage2_idmap_locked(reg->base, reg->size, PKVM_HOST_MEM_PROT, false); if (ret) return ret; - addr = reg->base + reg->size; } - return host_stage2_idmap_locked(addr, BIT(host_mmu.pgt.ia_bits) - addr, PKVM_HOST_MMIO_PROT, - false); + return ret; } int kvm_host_prepare_stage2(void *pgt_pool_base) @@ -881,7 +875,14 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) int ret = -EPERM; esr = read_sysreg_el2(SYS_ESR); - BUG_ON(!__get_fault_info(esr, &fault)); + if (!__get_fault_info(esr, &fault)) { + addr = (u64)-1; + /* + * We've presumably raced with a page-table change which caused + * AT to fail, try again. + */ + goto return_to_host; + } fault.esr_el2 = esr; addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; @@ -908,6 +909,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) else BUG_ON(ret && ret != -EAGAIN); +return_to_host: trace_host_mem_abort(esr, addr); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9970a4785819..7d522b037d9a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -619,6 +619,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + mm_flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b1723094d464..ec23164ad768 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -496,6 +496,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 34a44febae86..d710bb834a2a 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -310,6 +310,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0843adb266d1..60fed3c88332 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -420,6 +420,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; diff --git a/arch/x86/configs/gki_defconfig b/arch/x86/configs/gki_defconfig index 22797e912979..7e2df44033bc 100644 --- a/arch/x86/configs/gki_defconfig +++ b/arch/x86/configs/gki_defconfig @@ -396,6 +396,7 @@ CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_GOV_USER_SPACE=y CONFIG_CPU_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y CONFIG_DEVFREQ_THERMAL=y CONFIG_THERMAL_EMULATION=y # CONFIG_X86_PKG_TEMP_THERMAL is not set @@ -523,6 +524,7 @@ CONFIG_IIO=y CONFIG_IIO_BUFFER=y CONFIG_IIO_TRIGGER=y CONFIG_POWERCAP=y +CONFIG_IDLE_INJECT=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_BINDERFS=y CONFIG_ANDROID_DEBUG_SYMBOLS=y diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 97599581ec6b..bcb5678b5b91 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1369,6 +1369,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5e680e039d0e..4686c1d9d0cf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2553,3 +2553,49 @@ void bpf_jit_free(struct bpf_prog *prog) bpf_prog_unlock_free(prog); } + +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + u8 *old_addr, *new_addr, *old_bypass_addr; + int ret; + + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + /* + * On program loading or teardown, the program's kallsym entry + * might not be in place, so we use __bpf_arch_text_poke to skip + * the kallsyms check. + */ + if (new) { + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0); + if (!old) { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0); + } + } else { + ret = __bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = __bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0); + } +} diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c index 43eb748c1103..ee68032b1918 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/android/vendor_hooks.c @@ -95,6 +95,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_task_blocks_on_rtmutex); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_waiter_prio); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_start); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rtmutex_wait_finish); +EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_rt_mutex_steal); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_start); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_opt_spin_finish); EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_mutex_can_spin_on_owner); diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index fe6644f99887..8e9ba701a643 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -461,10 +461,14 @@ static void devfreq_monitor(struct work_struct *work) if (err) dev_err(&devfreq->dev, "dvfs failed with (%d) error\n", err); + if (devfreq->stop_polling) + goto out; + queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); - mutex_unlock(&devfreq->lock); +out: + mutex_unlock(&devfreq->lock); trace_devfreq_monitor(devfreq); } @@ -482,6 +486,10 @@ void devfreq_monitor_start(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (delayed_work_pending(&devfreq->work)) + goto out; + switch (devfreq->profile->timer) { case DEVFREQ_TIMER_DEFERRABLE: INIT_DEFERRABLE_WORK(&devfreq->work, devfreq_monitor); @@ -490,12 +498,16 @@ void devfreq_monitor_start(struct devfreq *devfreq) INIT_DELAYED_WORK(&devfreq->work, devfreq_monitor); break; default: - return; + goto out; } if (devfreq->profile->polling_ms) queue_delayed_work(devfreq_wq, &devfreq->work, msecs_to_jiffies(devfreq->profile->polling_ms)); + +out: + devfreq->stop_polling = false; + mutex_unlock(&devfreq->lock); } EXPORT_SYMBOL(devfreq_monitor_start); @@ -512,6 +524,14 @@ void devfreq_monitor_stop(struct devfreq *devfreq) if (IS_SUPPORTED_FLAG(devfreq->governor->flags, IRQ_DRIVEN)) return; + mutex_lock(&devfreq->lock); + if (devfreq->stop_polling) { + mutex_unlock(&devfreq->lock); + return; + } + + devfreq->stop_polling = true; + mutex_unlock(&devfreq->lock); cancel_delayed_work_sync(&devfreq->work); } EXPORT_SYMBOL(devfreq_monitor_stop); diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 4b680e10c15a..4f3ee92dbe1b 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dma-buf-sysfs-stats.h" @@ -168,35 +169,76 @@ void dma_buf_uninit_sysfs_statistics(void) kset_unregister(dma_buf_stats_kset); } +struct dma_buf_create_sysfs_entry { + struct dma_buf *dmabuf; + struct work_struct work; +}; + +union dma_buf_create_sysfs_work_entry { + struct dma_buf_create_sysfs_entry create_entry; + struct dma_buf_sysfs_entry sysfs_entry; +}; + +static void sysfs_add_workfn(struct work_struct *work) +{ + struct dma_buf_create_sysfs_entry *create_entry = + container_of(work, struct dma_buf_create_sysfs_entry, work); + struct dma_buf *dmabuf = create_entry->dmabuf; + + /* + * A dmabuf is ref-counted via its file member. If this handler holds the only + * reference to the dmabuf, there is no need for sysfs kobject creation. This is an + * optimization and a race; when the reference count drops to 1 immediately after + * this check it is not harmful as the sysfs entry will still get cleaned up in + * dma_buf_stats_teardown, which won't get called until the final dmabuf reference + * is released, and that can't happen until the end of this function. + */ + if (file_count(dmabuf->file) > 1) { + dmabuf->sysfs_entry->dmabuf = dmabuf; + /* + * kobject_init_and_add expects kobject to be zero-filled, but we have populated it + * (the sysfs_add_work union member) to trigger this work function. + */ + memset(&dmabuf->sysfs_entry->kobj, 0, sizeof(dmabuf->sysfs_entry->kobj)); + dmabuf->sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; + if (kobject_init_and_add(&dmabuf->sysfs_entry->kobj, &dma_buf_ktype, NULL, + "%lu", file_inode(dmabuf->file)->i_ino)) { + kobject_put(&dmabuf->sysfs_entry->kobj); + dmabuf->sysfs_entry = NULL; + } + } else { + /* + * Free the sysfs_entry and reset the pointer so dma_buf_stats_teardown doesn't + * attempt to operate on it. + */ + kfree(dmabuf->sysfs_entry); + dmabuf->sysfs_entry = NULL; + } + dma_buf_put(dmabuf); +} + int dma_buf_stats_setup(struct dma_buf *dmabuf, struct file *file) { - struct dma_buf_sysfs_entry *sysfs_entry; - int ret; + struct dma_buf_create_sysfs_entry *create_entry; + union dma_buf_create_sysfs_work_entry *work_entry; if (!dmabuf->exp_name) { pr_err("exporter name must not be empty if stats needed\n"); return -EINVAL; } - sysfs_entry = kzalloc(sizeof(struct dma_buf_sysfs_entry), GFP_KERNEL); - if (!sysfs_entry) + work_entry = kmalloc(sizeof(union dma_buf_create_sysfs_work_entry), GFP_KERNEL); + if (!work_entry) return -ENOMEM; - sysfs_entry->kobj.kset = dma_buf_per_buffer_stats_kset; - sysfs_entry->dmabuf = dmabuf; + dmabuf->sysfs_entry = &work_entry->sysfs_entry; - dmabuf->sysfs_entry = sysfs_entry; + create_entry = &work_entry->create_entry; + create_entry->dmabuf = dmabuf; - /* create the directory for buffer stats */ - ret = kobject_init_and_add(&sysfs_entry->kobj, &dma_buf_ktype, NULL, - "%lu", file_inode(file)->i_ino); - if (ret) - goto err_sysfs_dmabuf; + INIT_WORK(&create_entry->work, sysfs_add_workfn); + get_dma_buf(dmabuf); /* This reference will be dropped in sysfs_add_workfn. */ + schedule_work(&create_entry->work); return 0; - -err_sysfs_dmabuf: - kobject_put(&sysfs_entry->kobj); - dmabuf->sysfs_entry = NULL; - return ret; } diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index e0d42ee76b43..fbe8a07552ef 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -727,10 +727,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) dmabuf->resv = resv; } - ret = dma_buf_stats_setup(dmabuf, file); - if (ret) - goto err_dmabuf; - file->private_data = dmabuf; file->f_path.dentry->d_fsdata = dmabuf; dmabuf->file = file; @@ -739,9 +735,19 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) list_add(&dmabuf->list_node, &db_list.head); mutex_unlock(&db_list.lock); + ret = dma_buf_stats_setup(dmabuf, file); + if (ret) + goto err_sysfs; + return dmabuf; -err_dmabuf: +err_sysfs: + mutex_lock(&db_list.lock); + list_del(&dmabuf->list_node); + mutex_unlock(&db_list.lock); + dmabuf->file = NULL; + file->f_path.dentry->d_fsdata = NULL; + file->private_data = NULL; if (!resv) dma_resv_fini(dmabuf->resv); kfree(dmabuf); diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 9618ddcb09e3..0d0fdbdbbdc4 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8683,7 +8683,6 @@ static int ufshcd_add_lus(struct ufs_hba *hba) ufs_bsg_probe(hba); ufshpb_init(hba); scsi_scan_host(hba->host); - pm_runtime_put_sync(hba->dev); out: return ret; @@ -8916,15 +8915,12 @@ static void ufshcd_async_scan(void *data, async_cookie_t cookie) /* Probe and add UFS logical units */ ret = ufshcd_add_lus(hba); + out: - /* - * If we failed to initialize the device or the device is not - * present, turn off the power/clocks etc. - */ - if (ret) { - pm_runtime_put_sync(hba->dev); - ufshcd_hba_exit(hba); - } + pm_runtime_put_sync(hba->dev); + + if (ret) + dev_err(hba->dev, "%s failed: %d\n", __func__, ret); } static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd *scmd) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 1f23c96fa94f..255230fa1e47 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1375,6 +1375,9 @@ static int dwc3_core_init(struct dwc3 *dwc) if (dwc->parkmode_disable_ss_quirk) reg |= DWC3_GUCTL1_PARKMODE_DISABLE_SS; + if (dwc->parkmode_disable_hs_quirk) + reg |= DWC3_GUCTL1_PARKMODE_DISABLE_HS; + if (DWC3_VER_IS_WITHIN(DWC3, 290A, ANY) && (dwc->maximum_speed == USB_SPEED_HIGH || dwc->maximum_speed == USB_SPEED_FULL)) @@ -1658,6 +1661,8 @@ static void dwc3_get_properties(struct dwc3 *dwc) "snps,resume-hs-terminations"); dwc->parkmode_disable_ss_quirk = device_property_read_bool(dev, "snps,parkmode-disable-ss-quirk"); + dwc->parkmode_disable_hs_quirk = device_property_read_bool(dev, + "snps,parkmode-disable-hs-quirk"); dwc->gfladj_refclk_lpm_sel = device_property_read_bool(dev, "snps,gfladj-refclk-lpm-sel-quirk"); diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index fed5cd235752..52045b8d4c79 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -268,6 +268,7 @@ #define DWC3_GUCTL1_DEV_FORCE_20_CLK_FOR_30_CLK BIT(26) #define DWC3_GUCTL1_DEV_L1_EXIT_BY_HW BIT(24) #define DWC3_GUCTL1_PARKMODE_DISABLE_SS BIT(17) +#define DWC3_GUCTL1_PARKMODE_DISABLE_HS BIT(16) #define DWC3_GUCTL1_RESUME_OPMODE_HS_HOST BIT(10) /* Global Status Register */ @@ -1122,6 +1123,8 @@ struct dwc3_scratchpad_array { * generation after resume from suspend. * @parkmode_disable_ss_quirk: set if we need to disable all SuperSpeed * instances in park mode. + * @parkmode_disable_hs_quirk: set if we need to disable all HishSpeed + * instances in park mode. * @tx_de_emphasis_quirk: set if we enable Tx de-emphasis quirk * @tx_de_emphasis: Tx de-emphasis value * 0 - -6dB de-emphasis @@ -1343,6 +1346,7 @@ struct dwc3 { unsigned dis_tx_ipgap_linecheck_quirk:1; unsigned resume_hs_terminations:1; unsigned parkmode_disable_ss_quirk:1; + unsigned parkmode_disable_hs_quirk:1; unsigned gfladj_refclk_lpm_sel:1; unsigned tx_de_emphasis_quirk:1; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 6fdac0fae461..121092e35ec6 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2093,7 +2093,17 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, list_for_each_entry(r, &dep->pending_list, list) { if (r == req) { - dwc3_gadget_giveback(dep, req, -ECONNRESET); + /* + * Explicitly check for EP0/1 as dequeue for those + * EPs need to be handled differently. Control EP + * only deals with one USB req, and giveback will + * occur during dwc3_ep0_stall_and_restart(). EP0 + * requests are never added to started_list. + */ + if (dep->number > 1) + dwc3_gadget_giveback(dep, req, -ECONNRESET); + else + dwc3_ep0_reset_state(dwc); goto out; } } diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 544028862de0..10eb2175e749 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1053,19 +1053,19 @@ static void xhci_get_usb3_port_status(struct xhci_port *port, u32 *status, *status |= USB_PORT_STAT_C_CONFIG_ERROR << 16; /* USB3 specific wPortStatus bits */ - if (portsc & PORT_POWER) { + if (portsc & PORT_POWER) *status |= USB_SS_PORT_STAT_POWER; - /* link state handling */ - if (link_state == XDEV_U0) - bus_state->suspended_ports &= ~(1 << portnum); - } - /* remote wake resume signaling complete */ - if (bus_state->port_remote_wakeup & (1 << portnum) && + /* no longer suspended or resuming */ + if (link_state != XDEV_U3 && link_state != XDEV_RESUME && link_state != XDEV_RECOVERY) { - bus_state->port_remote_wakeup &= ~(1 << portnum); - usb_hcd_end_port_resume(&hcd->self, portnum); + /* remote wake resume signaling complete */ + if (bus_state->port_remote_wakeup & (1 << portnum)) { + bus_state->port_remote_wakeup &= ~(1 << portnum); + usb_hcd_end_port_resume(&hcd->self, portnum); + } + bus_state->suspended_ports &= ~(1 << portnum); } xhci_hub_report_usb3_link_state(xhci, status, portsc); @@ -1111,6 +1111,21 @@ static void xhci_get_usb2_port_status(struct xhci_port *port, u32 *status, return; } } + + /* + * Clear usb2 resume signalling variables if port is no longer suspended + * or resuming. Port either resumed to U0/U1/U2, disconnected, or in a + * error state. Resume related variables should be cleared in all those cases. + */ + if (link_state != XDEV_U3 && link_state != XDEV_RESUME) { + if (bus_state->resume_done[portnum] || + test_bit(portnum, &bus_state->resuming_ports)) { + bus_state->resume_done[portnum] = 0; + clear_bit(portnum, &bus_state->resuming_ports); + usb_hcd_end_port_resume(&port->rhub->hcd->self, portnum); + } + bus_state->suspended_ports &= ~(1 << portnum); + } } /* diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..b1b846504027 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 83532525282e..bbfbaf25ee59 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -404,6 +404,8 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_readahead, .bmap = erofs_bmap, .direct_IO = noop_direct_IO, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, }; #ifdef CONFIG_FS_DAX diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 7021e2cf6146..38c7f9c96c68 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -122,11 +122,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, - void *inpage, unsigned int *inputmargin, int *maptype, - bool may_inplace) + void *inpage, void *out, unsigned int *inputmargin, + int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i, j; + unsigned int omargin, total, i; struct page **in; void *src, *tmp; @@ -136,20 +136,20 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) { - DBG_BUGON(rq->in[i] == NULL); - for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j) - if (rq->out[j] == rq->in[i]) - goto docopy; - } + for (i = 0; i < ctx->inpages; ++i) + if (rq->out[ctx->outpages - ctx->inpages + i] != + rq->in[i]) + goto docopy; + kunmap_local(inpage); + *maptype = 3; + return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { *maptype = 0; return inpage; } - kunmap_atomic(inpage); - might_sleep(); + kunmap_local(inpage); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); @@ -162,7 +162,7 @@ docopy: src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } @@ -173,9 +173,9 @@ docopy: min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); + inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; @@ -205,16 +205,16 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *out) + u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; - u8 *headpage, *src; + u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { @@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { - kunmap_atomic(headpage); + kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & @@ -231,11 +231,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin, + src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); + out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, @@ -261,12 +262,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); - } else { + } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } @@ -289,7 +290,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } @@ -309,9 +310,9 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, } dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; @@ -320,50 +321,63 @@ dstmap_out: static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - const unsigned int outpages = + const unsigned int nrpages_in = + PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; + const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const unsigned int righthalf = min_t(unsigned int, rq->outputsize, - PAGE_SIZE - rq->pageofs_out); - const unsigned int lefthalf = rq->outputsize - righthalf; - const unsigned int interlaced_offset = - rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; + const unsigned int bs = rq->sb->s_blocksize; + unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; + u8 *kin; - if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - if (rq->out[0] == *rq->in) { - DBG_BUGON(rq->pageofs_out); - return 0; - } - - src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); - } - - if (outpages > inpages) { - DBG_BUGON(!rq->out[outpages - 1]); - if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); - } else if (!interlaced_offset) { - memmove(src, src + righthalf, lefthalf); + DBG_BUGON(rq->outputsize > rq->inputsize); + if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { + cur = bs - (rq->pageofs_out & (bs - 1)); + pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; + cur = min(cur, rq->outputsize); + if (cur && rq->out[0]) { + kin = kmap_local_page(rq->in[nrpages_in - 1]); + if (rq->out[0] == rq->in[nrpages_in - 1]) { + memmove(kin + rq->pageofs_out, kin + pi, cur); + flush_dcache_page(rq->out[0]); + } else { + memcpy_to_page(rq->out[0], rq->pageofs_out, + kin + pi, cur); + } + kunmap_local(kin); } + rq->outputsize -= cur; } - kunmap_local(src); + + for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + insz = min_t(unsigned int, PAGE_SIZE - rq->pageofs_in, + rq->outputsize); + rq->outputsize -= insz; + if (!rq->in[ni]) + continue; + kin = kmap_local_page(rq->in[ni]); + pi = 0; + do { + no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; + po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; + DBG_BUGON(no >= nrpages_out); + cnt = min_t(unsigned int, insz - pi, PAGE_SIZE - po); + if (rq->out[no] == rq->in[ni]) { + memmove(kin + po, + kin + rq->pageofs_in + pi, cnt); + flush_dcache_page(rq->out[no]); + } else if (rq->out[no]) { + memcpy_to_page(rq->out[no], po, + kin + rq->pageofs_in + pi, cnt); + } + pi += cnt; + } while (pi < insz); + kunmap_local(kin); + } + DBG_BUGON(ni > nrpages_in); return 0; } -static struct z_erofs_decompressor decompressors[] = { +const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" @@ -383,9 +397,3 @@ static struct z_erofs_decompressor decompressors[] = { }, #endif }; - -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) -{ - return decompressors[rq->alg].decompress(rq, pagepool); -} diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 8fc41fd1620c..190bc3ad5622 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -291,14 +291,19 @@ static int erofs_fill_inode(struct inode *inode) } if (erofs_inode_is_data_compressed(vi->datalayout)) { - if (!erofs_is_fscache_mode(inode->i_sb) && - inode->i_sb->s_blocksize_bits == PAGE_SHIFT) + if (!erofs_is_fscache_mode(inode->i_sb)) { + DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE, + erofs_info, inode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); err = z_erofs_fill_inode(inode); + } else err = -EOPNOTSUPP; goto out_unlock; } inode->i_mapping->a_ops = &erofs_raw_access_aops; + if (!erofs_is_fscache_mode(inode->i_sb)) + mapping_set_large_folios(inode->i_mapping); #ifdef CONFIG_EROFS_FS_ONDEMAND if (erofs_is_fscache_mode(inode->i_sb)) inode->i_mapping->a_ops = &erofs_fscache_access_aops; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1c03daf83a68..23151da13a23 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -544,7 +544,7 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); +int erofs_init_managed_cache(struct super_block *sb); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); @@ -565,6 +565,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b073b38c1c77..19af9bbcb8f1 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -597,68 +597,6 @@ static int erofs_fc_parse_param(struct fs_context *fc, return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) -{ - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} - -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); - - if (!inode) - return -ENOMEM; - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif - static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 451b9a6cba68..0b1b6ca804b3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,7 +5,6 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" -#include #include #include #include @@ -58,6 +57,9 @@ struct z_erofs_pcluster { /* L: total number of bvecs */ unsigned int vcnt; + /* I: pcluster size (compressed size) in bytes */ + unsigned int pclustersize; + /* I: page offset of start position of decompression */ unsigned short pageofs_out; @@ -72,14 +74,6 @@ struct z_erofs_pcluster { struct rcu_head rcu; }; - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - /* I: compression algorithm format */ unsigned char algorithmformat; @@ -119,9 +113,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; + return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; } /* @@ -239,14 +231,20 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + if (!nextpage) { + nextpage = erofs_allocpage(pagepool, GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -301,12 +299,12 @@ static int z_erofs_create_pcluster_pool(void) return 0; } -static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) { - int i; + unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct z_erofs_pcluster_slab *pcs = pcluster_pool; - for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { - struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) @@ -315,7 +313,7 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclusterpages = nrpages; + pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -507,19 +505,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -529,12 +525,12 @@ struct z_erofs_decompress_frontend { struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -564,11 +560,11 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* @@ -579,13 +575,14 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; + if (i_blocksize(fe->inode) != PAGE_SIZE) + return; if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (i = 0; i < pcl->pclusterpages; ++i) { - struct page *page; + for (i = 0; i < pclusterpages; ++i) { + struct page *page, *newpage; void *t; /* mark pages just found for debugging */ - struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) @@ -595,6 +592,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) { t = (void *)((unsigned long)page | 1); + newpage = NULL; } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -602,11 +600,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, continue; /* - * try to use cached I/O if page allocation - * succeeds or fallback to in-place I/O instead - * to avoid any direct reclaim. + * Try cached I/O if allocation succeeds or fallback to + * in-place I/O instead to avoid any direct reclaim. */ - newpage = erofs_allocpage(pagepool, gfp); + newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); @@ -619,7 +616,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) put_page(page); else if (newpage) - erofs_pagepool_add(pagepool, newpage); + erofs_pagepool_add(&fe->pagepool, newpage); } /* @@ -636,6 +633,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); @@ -643,7 +641,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. */ - for (i = 0; i < pcl->pclusterpages; ++i) { + for (i = 0; i < pclusterpages; ++i) { struct page *page = pcl->compressed_bvecs[i].page; if (!page) @@ -664,29 +662,73 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + bool ret; + int i; + + if (!folio_test_private(folio)) + return true; if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + return false; - ret = 0; + ret = false; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); - for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { + for (i = 0; i < pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; + ret = true; break; } } erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) - detach_page_private(page); + folio_detach_private(folio); return ret; } +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { @@ -717,7 +759,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } @@ -743,20 +786,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { + (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : - map->m_plen >> PAGE_SHIFT); + pcl = z_erofs_alloc_pcluster(map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); @@ -779,10 +822,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ - pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); - pcl->tailpacking_size = map->m_plen; } else { - pcl->obj.index = map->m_pa >> PAGE_SHIFT; + pcl->obj.index = erofs_blknr(sb, map->m_pa); grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { @@ -807,9 +848,11 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -819,8 +862,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -839,9 +881,27 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -864,20 +924,18 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } /* * if all pending pages are added, don't hold its reference @@ -887,25 +945,22 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; + fe->backmost = false; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, packed_inode, erofs_blknr(sb, pos), EROFS_KMAP); @@ -913,80 +968,66 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); + const unsigned int bs = i_blocksize(inode); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - erofs_dbg("out-of-range map @ pos %llu", offset + cur); - - if (z_erofs_collector_end(fe)) - fe->backmost = false; - map->m_la = offset + cur; + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + end - 1); + z_erofs_pcluster_end(fe); + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - err = z_erofs_collector_begin(fe); - if (err) - goto out; + if (!(map->m_flags & EROFS_MAP_MAPPED)) { + zero_user_segment(page, cur, end); + tight = false; + goto next_part; + } - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; + if (map->m_flags & EROFS_MAP_FRAGMENT) { + erofs_off_t fpos = offset + cur - map->m_la; - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); + if (err) goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); + tight = false; + goto next_part; } -hitted: + + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + /* * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since @@ -994,59 +1035,19 @@ hitted: * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); - if (!(map->m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, cur, end); - ++spiltted; - tight = false; - goto next_part; - } - if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; - - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); - if (err) - goto out; - ++spiltted; - tight = false; - goto next_part; - } - - exclusive = (!cur && (!spiltted || tight)); + exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE))); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); -retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + if (err) goto out; - } z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1071,12 +1072,12 @@ out: z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); - erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", - __func__, page, spiltted, map->m_llen); + erofs_dbg("%s, finish page: %pK split: %u map->m_llen %llu", + __func__, page, split, map->m_llen); return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -1256,8 +1257,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); - unsigned int i, inputsize; - int err2; + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; + int i, err2; struct page *page; bool overlapped; @@ -1294,18 +1296,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, if (err) goto out; - if (z_erofs_is_inline_pcluster(pcl)) - inputsize = pcl->tailpacking_size; - else - inputsize = pclusterpages * PAGE_SIZE; - - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, - .inputsize = inputsize, + .inputsize = pcl->pclustersize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, @@ -1321,12 +1318,11 @@ out: put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { - page = pcl->compressed_bvecs[i].page; + /* consider shortlived pages added when decompressing */ + page = be->compressed_pages[i]; if (erofs_page_is_managed(sbi, page)) continue; - - /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } @@ -1446,86 +1442,85 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, z_erofs_decompressqueue_work(&io->u.work); } -static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, - unsigned int nr, - struct page **pagepool, - struct address_space *mc) +static void z_erofs_fill_bio_vec(struct bio_vec *bvec, + struct z_erofs_decompress_frontend *f, + struct z_erofs_pcluster *pcl, + unsigned int nr, + struct address_space *mc) { - const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; - + struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; struct address_space *mapping; - struct page *oldpage, *page; - int justfound; + struct page *page, *oldpage; + int justfound, bs = i_blocksize(f->inode); + /* Except for inplace pages, the entire page can be used for I/Os */ + bvec->bv_offset = 0; + bvec->bv_len = PAGE_SIZE; repeat: - page = READ_ONCE(pcl->compressed_bvecs[nr].page); - oldpage = page; - - if (!page) + oldpage = READ_ONCE(zbv->page); + if (!oldpage) goto out_allocpage; - justfound = (unsigned long)page & 1UL; - page = (struct page *)((unsigned long)page & ~1UL); + justfound = (unsigned long)oldpage & 1UL; + page = (struct page *)((unsigned long)oldpage & ~1UL); + bvec->bv_page = page; + DBG_BUGON(z_erofs_is_shortlived_page(page)); /* - * preallocated cached pages, which is used to avoid direct reclaim - * otherwise, it will go inplace I/O path instead. + * Handle preallocated cached pages. We tried to allocate such pages + * without triggering direct reclaim. If allocation failed, inplace + * file-backed pages will be used instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); + WRITE_ONCE(zbv->page, page); tocache = true; goto out_tocache; } + mapping = READ_ONCE(page->mapping); - /* - * file-backed online pages in plcuster are all locked steady, - * therefore it is impossible for `mapping' to be NULL. + * File-backed pages for inplace I/Os are all locked steady, + * therefore it is impossible for `mapping` to be NULL. */ - if (mapping && mapping != mc) - /* ought to be unmanaged pages */ - goto out; - - /* directly return for shortlived page as well */ - if (z_erofs_is_shortlived_page(page)) - goto out; + if (mapping && mapping != mc) { + if (zbv->offset < 0) + bvec->bv_offset = round_up(-zbv->offset, bs); + bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; + return; + } lock_page(page); - /* only true if page reclaim goes wrong, should never happen */ DBG_BUGON(justfound && PagePrivate(page)); - /* the page is still in manage cache */ + /* the cached page is still in managed cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - + WRITE_ONCE(zbv->page, page); + /* + * The cached page is still available but without a valid + * `->private` pcluster hint. Let's reconnect them. + */ if (!PagePrivate(page)) { - /* - * impossible to be !PagePrivate(page) for - * the current restriction as well if - * the page is already in compressed_bvecs[]. - */ DBG_BUGON(!justfound); - - justfound = 0; - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); + /* compressed_bvecs[] already takes a ref */ + attach_page_private(page, pcl); + put_page(page); } - /* no need to submit io if it is already up-to-date */ + /* no need to submit if it is already up-to-date */ if (PageUptodate(page)) { unlock_page(page); - page = NULL; + bvec->bv_page = NULL; } - goto out; + return; } /* - * the managed page has been truncated, it's unsafe to - * reuse this one, let's allocate a new cache-managed page. + * It has been truncated, so it's unsafe to reuse this one. Let's + * allocate a new page for compressed data. */ DBG_BUGON(page->mapping); DBG_BUGON(!justfound); @@ -1534,25 +1529,23 @@ repeat: unlock_page(page); put_page(page); out_allocpage: - page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, - oldpage, page)) { - erofs_pagepool_add(pagepool, page); + page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); + if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { + erofs_pagepool_add(&f->pagepool, page); cond_resched(); goto repeat; } + bvec->bv_page = page; out_tocache: - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* turn into temporary page if fails (1 ref) */ + if (!tocache || bs != PAGE_SIZE || + add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) { + /* turn into a temporary shortlived page (1 ref) */ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); - goto out; + return; } attach_page_private(page, pcl); - /* drop a refcount added by allocpage (then we have 2 refs here) */ + /* drop a refcount added by allocpage (then 2 refs in total here) */ put_page(page); - -out: /* the only exit (for tracing and debugging) */ - return page; } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, @@ -1607,7 +1600,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static void z_erofs_submissionqueue_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; @@ -1619,7 +1612,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); @@ -1633,9 +1625,8 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); @@ -1643,17 +1634,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ - pgoff_t last_index; + erofs_off_t last_pa; struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ + /* No need to read from device for pclusters in the bypass queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); @@ -1666,7 +1654,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; - pgoff_t cur, end; + erofs_off_t cur, end; + struct bio_vec bvec; unsigned int i = 0; bool bypass = true; @@ -1685,18 +1674,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, }; (void)erofs_map_dev(sb, &mdev); - cur = erofs_blknr(sb, mdev.m_pa); - end = cur + pcl->pclusterpages; - + cur = mdev.m_pa; + end = cur + pcl->pclustersize; do { - struct page *page; - - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); - if (!page) + z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); + if (!bvec.bv_page) continue; - if (bio && (cur != last_index + 1 || + if (bio && (cur != last_pa || last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); @@ -1707,7 +1692,8 @@ submit_bio_retry: bio = NULL; } - if (unlikely(PageWorkingset(page)) && !memstall) { + if (unlikely(PageWorkingset(bvec.bv_page)) && + !memstall) { psi_memstall_enter(&pflags); memstall = 1; } @@ -1715,23 +1701,24 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); - bio->bi_end_io = z_erofs_decompressqueue_endio; - - last_bdev = mdev.m_bdev; - bio->bi_iter.bi_sector = (sector_t)cur << - (sb->s_blocksize_bits - 9); + bio->bi_end_io = z_erofs_submissionqueue_endio; + bio->bi_iter.bi_sector = cur >> 9; bio->bi_private = q[JQ_SUBMIT]; - if (f->readahead) + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; + last_bdev = mdev.m_bdev; } - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + if (cur + bvec.bv_len > end) + bvec.bv_len = end - cur; + if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, + bvec.bv_offset)) goto submit_bio_retry; - last_index = cur; + last_pa = cur + bvec.bv_len; bypass = false; - } while (++cur < end); + } while ((cur += bvec.bv_len) < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; @@ -1757,16 +1744,16 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) + bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; @@ -1775,7 +1762,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* @@ -1783,29 +1770,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); @@ -1826,7 +1812,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) { unlock_page(page); } else { - err = z_erofs_do_read_page(f, page, pagepool); + err = z_erofs_do_read_page(f, page); if (err) erofs_err(inode->i_sb, "readmore error at page %lu @ nid %llu", @@ -1847,28 +1833,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) struct inode *const inode = page->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; int err; trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); - - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, page); + z_erofs_pcluster_readmore(&f, NULL, false); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } @@ -1877,14 +1859,12 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; + struct page *head = NULL, *page; unsigned int nr_pages; - f.readahead = true; f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1900,20 +1880,19 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_readmore(&f, rac, false); + z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 8973ccad707d..f5d3ba39dd42 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -101,29 +101,26 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, } static unsigned int decode_compactedbits(unsigned int lobits, - unsigned int lomask, u8 *in, unsigned int pos, u8 *type) { const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); - const unsigned int lo = v & lomask; + const unsigned int lo = v & ((1 << lobits) - 1); *type = (v >> lobits) & 3; return lo; } -static int get_compacted_la_distance(unsigned int lclusterbits, +static int get_compacted_la_distance(unsigned int lobits, unsigned int encodebits, unsigned int vcnt, u8 *in, int i) { - const unsigned int lomask = (1 << lclusterbits) - 1; unsigned int lo, d1 = 0; u8 type; DBG_BUGON(i >= vcnt); do { - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) return d1; @@ -142,15 +139,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const unsigned int lomask = (1 << lclusterbits) - 1; - unsigned int vcnt, base, lo, encodebits, nblk, eofs; + unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs; int i; u8 *in, type; bool big_pcluster; if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits == 12) + else if (1 << amortizedshift == 2 && lclusterbits <= 12) vcnt = 16; else return -EOPNOTSUPP; @@ -159,6 +155,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; + lobits = max(lclusterbits, ilog2(Z_EROFS_VLE_DI_D0_CBLKCNT) + 1U); encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; eofs = erofs_blkoff(m->inode->i_sb, pos); base = round_down(eofs, vcnt << amortizedshift); @@ -166,15 +163,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, i = (eofs - base) >> amortizedshift; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, encodebits * i, &type); m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; /* figure out lookahead_distance: delta[1] if needed */ if (lookahead) - m->delta[1] = get_compacted_la_distance(lclusterbits, + m->delta[1] = get_compacted_la_distance(lobits, encodebits, vcnt, in, i); if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { if (!big_pcluster) { @@ -193,8 +189,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, * of which lo saves delta[1] rather than delta[0]. * Hence, get delta[0] by the previous lcluster indirectly. */ - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * (i - 1), &type); + lo = decode_compactedbits(lobits, in, + encodebits * (i - 1), &type); if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) lo = 0; else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) @@ -209,8 +205,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 1; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) i -= lo; @@ -221,8 +217,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, nblk = 0; while (i > 0) { --i; - lo = decode_compactedbits(lclusterbits, lomask, - in, encodebits * i, &type); + lo = decode_compactedbits(lobits, in, + encodebits * i, &type); if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { --i; diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c index e16457c75944..6ca74987f7da 100644 --- a/fs/fuse/backing.c +++ b/fs/fuse/backing.c @@ -1117,7 +1117,6 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, struct kstat stat; int err; - /* TODO this will not handle lookups over mount points */ inode_lock_nested(dir_backing_inode, I_MUTEX_PARENT); backing_entry = lookup_one_len(entry->d_name.name, dir_backing_entry, strlen(entry->d_name.name)); @@ -1136,16 +1135,22 @@ int fuse_lookup_backing(struct fuse_bpf_args *fa, struct inode *dir, return 0; } + err = follow_down(&fuse_entry->backing_path); + if (err) + goto err_out; + err = vfs_getattr(&fuse_entry->backing_path, &stat, STATX_BASIC_STATS, 0); - if (err) { - path_put_init(&fuse_entry->backing_path); - return err; - } + if (err) + goto err_out; fuse_stat_to_attr(get_fuse_conn(dir), backing_entry->d_inode, &stat, &feo->attr); return 0; + +err_out: + path_put_init(&fuse_entry->backing_path); + return err; } int fuse_handle_backing(struct fuse_entry_bpf *feb, struct inode **backing_inode, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 320d3b287ed0..6a6ff277501d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2697,6 +2697,9 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old); + void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0ff8ce8cd06a..7ccb05ba08ce 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -249,6 +249,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -320,6 +322,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); @@ -399,6 +404,8 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) @@ -525,6 +532,22 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the + * current location. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * set the internal maple state values to a sub-range. + * Please use mas_set_range() if you do not know where you are in the tree. + */ +static inline void __mas_set_range(struct ma_state *mas, unsigned long start, + unsigned long last) +{ + mas->index = start; + mas->last = last; +} /** * mas_set_range() - Set up Maple Tree operation state for a different index. @@ -539,9 +562,8 @@ static inline void mas_reset(struct ma_state *mas) static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->index = start; - mas->last = last; - mas->node = MAS_START; + __mas_set_range(mas, start, last); + mas->node = MAS_START; } /** diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 445494c502ba..49dc95d21f01 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -243,7 +243,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); + hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]); #endif break; default: diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8c3587d5c308..6641c4543d18 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -188,6 +188,36 @@ struct net { #endif } __randomize_layout; +/* + * To work around a KMI issue, hooks_bridge[] could not be + * added to struct netns_nf. Since the only use of netns_nf + * is embedded in struct net, struct ext_net is added to + * contain struct net plus the new field. Users of the new + * field must use get_nf_hooks_bridge() to access the field. + */ +struct ext_net { + struct net net; +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE + struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; +#endif + ANDROID_VENDOR_DATA(1); +}; + +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +extern struct net init_net; +extern struct nf_hook_entries **init_nf_hooks_bridgep; + +static inline struct nf_hook_entries __rcu **get_nf_hooks_bridge(const struct net *net) +{ + struct ext_net *ext_net; + + if (net == &init_net) + return init_nf_hooks_bridgep; + ext_net = container_of(net, struct ext_net, net); + return ext_net->hooks_bridge; +} +#endif + #include /* Init's network namespace */ diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 3b7eb0cb1201..56c72117b5b3 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -22,9 +22,6 @@ struct netns_nf { #ifdef CONFIG_NETFILTER_FAMILY_ARP struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS]; #endif -#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; -#endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) unsigned int defrag_ipv4_users; #endif diff --git a/include/trace/hooks/dtask.h b/include/trace/hooks/dtask.h index a63a2868e626..b51147089b2d 100644 --- a/include/trace/hooks/dtask.h +++ b/include/trace/hooks/dtask.h @@ -42,6 +42,9 @@ DECLARE_HOOK(android_vh_rtmutex_wait_start, DECLARE_HOOK(android_vh_rtmutex_wait_finish, TP_PROTO(struct rt_mutex_base *lock), TP_ARGS(lock)); +DECLARE_HOOK(android_vh_rt_mutex_steal, + TP_PROTO(int waiter_prio, int top_waiter_prio, bool *ret), + TP_ARGS(waiter_prio, top_waiter_prio, ret)); DECLARE_HOOK(android_vh_rwsem_read_wait_start, TP_PROTO(struct rw_semaphore *sem), diff --git a/init/Kconfig.gki b/init/Kconfig.gki index 081b1cdc9c7e..1a17a3d6e27b 100644 --- a/init/Kconfig.gki +++ b/init/Kconfig.gki @@ -202,6 +202,7 @@ config GKI_HIDDEN_NET_CONFIGS select PAGE_POOL select NET_PTP_CLASSIFY select NET_DEVLINK + select NETFILTER_FAMILY_BRIDGE help Dummy config option used to enable the networking hidden config, required by various SoC platforms. diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 832b2659e96e..00f23febb9a7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -997,11 +997,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1010,7 +1015,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1029,21 +1034,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1053,39 +1047,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 71bb2e3440e2..09ca202ac480 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,6 +43,7 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } +EXPORT_SYMBOL_GPL(dma_direct_get_required_mask); static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_limit) @@ -320,6 +321,7 @@ out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; } +EXPORT_SYMBOL_GPL(dma_direct_alloc); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) @@ -365,6 +367,7 @@ void dma_direct_free(struct device *dev, size_t size, __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } +EXPORT_SYMBOL_GPL(dma_direct_free); struct page *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index af4a6ef48ce0..e28e1e17eaf5 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -27,6 +27,7 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } +EXPORT_SYMBOL_GPL(dma_common_get_sgtable); /* * Create userspace mapping for the DMA-coherent memory. @@ -57,6 +58,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, return -ENXIO; #endif /* CONFIG_MMU */ } +EXPORT_SYMBOL_GPL(dma_common_mmap); struct page *dma_common_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) diff --git a/kernel/fork.c b/kernel/fork.c index 9ef103c05891..1109a10c5ccd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -659,7 +659,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); @@ -687,16 +686,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); - if (retval) + /* Use __mt_dup() to efficiently build an identical maple tree. */ + retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); + if (unlikely(retval)) goto out; mt_clear_in_rcu(mas.tree); - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + mas_for_each(&mas, mpnt, ULONG_MAX) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { + __mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + if (unlikely(mas_is_err(&mas))) { + retval = -ENOMEM; + goto loop_out; + } vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } @@ -758,12 +764,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); - /* Link the vma into the MT */ + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -772,15 +779,28 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); - if (retval) + if (retval) { + mpnt = mas_find(&mas, ULONG_MAX); goto loop_out; + } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: mas_destroy(&mas); - if (!retval) + if (!retval) { mt_set_in_rcu(mas.tree); + } else if (mpnt) { + /* + * The entire maple tree has already been duplicated. If the + * mmap duplication fails, mark the failure point with + * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, + * stop releasing VMAs that have not been duplicated after this + * point. + */ + mas_set_range(&mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&mas, XA_ZERO_ENTRY); + } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -790,8 +810,6 @@ fail_uprobe_end: uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: - unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 351716fe9138..8207fdade4a8 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -391,9 +391,15 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { + bool ret = false; + if (rt_mutex_waiter_less(waiter, top_waiter)) return true; + trace_android_vh_rt_mutex_steal(waiter->prio, top_waiter->prio, &ret); + if (ret) + return true; + #ifdef RT_MUTEX_BUILD_SPINLOCKS /* * Note that RT tasks are excluded from same priority (lateral) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffe8fb15145c..f0d68a636da1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -96,6 +96,7 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity); static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* @@ -105,6 +106,7 @@ static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; * (default: 0.75 msec) */ unsigned int sysctl_sched_idle_min_granularity = 750000ULL; +EXPORT_SYMBOL_GPL(sysctl_sched_idle_min_granularity); /* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 826f7b8d5e05..1200ff73c1b0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4,6 +4,8 @@ * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett * Matthew Wilcox + * Copyright (c) 2023 ByteDance + * Author: Peng Zhang */ /* @@ -158,6 +160,11 @@ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } +static inline void mt_free_one(struct maple_node *node) +{ + kmem_cache_free(maple_node_cache, node); +} + static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); @@ -199,6 +206,11 @@ static unsigned int mas_mt_height(struct ma_state *mas) return mt_height(mas->tree); } +static inline unsigned int mt_attr(struct maple_tree *mt) +{ + return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; +} + static inline enum maple_type mte_node_type(const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & @@ -5702,7 +5714,7 @@ void mas_destroy(struct ma_state *mas) mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } - kmem_cache_free(maple_node_cache, node); + mt_free_one(ma_mnode_ptr(node)); total--; } @@ -6527,6 +6539,278 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index) } EXPORT_SYMBOL(mtree_erase); +/* + * mas_dup_free() - Free an incomplete duplication of a tree. + * @mas: The maple state of a incomplete tree. + * + * The parameter @mas->node passed in indicates that the allocation failed on + * this node. This function frees all nodes starting from @mas->node in the + * reverse order of mas_dup_build(). There is no need to hold the source tree + * lock at this time. + */ +static void mas_dup_free(struct ma_state *mas) +{ + struct maple_node *node; + enum maple_type type; + void __rcu **slots; + unsigned char count, i; + + /* Maybe the first node allocation failed. */ + if (mas_is_none(mas)) + return; + + while (!mte_is_root(mas->node)) { + mas_ascend(mas); + if (mas->offset) { + mas->offset--; + do { + mas_descend(mas); + mas->offset = mas_data_end(mas); + } while (!mte_is_leaf(mas->node)); + + mas_ascend(mas); + } + + node = mte_to_node(mas->node); + type = mte_node_type(mas->node); + slots = ma_slots(node, type); + count = mas_data_end(mas) + 1; + for (i = 0; i < count; i++) + ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; + mt_free_bulk(count, slots); + } + + node = mte_to_node(mas->node); + mt_free_one(node); +} + +/* + * mas_copy_node() - Copy a maple node and replace the parent. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @parent: The parent of the new node. + * + * Copy @mas->node to @new_mas->node, set @parent to be the parent of + * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, + struct maple_pnode *parent) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + unsigned long val; + + /* Copy the node completely. */ + memcpy(new_node, node, sizeof(struct maple_node)); + /* Update the parent node pointer. */ + val = (unsigned long)node->parent & MAPLE_NODE_MASK; + new_node->parent = ma_parent_ptr(val | (unsigned long)parent); +} + +/* + * mas_dup_alloc() - Allocate child nodes for a maple node. + * @mas: The maple state of source tree. + * @new_mas: The maple state of new tree. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function allocates child nodes for @new_mas->node during the duplication + * process. If memory allocation fails, @mas is set to -ENOMEM. + */ +static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node = mte_to_node(mas->node); + struct maple_node *new_node = mte_to_node(new_mas->node); + enum maple_type type; + unsigned char request, count, i; + void __rcu **slots; + void __rcu **new_slots; + unsigned long val; + + /* Allocate memory for child nodes. */ + type = mte_node_type(mas->node); + new_slots = ma_slots(new_node, type); + request = mas_data_end(mas) + 1; + count = mt_alloc_bulk(gfp, request, (void **)new_slots); + if (unlikely(count < request)) { + memset(new_slots, 0, request * sizeof(void *)); + mas_set_err(mas, -ENOMEM); + return; + } + + /* Restore node type information in slots. */ + slots = ma_slots(node, type); + for (i = 0; i < count; i++) { + val = (unsigned long)mt_slot_locked(mas->tree, slots, i); + val &= MAPLE_NODE_MASK; + ((unsigned long *)new_slots)[i] |= val; + } +} + +/* + * mas_dup_build() - Build a new maple tree from a source tree + * @mas: The maple state of source tree, need to be in MAS_START state. + * @new_mas: The maple state of new tree, need to be in MAS_START state. + * @gfp: The GFP_FLAGS to use for allocations. + * + * This function builds a new tree in DFS preorder. If the memory allocation + * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the + * last node. mas_dup_free() will free the incomplete duplication of a tree. + * + * Note that the attributes of the two trees need to be exactly the same, and the + * new tree needs to be empty, otherwise -EINVAL will be set in @mas. + */ +static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, + gfp_t gfp) +{ + struct maple_node *node; + struct maple_pnode *parent = NULL; + struct maple_enode *root; + enum maple_type type; + + if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || + unlikely(!mtree_empty(new_mas->tree))) { + mas_set_err(mas, -EINVAL); + return; + } + + root = mas_start(mas); + if (mas_is_ptr(mas) || mas_is_none(mas)) + goto set_new_tree; + + node = mt_alloc_one(gfp); + if (!node) { + new_mas->node = MAS_NONE; + mas_set_err(mas, -ENOMEM); + return; + } + + type = mte_node_type(mas->node); + root = mt_mk_node(node, type); + new_mas->node = root; + new_mas->min = 0; + new_mas->max = ULONG_MAX; + root = mte_mk_root(root); + while (1) { + mas_copy_node(mas, new_mas, parent); + if (!mte_is_leaf(mas->node)) { + /* Only allocate child nodes for non-leaf nodes. */ + mas_dup_alloc(mas, new_mas, gfp); + if (unlikely(mas_is_err(mas))) + return; + } else { + /* + * This is the last leaf node and duplication is + * completed. + */ + if (mas->max == ULONG_MAX) + goto done; + + /* This is not the last leaf node and needs to go up. */ + do { + mas_ascend(mas); + mas_ascend(new_mas); + } while (mas->offset == mas_data_end(mas)); + + /* Move to the next subtree. */ + mas->offset++; + new_mas->offset++; + } + + mas_descend(mas); + parent = ma_parent_ptr(mte_to_node(new_mas->node)); + mas_descend(new_mas); + mas->offset = 0; + new_mas->offset = 0; + } +done: + /* Specially handle the parent of the root node. */ + mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); +set_new_tree: + /* Make them the same height */ + new_mas->tree->ma_flags = mas->tree->ma_flags; + rcu_assign_pointer(new_mas->tree->ma_root, root); +} + +/** + * __mt_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * Note that the user needs to manually lock the source tree and the new tree. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_dup_build(&mas, &new_mas, gfp); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + return ret; +} +EXPORT_SYMBOL(__mt_dup); + +/** + * mtree_dup(): Duplicate an entire maple tree + * @mt: The source maple tree + * @new: The new maple tree + * @gfp: The GFP_FLAGS to use for allocations + * + * This function duplicates a maple tree in Depth-First Search (DFS) pre-order + * traversal. It uses memcpy() to copy nodes in the source tree and allocate + * new child nodes in non-leaf nodes. The new node is exactly the same as the + * source node except for all the addresses stored in it. It will be faster than + * traversing all elements in the source tree and inserting them one by one into + * the new tree. + * The user needs to ensure that the attributes of the source tree and the new + * tree are the same, and the new tree needs to be an empty tree, otherwise + * -EINVAL will be returned. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If + * the attributes of the two trees are different or the new tree is not an empty + * tree. + */ +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) +{ + int ret = 0; + MA_STATE(mas, mt, 0, 0); + MA_STATE(new_mas, new, 0, 0); + + mas_lock(&new_mas); + mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); + mas_dup_build(&mas, &new_mas, gfp); + mas_unlock(&mas); + if (unlikely(mas_is_err(&mas))) { + ret = xa_err(mas.node); + if (ret == -ENOMEM) + mas_dup_free(&new_mas); + } + + mas_unlock(&new_mas); + return ret; +} +EXPORT_SYMBOL(mtree_dup); + /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree @@ -6541,7 +6825,7 @@ void __mt_destroy(struct maple_tree *mt) if (xa_is_node(root)) mte_destroy_walk(root, mt); - mt->ma_flags = 0; + mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ab9d4461abc9..68b2c387fddb 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1671,47 +1671,48 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt) #endif /* check_forking - simulate the kernel forking sequence with the tree. */ -static noinline void __init check_forking(struct maple_tree *mt) +static noinline void __init check_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134; + struct maple_tree mt, newmt; + int i, nr_entries = 134, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); - mt_set_non_kernel(99999); mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&newmt, &newmt_lock); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - down_write(&newmt_lock); - mas.index = 0; - mas.last = 0; - if (mas_expected_entries(&newmas, nr_entries)) { + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { pr_err("OOM!"); BUG_ON(1); } - rcu_read_lock(); - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } - rcu_read_unlock(); + mas_destroy(&newmas); + mas_destroy(&mas); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); + __mt_destroy(&mt); up_write(&newmt_lock); + up_write(&mt_lock); } static noinline void __init check_iteration(struct maple_tree *mt) @@ -1815,49 +1816,51 @@ static noinline void __init check_mas_store_gfp(struct maple_tree *mt) } #if defined(BENCH_FORK) -static noinline void __init bench_forking(struct maple_tree *mt) +static noinline void __init bench_forking(void) { - - struct maple_tree newmt; - int i, nr_entries = 134, nr_fork = 80000; + struct maple_tree mt, newmt; + int i, nr_entries = 134, nr_fork = 80000, ret; void *val; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, mt, 0, 0); - struct rw_semaphore newmt_lock; + MA_STATE(mas, &mt, 0, 0); + MA_STATE(newmas, &newmt, 0, 0); + struct rw_semaphore mt_lock, newmt_lock; + init_rwsem(&mt_lock); init_rwsem(&newmt_lock); - mt_set_external_lock(&newmt, &newmt_lock); - for (i = 0; i <= nr_entries; i++) - mtree_store_range(mt, i*10, i*10 + 5, - xa_mk_value(i), GFP_KERNEL); + mt_init_flags(&mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt, &mt_lock); + + down_write(&mt_lock); + for (i = 0; i <= nr_entries; i++) { + mas_set_range(&mas, i*10, i*10 + 5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } for (i = 0; i < nr_fork; i++) { - mt_set_non_kernel(99999); - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); - newmas.tree = &newmt; - mas_reset(&newmas); - mas_reset(&mas); - mas.index = 0; - mas.last = 0; - rcu_read_lock(); - down_write(&newmt_lock); - if (mas_expected_entries(&newmas, nr_entries)) { - printk("OOM!"); + mt_init_flags(&newmt, + MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&newmt, &newmt_lock); + + down_write_nested(&newmt_lock, SINGLE_DEPTH_NESTING); + ret = __mt_dup(&mt, &newmt, GFP_KERNEL); + if (ret) { + pr_err("OOM!"); BUG_ON(1); } - mas_for_each(&mas, val, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; + + mas_set(&newmas, 0); + mas_for_each(&newmas, val, ULONG_MAX) mas_store(&newmas, val); - } + mas_destroy(&newmas); - rcu_read_unlock(); mt_validate(&newmt); - mt_set_non_kernel(0); __mt_destroy(&newmt); up_write(&newmt_lock); } + mas_destroy(&mas); + __mt_destroy(&mt); + up_write(&mt_lock); } #endif @@ -2741,10 +2744,6 @@ static int __init maple_tree_seed(void) pr_info("\nTEST STARTING\n\n"); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_root_expand(&tree); - mtree_destroy(&tree); - #if defined(BENCH_SLOT_STORE) #define BENCH mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); @@ -2775,9 +2774,7 @@ static int __init maple_tree_seed(void) #endif #if defined(BENCH_FORK) #define BENCH - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - bench_forking(&tree); - mtree_destroy(&tree); + bench_forking(); goto skip; #endif #if defined(BENCH_MT_FOR_EACH) @@ -2789,13 +2786,15 @@ static int __init maple_tree_seed(void) #endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_iteration(&tree); + check_root_expand(&tree); mtree_destroy(&tree); mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_forking(&tree); + check_iteration(&tree); mtree_destroy(&tree); + check_forking(); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_mas_store_gfp(&tree); mtree_destroy(&tree); diff --git a/mm/filemap.c b/mm/filemap.c index 2c9e93f3e62c..bdc1d5e17195 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3021,7 +3021,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, /* * NOTE! This will make us return with VM_FAULT_RETRY, but with - * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT + * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) @@ -3031,13 +3031,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* - * We didn't have the right flags to drop the mmap_lock, - * but all fault_handlers only check for fatal signals - * if we return VM_FAULT_RETRY, so we need to drop the - * mmap_lock here and return 0 if we don't have a fpin. + * We didn't have the right flags to drop the + * fault lock, but all fault_handlers only check + * for fatal signals if we return VM_FAULT_RETRY, + * so we need to drop the fault lock here and + * return 0 if we don't have a fpin. */ if (*fpin == NULL) - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); return 0; } } else diff --git a/mm/memory.c b/mm/memory.c index 56057f97afaf..5e161551a5d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -411,6 +411,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, * be 0. This will underflow and is okay. */ next = mas_find(&mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing @@ -432,6 +434,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(&mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1736,7 +1740,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details, mm_wr_locked); - } while ((vma = mas_find(&mas, end_t - 1)) != NULL); + vma = mas_find(&mas, end_t - 1); + } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } @@ -3099,6 +3104,36 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +/* + * We could add a bitflag somewhere, but for now, we know that all + * vm_ops that have a ->map_pages have been audited and don't need + * the mmap_lock to be held. + */ +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) + return 0; + vma_end_read(vma); + return VM_FAULT_RETRY; +} + +static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (likely(vma->anon_vma)) + return 0; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + return 0; +} + /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. @@ -3126,12 +3161,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; - int ret; + vm_fault_t ret; delayacct_wpcopy_start(); - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, @@ -3139,13 +3175,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; } else { + int err; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) goto oom; - ret = __wp_page_copy_user(new_page, old_page, vmf); - if (ret) { + err = __wp_page_copy_user(new_page, old_page, vmf); + if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3158,7 +3195,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) put_page(old_page); delayacct_wpcopy_end(); - return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } @@ -3271,11 +3308,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) oom_free_new: put_page(new_page); oom: + ret = VM_FAULT_OOM; +out: if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return VM_FAULT_OOM; + return ret; } /** @@ -3324,10 +3363,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); @@ -3351,10 +3389,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + tmp = vmf_can_call_fault(vmf); + if (tmp) { put_page(vmf->page); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; + return tmp; } tmp = do_page_mkwrite(vmf); @@ -3510,12 +3548,6 @@ reuse: return wp_page_shared(vmf); } copy: - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Ok, we need to copy. Oh, well.. */ @@ -4623,10 +4655,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -4644,13 +4675,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_can_call_fault(vmf); + if (!ret) + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) @@ -4688,10 +4717,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -5514,7 +5542,7 @@ retry: * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (unlikely(!vma->anon_vma)) + if (vma_is_anonymous(vma) && !vma->anon_vma) goto inval_end_read; /* Check since vm_start/vm_end might change before we lock the VMA */ diff --git a/mm/migrate.c b/mm/migrate.c index 95dae53c8133..66d3cff06d3b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -393,6 +393,7 @@ int folio_migrate_mapping(struct address_space *mapping, int dirty; int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); + long entries, i; if (!mapping) { /* Anonymous page without mapping */ @@ -430,8 +431,10 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); } + entries = nr; } else { VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + entries = 1; } /* Move dirty while page refs frozen and newpage not yet exposed */ @@ -441,7 +444,11 @@ int folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - xas_store(&xas, newfolio); + /* Swap cache still stores N entries instead of a high-order entry */ + for (i = 0; i < entries; i++) { + xas_store(&xas, newfolio); + xas_next(&xas); + } /* * Drop cache reference from old page by unfreezing diff --git a/mm/mmap.c b/mm/mmap.c index 6220edba2e7d..62324b21d721 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3316,10 +3316,11 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); - if (!vma) { + if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); - return; + mmap_write_lock(mm); + goto destroy; } lru_add_drain(); @@ -3352,11 +3353,13 @@ void exit_mmap(struct mm_struct *mm) remove_vma(vma, true); count++; cond_resched(); - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + vma = mas_find(&mas, ULONG_MAX); + } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); +destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2c5b854f767b..76a1954071e1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -420,7 +420,7 @@ static int dump_task(struct task_struct *p, void *arg) * State information includes task's pid, uid, tgid, vm size, rss, * pgtables_bytes, swapents, oom_score_adj value, and name. */ -static void dump_tasks(struct oom_control *oc) +void dump_tasks(struct oom_control *oc) { pr_info("Tasks state (memory values in pages):\n"); pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); @@ -436,6 +436,7 @@ static void dump_tasks(struct oom_control *oc) rcu_read_unlock(); } } +EXPORT_SYMBOL_GPL(dump_tasks); static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6bb272894c96..0da15e1f3f72 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -243,7 +243,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) goto frame_finish; #endif - e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]); + e = rcu_dereference(get_nf_hooks_bridge(net)[NF_BR_PRE_ROUTING]); if (!e) goto frame_finish; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 01d690d9fe5f..c7f0aedf2244 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1016,7 +1016,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, unsigned int i; int ret; - e = rcu_dereference(net->nf.hooks_bridge[hook]); + e = rcu_dereference(get_nf_hooks_bridge(net)[hook]); if (!e) return okfn(net, sk, skb); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4c1707d0eb9b..1d0110152862 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1093,9 +1093,13 @@ void __init net_ns_init(void) struct net_generic *ng; #ifdef CONFIG_NET_NS - net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), - SMP_CACHE_BYTES, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + /* Allocate size for struct ext_net instead of struct net + * to fix a KMI issue when CONFIG_NETFILTER_FAMILY_BRIDGE + * is enabled + */ + net_cachep = kmem_cache_create("net_namespace", sizeof(struct ext_net), + SMP_CACHE_BYTES, + SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 55a7f72d547c..6c7a44f84b93 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -39,6 +39,12 @@ struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE +struct nf_hook_entries __rcu *init_nf_hooks_bridge[NF_INET_NUMHOOKS]; +struct nf_hook_entries __rcu **init_nf_hooks_bridgep = &init_nf_hooks_bridge[0]; +EXPORT_SYMBOL_GPL(init_nf_hooks_bridgep); +#endif + static DEFINE_MUTEX(nf_hook_mutex); /* max hooks per family/hooknum */ @@ -278,9 +284,9 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: - if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) + if (WARN_ON_ONCE(hooknum >= NF_INET_NUMHOOKS)) return NULL; - return net->nf.hooks_bridge + hooknum; + return get_nf_hooks_bridge(net) + hooknum; #endif #ifdef CONFIG_NETFILTER_INGRESS case NFPROTO_INET: @@ -747,7 +753,7 @@ static int __net_init netfilter_net_init(struct net *net) __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp)); #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); + __netfilter_net_init(get_nf_hooks_bridge(net), NF_INET_NUMHOOKS); #endif #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 63d1516816b1..566f7794bf58 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -281,7 +281,7 @@ static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: - return rcu_dereference(net->nf.hooks_bridge[hooknum]); + return rcu_dereference(get_nf_hooks_bridge(net)[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 05fa5141af51..3d6ebb9877a4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9480,7 +9480,7 @@ static void nft_set_commit_update(struct list_head *set_update_list) list_for_each_entry_safe(set, next, set_update_list, pending_update) { list_del_init(&set->pending_update); - if (!set->ops->commit) + if (!set->ops->commit || set->dead) continue; set->ops->commit(set); diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 8120aadf6a0f..3ca3c3a3ba01 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -210,9 +210,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE - if (hook >= ARRAY_SIZE(net->nf.hooks_bridge)) + if (hook >= NF_INET_NUMHOOKS) return ERR_PTR(-EINVAL); - hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); + hook_head = rcu_dereference(get_nf_hooks_bridge(net)[hook]); #endif break; #if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) diff --git a/tools/include/linux/rwsem.h b/tools/include/linux/rwsem.h index 83971b3cbfce..f8bffd4a987c 100644 --- a/tools/include/linux/rwsem.h +++ b/tools/include/linux/rwsem.h @@ -37,4 +37,8 @@ static inline int up_write(struct rw_semaphore *sem) { return pthread_rwlock_unlock(&sem->lock); } + +#define down_read_nested(sem, subclass) down_read(sem) +#define down_write_nested(sem, subclass) down_write(sem) + #endif /* _TOOLS_RWSEM_H */ diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..a6cdf25b6b9d 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -11,6 +11,7 @@ #define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock(x) pthread_mutex_lock(x) +#define spin_lock_nested(x, subclass) pthread_mutex_lock(x) #define spin_unlock(x) pthread_mutex_unlock(x) #define spin_lock_bh(x) pthread_mutex_lock(x) #define spin_unlock_bh(x) pthread_mutex_unlock(x) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index d587a558997f..64a1645ff94c 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -93,13 +93,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return p; } -void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +void __kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) { assert(objp); - uatomic_dec(&nr_allocated); - uatomic_dec(&cachep->nr_allocated); - if (kmalloc_verbose) - printf("Freeing %p to slab\n", objp); if (cachep->nr_objs > 10 || cachep->align) { memset(objp, POISON_FREE, cachep->size); free(objp); @@ -111,6 +107,15 @@ void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) } } +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) +{ + uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); + if (kmalloc_verbose) + printf("Freeing %p to slab\n", objp); + __kmem_cache_free_locked(cachep, objp); +} + void kmem_cache_free(struct kmem_cache *cachep, void *objp) { pthread_mutex_lock(&cachep->lock); @@ -141,18 +146,17 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, if (kmalloc_verbose) pr_debug("Bulk alloc %lu\n", size); - if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (cachep->non_kernel < size) - return 0; - - cachep->non_kernel -= size; - } - pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs >= size) { struct radix_tree_node *node; for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + node = cachep->objs; cachep->nr_objs--; cachep->objs = node->parent; @@ -163,11 +167,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } else { pthread_mutex_unlock(&cachep->lock); for (i = 0; i < size; i++) { + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + break; + cachep->non_kernel--; + } + if (cachep->align) { posix_memalign(&p[i], cachep->align, cachep->size * size); } else { p[i] = malloc(cachep->size * size); + if (!p[i]) + break; } if (cachep->ctor) cachep->ctor(p[i]); @@ -176,6 +188,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } } + if (i < size) { + size = i; + pthread_mutex_lock(&cachep->lock); + for (i = 0; i < size; i++) + __kmem_cache_free_locked(cachep, p[i]); + pthread_mutex_unlock(&cachep->lock); + return 0; + } + for (i = 0; i < size; i++) { uatomic_inc(&nr_allocated); uatomic_inc(&cachep->nr_allocated); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index b598b7fe4419..a2626f02f385 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35753,6 +35753,363 @@ static noinline void __init check_locky(struct maple_tree *mt) mt_clear_in_rcu(mt); } +/* + * Compares two nodes except for the addresses stored in the nodes. + * Returns zero if they are the same, otherwise returns non-zero. + */ +static int __init compare_node(struct maple_enode *enode_a, + struct maple_enode *enode_b) +{ + struct maple_node *node_a, *node_b; + struct maple_node a, b; + void **slots_a, **slots_b; /* Do not use the rcu tag. */ + enum maple_type type; + int i; + + if (((unsigned long)enode_a & MAPLE_NODE_MASK) != + ((unsigned long)enode_b & MAPLE_NODE_MASK)) { + pr_err("The lower 8 bits of enode are different.\n"); + return -1; + } + + type = mte_node_type(enode_a); + node_a = mte_to_node(enode_a); + node_b = mte_to_node(enode_b); + a = *node_a; + b = *node_b; + + /* Do not compare addresses. */ + if (ma_is_root(node_a) || ma_is_root(node_b)) { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MA_ROOT_PARENT); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MA_ROOT_PARENT); + } else { + a.parent = (struct maple_pnode *)((unsigned long)a.parent & + MAPLE_NODE_MASK); + b.parent = (struct maple_pnode *)((unsigned long)b.parent & + MAPLE_NODE_MASK); + } + + if (a.parent != b.parent) { + pr_err("The lower 8 bits of parents are different. %p %p\n", + a.parent, b.parent); + return -1; + } + + /* + * If it is a leaf node, the slots do not contain the node address, and + * no special processing of slots is required. + */ + if (ma_is_leaf(type)) + goto cmp; + + slots_a = ma_slots(&a, type); + slots_b = ma_slots(&b, type); + + for (i = 0; i < mt_slots[type]; i++) { + if (!slots_a[i] && !slots_b[i]) + break; + + if (!slots_a[i] || !slots_b[i]) { + pr_err("The number of slots is different.\n"); + return -1; + } + + /* Do not compare addresses in slots. */ + ((unsigned long *)slots_a)[i] &= MAPLE_NODE_MASK; + ((unsigned long *)slots_b)[i] &= MAPLE_NODE_MASK; + } + +cmp: + /* + * Compare all contents of two nodes, including parent (except address), + * slots (except address), pivots, gaps and metadata. + */ + return memcmp(&a, &b, sizeof(struct maple_node)); +} + +/* + * Compare two trees and return 0 if they are the same, non-zero otherwise. + */ +static int __init compare_tree(struct maple_tree *mt_a, struct maple_tree *mt_b) +{ + MA_STATE(mas_a, mt_a, 0, 0); + MA_STATE(mas_b, mt_b, 0, 0); + + if (mt_a->ma_flags != mt_b->ma_flags) { + pr_err("The flags of the two trees are different.\n"); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + + if (mas_is_ptr(&mas_a) || mas_is_ptr(&mas_b)) { + if (!(mas_is_ptr(&mas_a) && mas_is_ptr(&mas_b))) { + pr_err("One is MAS_ROOT and the other is not.\n"); + return -1; + } + return 0; + } + + while (!mas_is_none(&mas_a) || !mas_is_none(&mas_b)) { + + if (mas_is_none(&mas_a) || mas_is_none(&mas_b)) { + pr_err("One is MAS_NONE and the other is not.\n"); + return -1; + } + + if (mas_a.min != mas_b.min || + mas_a.max != mas_b.max) { + pr_err("mas->min, mas->max do not match.\n"); + return -1; + } + + if (compare_node(mas_a.node, mas_b.node)) { + pr_err("The contents of nodes %p and %p are different.\n", + mas_a.node, mas_b.node); + mt_dump(mt_a, mt_dump_dec); + mt_dump(mt_b, mt_dump_dec); + return -1; + } + + mas_dfs_preorder(&mas_a); + mas_dfs_preorder(&mas_b); + } + + return 0; +} + +static __init void mas_subtree_max_range(struct ma_state *mas) +{ + unsigned long limit = mas->max; + MA_STATE(newmas, mas->tree, 0, 0); + void *entry; + + mas_for_each(mas, entry, limit) { + if (mas->last - mas->index >= + newmas.last - newmas.index) { + newmas = *mas; + } + } + + *mas = newmas; +} + +/* + * build_full_tree() - Build a full tree. + * @mt: The tree to build. + * @flags: Use @flags to build the tree. + * @height: The height of the tree to build. + * + * Build a tree with full leaf nodes and internal nodes. Note that the height + * should not exceed 3, otherwise it will take a long time to build. + * Return: zero if the build is successful, non-zero if it fails. + */ +static __init int build_full_tree(struct maple_tree *mt, unsigned int flags, + int height) +{ + MA_STATE(mas, mt, 0, 0); + unsigned long step; + int ret = 0, cnt = 1; + enum maple_type type; + + mt_init_flags(mt, flags); + mtree_insert_range(mt, 0, ULONG_MAX, xa_mk_value(5), GFP_KERNEL); + + mtree_lock(mt); + + while (1) { + mas_set(&mas, 0); + if (mt_height(mt) < height) { + mas.max = ULONG_MAX; + goto store; + } + + while (1) { + mas_dfs_preorder(&mas); + if (mas_is_none(&mas)) + goto unlock; + + type = mte_node_type(mas.node); + if (mas_data_end(&mas) + 1 < mt_slots[type]) { + mas_set(&mas, mas.min); + goto store; + } + } +store: + mas_subtree_max_range(&mas); + step = mas.last - mas.index; + if (step < 1) { + ret = -1; + goto unlock; + } + + step /= 2; + mas.last = mas.index + step; + mas_store_gfp(&mas, xa_mk_value(5), + GFP_KERNEL); + ++cnt; + } +unlock: + mtree_unlock(mt); + + MT_BUG_ON(mt, mt_height(mt) != height); + /* pr_info("height:%u number of elements:%d\n", mt_height(mt), cnt); */ + return ret; +} + +static noinline void __init check_mtree_dup(struct maple_tree *mt) +{ + DEFINE_MTREE(new); + int i, j, ret, count = 0; + unsigned int rand_seed = 17, rand; + + /* store a value at [0, 0] */ + mt_init_flags(mt, 0); + mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + + /* The two trees have different attributes. */ + mt_init_flags(mt, 0); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* The new tree is not empty */ + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + mtree_store(&new, 5, xa_mk_value(5), GFP_KERNEL); + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret != -EINVAL); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Test for duplicating full trees. */ + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, 0, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, 0); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + for (i = 1; i <= 3; i++) { + ret = build_full_tree(mt, MT_FLAGS_ALLOC_RANGE, i); + MT_BUG_ON(mt, ret); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test for normal duplicating. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + ret = mtree_dup(mt, &new, GFP_KERNEL); + MT_BUG_ON(&new, ret); + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* Test memory allocation failed. */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i < 30; i += 3) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + + /* Failed at the first node. */ + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(0); + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + MT_BUG_ON(&new, ret != -ENOMEM); + mtree_destroy(mt); + mtree_destroy(&new); + + /* Random maple tree fails at a random node. */ + for (i = 0; i < 1000; i += 3) { + if (i & 1) { + mt_init_flags(mt, 0); + mt_init_flags(&new, 0); + } else { + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_init_flags(&new, MT_FLAGS_ALLOC_RANGE); + } + + for (j = 0; j < i; j++) { + mtree_store_range(mt, j * 10, j * 10 + 5, + xa_mk_value(j), GFP_KERNEL); + } + /* + * The rand() library function is not used, so we can generate + * the same random numbers on any platform. + */ + rand_seed = rand_seed * 1103515245 + 12345; + rand = rand_seed / 65536 % 128; + mt_set_non_kernel(rand); + + ret = mtree_dup(mt, &new, GFP_NOWAIT); + mt_set_non_kernel(0); + if (ret != 0) { + MT_BUG_ON(&new, ret != -ENOMEM); + count++; + mtree_destroy(mt); + continue; + } + + mt_validate(&new); + if (compare_tree(mt, &new)) + MT_BUG_ON(&new, 1); + + mtree_destroy(mt); + mtree_destroy(&new); + } + + /* pr_info("mtree_dup() fail %d times\n", count); */ + BUG_ON(!count); +} + extern void test_kmem_cache_bulk(void); void farmer_tests(void) @@ -35800,6 +36157,10 @@ void farmer_tests(void) check_null_expand(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, 0); + check_mtree_dup(&tree); + mtree_destroy(&tree); + /* RCU testing */ mt_init_flags(&tree, 0); check_erase_testset(&tree); @@ -35834,7 +36195,9 @@ void farmer_tests(void) void maple_tree_tests(void) { +#if !defined(BENCH) farmer_tests(); +#endif maple_tree_seed(); maple_tree_harvest(); } diff --git a/tools/testing/selftests/filesystems/fuse/bpf_loader.c b/tools/testing/selftests/filesystems/fuse/bpf_loader.c index 5bf26eadd421..94f884c64d29 100644 --- a/tools/testing/selftests/filesystems/fuse/bpf_loader.c +++ b/tools/testing/selftests/filesystems/fuse/bpf_loader.c @@ -394,6 +394,29 @@ int s_rename(struct s oldpathname, struct s newpathname) return res; } +int s_mount(struct s source, struct s target, struct s filesystem, + unsigned long mountflags, struct s data) +{ + int res; + + res = mount(source.s, target.s, filesystem.s, mountflags, data.s); + free(source.s); + free(target.s); + free(filesystem.s); + free(data.s); + + return res; +} + +int s_umount(struct s target) +{ + int res; + + res = umount(target.s); + free(target.s); + return res; +} + int s_fuse_attr(struct s pathname, struct fuse_attr *fuse_attr_out) { @@ -574,7 +597,10 @@ static int mount_fuse_maybe_init(const char *mount_dir, int bpf_fd, int dir_fd, })); } - *fuse_dev_ptr = fuse_dev; + if (fuse_dev_ptr) + *fuse_dev_ptr = fuse_dev; + else + TESTSYSCALL(close(fuse_dev)); fuse_dev = -1; result = TEST_SUCCESS; out: diff --git a/tools/testing/selftests/filesystems/fuse/fuse_test.c b/tools/testing/selftests/filesystems/fuse/fuse_test.c index 528595a8e82f..ad24ed48853e 100644 --- a/tools/testing/selftests/filesystems/fuse/fuse_test.c +++ b/tools/testing/selftests/filesystems/fuse/fuse_test.c @@ -2114,6 +2114,50 @@ out: return result; } +/** + * Test that fuse passthrough correctly traverses a mount point on the lower fs + */ +static int bpf_test_follow_mounts(const char *mount_dir) +{ + const char *bind_src = "bind_src"; + const char *bind_dst = "bind_dst"; + const char *file = "file"; + int fd = -1; + int src_fd = -1; + int result = TEST_FAILURE; + + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_src)), 0777)); + TESTSYSCALL(s_mkdir(s_path(s(ft_src), s(bind_dst)), 0777)); + TEST(fd = s_creat(s_pathn(3, s(ft_src), s(bind_src), s(file)), 0777), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + TESTSYSCALL(s_mount(s_path(s(ft_src), s(bind_src)), + s_path(s(ft_src), s(bind_dst)), + s(NULL), MS_BIND, s(NULL))); + TEST(src_fd = open(ft_src, O_DIRECTORY | O_RDONLY | O_CLOEXEC), + src_fd != -1); + TESTEQUAL(mount_fuse_no_init(mount_dir, -1, src_fd, NULL), 0); + TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_src), s(file)), + O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + TEST(fd = s_open(s_pathn(3, s(mount_dir), s(bind_dst), s(file)), + O_RDONLY), + fd != -1); + TESTSYSCALL(close(fd)); + fd = -1; + + result = TEST_SUCCESS; +out: + umount(mount_dir); + close(src_fd); + s_umount(s_path(s(ft_src), s(bind_dst))); + close(fd); + return result; +} + static void parse_range(const char *ranges, bool *run_test, size_t tests) { size_t i; @@ -2244,6 +2288,7 @@ int main(int argc, char *argv[]) MAKE_TEST(bpf_test_create_and_remove_bpf), MAKE_TEST(bpf_test_mkdir_and_remove_bpf), MAKE_TEST(bpf_test_readahead), + MAKE_TEST(bpf_test_follow_mounts), }; #undef MAKE_TEST diff --git a/tools/testing/selftests/filesystems/fuse/test_fuse.h b/tools/testing/selftests/filesystems/fuse/test_fuse.h index 69dadc9c7e45..e62e2ee07713 100644 --- a/tools/testing/selftests/filesystems/fuse/test_fuse.h +++ b/tools/testing/selftests/filesystems/fuse/test_fuse.h @@ -64,6 +64,9 @@ int s_setxattr(struct s pathname, const char name[], const void *value, size_t size, int flags); int s_removexattr(struct s pathname, const char name[]); int s_rename(struct s oldpathname, struct s newpathname); +int s_mount(struct s source, struct s target, struct s filesystem, + unsigned long mountflags, struct s data); +int s_umount(struct s target); struct s tracing_folder(void); int tracing_on(void);