From fcab9b441d2d0e08f55654a4adf2d51cd4680469 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Tue, 23 Aug 2022 17:20:33 +0200 Subject: [PATCH 01/25] mm: remove EXPERIMENTAL flag for zswap zswap has been with us since 2013, and it's widely used in many products. Link: https://lkml.kernel.org/r/20220823152033.66682-1-david@ixit.cz Signed-off-by: David Heidelberg Cc: Dan Carpenter Cc: Vitaly Wool Cc: Dan Streetman Cc: Barry Song Cc: Seth Jennings Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/Kconfig | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 0331f1461f81..e3fbd0788878 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -23,7 +23,7 @@ menuconfig SWAP in your computer. If unsure say Y. config ZSWAP - bool "Compressed cache for swap pages (EXPERIMENTAL)" + bool "Compressed cache for swap pages" depends on SWAP select FRONTSWAP select CRYPTO @@ -36,12 +36,6 @@ config ZSWAP in the case where decompressing from RAM is faster than swap device reads, can also improve workload performance. - This is marked experimental because it is a new feature (as of - v3.11) that interacts heavily with memory reclaim. While these - interactions don't cause any known issues on simple memory setups, - they have not be fully explored on the large set of potential - configurations and workloads that exist. - config ZSWAP_DEFAULT_ON bool "Enable the compressed cache for swap pages by default" depends on ZSWAP From cfdab60bfa66b2dc0391c9e405b8af6039924cd4 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Aug 2022 00:05:04 +0000 Subject: [PATCH 02/25] mm: page_counter: remove unneeded atomic ops for low/min MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memcg: optimize charge codepath", v2. Recently Linux networking stack has moved from a very old per socket pre-charge caching to per-cpu caching to avoid pre-charge fragmentation and unwarranted OOMs. One impact of this change is that for network traffic workloads, memcg charging codepath can become a bottleneck. The kernel test robot has also reported this regression[1]. This patch series tries to improve the memcg charging for such workloads. This patch series implement three optimizations: (A) Reduce atomic ops in page counter update path. (B) Change layout of struct page_counter to eliminate false sharing between usage and high. (C) Increase the memcg charge batch to 64. To evaluate the impact of these optimizations, on a 72 CPUs machine, we ran the following workload in root memcg and then compared with scenario where the workload is run in a three level of cgroup hierarchy with top level having min and low setup appropriately. $ netserver -6 # 36 instances of netperf with following params $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Results (average throughput of netperf): 1. root memcg 21694.8 Mbps 2. 6.0-rc1 10482.7 Mbps (-51.6%) 3. 6.0-rc1 + (A) 14542.5 Mbps (-32.9%) 4. 6.0-rc1 + (B) 12413.7 Mbps (-42.7%) 5. 6.0-rc1 + (C) 17063.7 Mbps (-21.3%) 6. 6.0-rc1 + (A+B+C) 20120.3 Mbps (-7.2%) With all three optimizations, the memcg overhead of this workload has been reduced from 51.6% to just 7.2%. [1] https://lore.kernel.org/linux-mm/20220619150456.GB34471@xsang-OptiPlex-9020/ This patch (of 3): For cgroups using low or min protections, the function propagate_protected_usage() was doing an atomic xchg() operation irrespectively. We can optimize out this atomic operation for one specific scenario where the workload is using the protection (i.e. min > 0) and the usage is above the protection (i.e. usage > min). This scenario is actually very common where the users want a part of their workload to be protected against the external reclaim. Though this optimization does introduce a race when the usage is around the protection and concurrent charges and uncharged trip it over or under the protection. In such cases, we might see lower effective protection but the subsequent charge/uncharge will correct it. To evaluate the impact of this optimization, on a 72 CPUs machine, we ran the following workload in a three level of cgroup hierarchy with top level having min and low setup appropriately to see if this optimization is effective for the mentioned case. $ netserver -6 # 36 instances of netperf with following params $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Results (average throughput of netperf): Without (6.0-rc1) 10482.7 Mbps With patch 14542.5 Mbps (38.7% improvement) With the patch, the throughput improved by 38.7% Link: https://lkml.kernel.org/r/20220825000506.239406-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220825000506.239406-2-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: kernel test robot Acked-by: Soheil Hassas Yeganeh Reviewed-by: Feng Tang Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Eric Dumazet Cc: Johannes Weiner Cc: "Michal Koutný" Cc: Muchun Song Cc: Oliver Sang Cc: Soheil Hassas Yeganeh Signed-off-by: Andrew Morton --- mm/page_counter.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/page_counter.c b/mm/page_counter.c index 8a0cc24b60dd..db20d6452b71 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; - unsigned long low, min; long delta; if (!c->parent) return; - min = READ_ONCE(c->min); - if (min || atomic_long_read(&c->min_usage)) { - protected = min(usage, min); + protected = min(usage, READ_ONCE(c->min)); + old_protected = atomic_long_read(&c->min_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } - low = READ_ONCE(c->low); - if (low || atomic_long_read(&c->low_usage)) { - protected = min(usage, low); + protected = min(usage, READ_ONCE(c->low)); + old_protected = atomic_long_read(&c->low_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) From 408587baee39753304dd572dacf374f75545d25a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Aug 2022 00:05:05 +0000 Subject: [PATCH 03/25] mm: page_counter: rearrange struct page_counter fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With memcg v2 enabled, memcg->memory.usage is a very hot member for the workloads doing memcg charging on multiple CPUs concurrently. Particularly the network intensive workloads. In addition, there is a false cache sharing between memory.usage and memory.high on the charge path. This patch moves the usage into a separate cacheline and move all the read most fields into separate cacheline. To evaluate the impact of this optimization, on a 72 CPUs machine, we ran the following workload in a three level of cgroup hierarchy. $ netserver -6 # 36 instances of netperf with following params $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Results (average throughput of netperf): Without (6.0-rc1) 10482.7 Mbps With patch 12413.7 Mbps (18.4% improvement) With the patch, the throughput improved by 18.4%. One side-effect of this patch is the increase in the size of struct mem_cgroup. For example with this patch on 64 bit build, the size of struct mem_cgroup increased from 4032 bytes to 4416 bytes. However for the performance improvement, this additional size is worth it. In addition there are opportunities to reduce the size of struct mem_cgroup like deprecation of kmem and tcpmem page counters and better packing. Link: https://lkml.kernel.org/r/20220825000506.239406-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: kernel test robot Reviewed-by: Feng Tang Acked-by: Soheil Hassas Yeganeh Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Eric Dumazet Cc: Johannes Weiner Cc: "Michal Koutný" Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 679591301994..78a1c934e416 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -3,15 +3,26 @@ #define _LINUX_PAGE_COUNTER_H #include +#include #include #include +#if defined(CONFIG_SMP) +struct pc_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define PC_PADDING(name) struct pc_padding name +#else +#define PC_PADDING(name) +#endif + struct page_counter { + /* + * Make sure 'usage' does not share cacheline with any other field. The + * memcg->memory.usage is a hot member of struct mem_cgroup. + */ atomic_long_t usage; - unsigned long min; - unsigned long low; - unsigned long high; - unsigned long max; + PC_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -23,18 +34,18 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; - /* legacy */ unsigned long watermark; unsigned long failcnt; - /* - * 'parent' is placed here to be far from 'usage' to reduce - * cache false sharing, as 'usage' is written mostly while - * parent is frequently read for cgroup's hierarchical - * counting nature. - */ + /* Keep all the read most fields in a separete cacheline. */ + PC_PADDING(_pad2_); + + unsigned long min; + unsigned long low; + unsigned long high; + unsigned long max; struct page_counter *parent; -}; +} ____cacheline_internodealigned_in_smp; #if BITS_PER_LONG == 32 #define PAGE_COUNTER_MAX LONG_MAX From 1813e51eece0ad6f4aacaeb738e7cced46feb470 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 25 Aug 2022 00:05:06 +0000 Subject: [PATCH 04/25] memcg: increase MEMCG_CHARGE_BATCH to 64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For several years, MEMCG_CHARGE_BATCH was kept at 32 but with bigger machines and the network intensive workloads requiring througput in Gbps, 32 is too small and makes the memcg charging path a bottleneck. For now, increase it to 64 for easy acceptance to 6.0. We will need to revisit this in future for ever increasing demand of higher performance. Please note that the memcg charge path drain the per-cpu memcg charge stock, so there should not be any oom behavior change. Though it does have impact on rstat flushing and high limit reclaim backoff. To evaluate the impact of this optimization, on a 72 CPUs machine, we ran the following workload in a three level of cgroup hierarchy. $ netserver -6 # 36 instances of netperf with following params $ netperf -6 -H ::1 -l 60 -t TCP_SENDFILE -- -m 10K Results (average throughput of netperf): Without (6.0-rc1) 10482.7 Mbps With patch 17064.7 Mbps (62.7% improvement) With the patch, the throughput improved by 62.7%. Link: https://lkml.kernel.org/r/20220825000506.239406-4-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: kernel test robot Acked-by: Soheil Hassas Yeganeh Reviewed-by: Feng Tang Acked-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Michal Hocko Cc: Eric Dumazet Cc: Johannes Weiner Cc: "Michal Koutný" Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6257867fbf95..a2461f9a8738 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,10 +354,11 @@ struct mem_cgroup { }; /* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. + * size of first charge trial. + * TODO: maybe necessary to use big numbers in big irons or dynamic based of the + * workload. */ -#define MEMCG_CHARGE_BATCH 32U +#define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; From 1a6baaa0db733c0dca2e170ca1df2b09834f47ce Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Sun, 11 Sep 2022 20:26:01 -0700 Subject: [PATCH 05/25] s390/hugetlb: switch to generic version of follow_huge_pud() When pud-sized hugepages were introduced for s390, the generic version of follow_huge_pud() was using pte_page() instead of pud_page(). This would be wrong for s390, see also commit 97534127012f ("mm/hugetlb: use pmd_page() in follow_huge_pmd()"). Therefore, and probably because not all archs were supporting pud_page() at that time, a private version of follow_huge_pud() was added for s390, correctly using pud_page(). Since commit 3a194f3f8ad01 ("mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry"), the generic version of follow_huge_pud() is now also using pud_page(), and in general behaves similar to follow_huge_pmd(). Therefore we can now switch to the generic version and get rid of the s390-specific follow_huge_pud(). Link: https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad Signed-off-by: Gerald Schaefer Cc: Haiyue Wang Cc: Miaohe Lin Cc: "Huang, Ying" Cc: Baolin Wang Cc: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/s390/mm/hugetlbpage.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 10e51ef9c79a..c299a18273ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -237,16 +237,6 @@ int pud_huge(pud_t pud) return pud_large(pud); } -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - if (flags & FOLL_GET) - return NULL; - - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) From c4f20f1479c456d9dd1c1e6d8bf956a25de742dc Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 25 Aug 2022 18:27:14 +0800 Subject: [PATCH 06/25] page_ext: introduce boot parameter 'early_page_ext' In commit 2f1ee0913ce5 ("Revert "mm: use early_pfn_to_nid in page_ext_init""), we call page_ext_init() after page_alloc_init_late() to avoid some panic problem. It seems that we cannot track early page allocations in current kernel even if page structure has been initialized early. This patch introduces a new boot parameter 'early_page_ext' to resolve this problem. If we pass it to the kernel, page_ext_init() will be moved up and the feature 'deferred initialization of struct pages' will be disabled to initialize the page allocator early and prevent the panic problem above. It can help us to catch early page allocations. This is useful especially when we find that the free memory value is not the same right after different kernel booting. [akpm@linux-foundation.org: fix section issue by removing __meminitdata] Link: https://lkml.kernel.org/r/20220825102714.669-1-lizhe.67@bytedance.com Signed-off-by: Li Zhe Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jason A. Donenfeld Cc: Jonathan Corbet Cc: Kees Cook Cc: Mark-PK Tsai Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ include/linux/page_ext.h | 11 +++++++++++ init/main.c | 6 +++++- mm/page_alloc.c | 2 ++ mm/page_ext.c | 8 ++++++++ 5 files changed, 34 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 426fa892d311..3b95f65bafe2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1471,6 +1471,14 @@ Permit 'security.evm' to be updated regardless of current integrity status. + early_page_ext [KNL] Enforces page_ext initialization to earlier + stages so cover more early boot allocations. + Please note that as side effect some optimizations + might be disabled to achieve that (e.g. parallelized + memory initialization is disabled) so the boot process + might take longer, especially on systems with a lot of + memory. Available with CONFIG_PAGE_EXTENSION=y. + failslab= fail_usercopy= fail_page_alloc= diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index ed27198cdaf4..22be4582faae 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -36,9 +36,15 @@ struct page_ext { unsigned long flags; }; +extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); +static inline bool early_page_ext_enabled(void) +{ + return early_page_ext; +} + #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { @@ -68,6 +74,11 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; +static inline bool early_page_ext_enabled(void) +{ + return false; +} + static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } diff --git a/init/main.c b/init/main.c index 1fe7942f5d4a..2a475d40f952 100644 --- a/init/main.c +++ b/init/main.c @@ -849,6 +849,9 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); + /* Should be run after vmap initialization */ + if (early_page_ext_enabled()) + page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); /* Should be run after espfix64 is set up. */ @@ -1618,7 +1621,8 @@ static noinline void __init kernel_init_freeable(void) padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - page_ext_init(); + if (!early_page_ext_enabled()) + page_ext_init(); do_basic_setup(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48c65bf3cb29..1d4278115d71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -482,6 +482,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; + if (early_page_ext_enabled()) + return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. diff --git a/mm/page_ext.c b/mm/page_ext.c index b236bdd59fa8..affe80243b6d 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -91,6 +91,14 @@ unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); +bool early_page_ext; +static int __init setup_early_page_ext(char *str) +{ + early_page_ext = true; + return 0; +} +early_param("early_page_ext", setup_early_page_ext); + static bool __init invoke_need_callbacks(void) { int i; From 3083da7bcf56a4922b996ea3551847488a43a8b6 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Fri, 26 Aug 2022 07:19:06 +0000 Subject: [PATCH 07/25] mm: backing-dev: Remove the unneeded result variable Return the value cgwb_bdi_init() directly instead of storing it in another redundant variable. Link: https://lkml.kernel.org/r/20220826071906.252419-1-ye.xingchen@zte.com.cn Signed-off-by: ye xingchen Reported-by: Zeal Robot Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton --- mm/backing-dev.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index de65cb1e5f76..c30419a5e119 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -776,8 +776,6 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) int bdi_init(struct backing_dev_info *bdi) { - int ret; - bdi->dev = NULL; kref_init(&bdi->refcnt); @@ -788,9 +786,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); - ret = cgwb_bdi_init(bdi); - - return ret; + return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) From 641608f36244eb01028c2b30a961c7242144d96a Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Wed, 24 Aug 2022 14:31:17 +0300 Subject: [PATCH 08/25] zram: don't retry compress incompressible page It doesn't make sense for us to retry to compress an uncompressible page (comp_len == PAGE_SIZE) in zsmalloc slowpath, because we will be storing it uncompressed anyway. We can avoid wasting time on another compression attempt. It is enough to take lock (zcomp_stream_get) and execute the code below. Link: https://lkml.kernel.org/r/20220824113117.78849-1-avromanov@sberdevices.ru Signed-off-by: Alexey Romanov Signed-off-by: Dmitry Rokosov Reviewed-by: Sergey Senozhatsky Cc: Alexey Romanov Cc: Dmitry Rokosov Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index add43f6c8bc0..607f4634c27d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1410,9 +1410,19 @@ compress_again: handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); - if (!IS_ERR((void *)handle)) + if (IS_ERR((void *)handle)) + return PTR_ERR((void *)handle); + + if (comp_len != PAGE_SIZE) goto compress_again; - return PTR_ERR((void *)handle); + /* + * If the page is not compressible, you need to acquire the lock and + * execute the code below. The zcomp_stream_get() call is needed to + * disable the cpu hotplug and grab the zstrm buffer back. + * It is necessary that the dereferencing of the zstrm variable below + * occurs correctly. + */ + zstrm = zcomp_stream_get(zram->comp); } alloced_pages = zs_get_total_pages(zram->mem_pool); From 35b471467f88b8d8b52ba5b006a29b9d057d09bf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:17 -0700 Subject: [PATCH 09/25] filemap: add filemap_get_folios_contig() Patch series "Convert to filemap_get_folios_contig()", v3. This patch series replaces find_get_pages_contig() with filemap_get_folios_contig(). This patch (of 7): This function is meant to replace find_get_pages_contig(). Unlike find_get_pages_contig(), filemap_get_folios_contig() no longer takes in a target number of pages to find - It returns up to 15 contiguous folios. To be more consistent with filemap_get_folios(), filemap_get_folios_contig() now also updates the start index passed in, and takes an end index. Link: https://lkml.kernel.org/r/20220824004023.77310-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20220824004023.77310-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Ryusuke Konishi Cc: Al Viro Cc: Matthew Wilcox Cc: David Sterba Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 73 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0178b2040ea3..8689c32d628b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -718,6 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, diff --git a/mm/filemap.c b/mm/filemap.c index cb740a6b7227..2a9441afe337 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2196,6 +2196,79 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) return index < folio->index + folio_nr_pages(folio) - 1; } +/** + * filemap_get_folios_contig - Get a batch of contiguous folios + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_contig() works exactly like filemap_get_folios(), + * except the returned folios are guaranteed to be contiguous. This may + * not return all contiguous folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ + +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + unsigned long nr; + struct folio *folio; + + rcu_read_lock(); + + for (folio = xas_load(&xas); folio && xas.xa_index <= end; + folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(folio)) + goto update_start; + + if (!folio_try_get_rcu(folio)) + goto retry; + + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; + + if (!folio_batch_add(fbatch, folio)) { + nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; + } + continue; +put_folio: + folio_put(folio); + +retry: + xas_reset(&xas); + } + +update_start: + nr = folio_batch_count(fbatch); + + if (nr) { + folio = fbatch->folios[nr - 1]; + if (folio_test_hugetlb(folio)) + *start = folio->index + 1; + else + *start = folio->index + folio_nr_pages(folio); + } +out: + rcu_read_unlock(); + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios_contig); + /** * find_get_pages_contig - gang contiguous pagecache lookup * @mapping: The address_space to search From 04c6b79ae4f0bcbd96afd7cea5e1a8848162438e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:18 -0700 Subject: [PATCH 10/25] btrfs: convert __process_pages_contig() to use filemap_get_folios_contig() Convert to use folios throughout. This is in preparation for the removal of find_get_pages_contig(). Now also supports large folios. Since we may receive more than nr_pages pages, nr_pages may underflow. Since nr_pages > 0 is equivalent to index <= end_index, we replaced it with this check instead. Link: https://lkml.kernel.org/r/20220824004023.77310-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/btrfs/extent_io.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf4f19e80e2f..013d6348deee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1900,9 +1900,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -1911,16 +1910,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -1930,23 +1930,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: From a75b81c3f63b3f467d5f54942c661bb79fe07505 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:19 -0700 Subject: [PATCH 11/25] btrfs: convert end_compressed_writeback() to use filemap_get_folios() Converted function to use folios throughout. This is in preparation for the removal of find_get_pages_contig(). Now also supports large folios. Since we may receive more than nr_pages pages, nr_pages may underflow. Since nr_pages > 0 is equivalent to index <= end_index, we replaced it with this check instead. Also this function does not care about the pages being contiguous so we can just use filemap_get_folios() to be more efficient. Link: https://lkml.kernel.org/r/20220824004023.77310-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/btrfs/compression.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e84d22c5c6a8..05d228efcd31 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -222,8 +223,7 @@ static noinline void end_compressed_writeback(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; + struct folio_batch fbatch; const int errno = blk_status_to_errno(cb->status); int i; int ret; @@ -231,24 +231,23 @@ static noinline void end_compressed_writeback(struct inode *inode, if (errno) mapping_set_error(inode->i_mapping, errno); - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (errno) - SetPageError(pages[i]); - btrfs_page_clamp_clear_writeback(fs_info, pages[i], + folio_set_error(folio); + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); - put_page(pages[i]); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); } /* the inode may be gone now */ } From 47d55419951312d723de1b6ad53ee92948b8eace Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:20 -0700 Subject: [PATCH 12/25] btrfs: convert process_page_range() to use filemap_get_folios_contig() Converted function to use folios throughout. This is in preparation for the removal of find_get_pages_contig(). Now also supports large folios. Since we may receive more than nr_pages pages, nr_pages may underflow. Since nr_pages > 0 is equivalent to index <= end_index, we replaced it with this check instead. Also minor comment renaming for consistency in subpage. Link: https://lkml.kernel.org/r/20220824004023.77310-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Sterba Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/btrfs/subpage.c | 2 +- fs/btrfs/tests/extent-io-tests.c | 32 +++++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 6fc2b77ae5c3..9a176af847d7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -337,7 +337,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * * Even with 0 returned, the page still need extra check to make sure * it's really the correct page, as the caller is using - * find_get_pages_contig(), which can race with page invalidating. + * filemap_get_folios_contig(), which can race with page invalidating. */ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index a232b15b8021..26b0c99f54b8 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; - struct page *pages[16]; + struct folio_batch fbatch; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; int i; int count = 0; int loops = 0; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (flags & PROCESS_TEST_LOCKED && - !PageLocked(pages[i])) + !folio_test_locked(folio)) count++; - if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) - unlock_page(pages[i]); - put_page(pages[i]); + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); if (flags & PROCESS_RELEASE) - put_page(pages[i]); + folio_put(folio); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); cond_resched(); loops++; if (loops > 100000) { printk(KERN_ERR - "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", - start, end, nr_pages, ret); + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); break; } } + return count; } From 24a1efb4a91283c853a31fa4775d5c10c09129a4 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:21 -0700 Subject: [PATCH 13/25] nilfs2: convert nilfs_find_uncommited_extent() to use filemap_get_folios_contig() Convert function to use folios throughout. This is in preparation for the removal of find_get_pages_contig(). Now also supports large folios. Also clean up an unnecessary if statement - pvec.pages[0]->index > index will always evaluate to false, and filemap_get_folios_contig() returns 0 if there is no folio found at index. Link: https://lkml.kernel.org/r/20220824004023.77310-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Ryusuke Konishi Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3267e96c256c..39b7eea2642a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff) { - unsigned int i; + unsigned int i, nr_folios; pgoff_t index; - unsigned int nblocks_in_page; unsigned long length = 0; - sector_t b; - struct pagevec pvec; - struct page *page; + struct folio_batch fbatch; + struct folio *folio; if (inode->i_mapping->nrpages == 0) return 0; index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); - nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, - pvec.pages); - if (pvec.nr == 0) + nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX, + &fbatch); + if (nr_folios == 0) return length; - if (length > 0 && pvec.pages[0]->index > index) - goto out; - - b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { - page = pvec.pages[i]; + folio = fbatch.folios[i]; - lock_page(page); - if (page_has_buffers(page)) { + folio_lock(folio); + if (folio_buffers(folio)) { struct buffer_head *bh, *head; + sector_t b; - bh = head = page_buffers(page); + b = folio->index << (PAGE_SHIFT - inode->i_blkbits); + bh = head = folio_buffers(folio); do { if (b < start_blk) continue; @@ -529,21 +524,17 @@ repeat: } else { if (length > 0) goto out_locked; - - b += nblocks_in_page; } - unlock_page(page); + folio_unlock(folio); - } while (++i < pagevec_count(&pvec)); + } while (++i < nr_folios); - index = page->index + 1; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; out_locked: - unlock_page(page); -out: - pagevec_release(&pvec); + folio_unlock(folio); + folio_batch_release(&fbatch); return length; } From 60aac486daa48d9caa9370d480d890f2a1f71742 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:22 -0700 Subject: [PATCH 14/25] ramfs: convert ramfs_nommu_get_unmapped_area() to use filemap_get_folios_contig() Convert to use folios throughout. This is in preparation for the removal for find_get_pages_contig(). Now also supports large folios. The initial version of this function set the page_address to be returned after finishing all the checks. Since folio_batches have a maximum of 15 folios, the function had to be modified to support getting and checking up to lpages, 15 pages at a time while still returning the initial page address. Now the function sets ret as soon as the first batch arrives, and updates it only if a check fails. The physical adjacency check utilizes the page frame numbers. The page frame number of each folio must be nr_pages away from the first folio. Link: https://lkml.kernel.org/r/20220824004023.77310-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ramfs/file-nommu.c | 50 +++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ba3525ccc27e..cb240eac5036 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long maxpages, lpages, nr, loop, ret; + unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn; struct inode *inode = file_inode(file); - struct page **pages = NULL, **ptr, *page; + struct folio_batch fbatch; loff_t isize; /* the mapping mustn't extend beyond the EOF */ @@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto out_free; - - nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); - if (nr != lpages) - goto out_free_pages; /* leave if some pages were missing */ + folio_batch_init(&fbatch); + nr_pages = 0; +repeat: + nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff, + ULONG_MAX, &fbatch); + if (!nr_folios) { + ret = -ENOSYS; + return ret; + } + if (ret == -ENOSYS) { + ret = (unsigned long) folio_address(fbatch.folios[0]); + pfn = folio_pfn(fbatch.folios[0]); + } /* check the pages for physical adjacency */ - ptr = pages; - page = *ptr++; - page++; - for (loop = lpages; loop > 1; loop--) - if (*ptr++ != page++) - goto out_free_pages; + for (loop = 0; loop < nr_folios; loop++) { + if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) { + ret = -ENOSYS; + goto out_free; /* leave if not physical adjacent */ + } + nr_pages += folio_nr_pages(fbatch.folios[loop]); + if (nr_pages >= lpages) + goto out_free; /* successfully found desired pages*/ + } + if (nr_pages < lpages) { + folio_batch_release(&fbatch); + goto repeat; /* loop if pages are missing */ + } /* okay - all conditions fulfilled */ - ret = (unsigned long) page_address(pages[0]); -out_free_pages: - ptr = pages; - for (loop = nr; loop > 0; loop--) - put_page(*ptr++); out_free: - kfree(pages); + folio_batch_release(&fbatch); out: return ret; } From 48658d8509d2db3391c95aa74308a2b1fc8e8461 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 23 Aug 2022 17:40:23 -0700 Subject: [PATCH 15/25] filemap: remove find_get_pages_contig() All callers of find_get_pages_contig() have been removed, so it is no longer needed. Link: https://lkml.kernel.org/r/20220824004023.77310-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Al Viro Cc: Chris Mason Cc: David Sterba Cc: David Sterba Cc: Josef Bacik Cc: Matthew Wilcox Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 -- mm/filemap.c | 60 ----------------------------------------- 2 files changed, 62 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8689c32d628b..09de43e36a64 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -720,8 +720,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); diff --git a/mm/filemap.c b/mm/filemap.c index 2a9441afe337..8151890e9a00 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2269,66 +2269,6 @@ out: } EXPORT_SYMBOL(filemap_get_folios_contig); -/** - * find_get_pages_contig - gang contiguous pagecache lookup - * @mapping: The address_space to search - * @index: The starting page index - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed - * - * find_get_pages_contig() works exactly like find_get_pages_range(), - * except that the returned number of pages are guaranteed to be - * contiguous. - * - * Return: the number of pages which were found. - */ -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, - unsigned int nr_pages, struct page **pages) -{ - XA_STATE(xas, &mapping->i_pages, index); - struct folio *folio; - unsigned int ret = 0; - - if (unlikely(!nr_pages)) - return 0; - - rcu_read_lock(); - for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { - if (xas_retry(&xas, folio)) - continue; - /* - * If the entry has been swapped out, we can stop looking. - * No current caller is looking for DAX entries. - */ - if (xa_is_value(folio)) - break; - - if (!folio_try_get_rcu(folio)) - goto retry; - - if (unlikely(folio != xas_reload(&xas))) - goto put_page; - -again: - pages[ret] = folio_file_page(folio, xas.xa_index); - if (++ret == nr_pages) - break; - if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; - } - continue; -put_page: - folio_put(folio); -retry: - xas_reset(&xas); - } - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(find_get_pages_contig); - /** * find_get_pages_range_tag - Find and return head pages matching @tag. * @mapping: the address_space to search From 639118d1571f70b1157b4bb5ac574b0ab0f38099 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 27 Aug 2022 19:20:43 +0800 Subject: [PATCH 16/25] mm: kill is_memblock_offlined() Directly check state of struct memory_block, no need a single function. Link: https://lkml.kernel.org/r/20220827112043.187028-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- drivers/base/memory.c | 6 ------ include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 3 +-- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bc60c9cd3230..9aa0da991cfb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -869,12 +869,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) } } -/* return true if the memory block is offlined, otherwise, return false */ -bool is_memblock_offlined(struct memory_block *mem) -{ - return mem->state == MEM_OFFLINE; -} - static struct attribute *memory_root_attrs[] = { #ifdef CONFIG_ARCH_MEMORY_PROBE &dev_attr_probe.attr, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..54675791bc50 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,7 +11,6 @@ struct page; struct zone; struct pglist_data; struct mem_section; -struct memory_block; struct memory_group; struct resource; struct vmem_altmap; @@ -333,7 +332,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fad6d1f2262a..dc727aee4ad3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1969,11 +1969,10 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { - int ret = !is_memblock_offlined(mem); int *nid = arg; *nid = mem->nid; - if (unlikely(ret)) { + if (unlikely(mem->state != MEM_OFFLINE)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); From b4a0215e11dcfe23a48c65c6d6c82c0c2c551a48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 27 Aug 2022 19:19:59 +0800 Subject: [PATCH 17/25] mm: fix null-ptr-deref in kswapd_is_running() kswapd_run/stop() will set pgdat->kswapd to NULL, which could race with kswapd_is_running() in kcompactd(), kswapd_run/stop() kcompactd() kswapd_is_running() pgdat->kswapd // error or nomal ptr verify pgdat->kswapd // load non-NULL pgdat->kswapd pgdat->kswapd = NULL task_is_running(pgdat->kswapd) // Null pointer derefence KASAN reports the null-ptr-deref shown below, vmscan: Failed to start kswapd on node 0 ... BUG: KASAN: null-ptr-deref in kcompactd+0x440/0x504 Read of size 8 at addr 0000000000000024 by task kcompactd0/37 CPU: 0 PID: 37 Comm: kcompactd0 Kdump: loaded Tainted: G OE 5.10.60 #1 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: dump_backtrace+0x0/0x394 show_stack+0x34/0x4c dump_stack+0x158/0x1e4 __kasan_report+0x138/0x140 kasan_report+0x44/0xdc __asan_load8+0x94/0xd0 kcompactd+0x440/0x504 kthread+0x1a4/0x1f0 ret_from_fork+0x10/0x18 At present kswapd/kcompactd_run() and kswapd/kcompactd_stop() are protected by mem_hotplug_begin/done(), but without kcompactd(). There is no need to involve memory hotplug lock in kcompactd(), so let's add a new mutex to protect pgdat->kswapd accesses. Also, because the kcompactd task will check the state of kswapd task, it's better to call kcompactd_stop() before kswapd_stop() to reduce lock conflicts. [akpm@linux-foundation.org: add comments] Link: https://lkml.kernel.org/r/20220827111959.186838-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 20 ++++++++++++++++++++ include/linux/mmzone.h | 6 ++++-- mm/compaction.c | 14 +++++++++++++- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 1 + mm/vmscan.c | 27 ++++++++++++++++----------- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 54675791bc50..51052969dbfe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -215,6 +215,22 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); +/* See kswapd_is_running() */ +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) +{ + mutex_lock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) +{ + mutex_unlock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) +{ + mutex_init(&pgdat->kswapd_lock); +} + #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ @@ -251,6 +267,10 @@ static inline bool movable_node_is_enabled(void) { return false; } + +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} #endif /* ! CONFIG_MEMORY_HOTPLUG */ /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fd61347b4b1f..18cf0fc5ce67 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -956,8 +956,10 @@ typedef struct pglist_data { atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ - struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/done() */ +#ifdef CONFIG_MEMORY_HOTPLUG + struct mutex kswapd_lock; +#endif + struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; diff --git a/mm/compaction.c b/mm/compaction.c index 640fa76228dd..262c4676b32c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1981,9 +1981,21 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +/* + * Determine whether kswapd is (or recently was!) running on this node. + * + * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't + * zero it. + */ static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && task_is_running(pgdat->kswapd); + bool running; + + pgdat_kswapd_lock(pgdat); + running = pgdat->kswapd && task_is_running(pgdat->kswapd); + pgdat_kswapd_unlock(pgdat); + + return running; } /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dc727aee4ad3..9ae1f98548b1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1940,8 +1940,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { - kswapd_stop(node); kcompactd_stop(node); + kswapd_stop(node); } writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1d4278115d71..09386b81e68e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7616,6 +7616,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) int i; pgdat_resize_init(pgdat); + pgdat_kswapd_lock_init(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); diff --git a/mm/vmscan.c b/mm/vmscan.c index bb993b21953d..02e720c83901 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4643,16 +4643,17 @@ void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - if (pgdat->kswapd) - return; - - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ - BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); - pgdat->kswapd = NULL; + pgdat_kswapd_lock(pgdat); + if (!pgdat->kswapd) { + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd on node %d\n", nid); + pgdat->kswapd = NULL; + } } + pgdat_kswapd_unlock(pgdat); } /* @@ -4661,12 +4662,16 @@ void kswapd_run(int nid) */ void kswapd_stop(int nid) { - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + pg_data_t *pgdat = NODE_DATA(nid); + struct task_struct *kswapd; + pgdat_kswapd_lock(pgdat); + kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; + pgdat->kswapd = NULL; } + pgdat_kswapd_unlock(pgdat); } static int __init kswapd_init(void) From 09876ae73945ca69550ce2cbe0538de11997fd94 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 27 Aug 2022 17:02:50 +0800 Subject: [PATCH 18/25] mm/damon: simplify the parameter passing for 'check_accesses' Patch series "mm/damon: Simplify the damon regions access check", v2. This patchset simplifies the operations when checking the damon regions accesses. This patch (of 2): The parameter 'struct damon_ctx *ctx' isn't used in the functions __damon_{p,v}a_check_access(), so we can remove it and simplify the parameter passing. Link: https://lkml.kernel.org/r/1661590971-20893-1-git-send-email-kaixuxia@tencent.com Link: https://lkml.kernel.org/r/1661590971-20893-2-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 5 ++--- mm/damon/vaddr.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dc131c6a5403..6b0d9e6aa677 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -166,8 +166,7 @@ out: return result.accessed; } -static void __damon_pa_check_access(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_check_access(struct damon_region *r) { static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; @@ -196,7 +195,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(ctx, r); + __damon_pa_check_access(r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index cc04d467ba23..34a72f5e13f5 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -542,8 +542,8 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void __damon_va_check_access(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_check_access(struct mm_struct *mm, + struct damon_region *r) { static struct mm_struct *last_mm; static unsigned long last_addr; @@ -578,7 +578,7 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - __damon_va_check_access(ctx, mm, r); + __damon_va_check_access(mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); From 95cd2522669243a1804ba8e9583ed10b3b4f51ef Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 27 Aug 2022 17:02:51 +0800 Subject: [PATCH 19/25] mm/damon/vaddr: remove comparison between mm and last_mm when checking region accesses The damon regions that belong to the same damon target have the same 'struct mm_struct *mm', so it's unnecessary to compare the mm and last_mm objects among the damon regions in one damon target when checking accesses. But the check is necessary when the target changed in '__damon_va_check_accesses()', so we can simplify the whole operation by using the bool 'same_target' to indicate whether the target changed. Link: https://lkml.kernel.org/r/1661590971-20893-3-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 34a72f5e13f5..a8505ad47c60 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -543,15 +543,14 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * r the region to be checked */ static void __damon_va_check_access(struct mm_struct *mm, - struct damon_region *r) + struct damon_region *r, bool same_target) { - static struct mm_struct *last_mm; static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == ALIGN_DOWN(r->sampling_addr, last_page_sz))) { if (last_accessed) r->nr_accesses++; @@ -562,7 +561,6 @@ static void __damon_va_check_access(struct mm_struct *mm, if (last_accessed) r->nr_accesses++; - last_mm = mm; last_addr = r->sampling_addr; } @@ -572,14 +570,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) struct mm_struct *mm; struct damon_region *r; unsigned int max_nr_accesses = 0; + bool same_target; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; + same_target = false; damon_for_each_region(r, t) { - __damon_va_check_access(mm, r); + __damon_va_check_access(mm, r, same_target); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + same_target = true; } mmput(mm); } From a38c94ed59fc312b51a33d867444ce73c6805ff6 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 29 Aug 2022 17:57:09 +0800 Subject: [PATCH 20/25] mm/thp: simplify has_transparent_hugepage by using IS_BUILTIN Simplify code of has_transparent_hugepage define by using IS_BUILTIN. No functional change. Link: https://lkml.kernel.org/r/20220829095709.3287462-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 014ee8f0fbaa..5911761487de 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1598,11 +1598,7 @@ typedef unsigned int pgtbl_mod_mask; #endif #ifndef has_transparent_hugepage -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 -#else -#define has_transparent_hugepage() 0 -#endif +#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif /* From bcd0dea5f4fb3d098b03ef3064fe6c8201d7c515 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 29 Aug 2022 17:51:25 +0800 Subject: [PATCH 21/25] mm/thp: remove redundant CONFIG_TRANSPARENT_HUGEPAGE Simplify code by removing redundant CONFIG_TRANSPARENT_HUGEPAGE judgment. No functional change. Link: https://lkml.kernel.org/r/20220829095125.3284567-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5911761487de..d13b4f7cc5be 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1276,8 +1276,7 @@ static inline int pgd_devmap(pgd_t pgd) #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ - (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ - !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; From 8eabc77c38d8b7345bc0b234b7d26695c29a0822 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 29 Aug 2022 17:46:06 +0800 Subject: [PATCH 22/25] mm/damon: get the hotness from damon_hot_score() in damon_pageout_score() We can get the hotness value from damon_hot_score() directly in damon_pageout_score() function and improve the code readability. Link: https://lkml.kernel.org/r/1661766366-20998-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 52 ++++++++----------------------------------- 1 file changed, 9 insertions(+), 43 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index b1335de200e7..f599838b5f64 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -88,49 +88,6 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) #define DAMON_MAX_SUBSCORE (100) #define DAMON_MAX_AGE_IN_LOG (32) -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, - struct damos *s) -{ - unsigned int max_nr_accesses; - int freq_subscore; - unsigned int age_in_sec; - int age_in_log, age_subscore; - unsigned int freq_weight = s->quota.weight_nr_accesses; - unsigned int age_weight = s->quota.weight_age; - int hotness; - - max_nr_accesses = c->aggr_interval / c->sample_interval; - freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; - - /* If frequency is 0, higher age means it's colder */ - if (freq_subscore == 0) - age_in_log *= -1; - - /* - * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. - * Scale it to be in [0, 100] and set it as age subscore. - */ - age_in_log += DAMON_MAX_AGE_IN_LOG; - age_subscore = age_in_log * DAMON_MAX_SUBSCORE / - DAMON_MAX_AGE_IN_LOG / 2; - - hotness = (freq_weight * freq_subscore + age_weight * age_subscore); - if (freq_weight + age_weight) - hotness /= freq_weight + age_weight; - /* - * Transform it to fit in [0, DAMOS_MAX_SCORE] - */ - hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; - - /* Return coldness of the region */ - return DAMOS_MAX_SCORE - hotness; -} - int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { @@ -172,3 +129,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, return hotness; } + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + int hotness = damon_hot_score(c, r, s); + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} From 663d0cfd2e77aa3ed170df76d441c8f07efa3cf6 Mon Sep 17 00:00:00 2001 From: zezuo Date: Wed, 31 Aug 2022 01:34:04 +0000 Subject: [PATCH 23/25] mm/page_alloc.c: delete a redundant parameter of rmqueue_pcplist The gfp_flags parameter is not used in rmqueue_pcplist, so directly delete this parameter. Link: https://lkml.kernel.org/r/20220831013404.3360714-1-zuoze1@huawei.com Signed-off-by: zezuo Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09386b81e68e..ce3315e22f35 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3779,8 +3779,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3841,7 +3840,7 @@ struct page *rmqueue(struct zone *preferred_zone, if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + migratetype, alloc_flags); if (likely(page)) goto out; } From 0742e49026121371c7ce1f640628c68c7da175d6 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 30 Aug 2022 12:01:38 +1000 Subject: [PATCH 24/25] mm/migrate_device.c: fix a misleading and outdated comment Commit ab09243aa95a ("mm/migrate.c: remove MIGRATE_PFN_LOCKED") changed the way trylock_page() in migrate_vma_collect_pmd() works without updating the comment. Reword the comment to be less misleading and a better reflection of what happens. Link: https://lkml.kernel.org/r/20220830020138.497063-1-apopple@nvidia.com Fixes: ab09243aa95a ("mm/migrate.c: remove MIGRATE_PFN_LOCKED") Signed-off-by: Alistair Popple Reported-by: Peter Xu Acked-by: Peter Xu Signed-off-by: Andrew Morton --- mm/migrate_device.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 27fb37d65476..72354248dde9 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -185,9 +185,16 @@ again: get_page(page); /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. + * We rely on trylock_page() to avoid deadlock between + * concurrent migrations where each is waiting on the others + * page lock. If we can't immediately lock the page we fail this + * migration as it is only best effort anyway. + * + * If we can lock the page it's safe to set up a migration entry + * now. In the common case where the page is mapped once in a + * single process setting up the migration entry now is an + * optimisation to avoid walking the rmap later with + * try_to_migrate(). */ if (trylock_page(page)) { bool anon_exclusive; From bd1264c37c15a75a3164852740ad0c9529907d83 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 30 Aug 2022 22:27:34 -0700 Subject: [PATCH 25/25] mm/vmalloc: extend find_vmap_lowest_match_check with extra arguments find_vmap_lowest_match() is now able to handle different roots. With DEBUG_AUGMENT_LOWEST_MATCH_CHECK enabled as: : --- a/mm/vmalloc.c : +++ b/mm/vmalloc.c : @@ -713,7 +713,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); : /*** Global kva allocator ***/ : : -#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 : +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1 compilation failed as: mm/vmalloc.c: In function 'find_vmap_lowest_match_check': mm/vmalloc.c:1328:32: warning: passing argument 1 of 'find_vmap_lowest_match' makes pointer from integer without a cast [-Wint-conversion] 1328 | va_1 = find_vmap_lowest_match(size, align, vstart, false); | ^~~~ | | | long unsigned int mm/vmalloc.c:1236:40: note: expected 'struct rb_root *' but argument is of type 'long unsigned int' 1236 | find_vmap_lowest_match(struct rb_root *root, unsigned long size, | ~~~~~~~~~~~~~~~~^~~~ mm/vmalloc.c:1328:9: error: too few arguments to function 'find_vmap_lowest_match' 1328 | va_1 = find_vmap_lowest_match(size, align, vstart, false); | ^~~~~~~~~~~~~~~~~~~~~~ mm/vmalloc.c:1236:1: note: declared here 1236 | find_vmap_lowest_match(struct rb_root *root, unsigned long size, | ^~~~~~~~~~~~~~~~~~~~~~ Extend find_vmap_lowest_match_check() and find_vmap_lowest_linear_match() with extra arguments to fix this. Link: https://lkml.kernel.org/r/20220906060548.1127396-1-song@kernel.org Link: https://lkml.kernel.org/r/20220831052734.3423079-1-song@kernel.org Fixes: f9863be49312 ("mm/vmalloc: extend __alloc_vmap_area() with extra arguments") Signed-off-by: Song Liu Reviewed-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- mm/vmalloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e68c0081e861..a991b909866f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1300,12 +1300,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size, #include static struct vmap_area * -find_vmap_lowest_linear_match(unsigned long size, +find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; - list_for_each_entry(va, &free_vmap_area_list, list) { + list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; @@ -1316,7 +1316,8 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size, unsigned long align) +find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1325,8 +1326,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart, false); - va_2 = find_vmap_lowest_linear_match(size, align, vstart); + va_1 = find_vmap_lowest_match(root, size, align, vstart, false); + va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1513,7 +1514,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size, align); + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr;