diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 77415ce01a5e..45bd3e475a7a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1469,6 +1469,14 @@ Permit 'security.evm' to be updated regardless of current integrity status. + early_page_ext [KNL] Enforces page_ext initialization to earlier + stages so cover more early boot allocations. + Please note that as side effect some optimizations + might be disabled to achieve that (e.g. parallelized + memory initialization is disabled) so the boot process + might take longer, especially on systems with a lot of + memory. Available with CONFIG_PAGE_EXTENSION=y. + failslab= fail_usercopy= fail_page_alloc= diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 10e51ef9c79a..c299a18273ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -237,16 +237,6 @@ int pud_huge(pud_t pud) return pud_large(pud); } -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - if (flags & FOLL_GET) - return NULL; - - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bc60c9cd3230..9aa0da991cfb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -869,12 +869,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) } } -/* return true if the memory block is offlined, otherwise, return false */ -bool is_memblock_offlined(struct memory_block *mem) -{ - return mem->state == MEM_OFFLINE; -} - static struct attribute *memory_root_attrs[] = { #ifdef CONFIG_ARCH_MEMORY_PROBE &dev_attr_probe.attr, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 04077a2bb8a2..44de3d125319 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1410,9 +1410,19 @@ compress_again: handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); - if (!IS_ERR((void *)handle)) + if (IS_ERR((void *)handle)) + return PTR_ERR((void *)handle); + + if (comp_len != PAGE_SIZE) goto compress_again; - return PTR_ERR((void *)handle); + /* + * If the page is not compressible, you need to acquire the lock and + * execute the code below. The zcomp_stream_get() call is needed to + * disable the cpu hotplug and grab the zstrm buffer back. + * It is necessary that the dereferencing of the zstrm variable below + * occurs correctly. + */ + zstrm = zcomp_stream_get(zram->comp); } alloced_pages = zs_get_total_pages(zram->mem_pool); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c0a08064b0a7..f1f051ad3147 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -219,8 +220,7 @@ static noinline void end_compressed_writeback(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; - struct page *pages[16]; - unsigned long nr_pages = end_index - index + 1; + struct folio_batch fbatch; const int errno = blk_status_to_errno(cb->status); int i; int ret; @@ -228,24 +228,23 @@ static noinline void end_compressed_writeback(struct inode *inode, if (errno) mapping_set_error(inode->i_mapping, errno); - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - nr_pages -= 1; - index += 1; - continue; - } + folio_batch_init(&fbatch); + while (index <= end_index) { + ret = filemap_get_folios(inode->i_mapping, &index, end_index, + &fbatch); + + if (ret == 0) + return; + for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (errno) - SetPageError(pages[i]); - btrfs_page_clamp_clear_writeback(fs_info, pages[i], + folio_set_error(folio); + btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); - put_page(pages[i]); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); } /* the inode may be gone now */ } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1eae68fbae21..4dcf22e051ff 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -270,9 +270,8 @@ static int __process_pages_contig(struct address_space *mapping, pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long nr_pages = end_index - start_index + 1; unsigned long pages_processed = 0; - struct page *pages[16]; + struct folio_batch fbatch; int err = 0; int i; @@ -281,16 +280,17 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) mapping_set_error(mapping, -EIO); - while (nr_pages > 0) { - int found_pages; + folio_batch_init(&fbatch); + while (index <= end_index) { + int found_folios; - found_pages = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (found_pages == 0) { + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + + if (found_folios == 0) { /* * Only if we're going to lock these pages, we can find * nothing at @index. @@ -300,23 +300,20 @@ static int __process_pages_contig(struct address_space *mapping, goto out; } - for (i = 0; i < found_pages; i++) { + for (i = 0; i < found_folios; i++) { int process_ret; - + struct folio *folio = fbatch.folios[i]; process_ret = process_one_page(fs_info, mapping, - pages[i], locked_page, page_ops, + &folio->page, locked_page, page_ops, start, end); if (process_ret < 0) { - for (; i < found_pages; i++) - put_page(pages[i]); err = -EAGAIN; + folio_batch_release(&fbatch); goto out; } - put_page(pages[i]); - pages_processed++; + pages_processed += folio_nr_pages(folio); } - nr_pages -= found_pages; - index += found_pages; + folio_batch_release(&fbatch); cond_resched(); } out: diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 6fc2b77ae5c3..9a176af847d7 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -337,7 +337,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * * Even with 0 returned, the page still need extra check to make sure * it's really the correct page, as the caller is using - * find_get_pages_contig(), which can race with page invalidating. + * filemap_get_folios_contig(), which can race with page invalidating. */ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f69ec4d2d6eb..350da449db08 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -20,39 +21,40 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) { int ret; - struct page *pages[16]; + struct folio_batch fbatch; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - unsigned long nr_pages = end_index - index + 1; int i; int count = 0; int loops = 0; - while (nr_pages > 0) { - ret = find_get_pages_contig(inode->i_mapping, index, - min_t(unsigned long, nr_pages, - ARRAY_SIZE(pages)), pages); + folio_batch_init(&fbatch); + + while (index <= end_index) { + ret = filemap_get_folios_contig(inode->i_mapping, &index, + end_index, &fbatch); for (i = 0; i < ret; i++) { + struct folio *folio = fbatch.folios[i]; + if (flags & PROCESS_TEST_LOCKED && - !PageLocked(pages[i])) + !folio_test_locked(folio)) count++; - if (flags & PROCESS_UNLOCK && PageLocked(pages[i])) - unlock_page(pages[i]); - put_page(pages[i]); + if (flags & PROCESS_UNLOCK && folio_test_locked(folio)) + folio_unlock(folio); if (flags & PROCESS_RELEASE) - put_page(pages[i]); + folio_put(folio); } - nr_pages -= ret; - index += ret; + folio_batch_release(&fbatch); cond_resched(); loops++; if (loops > 100000) { printk(KERN_ERR - "stuck in a loop, start %llu, end %llu, nr_pages %lu, ret %d\n", - start, end, nr_pages, ret); + "stuck in a loop, start %llu, end %llu, ret %d\n", + start, end, ret); break; } } + return count; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 3267e96c256c..39b7eea2642a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -480,41 +480,36 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff) { - unsigned int i; + unsigned int i, nr_folios; pgoff_t index; - unsigned int nblocks_in_page; unsigned long length = 0; - sector_t b; - struct pagevec pvec; - struct page *page; + struct folio_batch fbatch; + struct folio *folio; if (inode->i_mapping->nrpages == 0) return 0; index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); - nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, - pvec.pages); - if (pvec.nr == 0) + nr_folios = filemap_get_folios_contig(inode->i_mapping, &index, ULONG_MAX, + &fbatch); + if (nr_folios == 0) return length; - if (length > 0 && pvec.pages[0]->index > index) - goto out; - - b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits); i = 0; do { - page = pvec.pages[i]; + folio = fbatch.folios[i]; - lock_page(page); - if (page_has_buffers(page)) { + folio_lock(folio); + if (folio_buffers(folio)) { struct buffer_head *bh, *head; + sector_t b; - bh = head = page_buffers(page); + b = folio->index << (PAGE_SHIFT - inode->i_blkbits); + bh = head = folio_buffers(folio); do { if (b < start_blk) continue; @@ -529,21 +524,17 @@ repeat: } else { if (length > 0) goto out_locked; - - b += nblocks_in_page; } - unlock_page(page); + folio_unlock(folio); - } while (++i < pagevec_count(&pvec)); + } while (++i < nr_folios); - index = page->index + 1; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; out_locked: - unlock_page(page); -out: - pagevec_release(&pvec); + folio_unlock(folio); + folio_batch_release(&fbatch); return length; } diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ba3525ccc27e..cb240eac5036 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -203,9 +203,9 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long maxpages, lpages, nr, loop, ret; + unsigned long maxpages, lpages, nr_folios, loop, ret, nr_pages, pfn; struct inode *inode = file_inode(file); - struct page **pages = NULL, **ptr, *page; + struct folio_batch fbatch; loff_t isize; /* the mapping mustn't extend beyond the EOF */ @@ -221,31 +221,39 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto out_free; - - nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); - if (nr != lpages) - goto out_free_pages; /* leave if some pages were missing */ + folio_batch_init(&fbatch); + nr_pages = 0; +repeat: + nr_folios = filemap_get_folios_contig(inode->i_mapping, &pgoff, + ULONG_MAX, &fbatch); + if (!nr_folios) { + ret = -ENOSYS; + return ret; + } + if (ret == -ENOSYS) { + ret = (unsigned long) folio_address(fbatch.folios[0]); + pfn = folio_pfn(fbatch.folios[0]); + } /* check the pages for physical adjacency */ - ptr = pages; - page = *ptr++; - page++; - for (loop = lpages; loop > 1; loop--) - if (*ptr++ != page++) - goto out_free_pages; + for (loop = 0; loop < nr_folios; loop++) { + if (pfn + nr_pages != folio_pfn(fbatch.folios[loop])) { + ret = -ENOSYS; + goto out_free; /* leave if not physical adjacent */ + } + nr_pages += folio_nr_pages(fbatch.folios[loop]); + if (nr_pages >= lpages) + goto out_free; /* successfully found desired pages*/ + } + if (nr_pages < lpages) { + folio_batch_release(&fbatch); + goto repeat; /* loop if pages are missing */ + } /* okay - all conditions fulfilled */ - ret = (unsigned long) page_address(pages[0]); -out_free_pages: - ptr = pages; - for (loop = nr; loop > 0; loop--) - put_page(*ptr++); out_free: - kfree(pages); + folio_batch_release(&fbatch); out: return ret; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 567f12323f55..cf115888f3bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -354,10 +354,11 @@ struct mem_cgroup { }; /* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. + * size of first charge trial. + * TODO: maybe necessary to use big numbers in big irons or dynamic based of the + * workload. */ -#define MEMCG_CHARGE_BATCH 32U +#define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e0b2209ab71c..51052969dbfe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -11,7 +11,6 @@ struct page; struct zone; struct pglist_data; struct mem_section; -struct memory_block; struct memory_group; struct resource; struct vmem_altmap; @@ -216,6 +215,22 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); +/* See kswapd_is_running() */ +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) +{ + mutex_lock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) +{ + mutex_unlock(&pgdat->kswapd_lock); +} + +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) +{ + mutex_init(&pgdat->kswapd_lock); +} + #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ @@ -252,6 +267,10 @@ static inline bool movable_node_is_enabled(void) { return false; } + +static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {} +static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {} #endif /* ! CONFIG_MEMORY_HOTPLUG */ /* @@ -333,7 +352,6 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -extern bool is_memblock_offlined(struct memory_block *mem); extern int sparse_add_section(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ebe073cfcf67..fe6c5e04aeaf 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -955,8 +955,10 @@ typedef struct pglist_data { atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */ unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ - struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/done() */ +#ifdef CONFIG_MEMORY_HOTPLUG + struct mutex kswapd_lock; +#endif + struct task_struct *kswapd; /* Protected by kswapd_lock */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 679591301994..78a1c934e416 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -3,15 +3,26 @@ #define _LINUX_PAGE_COUNTER_H #include +#include #include #include +#if defined(CONFIG_SMP) +struct pc_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define PC_PADDING(name) struct pc_padding name +#else +#define PC_PADDING(name) +#endif + struct page_counter { + /* + * Make sure 'usage' does not share cacheline with any other field. The + * memcg->memory.usage is a hot member of struct mem_cgroup. + */ atomic_long_t usage; - unsigned long min; - unsigned long low; - unsigned long high; - unsigned long max; + PC_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -23,18 +34,18 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; - /* legacy */ unsigned long watermark; unsigned long failcnt; - /* - * 'parent' is placed here to be far from 'usage' to reduce - * cache false sharing, as 'usage' is written mostly while - * parent is frequently read for cgroup's hierarchical - * counting nature. - */ + /* Keep all the read most fields in a separete cacheline. */ + PC_PADDING(_pad2_); + + unsigned long min; + unsigned long low; + unsigned long high; + unsigned long max; struct page_counter *parent; -}; +} ____cacheline_internodealigned_in_smp; #if BITS_PER_LONG == 32 #define PAGE_COUNTER_MAX LONG_MAX diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index ed27198cdaf4..22be4582faae 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -36,9 +36,15 @@ struct page_ext { unsigned long flags; }; +extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); +static inline bool early_page_ext_enabled(void) +{ + return early_page_ext; +} + #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { @@ -68,6 +74,11 @@ static inline struct page_ext *page_ext_next(struct page_ext *curr) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; +static inline bool early_page_ext_enabled(void) +{ + return false; +} + static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 201dc7281640..384e12a1a976 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -718,8 +718,8 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages); +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 014ee8f0fbaa..d13b4f7cc5be 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1276,8 +1276,7 @@ static inline int pgd_devmap(pgd_t pgd) #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ - (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ - !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) { return 0; @@ -1598,11 +1597,7 @@ typedef unsigned int pgtbl_mod_mask; #endif #ifndef has_transparent_hugepage -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 -#else -#define has_transparent_hugepage() 0 -#endif +#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif /* diff --git a/init/main.c b/init/main.c index e827c544b3e3..d97e7b38c34a 100644 --- a/init/main.c +++ b/init/main.c @@ -847,6 +847,9 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); + /* Should be run after vmap initialization */ + if (early_page_ext_enabled()) + page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); /* Should be run after espfix64 is set up. */ @@ -1615,7 +1618,8 @@ static noinline void __init kernel_init_freeable(void) padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - page_ext_init(); + if (!early_page_ext_enabled()) + page_ext_init(); do_basic_setup(); diff --git a/mm/Kconfig b/mm/Kconfig index 3897e924e40f..39a919586949 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -23,7 +23,7 @@ menuconfig SWAP in your computer. If unsure say Y. config ZSWAP - bool "Compressed cache for swap pages (EXPERIMENTAL)" + bool "Compressed cache for swap pages" depends on SWAP select FRONTSWAP select CRYPTO @@ -36,12 +36,6 @@ config ZSWAP in the case where decompressing from RAM is faster than swap device reads, can also improve workload performance. - This is marked experimental because it is a new feature (as of - v3.11) that interacts heavily with memory reclaim. While these - interactions don't cause any known issues on simple memory setups, - they have not be fully explored on the large set of potential - configurations and workloads that exist. - config ZSWAP_DEFAULT_ON bool "Enable the compressed cache for swap pages by default" depends on ZSWAP diff --git a/mm/backing-dev.c b/mm/backing-dev.c index de65cb1e5f76..c30419a5e119 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -776,8 +776,6 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) int bdi_init(struct backing_dev_info *bdi) { - int ret; - bdi->dev = NULL; kref_init(&bdi->refcnt); @@ -788,9 +786,7 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); - ret = cgwb_bdi_init(bdi); - - return ret; + return cgwb_bdi_init(bdi); } struct backing_dev_info *bdi_alloc(int node_id) diff --git a/mm/compaction.c b/mm/compaction.c index 10561cb1aaad..13144c46618e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1977,9 +1977,21 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +/* + * Determine whether kswapd is (or recently was!) running on this node. + * + * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't + * zero it. + */ static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && task_is_running(pgdat->kswapd); + bool running; + + pgdat_kswapd_lock(pgdat); + running = pgdat->kswapd && task_is_running(pgdat->kswapd); + pgdat_kswapd_unlock(pgdat); + + return running; } /* diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index b1335de200e7..f599838b5f64 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -88,49 +88,6 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr) #define DAMON_MAX_SUBSCORE (100) #define DAMON_MAX_AGE_IN_LOG (32) -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, - struct damos *s) -{ - unsigned int max_nr_accesses; - int freq_subscore; - unsigned int age_in_sec; - int age_in_log, age_subscore; - unsigned int freq_weight = s->quota.weight_nr_accesses; - unsigned int age_weight = s->quota.weight_age; - int hotness; - - max_nr_accesses = c->aggr_interval / c->sample_interval; - freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; - for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; - age_in_log++, age_in_sec >>= 1) - ; - - /* If frequency is 0, higher age means it's colder */ - if (freq_subscore == 0) - age_in_log *= -1; - - /* - * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. - * Scale it to be in [0, 100] and set it as age subscore. - */ - age_in_log += DAMON_MAX_AGE_IN_LOG; - age_subscore = age_in_log * DAMON_MAX_SUBSCORE / - DAMON_MAX_AGE_IN_LOG / 2; - - hotness = (freq_weight * freq_subscore + age_weight * age_subscore); - if (freq_weight + age_weight) - hotness /= freq_weight + age_weight; - /* - * Transform it to fit in [0, DAMOS_MAX_SCORE] - */ - hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; - - /* Return coldness of the region */ - return DAMOS_MAX_SCORE - hotness; -} - int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { @@ -172,3 +129,12 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, return hotness; } + +int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + int hotness = damon_hot_score(c, r, s); + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dc131c6a5403..6b0d9e6aa677 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -166,8 +166,7 @@ out: return result.accessed; } -static void __damon_pa_check_access(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_check_access(struct damon_region *r) { static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; @@ -196,7 +195,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(ctx, r); + __damon_pa_check_access(r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index cc04d467ba23..a8505ad47c60 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -542,16 +542,15 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void __damon_va_check_access(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_check_access(struct mm_struct *mm, + struct damon_region *r, bool same_target) { - static struct mm_struct *last_mm; static unsigned long last_addr; static unsigned long last_page_sz = PAGE_SIZE; static bool last_accessed; /* If the region is in the last checked page, reuse the result */ - if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) == + if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == ALIGN_DOWN(r->sampling_addr, last_page_sz))) { if (last_accessed) r->nr_accesses++; @@ -562,7 +561,6 @@ static void __damon_va_check_access(struct damon_ctx *ctx, if (last_accessed) r->nr_accesses++; - last_mm = mm; last_addr = r->sampling_addr; } @@ -572,14 +570,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) struct mm_struct *mm; struct damon_region *r; unsigned int max_nr_accesses = 0; + bool same_target; damon_for_each_target(t, ctx) { mm = damon_get_mm(t); if (!mm) continue; + same_target = false; damon_for_each_region(r, t) { - __damon_va_check_access(ctx, mm, r); + __damon_va_check_access(mm, r, same_target); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + same_target = true; } mmput(mm); } diff --git a/mm/filemap.c b/mm/filemap.c index c8d7ef70e48c..8cb832af31ff 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2197,30 +2197,31 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) } /** - * find_get_pages_contig - gang contiguous pagecache lookup + * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search - * @index: The starting page index - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed + * @start: The starting page index + * @end: The final page index (inclusive) + * @fbatch: The batch to fill * - * find_get_pages_contig() works exactly like find_get_pages_range(), - * except that the returned number of pages are guaranteed to be - * contiguous. + * filemap_get_folios_contig() works exactly like filemap_get_folios(), + * except the returned folios are guaranteed to be contiguous. This may + * not return all contiguous folios if the batch gets filled up. * - * Return: the number of pages which were found. + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. */ -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, - unsigned int nr_pages, struct page **pages) -{ - XA_STATE(xas, &mapping->i_pages, index); - struct folio *folio; - unsigned int ret = 0; - if (unlikely(!nr_pages)) - return 0; +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + unsigned long nr; + struct folio *folio; rcu_read_lock(); - for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + + for (folio = xas_load(&xas); folio && xas.xa_index <= end; + folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* @@ -2228,33 +2229,45 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) - break; + goto update_start; if (!folio_try_get_rcu(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) - goto put_page; + goto put_folio; -again: - pages[ret] = folio_file_page(folio, xas.xa_index); - if (++ret == nr_pages) - break; - if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; + if (!folio_batch_add(fbatch, folio)) { + nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; } continue; -put_page: +put_folio: folio_put(folio); + retry: xas_reset(&xas); } + +update_start: + nr = folio_batch_count(fbatch); + + if (nr) { + folio = fbatch->folios[nr - 1]; + if (folio_test_hugetlb(folio)) + *start = folio->index + 1; + else + *start = folio->index + folio_nr_pages(folio); + } +out: rcu_read_unlock(); - return ret; + return folio_batch_count(fbatch); } -EXPORT_SYMBOL(find_get_pages_contig); +EXPORT_SYMBOL(filemap_get_folios_contig); /** * find_get_pages_range_tag - Find and return head pages matching @tag. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fad6d1f2262a..9ae1f98548b1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1940,8 +1940,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { - kswapd_stop(node); kcompactd_stop(node); + kswapd_stop(node); } writeback_set_ratelimit(); @@ -1969,11 +1969,10 @@ failed_removal: static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { - int ret = !is_memblock_offlined(mem); int *nid = arg; *nid = mem->nid; - if (unlikely(ret)) { + if (unlikely(mem->state != MEM_OFFLINE)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index dbf6c7a7a7c9..d8efd5a0eb40 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -186,9 +186,16 @@ again: get_page(page); /* - * Optimize for the common case where page is only mapped once - * in one process. If we can lock the page, then we can safely - * set up a special migration page table entry now. + * We rely on trylock_page() to avoid deadlock between + * concurrent migrations where each is waiting on the others + * page lock. If we can't immediately lock the page we fail this + * migration as it is only best effort anyway. + * + * If we can lock the page it's safe to set up a migration entry + * now. In the common case where the page is mapped once in a + * single process setting up the migration entry now is an + * optimisation to avoid walking the rmap later with + * try_to_migrate(). */ if (trylock_page(page)) { bool anon_exclusive; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ba0b438d57ab..65fba92d6676 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -482,6 +482,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) { static unsigned long prev_end_pfn, nr_initialised; + if (early_page_ext_enabled()) + return false; /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. @@ -3777,8 +3779,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3839,7 +3840,7 @@ struct page *rmqueue(struct zone *preferred_zone, if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + migratetype, alloc_flags); if (likely(page)) goto out; } @@ -7663,6 +7664,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) int i; pgdat_resize_init(pgdat); + pgdat_kswapd_lock_init(pgdat); pgdat_init_split_queue(pgdat); pgdat_init_kcompactd(pgdat); diff --git a/mm/page_counter.c b/mm/page_counter.c index 8a0cc24b60dd..db20d6452b71 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; - unsigned long low, min; long delta; if (!c->parent) return; - min = READ_ONCE(c->min); - if (min || atomic_long_read(&c->min_usage)) { - protected = min(usage, min); + protected = min(usage, READ_ONCE(c->min)); + old_protected = atomic_long_read(&c->min_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } - low = READ_ONCE(c->low); - if (low || atomic_long_read(&c->low_usage)) { - protected = min(usage, low); + protected = min(usage, READ_ONCE(c->low)); + old_protected = atomic_long_read(&c->low_usage); + if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) diff --git a/mm/page_ext.c b/mm/page_ext.c index b236bdd59fa8..affe80243b6d 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -91,6 +91,14 @@ unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); +bool early_page_ext; +static int __init setup_early_page_ext(char *str) +{ + early_page_ext = true; + return 0; +} +early_param("early_page_ext", setup_early_page_ext); + static bool __init invoke_need_callbacks(void) { int i; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e68c0081e861..a991b909866f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1300,12 +1300,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size, #include static struct vmap_area * -find_vmap_lowest_linear_match(unsigned long size, +find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, unsigned long align, unsigned long vstart) { struct vmap_area *va; - list_for_each_entry(va, &free_vmap_area_list, list) { + list_for_each_entry(va, head, list) { if (!is_within_this_va(va, size, align, vstart)) continue; @@ -1316,7 +1316,8 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size, unsigned long align) +find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1325,8 +1326,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart, false); - va_2 = find_vmap_lowest_linear_match(size, align, vstart); + va_1 = find_vmap_lowest_match(root, size, align, vstart, false); + va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1513,7 +1514,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size, align); + find_vmap_lowest_match_check(root, head, size, align); #endif return nva_start_addr; diff --git a/mm/vmscan.c b/mm/vmscan.c index 380a99e37ef4..0358d87196e9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4651,16 +4651,17 @@ void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - if (pgdat->kswapd) - return; - - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); - if (IS_ERR(pgdat->kswapd)) { - /* failure at boot is fatal */ - BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); - pgdat->kswapd = NULL; + pgdat_kswapd_lock(pgdat); + if (!pgdat->kswapd) { + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd on node %d\n", nid); + pgdat->kswapd = NULL; + } } + pgdat_kswapd_unlock(pgdat); } /* @@ -4669,12 +4670,16 @@ void kswapd_run(int nid) */ void kswapd_stop(int nid) { - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + pg_data_t *pgdat = NODE_DATA(nid); + struct task_struct *kswapd; + pgdat_kswapd_lock(pgdat); + kswapd = pgdat->kswapd; if (kswapd) { kthread_stop(kswapd); - NODE_DATA(nid)->kswapd = NULL; + pgdat->kswapd = NULL; } + pgdat_kswapd_unlock(pgdat); } static int __init kswapd_init(void)