diff --git a/MAINTAINERS b/MAINTAINERS index bbf961b06a7b..80476bee0d67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19179,7 +19179,7 @@ S: Maintained F: drivers/infiniband/hw/vmw_pvrdma/ VMware PVSCSI driver -M: Jim Gill +M: Vishal Bhakta M: VMware PV-Drivers L: linux-scsi@vger.kernel.org S: Maintained diff --git a/arch/arm/xen/p2m.c b/arch/arm/xen/p2m.c index acb464547a54..84a1cea1f43b 100644 --- a/arch/arm/xen/p2m.c +++ b/arch/arm/xen/p2m.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -109,7 +110,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap.host_addr = map_ops[i].host_addr, unmap.handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap.dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -130,7 +131,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, return 0; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -145,7 +145,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return 0; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, unsigned long nr_pages) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1f212b47a48a..5656e7aacd69 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1055,8 +1055,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_HUGE_PMD_SHARE - config ARCH_HAS_CACHE_LINE_SIZE def_bool y @@ -1157,8 +1155,8 @@ config XEN config FORCE_MAX_ZONEORDER int - default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE) - default "12" if (ARM64_16K_PAGES && TRANSPARENT_HUGEPAGE) + default "14" if ARM64_64K_PAGES + default "12" if ARM64_16K_PAGES default "11" help The kernel memory allocator divides physically contiguous memory @@ -1855,12 +1853,6 @@ config CMDLINE_FROM_BOOTLOADER the boot loader doesn't provide any, the default kernel command string provided in CMDLINE will be used. -config CMDLINE_EXTEND - bool "Extend bootloader kernel arguments" - help - The command-line arguments provided by the boot loader will be - appended to the default kernel command string. - config CMDLINE_FORCE bool "Always use the default kernel command string" help diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c759faf7a1ff..0aabc3be9a75 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -328,6 +328,11 @@ static inline void *phys_to_virt(phys_addr_t x) #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) #if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL) +#define page_to_virt(x) ({ \ + __typeof__(x) __page = x; \ + void *__addr = __va(page_to_phys(__page)); \ + (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\ +}) #define virt_to_page(x) pfn_to_page(virt_to_pfn(x)) #else #define page_to_virt(x) ({ \ diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 70ce8c1d2b07..bd02e99b1a4c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -63,23 +63,6 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) extern u64 idmap_t0sz; extern u64 idmap_ptrs_per_pgd; -static inline bool __cpu_uses_extended_idmap(void) -{ - if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52)) - return false; - - return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS)); -} - -/* - * True if the extended ID map requires an extra level of translation table - * to be configured. - */ -static inline bool __cpu_uses_extended_idmap_level(void) -{ - return ARM64_HW_PGTABLE_LEVELS(64 - idmap_t0sz) > CONFIG_PGTABLE_LEVELS; -} - /* * Ensure TCR.T0SZ is set to the provided value. */ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 046be789fbb4..9a65fb528110 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -66,7 +66,6 @@ extern bool arm64_use_ng_mappings; #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define PAGE_KERNEL __pgprot(PROT_NORMAL) -#define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED) #define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e17b96d0e4b5..47027796c2f9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -486,6 +486,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) #define pgprot_device(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN) +#define pgprot_tagged(prot) \ + __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED)) +#define pgprot_mhp pgprot_tagged /* * DMA allocations for non-coherent devices use what the Arm architecture calls * "Normal non-cacheable" memory, which permits speculation, unaligned accesses diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index dfd4edbfe360..d4a5fca984c3 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -796,6 +796,11 @@ #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX 0x7 + #ifdef CONFIG_ARM64_PA_BITS_52 #define ID_AA64MMFR0_PARANGE_MAX ID_AA64MMFR0_PARANGE_52 #else @@ -961,14 +966,17 @@ #define ID_PFR1_PROGMOD_SHIFT 0 #if defined(CONFIG_ARM64_4K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN4_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #elif defined(CONFIG_ARM64_16K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN16_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0xF #elif defined(CONFIG_ARM64_64K_PAGES) -#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT -#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN64_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED +#define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX 0x7 #endif #define MVFR2_FPMISC_SHIFT 4 diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 66b0e0b66e31..840bda1869e9 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -319,7 +319,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) */ adrp x5, __idmap_text_end clz x5, x5 - cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? + cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough? b.ge 1f // .. then skip VA range extension adr_l x6, idmap_t0sz @@ -655,8 +655,10 @@ SYM_FUNC_END(__secondary_too_slow) SYM_FUNC_START(__enable_mmu) mrs x2, ID_AA64MMFR0_EL1 ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4 - cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED - b.ne __no_granule_support + cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN + b.lt __no_granule_support + cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX + b.gt __no_granule_support update_early_cpu_boot_status 0, x2, x3 adrp x2, idmap_pg_dir phys_to_ttbr x1, x1 diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index dffb16682330..83f1c4b92095 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -163,33 +163,36 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) } while (1); } +static __init const u8 *get_bootargs_cmdline(void) +{ + const u8 *prop; + void *fdt; + int node; + + fdt = get_early_fdt_ptr(); + if (!fdt) + return NULL; + + node = fdt_path_offset(fdt, "/chosen"); + if (node < 0) + return NULL; + + prop = fdt_getprop(fdt, node, "bootargs", NULL); + if (!prop) + return NULL; + + return strlen(prop) ? prop : NULL; +} + static __init void parse_cmdline(void) { - if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { - const u8 *prop; - void *fdt; - int node; + const u8 *prop = get_bootargs_cmdline(); - fdt = get_early_fdt_ptr(); - if (!fdt) - goto out; - - node = fdt_path_offset(fdt, "/chosen"); - if (node < 0) - goto out; - - prop = fdt_getprop(fdt, node, "bootargs", NULL); - if (!prop) - goto out; + if (IS_ENABLED(CONFIG_CMDLINE_FORCE) || !prop) + __parse_cmdline(CONFIG_CMDLINE, true); + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && prop) __parse_cmdline(prop, true); - - if (!IS_ENABLED(CONFIG_CMDLINE_EXTEND)) - return; - } - -out: - __parse_cmdline(CONFIG_CMDLINE, true); } /* Keep checkers quiet */ diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 7d2318f80955..4658fcf88c2b 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -460,7 +460,7 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline u32 armv8pmu_read_evcntr(int idx) +static inline u64 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 47f3f035f3ea..e81c7ec9e102 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -311,16 +311,18 @@ int kvm_set_ipa_limit(void) } switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { - default: - case 1: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; - case 0: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT: kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n"); break; - case 2: + case ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_TGRAN_2_SUPPORTED_MAX: kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n"); break; + default: + kvm_err("Unsupported value for TGRAN_2, giving up\n"); + return -EINVAL; } kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0ace5e68efba..3685e12aba9b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -219,17 +219,40 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) int pfn_valid(unsigned long pfn) { - phys_addr_t addr = pfn << PAGE_SHIFT; + phys_addr_t addr = PFN_PHYS(pfn); - if ((addr >> PAGE_SHIFT) != pfn) + /* + * Ensure the upper PAGE_SHIFT bits are clear in the + * pfn. Else it might lead to false positives when + * some of the upper bits are set, but the lower bits + * match a valid pfn. + */ + if (PHYS_PFN(addr) != pfn) return 0; #ifdef CONFIG_SPARSEMEM +{ + struct mem_section *ms; + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - if (!valid_section(__pfn_to_section(pfn))) + ms = __pfn_to_section(pfn); + if (!valid_section(ms)) return 0; + + /* + * ZONE_DEVICE memory does not have the memblock entries. + * memblock_is_map_memory() check for ZONE_DEVICE based + * addresses will always fail. Even the normal hotplugged + * memory will never have MEMBLOCK_NOMAP flag set in their + * memblock entries. Skip memblock search for all non early + * memory sections covering all of hotplug memory including + * both normal and ZONE_DEVICE based. + */ + if (!early_section(ms)) + return pfn_section_valid(ms, pfn); +} #endif return memblock_is_map_memory(addr); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 3802cfbdd20d..7484ea4f6ba0 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -40,7 +40,7 @@ #define NO_BLOCK_MAPPINGS BIT(0) #define NO_CONT_MAPPINGS BIT(1) -u64 idmap_t0sz = TCR_T0SZ(VA_BITS); +u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN); u64 idmap_ptrs_per_pgd = PTRS_PER_PGD; u64 __section(".mmuoff.data.write") vabits_actual; @@ -512,7 +512,8 @@ static void __init map_mem(pgd_t *pgdp) * if MTE is present. Otherwise, it has the same attributes as * PAGE_KERNEL. */ - __map_memblock(pgdp, start, end, PAGE_KERNEL_TAGGED, flags); + __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), + flags); } /* diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index a3cc33091f46..17d80f751fcb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -741,7 +741,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, map_ops[i].status = GNTST_general_error; unmap[0].host_addr = map_ops[i].host_addr, unmap[0].handle = map_ops[i].handle; - map_ops[i].handle = ~0; + map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; else @@ -751,7 +751,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, kmap_ops[i].status = GNTST_general_error; unmap[1].host_addr = kmap_ops[i].host_addr, unmap[1].handle = kmap_ops[i].handle; - kmap_ops[i].handle = ~0; + kmap_ops[i].handle = INVALID_GRANT_HANDLE; if (kmap_ops[i].flags & GNTMAP_device_map) unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; else @@ -776,7 +776,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, out: return ret; } -EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_unmap_grant_ref *kunmap_ops, @@ -802,7 +801,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } -EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); #ifdef CONFIG_XEN_DEBUG_FS #include diff --git a/block/bio.c b/block/bio.c index a1c4d2900c7a..26b7f721cda8 100644 --- a/block/bio.c +++ b/block/bio.c @@ -33,7 +33,7 @@ static struct biovec_slab { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, - { .nr_vecs = BIO_MAX_PAGES, .name = "biovec-max" }, + { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) @@ -46,7 +46,7 @@ static struct biovec_slab *biovec_slab(unsigned short nr_vecs) return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; - case 129 ... BIO_MAX_PAGES: + case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); @@ -151,9 +151,9 @@ out: void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - BIO_BUG_ON(nr_vecs > BIO_MAX_PAGES); + BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); - if (nr_vecs == BIO_MAX_PAGES) + if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); @@ -186,15 +186,15 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_PAGES entries. + * The mempool is sized to handle up to BIO_MAX_VECS entries. */ - if (*nr_vecs < BIO_MAX_PAGES) { + if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; - *nr_vecs = BIO_MAX_PAGES; + *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 85d5790ac49b..3304e841df7c 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -109,6 +109,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, lockdep_assert_held(&blkg->q->queue_lock); + memset(sum, 0, sizeof(*sum)); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct blkg_rwstat *rwstat; @@ -122,7 +123,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + sum->cnt[i] += blkg_rwstat_read_counter(rwstat, i); } rcu_read_unlock(); } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index d5b652c5d9d1..379596813b93 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -221,7 +221,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) bio_for_each_segment(bv, bio, iter) { num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_PAGES) + if (++i == BIO_MAX_VECS) break; } if (num_sectors < bio_sectors(bio)) { diff --git a/block/blk-lib.c b/block/blk-lib.c index 752f9c722062..7b256131b20b 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -296,7 +296,7 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); - return min(pages, (sector_t)BIO_MAX_PAGES); + return min(pages, (sector_t)BIO_MAX_VECS); } static int __blkdev_issue_zero_pages(struct block_device *bdev, diff --git a/block/blk-map.c b/block/blk-map.c index 369e204d14d0..1ffef782fcf2 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -249,7 +249,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!iov_iter_count(iter)) return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_VECS)); if (!bio) return -ENOMEM; bio->bi_opf |= req_op(rq); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 833978c02e60..c0276b42d9fb 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -240,7 +240,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, */ if (op == REQ_OP_ZONE_RESET && blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { - bio->bi_opf = REQ_OP_ZONE_RESET_ALL; + bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; break; } @@ -318,6 +318,22 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return 0; } +static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, + const struct blk_zone_range *zrange) +{ + loff_t start, end; + + if (zrange->sector + zrange->nr_sectors <= zrange->sector || + zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) + /* Out of range */ + return -EINVAL; + + start = zrange->sector << SECTOR_SHIFT; + end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; + + return truncate_bdev_range(bdev, mode, start, end); +} + /* * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. @@ -329,6 +345,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, struct request_queue *q; struct blk_zone_range zrange; enum req_opf op; + int ret; if (!argp) return -EINVAL; @@ -352,6 +369,11 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKRESETZONE: op = REQ_OP_ZONE_RESET; + + /* Invalidate the page cache, including dirty pages. */ + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + if (ret) + return ret; break; case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; @@ -366,8 +388,20 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, return -ENOTTY; } - return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); + + /* + * Invalidate the page cache again for zone reset: writes can only be + * direct for zoned devices so concurrent writes would not add any page + * to the page cache after/during reset. The page cache may be filled + * again due to concurrent reads though and dropping the pages for + * these is fine. + */ + if (!ret && cmd == BLKRESETZONE) + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + + return ret; } static inline unsigned long *blk_alloc_zone_bitmap(int node, diff --git a/block/bounce.c b/block/bounce.c index 87983a35079c..6c441f4f1cd4 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -229,10 +229,10 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) * - The point of cloning the biovec is to produce a bio with a biovec * the caller can modify: bi_idx and bi_bvec_done should be 0. * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * - The original bio could've had more than BIO_MAX_VECS biovecs; if * we tried to clone the whole thing bio_alloc_bioset() would fail. * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. + * actually need to allocate is fewer than BIO_MAX_VECS. * * - Lastly, bi_vcnt should not be looked at or relied upon by code * that does not own the bio - reason being drivers don't use it for @@ -299,7 +299,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, int sectors = 0; bio_for_each_segment(from, *bio_orig, iter) { - if (i++ < BIO_MAX_PAGES) + if (i++ < BIO_MAX_VECS) sectors += from.bv_len >> 9; if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) bounce = true; diff --git a/block/genhd.c b/block/genhd.c index c55e8f0fced1..8c8f543572e6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -534,10 +534,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - if (disk->flags & GENHD_FL_HIDDEN) { - dev_set_uevent_suppress(ddev, 0); + if (disk->flags & GENHD_FL_HIDDEN) return; - } disk_scan_partitions(disk); diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 37179a8b1ceb..fa3719ef80e4 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -938,6 +938,9 @@ int software_node_register(const struct software_node *node) if (software_node_to_swnode(node)) return -EEXIST; + if (node->parent && !parent) + return -EINVAL; + return PTR_ERR_OR_ZERO(swnode_register(node, parent, 0)); } EXPORT_SYMBOL_GPL(software_node_register); @@ -1002,25 +1005,33 @@ EXPORT_SYMBOL_GPL(fwnode_remove_software_node); /** * device_add_software_node - Assign software node to a device * @dev: The device the software node is meant for. - * @swnode: The software node. + * @node: The software node. * - * This function will register @swnode and make it the secondary firmware node - * pointer of @dev. If @dev has no primary node, then @swnode will become the primary - * node. + * This function will make @node the secondary firmware node pointer of @dev. If + * @dev has no primary node, then @node will become the primary node. The + * function will register @node automatically if it wasn't already registered. */ -int device_add_software_node(struct device *dev, const struct software_node *swnode) +int device_add_software_node(struct device *dev, const struct software_node *node) { + struct swnode *swnode; int ret; /* Only one software node per device. */ if (dev_to_swnode(dev)) return -EBUSY; - ret = software_node_register(swnode); - if (ret) - return ret; + swnode = software_node_to_swnode(node); + if (swnode) { + kobject_get(&swnode->kobj); + } else { + ret = software_node_register(node); + if (ret) + return ret; - set_secondary_fwnode(dev, software_node_fwnode(swnode)); + swnode = software_node_to_swnode(node); + } + + set_secondary_fwnode(dev, &swnode->fwnode); return 0; } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 7d9cc433b758..5d9181382ce1 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1324,7 +1324,7 @@ struct bm_extent { * A followup commit may allow even bigger BIO sizes, * once we thought that through. */ #define DRBD_MAX_BIO_SIZE (1U << 20) -#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_SHIFT) +#if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT) #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE #endif #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index 5ac1881396af..227e1be4c6f9 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -871,6 +871,7 @@ static int rsxx_pci_probe(struct pci_dev *dev, card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); if (!card->event_wq) { dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); + st = -ENOMEM; goto failed_event_handler; } diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 982732dbe82e..664280f23bee 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -877,6 +877,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) if (card->mm_pages[0].desc == NULL || card->mm_pages[1].desc == NULL) { dev_printk(KERN_ERR, &card->dev->dev, "alloc failed\n"); + ret = -ENOMEM; goto failed_alloc; } reset_page(&card->mm_pages[0]); @@ -888,8 +889,10 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) spin_lock_init(&card->lock); card->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!card->queue) + if (!card->queue) { + ret = -ENOMEM; goto failed_alloc; + } tasklet_init(&card->tasklet, process_page, (unsigned long)card); diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 3ba2f716fe97..5e07065ec22f 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -103,6 +103,8 @@ static const struct of_device_id whitelist[] __initconst = { static const struct of_device_id blacklist[] __initconst = { { .compatible = "allwinner,sun50i-h6", }, + { .compatible = "arm,vexpress", }, + { .compatible = "calxeda,highbank", }, { .compatible = "calxeda,ecx-2000", }, diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index d3c23447b892..f86859bf76f1 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -317,9 +317,9 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) } base = ioremap(res->start, resource_size(res)); - if (IS_ERR(base)) { + if (!base) { dev_err(dev, "failed to map resource %pR\n", res); - ret = PTR_ERR(base); + ret = -ENOMEM; goto release_region; } @@ -374,7 +374,7 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) error: kfree(data); unmap_base: - iounmap(data->base); + iounmap(base); release_region: release_mem_region(res->start, resource_size(res)); return ret; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index b69d63143e0d..7bf0a7acae5e 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -24,7 +24,7 @@ efi_status_t check_platform_features(void) return EFI_SUCCESS; tg = (read_cpuid(ID_AA64MMFR0_EL1) >> ID_AA64MMFR0_TGRAN_SHIFT) & 0xf; - if (tg != ID_AA64MMFR0_TGRAN_SUPPORTED) { + if (tg < ID_AA64MMFR0_TGRAN_SUPPORTED_MIN || tg > ID_AA64MMFR0_TGRAN_SUPPORTED_MAX) { if (IS_ENABLED(CONFIG_ARM64_64K_PAGES)) efi_err("This 64 KB granular kernel is not supported by your CPU\n"); else diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 71691f32959b..03e1fe4de53d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -965,7 +965,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_PAGES; + q->limits.max_segments = BIO_MAX_VECS; blk_queue_max_discard_sectors(q, UINT_MAX); q->limits.discard_granularity = 512; q->limits.io_min = block_size; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 11c105ecd165..b0ab080f2567 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -229,7 +229,7 @@ static DEFINE_SPINLOCK(dm_crypt_clients_lock); static unsigned dm_crypt_clients_n = 0; static volatile unsigned long dm_crypt_pages_per_client; #define DM_CRYPT_MEMORY_PERCENT 2 -#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_PAGES * 16) +#define DM_CRYPT_MIN_PAGES_PER_CLIENT (BIO_MAX_VECS * 16) static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); @@ -3246,7 +3246,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size, ARCH_KMALLOC_MINALIGN); - ret = mempool_init(&cc->page_pool, BIO_MAX_PAGES, crypt_page_alloc, crypt_page_free, cc); + ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc); if (ret) { ti->error = "Cannot allocate page mempool"; goto bad; @@ -3373,9 +3373,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) /* * Check if bio is too large, split as needed. */ - if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_PAGES << PAGE_SHIFT)) && + if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) && (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size)) - dm_accept_partial_bio(bio, ((BIO_MAX_PAGES << PAGE_SHIFT) >> SECTOR_SHIFT)); + dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT)); /* * Ensure that bio is a multiple of internal sector encryption size diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 844c4be11768..4f72b6f66c3a 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1892,10 +1892,10 @@ restart: list_add(&g->lru, &wbl.list); wbl.size++; g->write_in_progress = true; - g->wc_list_contiguous = BIO_MAX_PAGES; + g->wc_list_contiguous = BIO_MAX_VECS; f = g; e->wc_list_contiguous++; - if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) { + if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) { if (unlikely(wc->writeback_all)) { next_node = rb_next(&f->rb_node); if (likely(next_node)) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 4337ae0e6af2..0b5dcaabbc15 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -735,7 +735,7 @@ static void r5l_submit_current_io(struct r5l_log *log) static struct bio *r5l_bio_alloc(struct r5l_log *log) { - struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs); + struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &log->bs); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio_set_dev(bio, log->rdev->bdev); @@ -1634,7 +1634,7 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, { struct page *page; - ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs); + ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_VECS, &log->bs); if (!ctx->ra_bio) return -ENOMEM; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index e8c118e05dfd..3ddc2aa0b530 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { struct bio *prev = bio; - bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, + bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_VECS, &ppl_conf->bs); bio->bi_opf = prev->bi_opf; bio->bi_write_hint = prev->bi_write_hint; diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index c2e70b757dd1..4383c262b3f5 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -399,11 +399,6 @@ void mmc_remove_card(struct mmc_card *card) mmc_remove_card_debugfs(card); #endif - if (host->cqe_enabled) { - host->cqe_ops->cqe_disable(host); - host->cqe_enabled = false; - } - if (mmc_card_present(card)) { if (mmc_host_is_spi(card->host)) { pr_info("%s: SPI card removed\n", @@ -416,6 +411,10 @@ void mmc_remove_card(struct mmc_card *card) of_node_put(card->dev.of_node); } + if (host->cqe_enabled) { + host->cqe_ops->cqe_disable(host); + host->cqe_enabled = false; + } + put_device(&card->dev); } - diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 0d80b72ddde8..8741271d3971 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -423,10 +423,6 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) /* EXT_CSD value is in units of 10ms, but we store in ms */ card->ext_csd.part_time = 10 * ext_csd[EXT_CSD_PART_SWITCH_TIME]; - /* Some eMMC set the value too low so set a minimum */ - if (card->ext_csd.part_time && - card->ext_csd.part_time < MMC_MIN_PART_SWITCH_TIME) - card->ext_csd.part_time = MMC_MIN_PART_SWITCH_TIME; /* Sleep / awake timeout in 100ns units */ if (sa_shift > 0 && sa_shift <= 0x17) @@ -616,6 +612,17 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) card->ext_csd.data_sector_size = 512; } + /* + * GENERIC_CMD6_TIME is to be used "unless a specific timeout is defined + * when accessing a specific field", so use it here if there is no + * PARTITION_SWITCH_TIME. + */ + if (!card->ext_csd.part_time) + card->ext_csd.part_time = card->ext_csd.generic_cmd6_time; + /* Some eMMC set the value too low so set a minimum */ + if (card->ext_csd.part_time < MMC_MIN_PART_SWITCH_TIME) + card->ext_csd.part_time = MMC_MIN_PART_SWITCH_TIME; + /* eMMC v5 or later */ if (card->ext_csd.rev >= 7) { memcpy(card->ext_csd.fwrev, &ext_csd[EXT_CSD_FIRMWARE_VERSION], diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index 17dbc81c221e..984d35055156 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1242,7 +1242,11 @@ mmci_start_command(struct mmci_host *host, struct mmc_command *cmd, u32 c) if (!cmd->busy_timeout) cmd->busy_timeout = 10 * MSEC_PER_SEC; - clks = (unsigned long long)cmd->busy_timeout * host->cclk; + if (cmd->busy_timeout > host->mmc->max_busy_timeout) + clks = (unsigned long long)host->mmc->max_busy_timeout * host->cclk; + else + clks = (unsigned long long)cmd->busy_timeout * host->cclk; + do_div(clks, MSEC_PER_SEC); writel_relaxed(clks, host->base + MMCIDATATIMER); } @@ -2151,6 +2155,10 @@ static int mmci_probe(struct amba_device *dev, mmc->caps |= MMC_CAP_WAIT_WHILE_BUSY; } + /* Variants with mandatory busy timeout in HW needs R1B responses. */ + if (variant->busy_timeout) + mmc->caps |= MMC_CAP_NEED_RSP_BUSY; + /* Prepare a CMD12 - needed to clear the DPSM on some variants. */ host->stop_abort.opcode = MMC_STOP_TRANSMISSION; host->stop_abort.arg = 0; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e68a8c4ac5a6..a5653892d773 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -380,6 +380,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) return true; nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; + nvme_req(req)->flags |= NVME_REQ_CANCELLED; blk_mq_complete_request(req); return true; } @@ -1440,7 +1441,7 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_free_id; } - error = -ENODEV; + error = NVME_SC_INVALID_NS | NVME_SC_DNR; if ((*id)->ncap == 0) /* namespace not allocated or attached */ goto out_free_id; @@ -4038,7 +4039,7 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid) static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) { struct nvme_id_ns *id; - int ret = -ENODEV; + int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; if (test_bit(NVME_NS_DEAD, &ns->flags)) goto out; @@ -4047,7 +4048,7 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) if (ret) goto out; - ret = -ENODEV; + ret = NVME_SC_INVALID_NS | NVME_SC_DNR; if (!nvme_ns_ids_equal(&ns->head->ids, ids)) { dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); @@ -4065,7 +4066,7 @@ out: * * TODO: we should probably schedule a delayed retry here. */ - if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR))) + if (ret > 0 && (ret & NVME_SC_DNR)) nvme_ns_remove(ns); } @@ -4095,6 +4096,12 @@ static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nsid); break; } + if (!nvme_multi_css(ctrl)) { + dev_warn(ctrl->device, + "command set not reported for nsid: %d\n", + nsid); + break; + } nvme_alloc_ns(ctrl, nsid, &ids); break; default: diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 20dadd86e981..73d073748389 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1956,7 +1956,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) sizeof(op->rsp_iu), DMA_FROM_DEVICE); if (opstate == FCPOP_STATE_ABORTED) - status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + status = cpu_to_le16(NVME_SC_HOST_ABORTED_CMD << 1); else if (freq->status) { status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); dev_info(ctrl->ctrl.device, @@ -2055,7 +2055,7 @@ done: nvme_fc_complete_rq(rq); check_error: - if (terminate_assoc) + if (terminate_assoc && ctrl->ctrl.state != NVME_CTRL_RESETTING) queue_work(nvme_reset_wq, &ctrl->ioerr_work); } @@ -2443,6 +2443,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); + op->nreq.flags |= NVME_REQ_CANCELLED; __nvme_fc_abort_op(ctrl, op); return true; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 17ab3320d28b..7249ae74f71f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3246,6 +3246,7 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | + NVME_QUIRK_DISABLE_WRITE_ZEROES| NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index c7e3ec561ba0..bc2f344f0ae0 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -9,7 +9,13 @@ int nvme_revalidate_zones(struct nvme_ns *ns) { - return blk_revalidate_disk_zones(ns->disk, NULL); + struct request_queue *q = ns->queue; + int ret; + + ret = blk_revalidate_disk_zones(ns->disk, NULL); + if (!ret) + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + return ret; } static int nvme_set_max_append(struct nvme_ctrl *ctrl) @@ -107,7 +113,6 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); free_data: kfree(id); return status; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 26c587ccd152..2798944899b7 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -50,9 +50,9 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) /* * nvmet_passthru_map_sg is limitted to using a single bio so limit - * the mdts based on BIO_MAX_PAGES as well + * the mdts based on BIO_MAX_VECS as well */ - max_hw_sectors = min_not_zero(BIO_MAX_PAGES << (PAGE_SHIFT - 9), + max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), max_hw_sectors); page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; @@ -191,7 +191,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) struct bio *bio; int i; - if (req->sg_cnt > BIO_MAX_PAGES) + if (req->sg_cnt > BIO_MAX_VECS) return -EINVAL; if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 06b6b742bb21..6c1f3ab7649c 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -802,9 +802,8 @@ static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc) nvmet_req_uninit(&rsp->req); nvmet_rdma_release_rsp(rsp); if (wc->status != IB_WC_WR_FLUSH_ERR) { - pr_info("RDMA WRITE for CQE 0x%p failed with status %s (%d).\n", - wc->wr_cqe, ib_wc_status_msg(wc->status), - wc->status); + pr_info("RDMA WRITE for CQE failed with status %s (%d).\n", + ib_wc_status_msg(wc->status), wc->status); nvmet_rdma_error_comp(queue); } return; diff --git a/drivers/opp/core.c b/drivers/opp/core.c index c2689386a906..1556998425d5 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1492,7 +1492,11 @@ static struct dev_pm_opp *_opp_get_next(struct opp_table *opp_table, mutex_lock(&opp_table->lock); list_for_each_entry(temp, &opp_table->opp_list, node) { - if (dynamic == temp->dynamic) { + /* + * Refcount must be dropped only once for each OPP by OPP core, + * do that with help of "removed" flag. + */ + if (!temp->removed && dynamic == temp->dynamic) { opp = temp; break; } @@ -1502,10 +1506,27 @@ static struct dev_pm_opp *_opp_get_next(struct opp_table *opp_table, return opp; } -bool _opp_remove_all_static(struct opp_table *opp_table) +/* + * Can't call dev_pm_opp_put() from under the lock as debugfs removal needs to + * happen lock less to avoid circular dependency issues. This routine must be + * called without the opp_table->lock held. + */ +static void _opp_remove_all(struct opp_table *opp_table, bool dynamic) { struct dev_pm_opp *opp; + while ((opp = _opp_get_next(opp_table, dynamic))) { + opp->removed = true; + dev_pm_opp_put(opp); + + /* Drop the references taken by dev_pm_opp_add() */ + if (dynamic) + dev_pm_opp_put_opp_table(opp_table); + } +} + +bool _opp_remove_all_static(struct opp_table *opp_table) +{ mutex_lock(&opp_table->lock); if (!opp_table->parsed_static_opps) { @@ -1520,13 +1541,7 @@ bool _opp_remove_all_static(struct opp_table *opp_table) mutex_unlock(&opp_table->lock); - /* - * Can't remove the OPP from under the lock, debugfs removal needs to - * happen lock less to avoid circular dependency issues. - */ - while ((opp = _opp_get_next(opp_table, false))) - dev_pm_opp_put(opp); - + _opp_remove_all(opp_table, false); return true; } @@ -1539,25 +1554,12 @@ bool _opp_remove_all_static(struct opp_table *opp_table) void dev_pm_opp_remove_all_dynamic(struct device *dev) { struct opp_table *opp_table; - struct dev_pm_opp *opp; - int count = 0; opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) return; - /* - * Can't remove the OPP from under the lock, debugfs removal needs to - * happen lock less to avoid circular dependency issues. - */ - while ((opp = _opp_get_next(opp_table, true))) { - dev_pm_opp_put(opp); - count++; - } - - /* Drop the references taken by dev_pm_opp_add() */ - while (count--) - dev_pm_opp_put_opp_table(opp_table); + _opp_remove_all(opp_table, true); /* Drop the reference taken by _find_opp_table() */ dev_pm_opp_put_opp_table(opp_table); diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 50fb9dced3c5..407c3bfe51d9 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -56,6 +56,7 @@ extern struct list_head opp_tables, lazy_opp_tables; * @dynamic: not-created from static DT entries. * @turbo: true if turbo (boost) OPP * @suspend: true if suspend OPP + * @removed: flag indicating that OPP's reference is dropped by OPP core. * @pstate: Device's power domain's performance state. * @rate: Frequency in hertz * @level: Performance level @@ -78,6 +79,7 @@ struct dev_pm_opp { bool dynamic; bool turbo; bool suspend; + bool removed; unsigned int pstate; unsigned long rate; unsigned int level; diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index c6fe0cfec0f6..2d7502648219 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -26,7 +26,7 @@ #include #include -#define INVALID_GRANT_REF (0) + #define INVALID_EVTCHN (-1) struct pci_bus_entry { @@ -42,7 +42,7 @@ struct pcifront_device { struct list_head root_buses; int evtchn; - int gnt_ref; + grant_ref_t gnt_ref; int irq; diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 66ad5b3ece19..f2a85500258d 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -681,6 +681,7 @@ static int dmc620_pmu_device_probe(struct platform_device *pdev) if (!name) { dev_err(&pdev->dev, "Create name failed, PMU @%pa\n", &res->start); + ret = -ENOMEM; goto out_teardown_dev; } diff --git a/drivers/regulator/mt6315-regulator.c b/drivers/regulator/mt6315-regulator.c index d49a1534d8e9..9edc34981ee0 100644 --- a/drivers/regulator/mt6315-regulator.c +++ b/drivers/regulator/mt6315-regulator.c @@ -41,7 +41,7 @@ struct mt6315_chip { .type = REGULATOR_VOLTAGE, \ .id = _bid, \ .owner = THIS_MODULE, \ - .n_voltages = 0xbf, \ + .n_voltages = 0xc0, \ .linear_ranges = mt_volt_range1, \ .n_linear_ranges = ARRAY_SIZE(mt_volt_range1), \ .vsel_reg = _vsel, \ @@ -69,7 +69,7 @@ static unsigned int mt6315_map_mode(u32 mode) case MT6315_BUCK_MODE_LP: return REGULATOR_MODE_IDLE; default: - return -EINVAL; + return REGULATOR_MODE_INVALID; } } diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c index 833d398c6aa2..2f7ee212cb8c 100644 --- a/drivers/regulator/pca9450-regulator.c +++ b/drivers/regulator/pca9450-regulator.c @@ -797,6 +797,14 @@ static int pca9450_i2c_probe(struct i2c_client *i2c, return ret; } + /* Clear PRESET_EN bit in BUCK123_DVS to use DVS registers */ + ret = regmap_clear_bits(pca9450->regmap, PCA9450_REG_BUCK123_DVS, + BUCK123_PRESET_EN); + if (ret) { + dev_err(&i2c->dev, "Failed to clear PRESET_EN bit: %d\n", ret); + return ret; + } + /* Set reset behavior on assertion of WDOG_B signal */ ret = regmap_update_bits(pca9450->regmap, PCA9450_REG_RESET_CTRL, WDOG_B_CFG_MASK, WDOG_B_CFG_COLD_LDO12); @@ -814,7 +822,7 @@ static int pca9450_i2c_probe(struct i2c_client *i2c, if (IS_ERR(pca9450->sd_vsel_gpio)) { dev_err(&i2c->dev, "Failed to get SD_VSEL GPIO\n"); - return ret; + return PTR_ERR(pca9450->sd_vsel_gpio); } dev_info(&i2c->dev, "%s probed.\n", diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index 79a554f1029d..65a108c9121f 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -726,8 +726,8 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps510 = { static const struct rpmh_vreg_hw_data pmic5_hfsmps515 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(2800000, 0, 4, 16000), - .n_voltages = 5, + .voltage_range = REGULATOR_LINEAR_RANGE(320000, 0, 235, 16000), + .n_voltages = 236, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, }; @@ -901,7 +901,7 @@ static const struct rpmh_vreg_init_data pm8350_vreg_data[] = { }; static const struct rpmh_vreg_init_data pm8350c_vreg_data[] = { - RPMH_VREG("smps1", "smp%s1", &pmic5_hfsmps510, "vdd-s1"), + RPMH_VREG("smps1", "smp%s1", &pmic5_hfsmps515, "vdd-s1"), RPMH_VREG("smps2", "smp%s2", &pmic5_ftsmps510, "vdd-s2"), RPMH_VREG("smps3", "smp%s3", &pmic5_ftsmps510, "vdd-s3"), RPMH_VREG("smps4", "smp%s4", &pmic5_ftsmps510, "vdd-s4"), diff --git a/drivers/regulator/rt4831-regulator.c b/drivers/regulator/rt4831-regulator.c index 3d4695ded629..e3aaac90d238 100644 --- a/drivers/regulator/rt4831-regulator.c +++ b/drivers/regulator/rt4831-regulator.c @@ -153,9 +153,9 @@ static int rt4831_regulator_probe(struct platform_device *pdev) int i, ret; regmap = dev_get_regmap(pdev->dev.parent, NULL); - if (IS_ERR(regmap)) { + if (!regmap) { dev_err(&pdev->dev, "Failed to init regmap\n"); - return PTR_ERR(regmap); + return -ENODEV; } /* Configure DSV mode to normal by default */ diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 28c04a4efa66..ba9ce4e0d30a 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3052,7 +3052,8 @@ static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx, basedev = block->base; spin_lock_irq(&dq->lock); - if (basedev->state < DASD_STATE_READY) { + if (basedev->state < DASD_STATE_READY || + test_bit(DASD_FLAG_OFFLINE, &basedev->flags)) { DBF_DEV_EVENT(DBF_ERR, basedev, "device not ready for request %p", req); rc = BLK_STS_IOERR; @@ -3487,8 +3488,6 @@ void dasd_generic_remove(struct ccw_device *cdev) struct dasd_device *device; struct dasd_block *block; - cdev->handler = NULL; - device = dasd_device_from_cdev(cdev); if (IS_ERR(device)) { dasd_remove_sysfs_files(cdev); @@ -3507,6 +3506,7 @@ void dasd_generic_remove(struct ccw_device *cdev) * no quite down yet. */ dasd_set_target_state(device, DASD_STATE_NEW); + cdev->handler = NULL; /* dasd_delete_device destroys the device reference. */ block = device->block; dasd_delete_device(device); diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index e663085a8944..1b68734940b5 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -158,6 +159,9 @@ static void ibmvfc_npiv_logout(struct ibmvfc_host *); static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); static void ibmvfc_tgt_move_login(struct ibmvfc_target *); +static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); + static const char *unknown_error = "unknown error"; static long h_reg_sub_crq(unsigned long unit_address, unsigned long ioba, @@ -899,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) { int rc = 0; struct vio_dev *vdev = to_vio_dev(vhost->dev); + unsigned long flags; + + ibmvfc_release_sub_crqs(vhost); /* Re-enable the CRQ */ do { @@ -910,6 +917,15 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) if (rc) dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); + spin_lock_irqsave(vhost->host->host_lock, flags); + spin_lock(vhost->crq.q_lock); + vhost->do_enquiry = 1; + vhost->using_channels = 0; + spin_unlock(vhost->crq.q_lock); + spin_unlock_irqrestore(vhost->host->host_lock, flags); + + ibmvfc_init_sub_crqs(vhost); + return rc; } @@ -926,8 +942,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) unsigned long flags; struct vio_dev *vdev = to_vio_dev(vhost->dev); struct ibmvfc_queue *crq = &vhost->crq; - struct ibmvfc_queue *scrq; - int i; + + ibmvfc_release_sub_crqs(vhost); /* Close the CRQ */ do { @@ -947,16 +963,6 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) memset(crq->msgs.crq, 0, PAGE_SIZE); crq->cur = 0; - if (vhost->scsi_scrqs.scrqs) { - for (i = 0; i < nr_scsi_hw_queues; i++) { - scrq = &vhost->scsi_scrqs.scrqs[i]; - spin_lock(scrq->q_lock); - memset(scrq->msgs.scrq, 0, PAGE_SIZE); - scrq->cur = 0; - spin_unlock(scrq->q_lock); - } - } - /* And re-open it again */ rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, crq->msg_token, PAGE_SIZE); @@ -966,9 +972,12 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) dev_warn(vhost->dev, "Partner adapter not ready\n"); else if (rc != 0) dev_warn(vhost->dev, "Couldn't register crq (rc=%d)\n", rc); + spin_unlock(vhost->crq.q_lock); spin_unlock_irqrestore(vhost->host->host_lock, flags); + ibmvfc_init_sub_crqs(vhost); + return rc; } @@ -5642,7 +5651,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, rc = h_reg_sub_crq(vdev->unit_address, scrq->msg_token, PAGE_SIZE, &scrq->cookie, &scrq->hw_irq); - if (rc) { + /* H_CLOSED indicates successful register, but no CRQ partner */ + if (rc && rc != H_CLOSED) { dev_warn(dev, "Error registering sub-crq: %d\n", rc); if (rc == H_PARAMETER) dev_warn_once(dev, "Firmware may not support MQ\n"); @@ -5675,8 +5685,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, irq_failed: do { - plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); - } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); + } while (rtas_busy_delay(rc)); reg_failed: ibmvfc_free_queue(vhost, scrq); LEAVE; @@ -5694,6 +5704,7 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) free_irq(scrq->irq, scrq); irq_dispose_mapping(scrq->irq); + scrq->irq = 0; do { rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, @@ -5707,17 +5718,21 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) LEAVE; } -static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) { int i, j; ENTER; + if (!vhost->mq_enabled) + return; vhost->scsi_scrqs.scrqs = kcalloc(nr_scsi_hw_queues, sizeof(*vhost->scsi_scrqs.scrqs), GFP_KERNEL); - if (!vhost->scsi_scrqs.scrqs) - return -1; + if (!vhost->scsi_scrqs.scrqs) { + vhost->do_enquiry = 0; + return; + } for (i = 0; i < nr_scsi_hw_queues; i++) { if (ibmvfc_register_scsi_channel(vhost, i)) { @@ -5726,13 +5741,12 @@ static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) kfree(vhost->scsi_scrqs.scrqs); vhost->scsi_scrqs.scrqs = NULL; vhost->scsi_scrqs.active_queues = 0; - LEAVE; - return -1; + vhost->do_enquiry = 0; + break; } } LEAVE; - return 0; } static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost) @@ -5999,11 +6013,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id) goto remove_shost; } - if (vhost->mq_enabled) { - rc = ibmvfc_init_sub_crqs(vhost); - if (rc) - dev_warn(dev, "Failed to allocate Sub-CRQs. rc=%d\n", rc); - } + ibmvfc_init_sub_crqs(vhost); if (shost_to_fc_host(shost)->rqst_q) blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1); diff --git a/drivers/scsi/ufs/ufs-qcom.c b/drivers/scsi/ufs/ufs-qcom.c index 7422a9b886be..ecc3f0f22105 100644 --- a/drivers/scsi/ufs/ufs-qcom.c +++ b/drivers/scsi/ufs/ufs-qcom.c @@ -253,12 +253,17 @@ static int ufs_qcom_host_reset(struct ufs_hba *hba) { int ret = 0; struct ufs_qcom_host *host = ufshcd_get_variant(hba); + bool reenable_intr = false; if (!host->core_reset) { dev_warn(hba->dev, "%s: reset control not set\n", __func__); goto out; } + reenable_intr = hba->is_irq_enabled; + disable_irq(hba->irq); + hba->is_irq_enabled = false; + ret = reset_control_assert(host->core_reset); if (ret) { dev_err(hba->dev, "%s: core_reset assert failed, err = %d\n", @@ -280,6 +285,11 @@ static int ufs_qcom_host_reset(struct ufs_hba *hba) usleep_range(1000, 1100); + if (reenable_intr) { + enable_irq(hba->irq); + hba->is_irq_enabled = true; + } + out: return ret; } diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 92df233ebdc4..26a0dc0ecad3 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -98,8 +98,6 @@ 16, 4, buf, __len, false); \ } while (0) -static bool early_suspend; - int ufshcd_dump_regs(struct ufs_hba *hba, size_t offset, size_t len, const char *prefix) { @@ -1537,7 +1535,7 @@ static ssize_t ufshcd_clkscale_enable_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%d\n", hba->clk_scaling.is_enabled); + return sysfs_emit(buf, "%d\n", hba->clk_scaling.is_enabled); } static ssize_t ufshcd_clkscale_enable_store(struct device *dev, @@ -5008,6 +5006,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) * UFS device needs urgent BKOPs. */ if (!hba->pm_op_in_progress && + !ufshcd_eh_in_progress(hba) && ufshcd_is_exception_event(lrbp->ucd_rsp_ptr) && schedule_work(&hba->eeh_work)) { /* @@ -5808,13 +5807,20 @@ static void ufshcd_err_handling_prepare(struct ufs_hba *hba) ufshcd_suspend_clkscaling(hba); ufshcd_clk_scaling_allow(hba, false); } + ufshcd_scsi_block_requests(hba); + /* Drain ufshcd_queuecommand() */ + down_write(&hba->clk_scaling_lock); + up_write(&hba->clk_scaling_lock); + cancel_work_sync(&hba->eeh_work); } static void ufshcd_err_handling_unprepare(struct ufs_hba *hba) { + ufshcd_scsi_unblock_requests(hba); ufshcd_release(hba); if (ufshcd_is_clkscaling_supported(hba)) ufshcd_clk_scaling_suspend(hba, false); + ufshcd_clear_ua_wluns(hba); pm_runtime_put(hba->dev); } @@ -5906,8 +5912,8 @@ static void ufshcd_err_handler(struct work_struct *work) spin_unlock_irqrestore(hba->host->host_lock, flags); ufshcd_err_handling_prepare(hba); spin_lock_irqsave(hba->host->host_lock, flags); - ufshcd_scsi_block_requests(hba); - hba->ufshcd_state = UFSHCD_STATE_RESET; + if (hba->ufshcd_state != UFSHCD_STATE_ERROR) + hba->ufshcd_state = UFSHCD_STATE_RESET; /* Complete requests that have door-bell cleared by h/w */ ufshcd_complete_requests(hba); @@ -6066,12 +6072,8 @@ skip_err_handling: } ufshcd_clear_eh_in_progress(hba); spin_unlock_irqrestore(hba->host->host_lock, flags); - ufshcd_scsi_unblock_requests(hba); ufshcd_err_handling_unprepare(hba); up(&hba->host_sem); - - if (!err && needs_reset) - ufshcd_clear_ua_wluns(hba); } /** @@ -7882,6 +7884,8 @@ static int ufshcd_probe_hba(struct ufs_hba *hba, bool async) unsigned long flags; ktime_t start = ktime_get(); + hba->ufshcd_state = UFSHCD_STATE_RESET; + ret = ufshcd_link_startup(hba); if (ret) goto out; @@ -8996,11 +9000,6 @@ int ufshcd_system_suspend(struct ufs_hba *hba) int ret = 0; ktime_t start = ktime_get(); - if (!hba) { - early_suspend = true; - return 0; - } - down(&hba->host_sem); if (!hba->is_powered) @@ -9052,14 +9051,6 @@ int ufshcd_system_resume(struct ufs_hba *hba) int ret = 0; ktime_t start = ktime_get(); - if (!hba) - return -EINVAL; - - if (unlikely(early_suspend)) { - early_suspend = false; - down(&hba->host_sem); - } - if (!hba->is_powered || pm_runtime_suspended(hba->dev)) /* * Let the runtime resume take care of resuming @@ -9092,9 +9083,6 @@ int ufshcd_runtime_suspend(struct ufs_hba *hba) int ret = 0; ktime_t start = ktime_get(); - if (!hba) - return -EINVAL; - if (!hba->is_powered) goto out; else @@ -9133,9 +9121,6 @@ int ufshcd_runtime_resume(struct ufs_hba *hba) int ret = 0; ktime_t start = ktime_get(); - if (!hba) - return -EINVAL; - if (!hba->is_powered) goto out; else diff --git a/drivers/scsi/vmw_pvscsi.c b/drivers/scsi/vmw_pvscsi.c index 081f54ab7d86..8a79605d9652 100644 --- a/drivers/scsi/vmw_pvscsi.c +++ b/drivers/scsi/vmw_pvscsi.c @@ -17,8 +17,6 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * - * Maintained by: Jim Gill - * */ #include diff --git a/drivers/scsi/vmw_pvscsi.h b/drivers/scsi/vmw_pvscsi.h index 75966d3f326e..51a82f7803d3 100644 --- a/drivers/scsi/vmw_pvscsi.h +++ b/drivers/scsi/vmw_pvscsi.h @@ -17,8 +17,6 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * - * Maintained by: Jim Gill - * */ #ifndef _VMW_PVSCSI_H_ diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index da87f3a1e351..b8f2f971c2f0 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -47,6 +47,11 @@ static unsigned evtchn_2l_max_channels(void) return EVTCHN_2L_NR_CHANNELS; } +static void evtchn_2l_remove(evtchn_port_t evtchn, unsigned int cpu) +{ + clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + static void evtchn_2l_bind_to_cpu(evtchn_port_t evtchn, unsigned int cpu, unsigned int old_cpu) { @@ -72,12 +77,6 @@ static bool evtchn_2l_is_pending(evtchn_port_t port) return sync_test_bit(port, BM(&s->evtchn_pending[0])); } -static bool evtchn_2l_test_and_set_mask(evtchn_port_t port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); -} - static void evtchn_2l_mask(evtchn_port_t port) { struct shared_info *s = HYPERVISOR_shared_info; @@ -355,18 +354,27 @@ static void evtchn_2l_resume(void) EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); } +static int evtchn_2l_percpu_deinit(unsigned int cpu) +{ + memset(per_cpu(cpu_evtchn_mask, cpu), 0, sizeof(xen_ulong_t) * + EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); + + return 0; +} + static const struct evtchn_ops evtchn_ops_2l = { .max_channels = evtchn_2l_max_channels, .nr_channels = evtchn_2l_max_channels, + .remove = evtchn_2l_remove, .bind_to_cpu = evtchn_2l_bind_to_cpu, .clear_pending = evtchn_2l_clear_pending, .set_pending = evtchn_2l_set_pending, .is_pending = evtchn_2l_is_pending, - .test_and_set_mask = evtchn_2l_test_and_set_mask, .mask = evtchn_2l_mask, .unmask = evtchn_2l_unmask, .handle_events = evtchn_2l_handle_events, .resume = evtchn_2l_resume, + .percpu_deinit = evtchn_2l_percpu_deinit, }; void __init xen_evtchn_2l_init(void) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index adb7260e94b2..8236e2364eeb 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -98,13 +98,19 @@ struct irq_info { short refcnt; u8 spurious_cnt; u8 is_accounted; - enum xen_irq_type type; /* type */ + short type; /* type: IRQT_* */ + u8 mask_reason; /* Why is event channel masked */ +#define EVT_MASK_REASON_EXPLICIT 0x01 +#define EVT_MASK_REASON_TEMPORARY 0x02 +#define EVT_MASK_REASON_EOI_PENDING 0x04 + u8 is_active; /* Is event just being handled? */ unsigned irq; evtchn_port_t evtchn; /* event channel */ unsigned short cpu; /* cpu bound */ unsigned short eoi_cpu; /* EOI must happen on this cpu-1 */ unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */ u64 eoi_time; /* Time in jiffies when to EOI. */ + spinlock_t lock; union { unsigned short virq; @@ -154,6 +160,7 @@ static DEFINE_RWLOCK(evtchn_rwlock); * evtchn_rwlock * IRQ-desc lock * percpu eoi_list_lock + * irq_info->lock */ static LIST_HEAD(xen_irq_list_head); @@ -304,6 +311,8 @@ static int xen_irq_info_common_setup(struct irq_info *info, info->irq = irq; info->evtchn = evtchn; info->cpu = cpu; + info->mask_reason = EVT_MASK_REASON_EXPLICIT; + spin_lock_init(&info->lock); ret = set_evtchn_to_irq(evtchn, irq); if (ret < 0) @@ -377,6 +386,7 @@ static int xen_irq_info_pirq_setup(unsigned irq, static void xen_irq_info_cleanup(struct irq_info *info) { set_evtchn_to_irq(info->evtchn, -1); + xen_evtchn_port_remove(info->evtchn, info->cpu); info->evtchn = 0; channels_on_cpu_dec(info); } @@ -458,6 +468,34 @@ unsigned int cpu_from_evtchn(evtchn_port_t evtchn) return ret; } +static void do_mask(struct irq_info *info, u8 reason) +{ + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + + if (!info->mask_reason) + mask_evtchn(info->evtchn); + + info->mask_reason |= reason; + + spin_unlock_irqrestore(&info->lock, flags); +} + +static void do_unmask(struct irq_info *info, u8 reason) +{ + unsigned long flags; + + spin_lock_irqsave(&info->lock, flags); + + info->mask_reason &= ~reason; + + if (!info->mask_reason) + unmask_evtchn(info->evtchn); + + spin_unlock_irqrestore(&info->lock, flags); +} + #ifdef CONFIG_X86 static bool pirq_check_eoi_map(unsigned irq) { @@ -604,7 +642,7 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) } info->eoi_time = 0; - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_EOI_PENDING); } static void xen_irq_lateeoi_worker(struct work_struct *work) @@ -773,6 +811,12 @@ static void xen_evtchn_close(evtchn_port_t port) BUG(); } +static void event_handler_exit(struct irq_info *info) +{ + smp_store_release(&info->is_active, 0); + clear_evtchn(info->evtchn); +} + static void pirq_query_unmask(int irq) { struct physdev_irq_status_query irq_status; @@ -791,14 +835,15 @@ static void pirq_query_unmask(int irq) static void eoi_pirq(struct irq_data *data) { - evtchn_port_t evtchn = evtchn_from_irq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; int rc = 0; if (!VALID_EVTCHN(evtchn)) return; - clear_evtchn(evtchn); + event_handler_exit(info); if (pirq_needs_eoi(data->irq)) { rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); @@ -849,7 +894,8 @@ static unsigned int __startup_pirq(unsigned int irq) goto err; out: - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_EXPLICIT); + eoi_pirq(irq_get_irq_data(irq)); return 0; @@ -876,7 +922,7 @@ static void shutdown_pirq(struct irq_data *data) if (!VALID_EVTCHN(evtchn)) return; - mask_evtchn(evtchn); + do_mask(info, EVT_MASK_REASON_EXPLICIT); xen_evtchn_close(evtchn); xen_irq_info_cleanup(info); } @@ -1628,6 +1674,8 @@ void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) } info = info_for_irq(irq); + if (xchg_acquire(&info->is_active, 1)) + return; dev = (info->type == IRQT_EVTCHN) ? info->u.interdomain : NULL; if (dev) @@ -1720,10 +1768,10 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) } /* Rebind an evtchn so that it gets delivered to a specific cpu */ -static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu) +static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu) { struct evtchn_bind_vcpu bind_vcpu; - int masked; + evtchn_port_t evtchn = info ? info->evtchn : 0; if (!VALID_EVTCHN(evtchn)) return -1; @@ -1739,7 +1787,7 @@ static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu) * Mask the event while changing the VCPU binding to prevent * it being delivered on an unexpected VCPU. */ - masked = test_and_set_mask(evtchn); + do_mask(info, EVT_MASK_REASON_TEMPORARY); /* * If this fails, it usually just indicates that we're dealing with a @@ -1749,8 +1797,7 @@ static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu) if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) bind_evtchn_to_cpu(evtchn, tcpu, false); - if (!masked) - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_TEMPORARY); return 0; } @@ -1789,7 +1836,7 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, unsigned int tcpu = select_target_cpu(dest); int ret; - ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); + ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu); if (!ret) irq_data_update_effective_affinity(data, cpumask_of(tcpu)); @@ -1798,28 +1845,29 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, static void enable_dynirq(struct irq_data *data) { - evtchn_port_t evtchn = evtchn_from_irq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_EXPLICIT); } static void disable_dynirq(struct irq_data *data) { - evtchn_port_t evtchn = evtchn_from_irq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); + do_mask(info, EVT_MASK_REASON_EXPLICIT); } static void ack_dynirq(struct irq_data *data) { - evtchn_port_t evtchn = evtchn_from_irq(data->irq); + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; - if (!VALID_EVTCHN(evtchn)) - return; - - clear_evtchn(evtchn); + if (VALID_EVTCHN(evtchn)) + event_handler_exit(info); } static void mask_ack_dynirq(struct irq_data *data) @@ -1828,18 +1876,39 @@ static void mask_ack_dynirq(struct irq_data *data) ack_dynirq(data); } +static void lateeoi_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) { + do_mask(info, EVT_MASK_REASON_EOI_PENDING); + event_handler_exit(info); + } +} + +static void lateeoi_mask_ack_dynirq(struct irq_data *data) +{ + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; + + if (VALID_EVTCHN(evtchn)) { + do_mask(info, EVT_MASK_REASON_EXPLICIT); + event_handler_exit(info); + } +} + static int retrigger_dynirq(struct irq_data *data) { - evtchn_port_t evtchn = evtchn_from_irq(data->irq); - int masked; + struct irq_info *info = info_for_irq(data->irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (!VALID_EVTCHN(evtchn)) return 0; - masked = test_and_set_mask(evtchn); + do_mask(info, EVT_MASK_REASON_TEMPORARY); set_evtchn(evtchn); - if (!masked) - unmask_evtchn(evtchn); + do_unmask(info, EVT_MASK_REASON_TEMPORARY); return 1; } @@ -1938,10 +2007,11 @@ static void restore_cpu_ipis(unsigned int cpu) /* Clear an irq's pending state, in preparation for polling on it */ void xen_clear_irq_pending(int irq) { - evtchn_port_t evtchn = evtchn_from_irq(irq); + struct irq_info *info = info_for_irq(irq); + evtchn_port_t evtchn = info ? info->evtchn : 0; if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); + event_handler_exit(info); } EXPORT_SYMBOL(xen_clear_irq_pending); void xen_set_irq_pending(int irq) @@ -2053,8 +2123,8 @@ static struct irq_chip xen_lateeoi_chip __read_mostly = { .irq_mask = disable_dynirq, .irq_unmask = enable_dynirq, - .irq_ack = mask_ack_dynirq, - .irq_mask_ack = mask_ack_dynirq, + .irq_ack = lateeoi_ack_dynirq, + .irq_mask_ack = lateeoi_mask_ack_dynirq, .irq_set_affinity = set_affinity_irq, .irq_retrigger = retrigger_dynirq, diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index b234f1766810..ad9fe51d3fb3 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -209,12 +209,6 @@ static bool evtchn_fifo_is_pending(evtchn_port_t port) return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); } -static bool evtchn_fifo_test_and_set_mask(evtchn_port_t port) -{ - event_word_t *word = event_word_from_port(port); - return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); -} - static void evtchn_fifo_mask(evtchn_port_t port) { event_word_t *word = event_word_from_port(port); @@ -423,7 +417,6 @@ static const struct evtchn_ops evtchn_ops_fifo = { .clear_pending = evtchn_fifo_clear_pending, .set_pending = evtchn_fifo_set_pending, .is_pending = evtchn_fifo_is_pending, - .test_and_set_mask = evtchn_fifo_test_and_set_mask, .mask = evtchn_fifo_mask, .unmask = evtchn_fifo_unmask, .handle_events = evtchn_fifo_handle_events, diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 0a97c0549db7..4d3398eff9cd 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -14,13 +14,13 @@ struct evtchn_ops { unsigned (*nr_channels)(void); int (*setup)(evtchn_port_t port); + void (*remove)(evtchn_port_t port, unsigned int cpu); void (*bind_to_cpu)(evtchn_port_t evtchn, unsigned int cpu, unsigned int old_cpu); void (*clear_pending)(evtchn_port_t port); void (*set_pending)(evtchn_port_t port); bool (*is_pending)(evtchn_port_t port); - bool (*test_and_set_mask)(evtchn_port_t port); void (*mask)(evtchn_port_t port); void (*unmask)(evtchn_port_t port); @@ -54,6 +54,13 @@ static inline int xen_evtchn_port_setup(evtchn_port_t evtchn) return 0; } +static inline void xen_evtchn_port_remove(evtchn_port_t evtchn, + unsigned int cpu) +{ + if (evtchn_ops->remove) + evtchn_ops->remove(evtchn, cpu); +} + static inline void xen_evtchn_port_bind_to_cpu(evtchn_port_t evtchn, unsigned int cpu, unsigned int old_cpu) @@ -76,11 +83,6 @@ static inline bool test_evtchn(evtchn_port_t port) return evtchn_ops->is_pending(port); } -static inline bool test_and_set_mask(evtchn_port_t port) -{ - return evtchn_ops->test_and_set_mask(port); -} - static inline void mask_evtchn(evtchn_port_t port) { return evtchn_ops->mask(port); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 5447c5156b2e..f01d58c7a042 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -133,20 +133,26 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, if (NULL == add) return NULL; - add->grants = kvcalloc(count, sizeof(add->grants[0]), GFP_KERNEL); - add->map_ops = kvcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL); - add->unmap_ops = kvcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL); - add->kmap_ops = kvcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); - add->kunmap_ops = kvcalloc(count, - sizeof(add->kunmap_ops[0]), GFP_KERNEL); + add->grants = kvmalloc_array(count, sizeof(add->grants[0]), + GFP_KERNEL); + add->map_ops = kvmalloc_array(count, sizeof(add->map_ops[0]), + GFP_KERNEL); + add->unmap_ops = kvmalloc_array(count, sizeof(add->unmap_ops[0]), + GFP_KERNEL); add->pages = kvcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || - NULL == add->kmap_ops || - NULL == add->kunmap_ops || NULL == add->pages) goto err; + if (use_ptemod) { + add->kmap_ops = kvmalloc_array(count, sizeof(add->kmap_ops[0]), + GFP_KERNEL); + add->kunmap_ops = kvmalloc_array(count, sizeof(add->kunmap_ops[0]), + GFP_KERNEL); + if (NULL == add->kmap_ops || NULL == add->kunmap_ops) + goto err; + } #ifdef CONFIG_XEN_GRANT_DMA_ALLOC add->dma_flags = dma_flags; @@ -183,10 +189,14 @@ struct gntdev_grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count, goto err; for (i = 0; i < count; i++) { - add->map_ops[i].handle = -1; - add->unmap_ops[i].handle = -1; - add->kmap_ops[i].handle = -1; - add->kunmap_ops[i].handle = -1; + add->grants[i].domid = DOMID_INVALID; + add->grants[i].ref = INVALID_GRANT_REF; + add->map_ops[i].handle = INVALID_GRANT_HANDLE; + add->unmap_ops[i].handle = INVALID_GRANT_HANDLE; + if (use_ptemod) { + add->kmap_ops[i].handle = INVALID_GRANT_HANDLE; + add->kunmap_ops[i].handle = INVALID_GRANT_HANDLE; + } } add->index = 0; @@ -274,7 +284,7 @@ static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) map->grants[pgnr].ref, map->grants[pgnr].domid); gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, - -1 /* handle */); + INVALID_GRANT_HANDLE); return 0; } @@ -292,7 +302,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) if (!use_ptemod) { /* Note: it could already be mapped */ - if (map->map_ops[0].handle != -1) + if (map->map_ops[0].handle != INVALID_GRANT_HANDLE) return 0; for (i = 0; i < map->count; i++) { unsigned long addr = (unsigned long) @@ -301,7 +311,7 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) map->grants[i].ref, map->grants[i].domid); gnttab_set_unmap_op(&map->unmap_ops[i], addr, - map->flags, -1 /* handle */); + map->flags, INVALID_GRANT_HANDLE); } } else { /* @@ -327,13 +337,13 @@ int gntdev_map_grant_pages(struct gntdev_grant_map *map) map->grants[i].ref, map->grants[i].domid); gnttab_set_unmap_op(&map->kunmap_ops[i], address, - flags, -1); + flags, INVALID_GRANT_HANDLE); } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, - map->pages, map->count); + err = gnttab_map_refs(map->map_ops, map->kmap_ops, map->pages, + map->count); for (i = 0; i < map->count; i++) { if (map->map_ops[i].status == GNTST_okay) @@ -385,7 +395,7 @@ static int __unmap_grant_pages(struct gntdev_grant_map *map, int offset, pr_debug("unmap handle=%d st=%d\n", map->unmap_ops[offset+i].handle, map->unmap_ops[offset+i].status); - map->unmap_ops[offset+i].handle = -1; + map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; } return err; } @@ -401,13 +411,15 @@ static int unmap_grant_pages(struct gntdev_grant_map *map, int offset, * already unmapped some of the grants. Only unmap valid ranges. */ while (pages && !err) { - while (pages && map->unmap_ops[offset].handle == -1) { + while (pages && + map->unmap_ops[offset].handle == INVALID_GRANT_HANDLE) { offset++; pages--; } range = 0; while (range < pages) { - if (map->unmap_ops[offset+range].handle == -1) + if (map->unmap_ops[offset + range].handle == + INVALID_GRANT_HANDLE) break; range++; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 4aa1f88d5bf8..92ed7d5df677 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -118,13 +118,22 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, if (!(mode & FMODE_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range); if (err) - return err; + goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; + +invalidate: + /* + * Someone else has handle exclusively open. Try invalidating instead. + * The 'end' argument is inclusive so the rounding is safe. + */ + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + lstart >> PAGE_SHIFT, + lend >> PAGE_SHIFT); } static void set_init_blocksize(struct block_device *bdev) @@ -423,7 +432,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { bool polled = false; @@ -491,8 +500,8 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (!iov_iter_count(iter)) return 0; - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_PAGES + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4671c99d468d..191e358f1322 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3059,7 +3059,7 @@ struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c2900ebf767a..3d9088eab2fc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1428,7 +1428,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, if (!first_page->dev->bdev) goto out; - bio = btrfs_io_bio_alloc(BIO_MAX_PAGES); + bio = btrfs_io_bio_alloc(BIO_MAX_VECS); bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 1f0270229d7b..da8351d1e455 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -378,7 +378,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type attr = to_attr(dentry); if (!attr) - goto out_put_item; + goto out_free_buffer; if (type & CONFIGFS_ITEM_BIN_ATTR) { buffer->bin_attr = to_bin_attr(dentry); @@ -391,7 +391,7 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type /* Grab the module reference for this attribute if we have one */ error = -ENODEV; if (!try_module_get(buffer->owner)) - goto out_put_item; + goto out_free_buffer; error = -EACCES; if (!buffer->item->ci_type) @@ -435,8 +435,6 @@ static int __configfs_open_file(struct inode *inode, struct file *file, int type out_put_module: module_put(buffer->owner); -out_put_item: - config_item_put(buffer->item); out_free_buffer: up_read(&frag->frag_sem); kfree(buffer); diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index b048a0e38516..68a2de6b5a9b 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -52,7 +52,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, int num_pages = 0; /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOFS, BIO_MAX_VECS); while (len) { unsigned int blocks_this_page = min(len, blocks_per_page); @@ -74,7 +74,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, len -= blocks_this_page; lblk += blocks_this_page; pblk += blocks_this_page; - if (num_pages == BIO_MAX_PAGES || !len || + if (num_pages == BIO_MAX_VECS || !len || !fscrypt_mergeable_bio(bio, inode, lblk)) { err = submit_bio_wait(bio); if (err) @@ -126,7 +126,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, len); - BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES); + BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), (len + blocks_per_page - 1) >> blocks_per_page_bits); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6cb356c4217b..3851e1a64f73 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1235,7 +1235,7 @@ submit_bio_retry: } if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); bio->bi_end_io = z_erofs_decompressqueue_endio; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 03a44a0de86a..f038d578d8d8 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -398,7 +398,7 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 174a0819ad96..be5415a0dbbc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -292,7 +292,7 @@ void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - f2fs_ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); + f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d8d17e94fd57..dae125526a89 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -862,7 +862,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { - bio = __bio_alloc(fio, BIO_MAX_PAGES); + bio = __bio_alloc(fio, BIO_MAX_VECS); __attach_io_flag(fio); f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, fio->page->index, fio, GFP_NOIO); @@ -937,7 +937,7 @@ alloc_new: fio->retry = true; goto skip; } - io->bio = __bio_alloc(fio, BIO_MAX_PAGES); + io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, bio_page->index, fio, GFP_NOIO); io->fio = *fio; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 993004f06a77..c2866561263e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4381,7 +4381,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) block_t total_node_blocks = 0; do { - readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 229814b4f4a6..e9a7a637d688 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -851,7 +851,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * BIO_MAX_PAGES; + return 8 * BIO_MAX_VECS; else return 0; } @@ -868,7 +868,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - desired = BIO_MAX_PAGES; + desired = BIO_MAX_VECS; if (type == NODE) desired <<= 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7069793752f1..82592b19b4e0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -753,9 +753,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { f2fs_warn(sbi, "Not support %d, larger than %d", - 1 << arg, BIO_MAX_PAGES); + 1 << arg, BIO_MAX_VECS); return -EINVAL; } F2FS_OPTION(sbi).write_io_size_bits = arg; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 16937ebb2a3e..6410281546f9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -998,12 +998,16 @@ static void trans_drain(struct gfs2_trans *tr) while (!list_empty(head)) { bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); + if (!list_empty(&bd->bd_ail_st_list)) + gfs2_remove_from_ail(bd); kmem_cache_free(gfs2_bufdata_cachep, bd); } head = &tr->tr_databuf; while (!list_empty(head)) { bd = list_first_entry(head, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); + if (!list_empty(&bd->bd_ail_st_list)) + gfs2_remove_from_ail(bd); kmem_cache_free(gfs2_bufdata_cachep, bd); } } @@ -1032,7 +1036,7 @@ repeat: * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() */ - if (gfs2_withdrawn(sdp)) + if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) goto out; /* Log might have been flushed while we waited for the flush lock */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index dc1b93a877c6..a82f4747aa8d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -267,7 +267,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 74c7d01723b9..aa4136055a83 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1539,9 +1539,7 @@ static int gfs2_reconfigure(struct fs_context *fc) return -EINVAL; if (fc->sb_flags & SB_RDONLY) { - error = gfs2_make_fs_ro(sdp); - if (error) - errorfc(fc, "unable to remount read-only"); + gfs2_make_fs_ro(sdp); } else { error = gfs2_make_fs_rw(sdp); if (error) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 861ed5fe02a5..97076d3f562f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -587,9 +587,8 @@ out: * Returns: errno */ -int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - int error = 0; int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); gfs2_flush_delete_work(sdp); @@ -624,8 +623,6 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) if (!log_write_allowed) sdp->sd_vfs->s_flags |= SB_RDONLY; - - return error; } /** @@ -637,7 +634,6 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) static void gfs2_put_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - int error; struct gfs2_jdesc *jd; /* No more recovery requests */ @@ -658,9 +654,7 @@ restart: spin_unlock(&sdp->sd_jindex_spin); if (!sb_rdonly(sb)) { - error = gfs2_make_fs_ro(sdp); - if (error) - gfs2_io_error(sdp); + gfs2_make_fs_ro(sdp); } WARN_ON(gfs2_withdrawing(sdp)); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 08e502dec7ec..ec4affb33ed5 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -34,7 +34,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp); +extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ab96cf0bf26b..63fec11ef2ce 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -169,6 +169,8 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, bd->bd_bh = bh; bd->bd_gl = gl; INIT_LIST_HEAD(&bd->bd_list); + INIT_LIST_HEAD(&bd->bd_ail_st_list); + INIT_LIST_HEAD(&bd->bd_ail_gl_list); bh->b_private = bd; return bd; } diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 8d3c670c990f..4f034b87b427 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -119,17 +119,22 @@ void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh) static void signal_our_withdraw(struct gfs2_sbd *sdp) { struct gfs2_glock *live_gl = sdp->sd_live_gh.gh_gl; - struct inode *inode = sdp->sd_jdesc->jd_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_glock *i_gl = ip->i_gl; - u64 no_formal_ino = ip->i_no_formal_ino; + struct inode *inode; + struct gfs2_inode *ip; + struct gfs2_glock *i_gl; + u64 no_formal_ino; int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); int ret = 0; int tries; - if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc) return; + inode = sdp->sd_jdesc->jd_inode; + ip = GFS2_I(inode); + i_gl = ip->i_gl; + no_formal_ino = ip->i_no_formal_ino; + /* Prevent any glock dq until withdraw recovery is complete */ set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); /* @@ -156,7 +161,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) ret = 0; } if (!ret) - ret = gfs2_make_fs_ro(sdp); + gfs2_make_fs_ro(sdp); gfs2_freeze_unlock(&freeze_gh); } diff --git a/fs/io-wq.c b/fs/io-wq.c index 28868eb4cd09..0ae9ecadf295 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -110,7 +110,6 @@ struct io_wq { io_wq_work_fn *do_work; struct task_struct *manager; - struct user_struct *user; struct io_wq_hash *hash; @@ -592,7 +591,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) tsk->pf_io_worker = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); - tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY; + tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -710,7 +709,6 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); - try_to_freeze(); if (fatal_signal_pending(current)) set_bit(IO_WQ_BIT_EXIT, &wq->state); } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); @@ -722,9 +720,9 @@ static int io_wq_manager(void *data) io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); - /* we might not ever have created any workers */ - if (atomic_read(&wq->worker_refs)) - wait_for_completion(&wq->worker_done); + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); + wait_for_completion(&wq->worker_done); spin_lock_irq(&wq->hash->wait.lock); for_each_node(node) @@ -774,7 +772,10 @@ static int io_wq_fork_manager(struct io_wq *wq) if (wq->manager) return 0; - reinit_completion(&wq->worker_done); + WARN_ON_ONCE(test_bit(IO_WQ_BIT_EXIT, &wq->state)); + + init_completion(&wq->worker_done); + atomic_set(&wq->worker_refs, 1); tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); if (!IS_ERR(tsk)) { wq->manager = get_task_struct(tsk); @@ -782,6 +783,9 @@ static int io_wq_fork_manager(struct io_wq *wq) return 0; } + if (atomic_dec_and_test(&wq->worker_refs)) + complete(&wq->worker_done); + return PTR_ERR(tsk); } @@ -794,8 +798,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) /* Can only happen if manager creation fails after exec */ if (io_wq_fork_manager(wqe->wq) || test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { - work->flags |= IO_WQ_WORK_CANCEL; - wqe->wq->do_work(work); + io_run_cancel(work, wqe); return; } @@ -1018,13 +1021,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->exited); refcount_set(&wq->refs, 1); - init_completion(&wq->worker_done); - atomic_set(&wq->worker_refs, 0); - ret = io_wq_fork_manager(wq); if (!ret) return wq; - err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); diff --git a/fs/io-wq.h b/fs/io-wq.h index 5fbf7997149e..1ac2f3248088 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -79,8 +79,8 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; + const struct cred *creds; unsigned flags; - unsigned short personality; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 92c25b5f1349..a4bce17af506 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -258,12 +258,10 @@ enum { struct io_sq_data { refcount_t refs; - struct mutex lock; + struct rw_semaphore rw_lock; /* ctx's that are using this sqd */ struct list_head ctx_list; - struct list_head ctx_new_list; - struct mutex ctx_lock; struct task_struct *thread; struct wait_queue_head wait; @@ -271,10 +269,9 @@ struct io_sq_data { unsigned sq_thread_idle; int sq_cpu; pid_t task_pid; + pid_t task_tgid; unsigned long state; - struct completion startup; - struct completion parked; struct completion exited; }; @@ -336,7 +333,6 @@ struct io_ring_ctx { unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; - unsigned int sqo_exec: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -380,6 +376,7 @@ struct io_ring_ctx { /* Only used for accounting purposes */ struct mm_struct *mm_account; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; @@ -400,7 +397,6 @@ struct io_ring_ctx { struct user_struct *user; struct completion ref_comp; - struct completion sq_thread_comp; #if defined(CONFIG_UNIX) struct socket *ring_sock; @@ -408,7 +404,8 @@ struct io_ring_ctx { struct idr io_buffer_idr; - struct idr personality_idr; + struct xarray personalities; + u32 pers_next; struct { unsigned cached_cq_tail; @@ -454,6 +451,7 @@ struct io_ring_ctx { /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; + struct list_head tctx_list; }; /* @@ -805,6 +803,12 @@ struct io_kiocb { struct io_wq_work work; }; +struct io_tctx_node { + struct list_head ctx_node; + struct task_struct *task; + struct io_ring_ctx *ctx; +}; + struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -979,6 +983,8 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +static bool io_disarm_next(struct io_kiocb *req); +static void io_uring_del_task_file(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); @@ -1129,9 +1135,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); - init_completion(&ctx->sq_thread_comp); idr_init(&ctx->io_buffer_idr); - idr_init(&ctx->personality_idr); + xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); @@ -1144,6 +1149,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); + INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); return ctx; @@ -1183,6 +1189,9 @@ static void io_prep_async_work(struct io_kiocb *req) const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; @@ -1514,15 +1523,14 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static inline void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, long res, + unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); __io_cqring_fill_event(req, res, cflags); - io_commit_cqring(ctx); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -1530,19 +1538,26 @@ static inline void io_req_complete_post(struct io_kiocb *req, long res, if (refcount_dec_and_test(&req->refs)) { struct io_comp_state *cs = &ctx->submit_state.comp; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) + io_disarm_next(req); + if (req->link) { + io_req_task_queue(req->link); + req->link = NULL; + } + } io_dismantle_req(req); io_put_task(req->task, 1); list_add(&req->compl.list, &cs->locked_free_list); cs->locked_free_nr++; } else req = NULL; + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - if (req) { - io_queue_next(req); + + if (req) percpu_ref_put(&ctx->refs); - } } static void io_req_complete_state(struct io_kiocb *req, long res, @@ -1648,6 +1663,10 @@ static void io_dismantle_req(struct io_kiocb *req) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1690,15 +1709,11 @@ static inline void io_remove_next_linked(struct io_kiocb *req) nxt->link = NULL; } -static void io_kill_linked_timeout(struct io_kiocb *req) +static bool io_kill_linked_timeout(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; + struct io_kiocb *link = req->link; bool cancelled = false; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - link = req->link; /* * Can happen if a linked timeout fired and link had been like @@ -1713,50 +1728,48 @@ static void io_kill_linked_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(link, -ECANCELED); - io_commit_cqring(ctx); + io_put_req_deferred(link, 1); cancelled = true; } } req->flags &= ~REQ_F_LINK_TIMEOUT; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - if (cancelled) { - io_cqring_ev_posted(ctx); - io_put_req(link); - } + return cancelled; } - static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) { - struct io_kiocb *link, *nxt; - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; + struct io_kiocb *nxt, *link = req->link; - spin_lock_irqsave(&ctx->completion_lock, flags); - link = req->link; req->link = NULL; - while (link) { nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); io_cqring_fill_event(link, -ECANCELED); - io_put_req_deferred(link, 2); link = nxt; } - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +} - io_cqring_ev_posted(ctx); +static bool io_disarm_next(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + bool posted = false; + + if (likely(req->flags & REQ_F_LINK_TIMEOUT)) + posted = io_kill_linked_timeout(req); + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + posted |= (req->link != NULL); + io_fail_links(req); + } + return posted; } static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - if (req->flags & REQ_F_LINK_TIMEOUT) - io_kill_linked_timeout(req); + struct io_kiocb *nxt; /* * If LINK is set, we have dependent requests in this chain. If we @@ -1764,14 +1777,22 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (likely(!(req->flags & REQ_F_FAIL_LINK))) { - struct io_kiocb *nxt = req->link; + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + bool posted; - req->link = NULL; - return nxt; + spin_lock_irqsave(&ctx->completion_lock, flags); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (posted) + io_cqring_ev_posted(ctx); } - io_fail_links(req); - return NULL; + nxt = req->link; + req->link = NULL; + return nxt; } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) @@ -5559,22 +5580,30 @@ add: return 0; } +struct io_cancel_data { + struct io_ring_ctx *ctx; + u64 user_data; +}; + static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_cancel_data *cd = data; - return req->user_data == (unsigned long) data; + return req->ctx == cd->ctx && req->user_data == cd->user_data; } -static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr) +static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, + struct io_ring_ctx *ctx) { + struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; enum io_wq_cancel cancel_ret; int ret = 0; - if (!tctx->io_wq) + if (!tctx || !tctx->io_wq) return -ENOENT; - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -5597,8 +5626,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, unsigned long flags; int ret; - ret = io_async_cancel_one(req->task->io_uring, - (void *) (unsigned long) sqe_addr); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); if (ret != -ENOENT) { spin_lock_irqsave(&ctx->completion_lock, flags); goto done; @@ -5639,8 +5667,47 @@ static int io_async_cancel_prep(struct io_kiocb *req, static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + u64 sqe_addr = req->cancel.addr; + struct io_tctx_node *node; + int ret; - io_async_find_and_cancel(ctx, req, req->cancel.addr, 0); + /* tasks should wait for their io-wq threads, so safe w/o sync */ + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + spin_lock_irq(&ctx->completion_lock); + if (ret != -ENOENT) + goto done; + ret = io_timeout_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + ret = io_poll_cancel(ctx, sqe_addr); + if (ret != -ENOENT) + goto done; + spin_unlock_irq(&ctx->completion_lock); + + /* slow path, try all io-wq's */ + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + ret = -ENOENT; + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (!tctx || !tctx->io_wq) + continue; + ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); + if (ret != -ENOENT) + break; + } + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + + spin_lock_irq(&ctx->completion_lock); +done: + io_cqring_fill_event(req, ret); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + + if (ret < 0) + req_set_fail_links(req); + io_put_req(req); return 0; } @@ -5916,18 +5983,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) const struct cred *creds = NULL; int ret; - if (req->work.personality) { - const struct cred *new_creds; - - if (!(issue_flags & IO_URING_F_NONBLOCK)) - mutex_lock(&ctx->uring_lock); - new_creds = idr_find(&ctx->personality_idr, req->work.personality); - if (!(issue_flags & IO_URING_F_NONBLOCK)) - mutex_unlock(&ctx->uring_lock); - if (!new_creds) - return -EINVAL; - creds = override_creds(new_creds); - } + if (req->work.creds && req->work.creds != current_cred()) + creds = override_creds(req->work.creds); switch (req->opcode) { case IORING_OP_NOP: @@ -6291,7 +6348,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, { struct io_submit_state *state; unsigned int sqe_flags; - int ret = 0; + int personality, ret = 0; req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -6306,6 +6363,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = current; req->result = 0; + req->work.list.next = NULL; + req->work.creds = NULL; + req->work.flags = 0; /* enforce forwards compatibility on users */ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { @@ -6323,9 +6383,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; - req->work.list.next = NULL; - req->work.flags = 0; - req->work.personality = READ_ONCE(sqe->personality); + personality = READ_ONCE(sqe->personality); + if (personality) { + req->work.creds = xa_load(&ctx->personalities, personality); + if (!req->work.creds) + return -EINVAL; + get_cred(req->work.creds); + } state = &ctx->submit_state; /* @@ -6587,7 +6651,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } @@ -6611,58 +6676,6 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd) sqd->sq_thread_idle = sq_thread_idle; } -static void io_sqd_init_new(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - - while (!list_empty(&sqd->ctx_new_list)) { - ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); - list_move_tail(&ctx->sqd_list, &sqd->ctx_list); - complete(&ctx->sq_thread_comp); - } - - io_sqd_update_thread_idle(sqd); -} - -static bool io_sq_thread_should_stop(struct io_sq_data *sqd) -{ - return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static bool io_sq_thread_should_park(struct io_sq_data *sqd) -{ - return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); -} - -static void io_sq_thread_parkme(struct io_sq_data *sqd) -{ - for (;;) { - /* - * TASK_PARKED is a special state; we must serialize against - * possible pending wakeups to avoid store-store collisions on - * task->state. - * - * Such a collision might possibly result in the task state - * changin from TASK_PARKED and us failing the - * wait_task_inactive() in kthread_park(). - */ - set_special_state(TASK_PARKED); - if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) - break; - - /* - * Thread is going to call schedule(), do not preempt it, - * or the caller of kthread_park() may spend more time in - * wait_task_inactive(). - */ - preempt_disable(); - complete(&sqd->parked); - schedule_preempt_disabled(); - preempt_enable(); - } - __set_current_state(TASK_RUNNING); -} - static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6681,31 +6694,32 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; - wait_for_completion(&sqd->startup); + down_read(&sqd->rw_lock); - while (!io_sq_thread_should_stop(sqd)) { + while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { int ret; bool cap_entries, sqt_spin, needs_sched; - /* - * Any changes to the sqd lists are synchronized through the - * thread parking. This synchronizes the thread vs users, - * the users are synchronized on the sqd->ctx_lock. - */ - if (io_sq_thread_should_park(sqd)) { - io_sq_thread_parkme(sqd); - continue; - } - if (unlikely(!list_empty(&sqd->ctx_new_list))) { - io_sqd_init_new(sqd); + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { + up_read(&sqd->rw_lock); + cond_resched(); + down_read(&sqd->rw_lock); + io_run_task_work(); timeout = jiffies + sqd->sq_thread_idle; + continue; } if (fatal_signal_pending(current)) break; sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); ret = __io_sq_thread(ctx, cap_entries); + if (creds) + revert_creds(creds); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } @@ -6732,12 +6746,13 @@ static int io_sq_thread(void *data) } } - if (needs_sched && !io_sq_thread_should_park(sqd)) { + if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); + up_read(&sqd->rw_lock); schedule(); - try_to_freeze(); + down_read(&sqd->rw_lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } @@ -6745,32 +6760,23 @@ static int io_sq_thread(void *data) finish_wait(&sqd->wait, &wait); timeout = jiffies + sqd->sq_thread_idle; } + up_read(&sqd->rw_lock); + down_write(&sqd->rw_lock); + /* + * someone may have parked and added a cancellation task_work, run + * it first because we don't want it in io_uring_cancel_sqpoll() + */ + io_run_task_work(); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_uring_cancel_sqpoll(ctx); + sqd->thread = NULL; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_ring_set_wakeup_flag(ctx); + up_write(&sqd->rw_lock); io_run_task_work(); - - /* - * Ensure that we park properly if racing with someone trying to park - * while we're exiting. If we fail to grab the lock, check park and - * park if necessary. The ordering with the park bit and the lock - * ensures that we catch this reliably. - */ - if (!mutex_trylock(&sqd->lock)) { - if (io_sq_thread_should_park(sqd)) - io_sq_thread_parkme(sqd); - mutex_lock(&sqd->lock); - } - - sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - ctx->sqo_exec = 1; - io_ring_set_wakeup_flag(ctx); - } - complete(&sqd->exited); - mutex_unlock(&sqd->lock); do_exit(0); } @@ -7069,44 +7075,37 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) } static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) + __releases(&sqd->rw_lock) { - if (sqd->thread == current) - return; + WARN_ON_ONCE(sqd->thread == current); + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (sqd->thread) - wake_up_state(sqd->thread, TASK_PARKED); - mutex_unlock(&sqd->lock); + up_write(&sqd->rw_lock); } static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) + __acquires(&sqd->rw_lock) { - if (sqd->thread == current) - return; + WARN_ON_ONCE(sqd->thread == current); + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) { + down_write(&sqd->rw_lock); + /* set again for consistency, in case concurrent parks are happening */ + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (sqd->thread) wake_up_process(sqd->thread); - wait_for_completion(&sqd->parked); - } } static void io_sq_thread_stop(struct io_sq_data *sqd) { - if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) - return; - mutex_lock(&sqd->lock); - if (sqd->thread) { - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); + WARN_ON_ONCE(sqd->thread == current); + + down_write(&sqd->rw_lock); + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + if (sqd->thread) wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); - WARN_ON_ONCE(sqd->thread); - } else { - mutex_unlock(&sqd->lock); - } + up_write(&sqd->rw_lock); + wait_for_completion(&sqd->exited); } static void io_put_sq_data(struct io_sq_data *sqd) @@ -7122,22 +7121,15 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx) struct io_sq_data *sqd = ctx->sq_data; if (sqd) { - complete(&sqd->startup); - if (sqd->thread) { - wait_for_completion(&ctx->sq_thread_comp); - io_sq_thread_park(sqd); - } - - mutex_lock(&sqd->ctx_lock); - list_del(&ctx->sqd_list); + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); io_sqd_update_thread_idle(sqd); - mutex_unlock(&sqd->ctx_lock); - - if (sqd->thread) - io_sq_thread_unpark(sqd); + io_sq_thread_unpark(sqd); io_put_sq_data(sqd); ctx->sq_data = NULL; + if (ctx->sq_creds) + put_cred(ctx->sq_creds); } } @@ -7161,18 +7153,32 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) fdput(f); return ERR_PTR(-EINVAL); } + if (sqd->task_tgid != current->tgid) { + fdput(f); + return ERR_PTR(-EPERM); + } refcount_inc(&sqd->refs); fdput(f); return sqd; } -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, + bool *attached) { struct io_sq_data *sqd; - if (p->flags & IORING_SETUP_ATTACH_WQ) - return io_attach_sq_data(p); + *attached = false; + if (p->flags & IORING_SETUP_ATTACH_WQ) { + sqd = io_attach_sq_data(p); + if (!IS_ERR(sqd)) { + *attached = true; + return sqd; + } + /* fall through for EPERM case, setup new sqd/task */ + if (PTR_ERR(sqd) != -EPERM) + return sqd; + } sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); if (!sqd) @@ -7180,12 +7186,8 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) refcount_set(&sqd->refs, 1); INIT_LIST_HEAD(&sqd->ctx_list); - INIT_LIST_HEAD(&sqd->ctx_new_list); - mutex_init(&sqd->ctx_lock); - mutex_init(&sqd->lock); + init_rwsem(&sqd->rw_lock); init_waitqueue_head(&sqd->wait); - init_completion(&sqd->startup); - init_completion(&sqd->parked); init_completion(&sqd->exited); return sqd; } @@ -7802,7 +7804,6 @@ static int io_uring_alloc_task_context(struct task_struct *task, init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); - tctx->sqpoll = false; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -7823,26 +7824,6 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) -{ - struct task_struct *tsk; - int ret; - - clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - reinit_completion(&sqd->parked); - ctx->sqo_exec = 0; - sqd->task_pid = current->pid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) - return PTR_ERR(tsk); - ret = io_uring_alloc_task_context(tsk, ctx); - if (ret) - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - sqd->thread = tsk; - wake_up_new_task(tsk); - return ret; -} - static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { @@ -7865,39 +7846,51 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; struct io_sq_data *sqd; + bool attached; ret = -EPERM; if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) goto err; - sqd = io_get_sq_data(p); + sqd = io_get_sq_data(p, &attached); if (IS_ERR(sqd)) { ret = PTR_ERR(sqd); goto err; } + ctx->sq_creds = get_current_cred(); ctx->sq_data = sqd; - io_sq_thread_park(sqd); - mutex_lock(&sqd->ctx_lock); - list_add(&ctx->sqd_list, &sqd->ctx_new_list); - mutex_unlock(&sqd->ctx_lock); - io_sq_thread_unpark(sqd); - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); if (!ctx->sq_thread_idle) ctx->sq_thread_idle = HZ; - if (sqd->thread) + ret = 0; + io_sq_thread_park(sqd); + /* don't attach to a dying SQPOLL thread, would be racy */ + if (attached && !sqd->thread) { + ret = -ENXIO; + } else { + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); + } + io_sq_thread_unpark(sqd); + + if (ret < 0) { + io_put_sq_data(sqd); + ctx->sq_data = NULL; + return ret; + } else if (attached) { return 0; + } if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; ret = -EINVAL; if (cpu >= nr_cpu_ids) - goto err; + goto err_sqpoll; if (!cpu_online(cpu)) - goto err; + goto err_sqpoll; sqd->sq_cpu = cpu; } else { @@ -7905,15 +7898,15 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, } sqd->task_pid = current->pid; + sqd->task_tgid = current->tgid; tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); if (IS_ERR(tsk)) { ret = PTR_ERR(tsk); - goto err; + goto err_sqpoll; } - ret = io_uring_alloc_task_context(tsk, ctx); - if (ret) - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + sqd->thread = tsk; + ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) goto err; @@ -7927,15 +7920,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, err: io_sq_thread_finish(ctx); return ret; -} - -static void io_sq_offload_start(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->flags & IORING_SETUP_SQPOLL) - complete(&sqd->startup); +err_sqpoll: + complete(&ctx->sq_data->exited); + goto err; } static inline void __io_unaccount_mem(struct user_struct *user, @@ -8418,7 +8405,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); - idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -8483,7 +8469,7 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; - creds = idr_remove(&ctx->personality_idr, id); + creds = xa_erase(&ctx->personalities, id); if (creds) { put_cred(creds); return 0; @@ -8492,14 +8478,6 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) return -EINVAL; } -static int io_remove_personalities(int id, void *p, void *data) -{ - struct io_ring_ctx *ctx = data; - - io_unregister_personality(ctx, id); - return 0; -} - static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) { struct callback_head *work, *next; @@ -8522,10 +8500,34 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) return executed; } +struct io_tctx_exit { + struct callback_head task_work; + struct completion completion; + struct io_ring_ctx *ctx; +}; + +static void io_tctx_exit_cb(struct callback_head *cb) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_exit *work; + + work = container_of(cb, struct io_tctx_exit, task_work); + /* + * When @in_idle, we're in cancellation and it's racy to remove the + * node. It'll be removed by the end of cancellation, just ignore it. + */ + if (!atomic_read(&tctx->in_idle)) + io_uring_del_task_file((unsigned long)work->ctx); + complete(&work->completion); +} + static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - exit_work); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); + unsigned long timeout = jiffies + HZ * 60 * 5; + struct io_tctx_exit exit; + struct io_tctx_node *node; + int ret; /* * If we're doing polled IO and end up having requests being @@ -8535,19 +8537,47 @@ static void io_ring_exit_work(struct work_struct *work) */ do { io_uring_try_cancel_requests(ctx, NULL, NULL); + + WARN_ON_ONCE(time_after(jiffies, timeout)); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + + mutex_lock(&ctx->uring_lock); + while (!list_empty(&ctx->tctx_list)) { + WARN_ON_ONCE(time_after(jiffies, timeout)); + + node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, + ctx_node); + exit.ctx = ctx; + init_completion(&exit.completion); + init_task_work(&exit.task_work, io_tctx_exit_cb); + ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); + if (WARN_ON_ONCE(ret)) + continue; + wake_up_process(node->task); + + mutex_unlock(&ctx->uring_lock); + wait_for_completion(&exit.completion); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } + mutex_unlock(&ctx->uring_lock); + io_ring_ctx_free(ctx); } static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { + unsigned long index; + struct creds *creds; + mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); /* if force is set, the ring is going away. always drop after that */ ctx->cq_overflow_flushed = 1; if (ctx->rings) __io_cqring_overflow_flush(ctx, true, NULL, NULL); - idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); + xa_for_each(&ctx->personalities, index, creds) + io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -8600,11 +8630,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return ret; } -static void io_cancel_defer_files(struct io_ring_ctx *ctx, +static bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { - struct io_defer_entry *de = NULL; + struct io_defer_entry *de; LIST_HEAD(list); spin_lock_irq(&ctx->completion_lock); @@ -8615,6 +8645,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } spin_unlock_irq(&ctx->completion_lock); + if (list_empty(&list)) + return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); @@ -8624,6 +8656,38 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, io_req_complete(de->req, -ECANCELED); kfree(de); } + return true; +} + +static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + +static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +{ + struct io_tctx_node *node; + enum io_wq_cancel cret; + bool ret = false; + + mutex_lock(&ctx->uring_lock); + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + /* + * io_wq will stay alive while we hold uring_lock, because it's + * killed after ctx nodes, which requires to take the lock. + */ + if (!tctx || !tctx->io_wq) + continue; + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } + mutex_unlock(&ctx->uring_lock); + + return ret; } static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, @@ -8631,27 +8695,34 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct files_struct *files) { struct io_task_cancel cancel = { .task = task, .files = files, }; - struct task_struct *tctx_task = task ?: current; - struct io_uring_task *tctx = tctx_task->io_uring; + struct io_uring_task *tctx = task ? task->io_uring : NULL; while (1) { enum io_wq_cancel cret; bool ret = false; - if (tctx && tctx->io_wq) { + if (!task) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx && tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } /* SQPOLL thread does its own polling */ - if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) { + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) || + (ctx->sq_data && ctx->sq_data->thread == current)) { while (!list_empty_careful(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; } } + ret |= io_cancel_defer_files(ctx, task, files); ret |= io_poll_remove_all(ctx, task, files); ret |= io_kill_timeouts(ctx, task, files); ret |= io_run_task_work(); @@ -8691,58 +8762,21 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_uring_try_cancel_requests(ctx, task, files); - if (ctx->sq_data) - io_sq_thread_unpark(ctx->sq_data); prepare_to_wait(&task->io_uring->wait, &wait, TASK_UNINTERRUPTIBLE); if (inflight == io_uring_count_inflight(ctx, task, files)) schedule(); finish_wait(&task->io_uring->wait, &wait); - if (ctx->sq_data) - io_sq_thread_park(ctx->sq_data); } } -/* - * We need to iteratively cancel requests, in case a request has dependent - * hard links. These persist even for failure of cancelations, hence keep - * looping until none are found. - */ -static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct files_struct *files) -{ - struct task_struct *task = current; - - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { - /* never started, nothing to cancel */ - if (ctx->flags & IORING_SETUP_R_DISABLED) { - io_sq_offload_start(ctx); - return; - } - io_sq_thread_park(ctx->sq_data); - task = ctx->sq_data->thread; - if (task) - atomic_inc(&task->io_uring->in_idle); - } - - io_cancel_defer_files(ctx, task, files); - - io_uring_cancel_files(ctx, task, files); - if (!files) - io_uring_try_cancel_requests(ctx, task, NULL); - - if (task) - atomic_dec(&task->io_uring->in_idle); - if (ctx->sq_data) - io_sq_thread_unpark(ctx->sq_data); -} - /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { @@ -8751,102 +8785,151 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) return ret; tctx = current->io_uring; } - if (tctx->last != file) { - void *old = xa_load(&tctx->xa, (unsigned long)file); + if (tctx->last != ctx) { + void *old = xa_load(&tctx->xa, (unsigned long)ctx); if (!old) { - get_file(file); - ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, - file, GFP_KERNEL)); + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; + + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); if (ret) { - fput(file); + kfree(node); return ret; } + + mutex_lock(&ctx->uring_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->uring_lock); } - tctx->last = file; + tctx->last = ctx; } - - /* - * This is race safe in that the task itself is doing this, hence it - * cannot be going through the exit/cancel paths at the same time. - * This cannot be modified while exit/cancel is running. - */ - if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) - tctx->sqpoll = true; - return 0; } /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_task_file(struct file *file) +static void io_uring_del_task_file(unsigned long index) { struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; - if (tctx->last == file) + if (!tctx) + return; + node = xa_erase(&tctx->xa, index); + if (!node) + return; + + WARN_ON_ONCE(current != node->task); + WARN_ON_ONCE(list_empty(&node->ctx_node)); + + mutex_lock(&node->ctx->uring_lock); + list_del(&node->ctx_node); + mutex_unlock(&node->ctx->uring_lock); + + if (tctx->last == node->ctx) tctx->last = NULL; - file = xa_erase(&tctx->xa, (unsigned long)file); - if (file) - fput(file); + kfree(node); } static void io_uring_clean_tctx(struct io_uring_task *tctx) { - struct file *file; + struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, file) - io_uring_del_task_file(file); + xa_for_each(&tctx->xa, index, node) + io_uring_del_task_file(index); if (tctx->io_wq) { io_wq_put_and_exit(tctx->io_wq); tctx->io_wq = NULL; } } -void __io_uring_files_cancel(struct files_struct *files) -{ - struct io_uring_task *tctx = current->io_uring; - struct file *file; - unsigned long index; - - /* make sure overflow events are dropped */ - atomic_inc(&tctx->in_idle); - xa_for_each(&tctx->xa, index, file) - io_uring_cancel_task_requests(file->private_data, files); - atomic_dec(&tctx->in_idle); - - if (files) - io_uring_clean_tctx(tctx); -} - static s64 tctx_inflight(struct io_uring_task *tctx) { return percpu_counter_sum(&tctx->inflight); } +static void io_sqpoll_cancel_cb(struct callback_head *cb) +{ + struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work); + struct io_ring_ctx *ctx = work->ctx; + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd->thread) + io_uring_cancel_sqpoll(ctx); + complete(&work->completion); +} + +static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + struct io_tctx_exit work = { .ctx = ctx, }; + struct task_struct *task; + + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + task = sqd->thread; + if (task) { + init_completion(&work.completion); + init_task_work(&work.task_work, io_sqpoll_cancel_cb); + WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL)); + wake_up_process(task); + } + io_sq_thread_unpark(sqd); + + if (task) + wait_for_completion(&work.completion); +} + +void __io_uring_files_cancel(struct files_struct *files) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; + unsigned long index; + + /* make sure overflow events are dropped */ + atomic_inc(&tctx->in_idle); + xa_for_each(&tctx->xa, index, node) { + struct io_ring_ctx *ctx = node->ctx; + + if (ctx->sq_data) { + io_sqpoll_cancel_sync(ctx); + continue; + } + io_uring_cancel_files(ctx, current, files); + if (!files) + io_uring_try_cancel_requests(ctx, current, NULL); + } + atomic_dec(&tctx->in_idle); + + if (files) + io_uring_clean_tctx(tctx); +} + +/* should only be called by SQPOLL task */ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) { struct io_sq_data *sqd = ctx->sq_data; - struct io_uring_task *tctx; + struct io_uring_task *tctx = current->io_uring; s64 inflight; DEFINE_WAIT(wait); - if (!sqd) - return; - io_sq_thread_park(sqd); - if (!sqd->thread || !sqd->thread->io_uring) { - io_sq_thread_unpark(sqd); - return; - } - tctx = ctx->sq_data->thread->io_uring; + WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current); + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); if (!inflight) break; - io_uring_cancel_task_requests(ctx, NULL); + io_uring_try_cancel_requests(ctx, current, NULL); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); /* @@ -8859,7 +8942,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); - io_sq_thread_unpark(sqd); } /* @@ -8874,15 +8956,6 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - - if (tctx->sqpoll) { - struct file *file; - unsigned long index; - - xa_for_each(&tctx->xa, index, file) - io_uring_cancel_sqpoll(file->private_data); - } - do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); @@ -8981,7 +9054,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { - int ret = 0; DEFINE_WAIT(wait); do { @@ -8995,7 +9067,7 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); - return ret; + return 0; } static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, @@ -9069,13 +9141,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); - if (unlikely(ctx->sqo_exec)) { - ret = io_sq_thread_fork(ctx->sq_data, ctx); - if (ret) - goto out; - ctx->sqo_exec = 0; - } ret = -EOWNERDEAD; + if (unlikely(ctx->sq_data->thread == NULL)) { + goto out; + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { @@ -9085,7 +9154,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(ctx, f.file); + ret = io_uring_add_task_file(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9127,10 +9196,9 @@ out_fput: } #ifdef CONFIG_PROC_FS -static int io_uring_show_cred(int id, void *p, void *data) +static int io_uring_show_cred(struct seq_file *m, unsigned int id, + const struct cred *cred) { - const struct cred *cred = p; - struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; kernel_cap_t cap; @@ -9198,9 +9266,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, (unsigned int) buf->len); } - if (has_lock && !idr_is_empty(&ctx->personality_idr)) { + if (has_lock && !xa_empty(&ctx->personalities)) { + unsigned long index; + const struct cred *cred; + seq_printf(m, "Personalities:\n"); - idr_for_each(&ctx->personality_idr, io_uring_show_cred, m); + xa_for_each(&ctx->personalities, index, cred) + io_uring_show_cred(m, index, cred); } seq_printf(m, "PollList:\n"); spin_lock_irq(&ctx->completion_lock); @@ -9294,7 +9366,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_task_file(ctx, file); + ret = io_uring_add_task_file(ctx); if (ret) { put_unused_fd(fd); return ret; @@ -9402,9 +9474,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - if (!(p->flags & IORING_SETUP_R_DISABLED)) - io_sq_offload_start(ctx); - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -9532,14 +9601,16 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { const struct cred *creds; + u32 id; int ret; creds = get_current_cred(); - ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, - USHRT_MAX, GFP_KERNEL); - if (ret < 0) - put_cred(creds); + ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, + XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); + if (!ret) + return id; + put_cred(creds); return ret; } @@ -9621,7 +9692,9 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (ctx->restrictions.registered) ctx->restricted = 1; - io_sq_offload_start(ctx); + ctx->flags &= ~IORING_SETUP_R_DISABLED; + if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); return 0; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7ffcd7ef33d4..414769a6ad11 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1221,7 +1221,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset); + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset); bio_set_dev(bio, wpc->iomap.bdev); bio->bi_iter.bi_sector = sector; bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); @@ -1252,7 +1252,7 @@ iomap_chain_bio(struct bio *prev) { struct bio *new; - new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + new = bio_alloc(GFP_NOFS, BIO_MAX_VECS); bio_copy_dev(new, prev);/* also copies over blkcg information */ new->bi_iter.bi_sector = bio_end_sector(prev); new->bi_opf = prev->bi_opf; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4e339bba6afb..c86757f3e244 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -300,7 +300,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, */ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_PAGES); + nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; if (dio->error) { @@ -344,7 +344,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, copied += n; nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_PAGES); + BIO_MAX_VECS); iomap_dio_submit_bio(dio, iomap, bio, pos); pos += n; } while (nr_pages); diff --git a/fs/mpage.c b/fs/mpage.c index 00ac5c329b59..af32fda87896 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -652,7 +652,7 @@ alloc_new: goto out; } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH); + BIO_MAX_VECS, GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index e2a488d403a6..14a72224b657 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -127,7 +127,7 @@ config PNFS_BLOCK config PNFS_FLEXFILE_LAYOUT tristate depends on NFS_V4_1 && NFS_V3 - default m + default NFS_V4 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN string "NFSv4.1 Implementation ID Domain" diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 19a9f434442f..fc4f490f2d78 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -81,8 +81,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA | - NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(dir, + NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED); list_add(&ctx->list, &nfsi->open_files); spin_unlock(&dir->i_lock); return ctx; @@ -1401,6 +1402,13 @@ out_force: goto out; } +static void nfs_mark_dir_for_revalidate(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE); + spin_unlock(&inode->i_lock); +} + /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. @@ -1435,19 +1443,14 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, __func__, dentry); return 1; case 0: - nfs_mark_for_revalidate(dir); - if (inode && S_ISDIR(inode->i_mode)) { - /* Purge readdir caches. */ - nfs_zap_caches(inode); - /* - * We can't d_drop the root of a disconnected tree: - * its d_hash is on the s_anon list and d_drop() would hide - * it from shrink_dcache_for_unmount(), leading to busy - * inodes on unmount and further oopses. - */ - if (IS_ROOT(dentry)) - return 1; - } + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (inode && IS_ROOT(dentry)) + return 1; dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", __func__, dentry); return 0; @@ -1525,6 +1528,13 @@ out: nfs_free_fattr(fattr); nfs_free_fhandle(fhandle); nfs4_label_free(label); + + /* + * If the lookup failed despite the dentry change attribute being + * a match, then we should revalidate the directory cache. + */ + if (!ret && nfs_verify_change_attribute(dir, dentry->d_time)) + nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } @@ -1567,7 +1577,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, error = nfs_lookup_verify_inode(inode, flags); if (error) { if (error == -ESTALE) - nfs_zap_caches(dir); + nfs_mark_dir_for_revalidate(dir); goto out_bad; } nfs_advise_use_readdirplus(dir); @@ -1691,10 +1701,9 @@ static void nfs_drop_nlink(struct inode *inode) if (inode->i_nlink > 0) drop_nlink(inode); NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_INVALID_OTHER - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid( + inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } @@ -1706,7 +1715,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode) { if (S_ISDIR(inode->i_mode)) /* drop any readdir cache as it could easily be old */ - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { nfs_complete_unlink(dentry, inode); @@ -2064,7 +2073,6 @@ out: dput(parent); return d; out_error: - nfs_mark_for_revalidate(dir); d = ERR_PTR(error); goto out; } @@ -2473,9 +2481,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (error == 0) { spin_lock(&old_inode->i_lock); NFS_I(old_inode)->attr_gencount = nfs_inc_attr_generation_counter(); - NFS_I(old_inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(old_inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&old_inode->i_lock); } out: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 749bbea14d99..a7fb076a5f44 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -207,7 +207,7 @@ static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) } #endif -static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) +void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); @@ -229,6 +229,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode); } +EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); /* * Invalidate the local caches @@ -1067,8 +1068,8 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA | - NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA | + NFS_INO_REVAL_FORCED); list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 25fb43b69e5a..7b644d6c09e4 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -411,7 +411,8 @@ extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); -void nfs_zap_acl_cache(struct inode *inode); +extern void nfs_zap_acl_cache(struct inode *inode); +extern void nfs_set_cache_invalid(struct inode *inode, unsigned long flags); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index ca10072644ff..ed1c83738c30 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -36,6 +36,7 @@ #define NFS3_pagepad_sz (1) /* Page padding */ #define NFS3_fhandle_sz (1+16) #define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ +#define NFS3_post_op_fh_sz (1+NFS3_fh_sz) #define NFS3_sattr_sz (15) #define NFS3_filename_sz (1+(NFS3_MAXNAMLEN>>2)) #define NFS3_path_sz (1+(NFS3_MAXPATHLEN>>2)) @@ -73,7 +74,7 @@ #define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz) #define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) -#define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) +#define NFS3_createres_sz (1+NFS3_post_op_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index f3fd935620fc..094024b0aca1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -357,13 +357,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); spin_lock(&dst_inode->i_lock); - NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | - NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE | - NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA); + nfs_set_cache_invalid( + dst_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED | + NFS_INO_INVALID_SIZE | NFS_INO_INVALID_ATTR | + NFS_INO_INVALID_DATA); spin_unlock(&dst_inode->i_lock); spin_lock(&src_inode->i_lock); - NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE | - NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME); + nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE | + NFS_INO_REVAL_FORCED | + NFS_INO_INVALID_ATIME); spin_unlock(&src_inode->i_lock); status = res->write_res.count; out: diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 74bc5120013d..c65c4b41e2c1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1169,14 +1169,14 @@ int nfs4_call_sync(struct rpc_clnt *clnt, static void nfs4_inc_nlink_locked(struct inode *inode) { - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); inc_nlink(inode); } static void nfs4_dec_nlink_locked(struct inode *inode) { - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); drop_nlink(inode); } @@ -1187,35 +1187,31 @@ nfs4_update_changeattr_locked(struct inode *inode, { struct nfs_inode *nfsi = NFS_I(inode); - nfsi->cache_validity |= NFS_INO_INVALID_CTIME - | NFS_INO_INVALID_MTIME - | cache_validity; + cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) { nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; } else { if (S_ISDIR(inode->i_mode)) { - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + cache_validity |= NFS_INO_INVALID_DATA; nfs_force_lookup_revalidate(inode); } else { if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + cache_validity |= NFS_INO_REVAL_PAGECACHE; } if (cinfo->before != inode_peek_iversion_raw(inode)) - nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL | - NFS_INO_INVALID_XATTR; + cache_validity |= NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR; } inode_set_iversion_raw(inode, cinfo->after); nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); + nfs_set_cache_invalid(inode, cache_validity); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; - - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); } void @@ -5893,6 +5889,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); int ret, i; + /* You can't remove system.nfs4_acl: */ + if (buflen == 0) + return -EINVAL; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; if (npages > ARRAY_SIZE(pages)) @@ -5915,9 +5914,9 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl * so mark the attribute cache invalid. */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); @@ -5969,7 +5968,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, return ret; if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL)) return -ENOENT; - return 0; + return label.len; } static int nfs4_get_security_label(struct inode *inode, void *buf, diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b27ebdccef70..5fa11e1aca4c 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -500,9 +500,9 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); d_move(dentry, sdentry); break; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 82bdcb982186..f05a90338a76 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -303,9 +303,9 @@ static void nfs_set_pageerror(struct address_space *mapping) nfs_zap_mapping(mapping->host, mapping); /* Force file size revalidation */ spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED | - NFS_INO_REVAL_PAGECACHE | - NFS_INO_INVALID_SIZE; + nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED | + NFS_INO_REVAL_PAGECACHE | + NFS_INO_INVALID_SIZE); spin_unlock(&inode->i_lock); } @@ -1604,7 +1604,7 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_OTHER; + nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); spin_unlock(&inode->i_lock); } return 0; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1e75417bfe6e..56872e93823d 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -399,7 +399,7 @@ static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; - wi->max_pages = BIO_MAX_PAGES; + wi->max_pages = BIO_MAX_VECS; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 45f44425d856..b9e87ebb1060 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - if (page_count <= BIO_MAX_PAGES) + if (page_count <= BIO_MAX_VECS) bio = bio_alloc(GFP_NOIO, page_count); else bio = bio_kmalloc(GFP_NOIO, page_count); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b6ff4a21abac..0fe76f376dee 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -684,7 +684,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); - nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); + nr_pages = iov_iter_npages(from, BIO_MAX_VECS); if (!nr_pages) return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 983ed2fe7c85..d0246c92a6e8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -20,11 +20,11 @@ #define BIO_BUG_ON #endif -#define BIO_MAX_PAGES 256U +#define BIO_MAX_VECS 256U static inline unsigned int bio_max_segs(unsigned int nr_segs) { - return min(nr_segs, BIO_MAX_PAGES); + return min(nr_segs, BIO_MAX_VECS); } #define bio_prio(bio) (bio)->bi_ioprio diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 7cb7bd0e334c..9761a0ec9f95 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -18,7 +18,7 @@ struct io_uring_task { /* submission side */ struct xarray xa; struct wait_queue_head wait; - struct file *last; + void *last; void *io_wq; struct percpu_counter inflight; atomic_t in_idle; diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index cdfc4e9f253e..5e772392a379 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -904,6 +904,10 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, #define pgprot_device pgprot_noncached #endif +#ifndef pgprot_mhp +#define pgprot_mhp(prot) (prot) +#endif + #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify diff --git a/include/linux/property.h b/include/linux/property.h index dafccfce0262..dd4687b56239 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -488,7 +488,7 @@ fwnode_create_software_node(const struct property_entry *properties, const struct fwnode_handle *parent); void fwnode_remove_software_node(struct fwnode_handle *fwnode); -int device_add_software_node(struct device *dev, const struct software_node *swnode); +int device_add_software_node(struct device *dev, const struct software_node *node); void device_remove_software_node(struct device *dev); int device_create_managed_software_node(struct device *dev, diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index ccdb5320a240..71902f41c919 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -147,6 +147,9 @@ enum { #define BUCK6_FPWM 0x04 #define BUCK6_ENMODE_MASK 0x03 +/* PCA9450_REG_BUCK123_PRESET_EN bit */ +#define BUCK123_PRESET_EN 0x80 + /* PCA9450_BUCK1OUT_DVS0 bits */ #define BUCK1OUT_DVS0_MASK 0x7F #define BUCK1OUT_DVS0_DEFAULT 0x14 diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 0b1182a3cf41..cb854df031ce 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -50,6 +50,13 @@ #include #include +/* + * Technically there's no reliably invalid grant reference or grant handle, + * so pick the value that is the most unlikely one to be observed valid. + */ +#define INVALID_GRANT_REF ((grant_ref_t)-1) +#define INVALID_GRANT_HANDLE ((grant_handle_t)-1) + #define GNTTAB_RESERVED_XENSTORE 1 /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 0b1386073d49..b94074c82772 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -51,7 +51,6 @@ #define XENBUS_MAX_RING_GRANT_ORDER 4 #define XENBUS_MAX_RING_GRANTS (1U << XENBUS_MAX_RING_GRANT_ORDER) -#define INVALID_GRANT_HANDLE (~0U) /* Register callback to watch this node. */ struct xenbus_watch diff --git a/kernel/fork.c b/kernel/fork.c index 7a2b4634d198..6cd024f16630 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2441,6 +2441,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) if (!IS_ERR(tsk)) { sigfillset(&tsk->blocked); sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); + tsk->flags |= PF_NOFREEZE; } return tsk; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5ba51a8bdaeb..0cdbbfbc5757 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1072,7 +1072,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { - struct mhp_params params = { .pgprot = PAGE_KERNEL }; + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; u64 start, size; bool new_node = false; int ret; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cf702a5f7fe5..39ed0e0afe6d 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -963,8 +963,11 @@ void rpc_execute(struct rpc_task *task) rpc_set_active(task); rpc_make_runnable(rpciod_workqueue, task); - if (!is_async) + if (!is_async) { + unsigned int pflags = memalloc_nofs_save(); __rpc_execute(task); + memalloc_nofs_restore(pflags); + } } static void rpc_async_schedule(struct work_struct *work) diff --git a/security/commoncap.c b/security/commoncap.c index 28f4d25480df..1c519c875217 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -543,8 +543,7 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), - *fs_ns = inode->i_sb->s_user_ns, - *ancestor; + *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; size_t newsize; @@ -567,15 +566,6 @@ int cap_convert_nscap(struct user_namespace *mnt_userns, struct dentry *dentry, if (nsrootid == -1) return -EINVAL; - /* - * Do not allow allow adding a v3 filesystem capability xattr - * if the rootid field is ambiguous. - */ - for (ancestor = task_ns->parent; ancestor; ancestor = ancestor->parent) { - if (from_kuid(ancestor, rootid) == 0) - return -EINVAL; - } - newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) diff --git a/sound/hda/intel-nhlt.c b/sound/hda/intel-nhlt.c index d053beccfaec..e2237239d922 100644 --- a/sound/hda/intel-nhlt.c +++ b/sound/hda/intel-nhlt.c @@ -39,6 +39,11 @@ int intel_nhlt_get_dmic_geo(struct device *dev, struct nhlt_acpi_table *nhlt) if (!nhlt) return 0; + if (nhlt->header.length <= sizeof(struct acpi_table_header)) { + dev_warn(dev, "Invalid DMIC description table\n"); + return 0; + } + for (j = 0, epnt = nhlt->desc; j < nhlt->endpoint_count; j++, epnt = (struct nhlt_endpoint *)((u8 *)epnt + epnt->length)) { diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index 6a8564566375..17a25e453f60 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -47,6 +47,10 @@ static void hda_codec_unsol_event(struct hdac_device *dev, unsigned int ev) if (codec->bus->shutdown) return; + /* ignore unsol events during system suspend/resume */ + if (codec->core.dev.power.power_state.event != PM_EVENT_ON) + return; + if (codec->patch_ops.unsol_event) codec->patch_ops.unsol_event(codec, ev); } diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c index 9087981cd1f7..ca2f2ecd1488 100644 --- a/sound/pci/hda/hda_controller.c +++ b/sound/pci/hda/hda_controller.c @@ -609,13 +609,6 @@ static int azx_pcm_open(struct snd_pcm_substream *substream) 20, 178000000); - /* by some reason, the playback stream stalls on PulseAudio with - * tsched=1 when a capture stream triggers. Until we figure out the - * real cause, disable tsched mode by telling the PCM info flag. - */ - if (chip->driver_caps & AZX_DCAPS_AMD_WORKAROUND) - runtime->hw.info |= SNDRV_PCM_INFO_BATCH; - if (chip->align_buffer_size) /* constrain buffer sizes to be multiple of 128 bytes. This is more efficient in terms of memory diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 5b492c3f816c..5eea130dcf0a 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1026,6 +1026,8 @@ static int azx_prepare(struct device *dev) chip = card->private_data; chip->pm_prepared = 1; + flush_work(&azx_bus(chip)->unsol_work); + /* HDA controller always requires different WAKEEN for runtime suspend * and system suspend, so don't use direct-complete here. */ diff --git a/sound/pci/hda/patch_ca0132.c b/sound/pci/hda/patch_ca0132.c index c966f49fa942..b2b620f6c832 100644 --- a/sound/pci/hda/patch_ca0132.c +++ b/sound/pci/hda/patch_ca0132.c @@ -1309,6 +1309,7 @@ static const struct snd_pci_quirk ca0132_quirks[] = { SND_PCI_QUIRK(0x1102, 0x0013, "Recon3D", QUIRK_R3D), SND_PCI_QUIRK(0x1102, 0x0018, "Recon3D", QUIRK_R3D), SND_PCI_QUIRK(0x1102, 0x0051, "Sound Blaster AE-5", QUIRK_AE5), + SND_PCI_QUIRK(0x1102, 0x0191, "Sound Blaster AE-5 Plus", QUIRK_AE5), SND_PCI_QUIRK(0x1102, 0x0081, "Sound Blaster AE-7", QUIRK_AE7), {} }; diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index f2aa226d1373..c20dad46a7c9 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -149,6 +149,21 @@ static int cx_auto_vmaster_mute_led(struct led_classdev *led_cdev, return 0; } +static void cxt_init_gpio_led(struct hda_codec *codec) +{ + struct conexant_spec *spec = codec->spec; + unsigned int mask = spec->gpio_mute_led_mask | spec->gpio_mic_led_mask; + + if (mask) { + snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_MASK, + mask); + snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_DIRECTION, + mask); + snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_DATA, + spec->gpio_led); + } +} + static int cx_auto_init(struct hda_codec *codec) { struct conexant_spec *spec = codec->spec; @@ -156,6 +171,7 @@ static int cx_auto_init(struct hda_codec *codec) if (!spec->dynamic_eapd) cx_auto_turn_eapd(codec, spec->num_eapds, spec->eapds, true); + cxt_init_gpio_led(codec); snd_hda_apply_fixup(codec, HDA_FIXUP_ACT_INIT); return 0; @@ -215,6 +231,7 @@ enum { CXT_FIXUP_HP_SPECTRE, CXT_FIXUP_HP_GATE_MIC, CXT_FIXUP_MUTE_LED_GPIO, + CXT_FIXUP_HP_ZBOOK_MUTE_LED, CXT_FIXUP_HEADSET_MIC, CXT_FIXUP_HP_MIC_NO_PRESENCE, }; @@ -654,31 +671,36 @@ static int cxt_gpio_micmute_update(struct led_classdev *led_cdev, return 0; } +static void cxt_setup_mute_led(struct hda_codec *codec, + unsigned int mute, unsigned int mic_mute) +{ + struct conexant_spec *spec = codec->spec; + + spec->gpio_led = 0; + spec->mute_led_polarity = 0; + if (mute) { + snd_hda_gen_add_mute_led_cdev(codec, cxt_gpio_mute_update); + spec->gpio_mute_led_mask = mute; + } + if (mic_mute) { + snd_hda_gen_add_micmute_led_cdev(codec, cxt_gpio_micmute_update); + spec->gpio_mic_led_mask = mic_mute; + } +} static void cxt_fixup_mute_led_gpio(struct hda_codec *codec, const struct hda_fixup *fix, int action) { - struct conexant_spec *spec = codec->spec; - static const struct hda_verb gpio_init[] = { - { 0x01, AC_VERB_SET_GPIO_MASK, 0x03 }, - { 0x01, AC_VERB_SET_GPIO_DIRECTION, 0x03 }, - {} - }; - - if (action == HDA_FIXUP_ACT_PRE_PROBE) { - snd_hda_gen_add_mute_led_cdev(codec, cxt_gpio_mute_update); - spec->gpio_led = 0; - spec->mute_led_polarity = 0; - spec->gpio_mute_led_mask = 0x01; - spec->gpio_mic_led_mask = 0x02; - snd_hda_gen_add_micmute_led_cdev(codec, cxt_gpio_micmute_update); - } - snd_hda_add_verbs(codec, gpio_init); - if (spec->gpio_led) - snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_DATA, - spec->gpio_led); + if (action == HDA_FIXUP_ACT_PRE_PROBE) + cxt_setup_mute_led(codec, 0x01, 0x02); } +static void cxt_fixup_hp_zbook_mute_led(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + if (action == HDA_FIXUP_ACT_PRE_PROBE) + cxt_setup_mute_led(codec, 0x10, 0x20); +} /* ThinkPad X200 & co with cxt5051 */ static const struct hda_pintbl cxt_pincfg_lenovo_x200[] = { @@ -839,6 +861,10 @@ static const struct hda_fixup cxt_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_mute_led_gpio, }, + [CXT_FIXUP_HP_ZBOOK_MUTE_LED] = { + .type = HDA_FIXUP_FUNC, + .v.func = cxt_fixup_hp_zbook_mute_led, + }, [CXT_FIXUP_HEADSET_MIC] = { .type = HDA_FIXUP_FUNC, .v.func = cxt_fixup_headset_mic, @@ -917,6 +943,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x8402, "HP ProBook 645 G4", CXT_FIXUP_MUTE_LED_GPIO), + SND_PCI_QUIRK(0x103c, 0x8427, "HP ZBook Studio G5", CXT_FIXUP_HP_ZBOOK_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x8455, "HP Z2 G4", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x8456, "HP Z2 G4 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x8457, "HP Z2 G4 mini", CXT_FIXUP_HP_MIC_NO_PRESENCE), @@ -956,6 +983,7 @@ static const struct hda_model_fixup cxt5066_fixup_models[] = { { .id = CXT_FIXUP_MUTE_LED_EAPD, .name = "mute-led-eapd" }, { .id = CXT_FIXUP_HP_DOCK, .name = "hp-dock" }, { .id = CXT_FIXUP_MUTE_LED_GPIO, .name = "mute-led-gpio" }, + { .id = CXT_FIXUP_HP_ZBOOK_MUTE_LED, .name = "hp-zbook-mute-led" }, { .id = CXT_FIXUP_HP_MIC_NO_PRESENCE, .name = "hp-mic-fix" }, {} }; diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index e6d0843ee9df..45ae845e82df 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -2480,6 +2480,18 @@ static void generic_hdmi_free(struct hda_codec *codec) } #ifdef CONFIG_PM +static int generic_hdmi_suspend(struct hda_codec *codec) +{ + struct hdmi_spec *spec = codec->spec; + int pin_idx; + + for (pin_idx = 0; pin_idx < spec->num_pins; pin_idx++) { + struct hdmi_spec_per_pin *per_pin = get_pin(spec, pin_idx); + cancel_delayed_work_sync(&per_pin->work); + } + return 0; +} + static int generic_hdmi_resume(struct hda_codec *codec) { struct hdmi_spec *spec = codec->spec; @@ -2503,6 +2515,7 @@ static const struct hda_codec_ops generic_hdmi_patch_ops = { .build_controls = generic_hdmi_build_controls, .unsol_event = hdmi_unsol_event, #ifdef CONFIG_PM + .suspend = generic_hdmi_suspend, .resume = generic_hdmi_resume, #endif }; diff --git a/sound/usb/card.c b/sound/usb/card.c index 85ed8507e41a..b6f4c0848e66 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -830,6 +830,9 @@ static int usb_audio_probe(struct usb_interface *intf, snd_media_device_create(chip, intf); } + if (quirk) + chip->quirk_type = quirk->type; + usb_chip[chip->index] = chip; chip->intf[chip->num_interfaces] = intf; chip->num_interfaces++; @@ -904,6 +907,9 @@ static void usb_audio_disconnect(struct usb_interface *intf) } } + if (chip->quirk_type & QUIRK_SETUP_DISABLE_AUTOSUSPEND) + usb_enable_autosuspend(interface_to_usbdev(intf)); + chip->num_interfaces--; if (chip->num_interfaces <= 0) { usb_chip[chip->index] = NULL; diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 737b2729c0d3..d3001fb18141 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -547,7 +547,7 @@ static int setup_disable_autosuspend(struct snd_usb_audio *chip, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { - driver->supports_autosuspend = 0; + usb_disable_autosuspend(interface_to_usbdev(iface)); return 1; /* Continue with creating streams and mixer */ } @@ -1520,6 +1520,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */ case USB_ID(0x21b4, 0x0081): /* AudioQuest DragonFly */ case USB_ID(0x2912, 0x30c8): /* Audioengine D1 */ + case USB_ID(0x413c, 0xa506): /* Dell AE515 sound bar */ return true; } @@ -1670,6 +1671,14 @@ void snd_usb_ctl_msg_quirk(struct usb_device *dev, unsigned int pipe, && (requesttype & USB_TYPE_MASK) == USB_TYPE_CLASS) msleep(20); + /* + * Plantronics headsets (C320, C320-M, etc) need a delay to avoid + * random microhpone failures. + */ + if (USB_ID_VENDOR(chip->usb_id) == 0x047f && + (requesttype & USB_TYPE_MASK) == USB_TYPE_CLASS) + msleep(20); + /* Zoom R16/24, many Logitech(at least H650e/H570e/BCC950), * Jabra 550a, Kingston HyperX needs a tiny delay here, * otherwise requests like get/set frequency return diff --git a/sound/usb/usbaudio.h b/sound/usb/usbaudio.h index 215c1771dd57..60b9dd7df6bb 100644 --- a/sound/usb/usbaudio.h +++ b/sound/usb/usbaudio.h @@ -27,6 +27,7 @@ struct snd_usb_audio { struct snd_card *card; struct usb_interface *intf[MAX_CARD_INTERFACES]; u32 usb_id; + uint16_t quirk_type; struct mutex mutex; unsigned int system_suspend; atomic_t active; diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b2282be6f938..612d3899614a 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -332,5 +332,5 @@ int main(void) ksft_print_cnts(); - return 0; + return ret; }