diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 08a84f801bfe..c1dcaa3d2d6e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1324,7 +1324,6 @@ struct kvm_arch { * the thread holds the MMU lock in write mode. */ spinlock_t tdp_mmu_pages_lock; - struct workqueue_struct *tdp_mmu_zap_wq; #endif /* CONFIG_X86_64 */ /* @@ -1727,7 +1726,7 @@ void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2a6fec4e2d19..d30325e297a0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5994,19 +5994,16 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, kvm_mmu_zap_all_fast(kvm); } -int kvm_mmu_init_vm(struct kvm *kvm) +void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - int r; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; + kvm_mmu_init_tdp_mmu(kvm); node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -6019,8 +6016,6 @@ int kvm_mmu_init_vm(struct kvm *kvm) kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; - - return 0; } static void mmu_free_vm_memory_caches(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 582def531d4d..0a9d5f2925c3 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -56,7 +56,12 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; - u8 mmu_valid_gen; + union { + u8 mmu_valid_gen; + + /* Only accessed under slots_lock. */ + bool tdp_mmu_scheduled_root_to_zap; + }; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* @@ -92,13 +97,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ tdp_ptep_t ptep; }; - union { - DECLARE_BITMAP(unsync_child_bitmap, 512); - struct { - struct work_struct tdp_mmu_async_work; - void *tdp_mmu_async_data; - }; - }; + DECLARE_BITMAP(unsync_child_bitmap, 512); struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 9b9fc4e834d0..c3b0f973375b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,24 +14,16 @@ static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -int kvm_mmu_init_tdp_mmu(struct kvm *kvm) +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - struct workqueue_struct *wq; - if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return 0; - - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); - if (!wq) - return -ENOMEM; + return; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); - kvm->arch.tdp_mmu_zap_wq = wq; - return 1; } /* Arbitrarily returns true so that this may be used in if statements. */ @@ -57,20 +49,15 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) * ultimately frees all roots. */ kvm_tdp_mmu_invalidate_all_roots(kvm); - - /* - * Destroying a workqueue also first flushes the workqueue, i.e. no - * need to invoke kvm_tdp_mmu_zap_invalidated_roots(). - */ - destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); + kvm_tdp_mmu_zap_invalidated_roots(kvm); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* * Ensure that all the outstanding RCU callbacks to free shadow pages - * can run before the VM is torn down. Work items on tdp_mmu_zap_wq - * can call kvm_tdp_mmu_put_root and create new callbacks. + * can run before the VM is torn down. Putting the last reference to + * zapped roots will create new callbacks. */ rcu_barrier(); } @@ -97,46 +84,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } -static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared); - -static void tdp_mmu_zap_root_work(struct work_struct *work) -{ - struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, - tdp_mmu_async_work); - struct kvm *kvm = root->tdp_mmu_async_data; - - read_lock(&kvm->mmu_lock); - - /* - * A TLB flush is not necessary as KVM performs a local TLB flush when - * allocating a new root (see kvm_mmu_load()), and when migrating vCPU - * to a different pCPU. Note, the local TLB flush on reuse also - * invalidates any paging-structure-cache entries, i.e. TLB entries for - * intermediate paging structures, that may be zapped, as such entries - * are associated with the ASID on both VMX and SVM. - */ - tdp_mmu_zap_root(kvm, root, true); - - /* - * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for - * avoiding an infinite loop. By design, the root is reachable while - * it's being asynchronously zapped, thus a different task can put its - * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an - * asynchronously zapped root is unavoidable. - */ - kvm_tdp_mmu_put_root(kvm, root, true); - - read_unlock(&kvm->mmu_lock); -} - -static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) -{ - root->tdp_mmu_async_data = kvm; - INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); - queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); -} - void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { @@ -222,11 +169,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, false, false); \ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \ _root; \ - _root = tdp_mmu_next_root(_kvm, _root, false, false)) \ - if (!kvm_lockdep_assert_mmu_lock_held(_kvm, false)) { \ + _root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \ + if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \ } else /* @@ -305,7 +252,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) * by a memslot update or by the destruction of the VM. Initialize the * refcount to two; one reference for the vCPU, and one reference for * the TDP MMU itself, which is held until the root is invalidated and - * is ultimately put by tdp_mmu_zap_root_work(). + * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); @@ -963,7 +910,7 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_tdp_mmu_root_yield_safe(kvm, root, false) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -985,7 +932,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_tdp_mmu_root_yield_safe(kvm, root, false) tdp_mmu_zap_root(kvm, root, false); } @@ -995,18 +942,47 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + struct kvm_mmu_page *root; + + read_lock(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root, true) { + if (!root->tdp_mmu_scheduled_root_to_zap) + continue; + + root->tdp_mmu_scheduled_root_to_zap = false; + KVM_BUG_ON(!root->role.invalid, kvm); + + /* + * A TLB flush is not necessary as KVM performs a local TLB + * flush when allocating a new root (see kvm_mmu_load()), and + * when migrating a vCPU to a different pCPU. Note, the local + * TLB flush on reuse also invalidates paging-structure-cache + * entries, i.e. TLB entries for intermediate paging structures, + * that may be zapped, as such entries are associated with the + * ASID on both VMX and SVM. + */ + tdp_mmu_zap_root(kvm, root, true); + + /* + * The referenced needs to be put *after* zapping the root, as + * the root must be reachable by mmu_notifiers while it's being + * zapped + */ + kvm_tdp_mmu_put_root(kvm, root, true); + } + + read_unlock(&kvm->mmu_lock); } /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual - * zapping is performed asynchronously. Using a separate workqueue makes it - * easy to ensure that the destruction is performed before the "fast zap" - * completes, without keeping a separate list of invalidated roots; the list is - * effectively the list of work items in the workqueue. + * zapping is done separately so that it happens with mmu_lock with read, + * whereas invalidating roots must be done with mmu_lock held for write (unless + * the VM is being destroyed). * - * Note, the asynchronous worker is gifted the TDP MMU's reference. + * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_get_vcpu_root_hpa(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) @@ -1031,19 +1007,20 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) /* * As above, mmu_lock isn't held when destroying the VM! There can't * be other references to @kvm, i.e. nothing else can invalidate roots - * or be consuming roots, but walking the list of roots does need to be - * guarded against roots being deleted by the asynchronous zap worker. + * or get/put references to roots. */ - rcu_read_lock(); - - list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) { + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + /* + * Note, invalid roots can outlive a memslot update! Invalid + * roots must be *zapped* before the memslot update completes, + * but a different task can acquire a reference and keep the + * root alive after its been zapped. + */ if (!root->role.invalid) { + root->tdp_mmu_scheduled_root_to_zap = true; root->role.invalid = true; - tdp_mmu_schedule_zap_root(kvm, root); } } - - rcu_read_unlock(); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index d0a9fe0770fd..c82a8bb321bb 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -65,7 +65,7 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, u64 *spte); #ifdef CONFIG_X86_64 -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } @@ -86,7 +86,7 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else -static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } +static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a020c89a6bb1..4d6baae1ae74 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12453,9 +12453,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out; - ret = kvm_mmu_init_vm(kvm); - if (ret) - goto out_page_track; + kvm_mmu_init_vm(kvm); ret = static_call(kvm_x86_vm_init)(kvm); if (ret) @@ -12500,7 +12498,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) out_uninit_mmu: kvm_mmu_uninit_vm(kvm); -out_page_track: kvm_page_track_cleanup(kvm); out: return ret;