Skip to content

Commit bb18842

Browse files
Ben Gardonbonzini
authored andcommitted
kvm: x86/mmu: Add TDP MMU PF handler
Add functions to handle page faults in the TDP MMU. These page faults are currently handled in much the same way as the x86 shadow paging based MMU, however the ordering of some operations is slightly different. Future patches will add eager NX splitting, a fast page fault handler, and parallel page faults. Tested by running kvm-unit-tests and KVM selftests on an Intel Haswell machine. This series introduced no new failures. This series can be viewed in Gerrit at: https://linux-review.googlesource.com/c/virt/kvm/kvm/+/2538 Signed-off-by: Ben Gardon <bgardon@google.com> Message-Id: <20201014182700.2888246-11-bgardon@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
1 parent 7d94531 commit bb18842

5 files changed

Lines changed: 194 additions & 37 deletions

File tree

arch/x86/kvm/mmu/mmu.c

Lines changed: 20 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -137,23 +137,6 @@ module_param(dbg, bool, 0644);
137137
/* make pte_list_desc fit well in cache line */
138138
#define PTE_LIST_EXT 3
139139

140-
/*
141-
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
142-
*
143-
* RET_PF_RETRY: let CPU fault again on the address.
144-
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
145-
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
146-
* RET_PF_FIXED: The faulting entry has been fixed.
147-
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
148-
*/
149-
enum {
150-
RET_PF_RETRY = 0,
151-
RET_PF_EMULATE,
152-
RET_PF_INVALID,
153-
RET_PF_FIXED,
154-
RET_PF_SPURIOUS,
155-
};
156-
157140
struct pte_list_desc {
158141
u64 *sptes[PTE_LIST_EXT];
159142
struct pte_list_desc *more;
@@ -233,11 +216,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
233216
unsigned int access)
234217
{
235218
u64 mask = make_mmio_spte(vcpu, gfn, access);
236-
unsigned int gen = get_mmio_spte_generation(mask);
237219

238-
access = mask & ACC_ALL;
239-
240-
trace_mark_mmio_spte(sptep, gfn, access, gen);
220+
trace_mark_mmio_spte(sptep, gfn, mask);
241221
mmu_spte_set(sptep, mask);
242222
}
243223

@@ -2762,9 +2742,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
27622742
return level;
27632743
}
27642744

2765-
static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
2766-
int max_level, kvm_pfn_t *pfnp,
2767-
bool huge_page_disallowed, int *req_level)
2745+
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
2746+
int max_level, kvm_pfn_t *pfnp,
2747+
bool huge_page_disallowed, int *req_level)
27682748
{
27692749
struct kvm_memory_slot *slot;
27702750
struct kvm_lpage_info *linfo;
@@ -2818,10 +2798,10 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
28182798
return level;
28192799
}
28202800

2821-
static void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
2822-
kvm_pfn_t *pfnp, int *levelp)
2801+
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
2802+
kvm_pfn_t *pfnp, int *goal_levelp)
28232803
{
2824-
int level = *levelp;
2804+
int level = *goal_levelp;
28252805

28262806
if (cur_level == level && level > PG_LEVEL_4K &&
28272807
is_shadow_present_pte(spte) &&
@@ -2836,7 +2816,7 @@ static void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
28362816
u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
28372817
KVM_PAGES_PER_HPAGE(level - 1);
28382818
*pfnp |= gfn & page_mask;
2839-
(*levelp)--;
2819+
(*goal_levelp)--;
28402820
}
28412821
}
28422822

@@ -3643,9 +3623,11 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
36433623
if (page_fault_handle_page_track(vcpu, error_code, gfn))
36443624
return RET_PF_EMULATE;
36453625

3646-
r = fast_page_fault(vcpu, gpa, error_code);
3647-
if (r != RET_PF_INVALID)
3648-
return r;
3626+
if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
3627+
r = fast_page_fault(vcpu, gpa, error_code);
3628+
if (r != RET_PF_INVALID)
3629+
return r;
3630+
}
36493631

36503632
r = mmu_topup_memory_caches(vcpu, false);
36513633
if (r)
@@ -3667,8 +3649,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
36673649
r = make_mmu_pages_available(vcpu);
36683650
if (r)
36693651
goto out_unlock;
3670-
r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
3671-
prefault, is_tdp);
3652+
3653+
if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
3654+
r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
3655+
pfn, prefault);
3656+
else
3657+
r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
3658+
prefault, is_tdp);
36723659

36733660
out_unlock:
36743661
spin_unlock(&vcpu->kvm->mmu_lock);

arch/x86/kvm/mmu/mmu_internal.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,36 @@ static inline bool kvm_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *sp)
111111
return !sp->root_count;
112112
}
113113

114+
/*
115+
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
116+
*
117+
* RET_PF_RETRY: let CPU fault again on the address.
118+
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
119+
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
120+
* RET_PF_FIXED: The faulting entry has been fixed.
121+
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
122+
*/
123+
enum {
124+
RET_PF_RETRY = 0,
125+
RET_PF_EMULATE,
126+
RET_PF_INVALID,
127+
RET_PF_FIXED,
128+
RET_PF_SPURIOUS,
129+
};
130+
131+
/* Bits which may be returned by set_spte() */
132+
#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
133+
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
134+
#define SET_SPTE_SPURIOUS BIT(2)
135+
136+
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
137+
int max_level, kvm_pfn_t *pfnp,
138+
bool huge_page_disallowed, int *req_level);
139+
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
140+
kvm_pfn_t *pfnp, int *goal_levelp);
141+
142+
bool is_nx_huge_page_enabled(void);
143+
144+
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
145+
114146
#endif /* __KVM_X86_MMU_INTERNAL_H */

arch/x86/kvm/mmu/mmutrace.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,8 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
202202

203203
TRACE_EVENT(
204204
mark_mmio_spte,
205-
TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
206-
TP_ARGS(sptep, gfn, access, gen),
205+
TP_PROTO(u64 *sptep, gfn_t gfn, u64 spte),
206+
TP_ARGS(sptep, gfn, spte),
207207

208208
TP_STRUCT__entry(
209209
__field(void *, sptep)
@@ -215,8 +215,8 @@ TRACE_EVENT(
215215
TP_fast_assign(
216216
__entry->sptep = sptep;
217217
__entry->gfn = gfn;
218-
__entry->access = access;
219-
__entry->gen = gen;
218+
__entry->access = spte & ACC_ALL;
219+
__entry->gen = get_mmio_spte_generation(spte);
220220
),
221221

222222
TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "mmu.h"
44
#include "mmu_internal.h"
5+
#include "mmutrace.h"
56
#include "tdp_iter.h"
67
#include "tdp_mmu.h"
78
#include "spte.h"
@@ -271,6 +272,10 @@ static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
271272
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
272273
for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
273274

275+
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
276+
for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
277+
_mmu->shadow_root_level, _start, _end)
278+
274279
/*
275280
* Flush the TLB if the process should drop kvm->mmu_lock.
276281
* Return whether the caller still needs to flush the tlb.
@@ -355,3 +360,132 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
355360
if (flush)
356361
kvm_flush_remote_tlbs(kvm);
357362
}
363+
364+
/*
365+
* Installs a last-level SPTE to handle a TDP page fault.
366+
* (NPT/EPT violation/misconfiguration)
367+
*/
368+
static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
369+
int map_writable,
370+
struct tdp_iter *iter,
371+
kvm_pfn_t pfn, bool prefault)
372+
{
373+
u64 new_spte;
374+
int ret = 0;
375+
int make_spte_ret = 0;
376+
377+
if (unlikely(is_noslot_pfn(pfn))) {
378+
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
379+
trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
380+
} else
381+
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
382+
pfn, iter->old_spte, prefault, true,
383+
map_writable, !shadow_accessed_mask,
384+
&new_spte);
385+
386+
if (new_spte == iter->old_spte)
387+
ret = RET_PF_SPURIOUS;
388+
else
389+
tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
390+
391+
/*
392+
* If the page fault was caused by a write but the page is write
393+
* protected, emulation is needed. If the emulation was skipped,
394+
* the vCPU would have the same fault again.
395+
*/
396+
if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
397+
if (write)
398+
ret = RET_PF_EMULATE;
399+
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
400+
}
401+
402+
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
403+
if (unlikely(is_mmio_spte(new_spte)))
404+
ret = RET_PF_EMULATE;
405+
406+
trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
407+
if (!prefault)
408+
vcpu->stat.pf_fixed++;
409+
410+
return ret;
411+
}
412+
413+
/*
414+
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
415+
* page tables and SPTEs to translate the faulting guest physical address.
416+
*/
417+
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
418+
int map_writable, int max_level, kvm_pfn_t pfn,
419+
bool prefault)
420+
{
421+
bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
422+
bool write = error_code & PFERR_WRITE_MASK;
423+
bool exec = error_code & PFERR_FETCH_MASK;
424+
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
425+
struct kvm_mmu *mmu = vcpu->arch.mmu;
426+
struct tdp_iter iter;
427+
struct kvm_mmu_memory_cache *pf_pt_cache =
428+
&vcpu->arch.mmu_shadow_page_cache;
429+
u64 *child_pt;
430+
u64 new_spte;
431+
int ret;
432+
gfn_t gfn = gpa >> PAGE_SHIFT;
433+
int level;
434+
int req_level;
435+
436+
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
437+
return RET_PF_RETRY;
438+
if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
439+
return RET_PF_RETRY;
440+
441+
level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
442+
huge_page_disallowed, &req_level);
443+
444+
trace_kvm_mmu_spte_requested(gpa, level, pfn);
445+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
446+
if (nx_huge_page_workaround_enabled)
447+
disallowed_hugepage_adjust(iter.old_spte, gfn,
448+
iter.level, &pfn, &level);
449+
450+
if (iter.level == level)
451+
break;
452+
453+
/*
454+
* If there is an SPTE mapping a large page at a higher level
455+
* than the target, that SPTE must be cleared and replaced
456+
* with a non-leaf SPTE.
457+
*/
458+
if (is_shadow_present_pte(iter.old_spte) &&
459+
is_large_pte(iter.old_spte)) {
460+
tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
461+
462+
kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
463+
KVM_PAGES_PER_HPAGE(iter.level));
464+
465+
/*
466+
* The iter must explicitly re-read the spte here
467+
* because the new value informs the !present
468+
* path below.
469+
*/
470+
iter.old_spte = READ_ONCE(*iter.sptep);
471+
}
472+
473+
if (!is_shadow_present_pte(iter.old_spte)) {
474+
child_pt = kvm_mmu_memory_cache_alloc(pf_pt_cache);
475+
clear_page(child_pt);
476+
new_spte = make_nonleaf_spte(child_pt,
477+
!shadow_accessed_mask);
478+
479+
trace_kvm_mmu_get_page(sp, true);
480+
tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
481+
}
482+
}
483+
484+
if (WARN_ON(iter.level != level))
485+
return RET_PF_RETRY;
486+
487+
ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
488+
pfn, prefault);
489+
490+
return ret;
491+
}

arch/x86/kvm/mmu/tdp_mmu.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,8 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
1414

1515
bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
1616
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
17+
18+
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
19+
int map_writable, int max_level, kvm_pfn_t pfn,
20+
bool prefault);
1721
#endif /* __KVM_X86_MMU_TDP_MMU_H */

0 commit comments

Comments
 (0)