You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
/* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating * 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise * gfn_track will overflow and explosions will ensure. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which * incorporates various mode bits and properties of the SP. Roughly speaking, * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * * But, even though there are 19 bits in the mask below, not all combinations * of modes and flags are possible: * * - invalid shadow pages are not accounted, so the bits are effectively 18 * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has * has_4_byte_gpte=0. Therefore, 2 bits are always unused. * * - the 4 bits of level are effectively limited to the values 2/3/4/5, * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE * paging has exactly one upper level, making level completely redundant * when has_4_byte_gpte=1. * * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. * * Therefore, the maximum number of possible upper-level shadow pages for a * single gfn is a bit less than 2^13.*/union kvm_mmu_page_role {
u32 word;
struct {
unsigned level:4;
unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
unsigned efer_nx:1;
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
unsigned guest_mode:1;
unsigned passthrough:1;
unsigned :5;
/* * This is left at the top of the word so that * kvm_memslots_for_spte_role can extract it with a * simple shift. While there is room, give it a whole * byte so it is also faster to load it from memory.*/unsigned smm:8;
};
};
staticintvmx_get_max_tdp_level(void)
{
if (cpu_has_vmx_ept_5levels())
return5;
return4;
}
voidkvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
...
}
static __init inthardware_setup(void)
{
...
kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
ept_caps_to_lpage_level(vmx_capability.ept));
...
}
struct kvm_mmu_page
struct kvm_mmu_page是影子页面的主要数据结构
影子页面包含 512 个 spte,可以是叶 spte,也可以是非叶 spte
影子页面可能包含叶子和非叶子的混合
arch/x86/kvm/mmu/mmu_internal.h
structkvm_mmu_page {
/* * Note, "link" through "spt" fit in a single 64 byte cache line on * 64-bit kernels, keep it that way unless there's a reason not to.*/structlist_head link;
structhlist_node hash_link;
bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
bool lpage_disallowed; /* Can't be replaced by an equiv large page *//* * The following two entries are used to key the shadow page in the * hash table.*/union kvm_mmu_page_role role;
gfn_t gfn;
u64 *spt;
/* * Stores the result of the guest translation being shadowed by each * SPTE. KVM shadows two types of guest translations: nGPA -> GPA * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both * cases the result of the translation is a GPA and a set of access * constraints. * * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed * access permissions are stored in the lower bits. Note, for * convenience and uniformity across guests, the access permissions are * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.*/
u64 *shadowed_translation;
/* Currently serving as active root */union {
int root_count;
refcount_t tdp_mmu_root_count;
};
unsignedint unsync_children;
union {
structkvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */tdp_ptep_t ptep;
};
union {
DECLARE_BITMAP(unsync_child_bitmap, 512);
struct {
structwork_struct tdp_mmu_async_work;
void *tdp_mmu_async_data;
};
};
structlist_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
/* * Used out of the mmu-lock to avoid reading spte values while an * update is in progress; see the comments in __get_spte_lockless().*/int clear_spte_count;
#endif/* Number of writes since the last time traversal visited this page. */atomic_t write_flooding_count;
#ifdef CONFIG_X86_64
/* Used for freeing the page asynchronously if it is a TDP MMU page. */structrcu_head rcu_head;
#endif
};
/* * Given an SPTE and its level, returns a pointer containing the host virtual * address of the child page table referenced by the SPTE. Returns null if * there is no such entry.*/tdp_ptep_tspte_to_child_pt(u64 spte, int level)
{
/* * There's no child entry if this entry isn't present or is a * last-level entry.*/if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
returnNULL;
//PA -> KVM PFN -> 对齐的 PA -> VA,通过这个 VA 操作页表return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}
建立 TDP MMU 映射 kvm_tdp_mmu_map()
arch/x86/kvm/mmu/tdp_mmu.c
staticvoidtdp_mmu_init_sp(structkvm_mmu_page *sp, tdp_ptep_t sptep,
gfn_t gfn, union kvm_mmu_page_role role)
{ //这里设置了 page->private 回指 kvm_mmu_page 结构set_page_private(virt_to_page(sp->spt), (unsignedlong)sp);
sp->role = role; //子继承父的 role
sp->gfn = gfn; //新页表页的起始页帧
sp->ptep = sptep; //设置页表页的父页表页的虚拟地址,由于是新分配的,必然只有一个父页表页
sp->tdp_mmu_page = true;
trace_kvm_mmu_get_page(sp, true);
}
staticvoidtdp_mmu_init_child_sp(structkvm_mmu_page *child_sp,
structtdp_iter *iter)
{
structkvm_mmu_page *parent_sp;
union kvm_mmu_page_role role;
//根据父页表页的虚拟地址得到其对应的 kvm_mmu_page 指针
parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
role = parent_sp->role; //子继承父的 role
role.level--; //往下走一级//初始化子页表页对应的 kvm_mmu_page 实例的各个域tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
...
/* * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration)*/staticinttdp_mmu_map_handle_target_level(structkvm_vcpu *vcpu,
structkvm_page_fault *fault,
structtdp_iter *iter)
{
structkvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
u64 new_spte;
int ret = RET_PF_FIXED;
bool wrprot = false;
//能走到这里目标级别和当前影子叶子页表页的级别应该是一致的,否则发出警告if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
//new_spte 是新页表项的内容,期望是一个物理地址if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else//创建叶子页表页的 SPTE 的内容
wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
fault->pfn, iter->old_spte, fault->prefetch, true,
fault->map_writable, &new_spte);
//将新页表项的内容 new_spte 填充到 iter->sptep 指向的页表项if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
elseif (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
elseif (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(iter->level + 1));
/* * If the page fault was caused by a write but the page is write * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again.*/if (wrprot) {
if (fault->write)
ret = RET_PF_EMULATE;
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */if (unlikely(is_mmio_spte(new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
ret = RET_PF_EMULATE;
} else {
trace_kvm_mmu_set_spte(iter->level, iter->gfn,
rcu_dereference(iter->sptep));
}
return ret;
}
/* * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the * provided page table. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed).*/staticinttdp_mmu_link_sp(structkvm *kvm, structtdp_iter *iter,
structkvm_mmu_page *sp, bool shared)
{ //创建一个非叶子页表页的 SPTE 内容,由子页表页的虚拟地址转换为物理地址作为主要内容
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
int ret = 0;
//设置父页表页(从 iter->sptep 得到)的页表项内容为物理地址 spteif (shared) {
ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
if (ret)
return ret;
} else {
tdp_mmu_set_spte(kvm, iter, spte);
}
tdp_account_mmu_page(kvm, sp);
return0;
}
...
/* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address.*/intkvm_tdp_mmu_map(structkvm_vcpu *vcpu, structkvm_page_fault *fault)
{
structkvm_mmu *mmu = vcpu->arch.mmu;
structkvm *kvm = vcpu->kvm;
structtdp_iter iter;
structkvm_mmu_page *sp;
int ret = RET_PF_RETRY;
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
rcu_read_lock();
//以先序遍历的方式逐级建立映射tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
int r;
//如果使能了不可执行巨页的 workaround(见后面简介)需要对巨页的缺页处理进行拆分,用小的页代替巨页if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
//如果 SPTE 已经被另一个线程冻结,直接放弃重试,避免不必要的页表分配和释放/* * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free.*/if (is_removed_spte(iter.old_spte))
goto retry;
//到了目标层级,跳出循环;否则继续建立目标层级以上逐级的页表映射if (iter.level == fault->goal_level)
goto map_target_level;
//如果下一级映射存在且不是大页,步进到下一级/* Step down into the lower level page table if it exists. */if (is_shadow_present_pte(iter.old_spte) &&
!is_large_pte(iter.old_spte))
continue;
//SPTE 映射不存在则分配新 sp,或者 SPTE 指向一个巨页则分割巨页/* * The SPTE is either non-present or points to a huge page that * needs to be split.*/
sp = tdp_mmu_alloc_sp(vcpu); //分配准备用于子页表页的 kvm_mmu_page 实例及其要管理的页表页,用 spt 域(HVA)指向它tdp_mmu_init_child_sp(sp, &iter); //将子页表页链接到父页表页,设置子页表页 struct page->private 回指 sp//传递是否“不允许创建巨页”的属性给新 sp
sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
//SPTE 映射指向一个巨页,需要分割巨页if (is_shadow_present_pte(iter.old_spte))
r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);//子页表页的每个条目指向分割好的数据页else//对于子页表页不存在的情况,将父页表页(从 iter->sptep 得到)的 SPTE 内容设为新分配的子页表页(由 sp->spt 转换得到)的物理地址
r = tdp_mmu_link_sp(kvm, &iter, sp, true);
//如果安装上层 SPTE 失败,则强制 guest 重试,例如,因为不同的任务修改了 SPTE/* * Force the guest to retry if installing an upper level SPTE * failed, e.g. because a different task modified the SPTE.*/if (r) {
tdp_mmu_free_sp(sp);
goto retry;
}
//如果缺页不允许创建巨页,添加 sp 到 possible_nx_huge_page_link 链表,将来 zap 该页的时候有可能恢复成不可执行的巨页if (fault->huge_page_disallowed &&
fault->req_level >= iter.level) {
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
if (sp->nx_huge_page_disallowed)
track_possible_nx_huge_page(kvm, sp);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
}
/* * The walk aborted before reaching the target level, e.g. because the * iterator detected an upper level SPTE was frozen during traversal.*/WARN_ON_ONCE(iter.level == fault->goal_level);
goto retry;
//前面的准备工作已经完成,数据页 fault->pfn 在前面 kvm_faultin_pfn() 已经分配了,此处需将它填充最后一级页表页的 SPTE 完成映射
map_target_level:
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
retry:
rcu_read_unlock();
return ret;
}
#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
/* * Memory caches are used to preallocate memory ahead of various MMU flows, * e.g. page fault handlers. Gracefully handling allocation failures deep in * MMU flows is problematic, as is triggering reclaim, I/O, etc... while * holding MMU locks. Note, these caches act more like prefetch buffers than * classical caches, i.e. objects are not returned to the cache on being freed. * * The @capacity field and @objects array are lazily initialized when the cache * is topped up (__kvm_mmu_topup_memory_cache()).*/structkvm_mmu_memory_cache {
int nobjs;
gfp_t gfp_zero;
gfp_t gfp_custom;
structkmem_cache *kmem_cache;
int capacity;
void **objects;
};
#endif