xref: /illumos-kvm/kvm_mmu.c (revision 8c4fef53)
1 /*
2  * GPL HEADER START
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16  *
17  * GPL HEADER END
18  *
19  * Copyright 2011 various Linux Kernel contributors.
20  * Copyright 2017 Joyent, Inc.
21  * Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
22  * Copyright 2011 Richard Lowe
23  */
24 
25 #include <sys/sysmacros.h>
26 
27 #include "kvm_bitops.h"
28 #include "kvm_cache_regs.h"
29 #include "kvm_x86impl.h"
30 #include "kvm_host.h"
31 #include "kvm_mmu.h"
32 #include "msr-index.h"
33 
34 /*
35  * When setting this variable to true it enables Two-Dimensional-Paging
36  * where the hardware walks 2 page tables:
37  * 1. the guest-virtual to guest-physical
38  * 2. while doing 1. it walks guest-physical to host-physical
39  * If the hardware supports that we don't need to do shadow paging.
40  */
41 int tdp_enabled = 0;
42 
43 static int oos_shadow = 1;
44 
45 #define	virt_to_page(addr) pfn_to_page(hat_getpfnum(kas.a_hat, addr))
46 
47 #define	PT_FIRST_AVAIL_BITS_SHIFT 9
48 #define	PT64_SECOND_AVAIL_BITS_SHIFT 52
49 
50 #define	VALID_PAGE(x) ((x) != INVALID_PAGE)
51 
52 #define	PT64_LEVEL_BITS 9
53 
54 #define	PT64_LEVEL_SHIFT(level) \
55 		(PAGESHIFT + (level - 1) * PT64_LEVEL_BITS)
56 
57 #define	PT64_LEVEL_MASK(level) \
58 		(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
59 
60 #define	PT64_INDEX(address, level)\
61 	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
62 
63 #define	PT32_LEVEL_BITS 10
64 
65 #define	PT32_LEVEL_SHIFT(level) \
66 	(PAGESHIFT + (level - 1) * PT32_LEVEL_BITS)
67 
68 #define	PT32_LEVEL_MASK(level) \
69 	(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
70 
71 #define	PT32_LVL_OFFSET_MASK(level) (PT32_BASE_ADDR_MASK & \
72 	((1ULL << (PAGESHIFT + (((level) - 1) * PT32_LEVEL_BITS))) - 1))
73 
74 
75 #define	PT32_INDEX(address, level) \
76 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
77 
78 
79 #define	PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(uint64_t)(PAGESIZE-1))
80 
81 #define	PT64_DIR_BASE_ADDR_MASK \
82 	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGESHIFT + PT64_LEVEL_BITS)) - 1))
83 
84 #define	PT64_LVL_ADDR_MASK(level) \
85 	(PT64_BASE_ADDR_MASK & \
86 	~((1ULL << (PAGESHIFT + (((level) - 1) * PT64_LEVEL_BITS))) - 1))
87 
88 #define	PT64_LVL_OFFSET_MASK(level) (PT64_BASE_ADDR_MASK & \
89 	((1ULL << (PAGESHIFT + (((level) - 1) * PT64_LEVEL_BITS))) - 1))
90 
91 #define	PT32_BASE_ADDR_MASK PAGEMASK
92 
93 #define	PT32_DIR_BASE_ADDR_MASK \
94 	(PAGEMASK & ~((1ULL << (PAGESHIFT + PT32_LEVEL_BITS)) - 1))
95 
96 #define	PT32_LVL_ADDR_MASK(level) (PAGEMASK & \
97 	~((1ULL << (PAGESHIFT + (((level) - 1) * PT32_LEVEL_BITS))) - 1))
98 
99 #define	PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
100 	| PT64_NX_MASK)
101 
102 #define	RMAP_EXT 4
103 
104 #define	ACC_EXEC_MASK    1
105 #define	ACC_WRITE_MASK   PT_WRITABLE_MASK
106 #define	ACC_USER_MASK    PT_USER_MASK
107 #define	ACC_ALL		(ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
108 
109 #define	SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
110 
111 #define	SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
112 
113 typedef struct kvm_rmap_desc {
114 	uint64_t *sptes[RMAP_EXT];
115 	struct kvm_rmap_desc *more;
116 } kvm_rmap_desc_t;
117 
118 typedef struct kvm_shadow_walk_iterator {
119 	uint64_t addr;
120 	hpa_t shadow_addr;
121 	uint64_t *sptep;
122 	int level;
123 	unsigned index;
124 } kvm_shadow_walk_iterator_t;
125 
126 #define	for_each_shadow_entry(_vcpu, _addr, _walker)    \
127 	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
128 	    shadow_walk_okay(&(_walker), _vcpu);		\
129 	    shadow_walk_next(&(_walker)))
130 
131 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *, struct kvm_mmu_page *);
132 
133 struct kmem_cache *pte_chain_cache;
134 struct kmem_cache *rmap_desc_cache;
135 struct kmem_cache *mmu_page_header_cache;
136 
137 static uint64_t shadow_trap_nonpresent_pte;
138 static uint64_t shadow_notrap_nonpresent_pte;
139 static uint64_t shadow_base_present_pte;
140 static uint64_t shadow_nx_mask;
141 static uint64_t shadow_x_mask;	/* mutual exclusive with nx_mask */
142 static uint64_t shadow_user_mask;
143 static uint64_t shadow_accessed_mask;
144 static uint64_t shadow_dirty_mask;
145 
146 static uint64_t
rsvd_bits(int s,int e)147 rsvd_bits(int s, int e)
148 {
149 	return (((1ULL << (e - s + 1)) - 1) << s);
150 }
151 
152 void
kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte,uint64_t notrap_pte)153 kvm_mmu_set_nonpresent_ptes(uint64_t trap_pte, uint64_t notrap_pte)
154 {
155 	shadow_trap_nonpresent_pte = trap_pte;
156 	shadow_notrap_nonpresent_pte = notrap_pte;
157 }
158 
159 void
kvm_mmu_set_base_ptes(uint64_t base_pte)160 kvm_mmu_set_base_ptes(uint64_t base_pte)
161 {
162 	shadow_base_present_pte = base_pte;
163 }
164 
165 void
kvm_mmu_set_mask_ptes(uint64_t user_mask,uint64_t accessed_mask,uint64_t dirty_mask,uint64_t nx_mask,uint64_t x_mask)166 kvm_mmu_set_mask_ptes(uint64_t user_mask, uint64_t accessed_mask,
167     uint64_t dirty_mask, uint64_t nx_mask, uint64_t x_mask)
168 {
169 	shadow_user_mask = user_mask;
170 	shadow_accessed_mask = accessed_mask;
171 	shadow_dirty_mask = dirty_mask;
172 	shadow_nx_mask = nx_mask;
173 	shadow_x_mask = x_mask;
174 }
175 
176 static int
is_write_protection(struct kvm_vcpu * vcpu)177 is_write_protection(struct kvm_vcpu *vcpu)
178 {
179 	return (kvm_read_cr0_bits(vcpu, X86_CR0_WP));
180 }
181 
182 static int
is_cpuid_PSE36(void)183 is_cpuid_PSE36(void)
184 {
185 	return (1);
186 }
187 
188 static int
is_nx(struct kvm_vcpu * vcpu)189 is_nx(struct kvm_vcpu *vcpu)
190 {
191 	return (vcpu->arch.efer & EFER_NX);
192 }
193 
194 static int
is_shadow_present_pte(uint64_t pte)195 is_shadow_present_pte(uint64_t pte)
196 {
197 	return (pte != shadow_trap_nonpresent_pte &&
198 	    pte != shadow_notrap_nonpresent_pte);
199 }
200 
201 static int
is_large_pte(uint64_t pte)202 is_large_pte(uint64_t pte)
203 {
204 	return (pte & PT_PAGE_SIZE_MASK);
205 }
206 
207 static int
is_writable_pte(unsigned long pte)208 is_writable_pte(unsigned long pte)
209 {
210 	return (pte & PT_WRITABLE_MASK);
211 }
212 
213 static int
is_dirty_gpte(unsigned long pte)214 is_dirty_gpte(unsigned long pte)
215 {
216 	return (pte & PT_DIRTY_MASK);
217 }
218 
219 static int
is_rmap_spte(uint64_t pte)220 is_rmap_spte(uint64_t pte)
221 {
222 	return (is_shadow_present_pte(pte));
223 }
224 
225 static int
is_last_spte(uint64_t pte,int level)226 is_last_spte(uint64_t pte, int level)
227 {
228 	if (level == PT_PAGE_TABLE_LEVEL)
229 		return (1);
230 	if (is_large_pte(pte))
231 		return (1);
232 	return (0);
233 }
234 
235 static pfn_t
spte_to_pfn(uint64_t pte)236 spte_to_pfn(uint64_t pte)
237 {
238 	return ((pte & PT64_BASE_ADDR_MASK) >> PAGESHIFT);
239 }
240 
241 static gfn_t
pse36_gfn_delta(uint32_t gpte)242 pse36_gfn_delta(uint32_t gpte)
243 {
244 	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGESHIFT;
245 
246 	return ((gpte & PT32_DIR_PSE36_MASK) << shift);
247 }
248 
249 static void
__set_spte(uint64_t * sptep,uint64_t spte)250 __set_spte(uint64_t *sptep, uint64_t spte)
251 {
252 	*sptep = spte;
253 }
254 
255 static int
mmu_topup_memory_cache(struct kvm_mmu_memory_cache * cache,struct kmem_cache * base_cache,int min)256 mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
257     struct kmem_cache *base_cache, int min)
258 {
259 	caddr_t obj;
260 
261 	if (cache->nobjs >= min)
262 		return (0);
263 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
264 		obj = kmem_cache_alloc(base_cache, KM_SLEEP);
265 		cache->objects[cache->nobjs].kma_object = obj;
266 		cache->objects[cache->nobjs++].kpm_object = NULL;
267 	}
268 	return (0);
269 }
270 
271 static int
mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache * cache,int min)272 mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min)
273 {
274 	page_t *page;
275 
276 	if (cache->nobjs >= min)
277 		return (0);
278 
279 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
280 		page = alloc_page(KM_SLEEP,
281 		    &cache->objects[cache->nobjs].kma_object);
282 		if (!page)
283 			return (-ENOMEM);
284 
285 		cache->objects[cache->nobjs++].kpm_object = page_address(page);
286 	}
287 
288 	return (0);
289 }
290 
291 static int
mmu_topup_memory_caches(struct kvm_vcpu * vcpu)292 mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
293 {
294 	int r = 0;
295 
296 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
297 	    pte_chain_cache, 4);
298 
299 	if (r)
300 		goto out;
301 
302 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
303 	    rmap_desc_cache, 4);
304 
305 	if (r)
306 		goto out;
307 
308 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
309 
310 	if (r)
311 		goto out;
312 
313 	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
314 	    mmu_page_header_cache, 4);
315 
316 out:
317 	return (r);
318 }
319 
320 static void *
mmu_memory_cache_alloc(struct kvm_mmu_memory_cache * mc,size_t size)321 mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, size_t size)
322 {
323 	if (mc->objects[--mc->nobjs].kpm_object)
324 		return (mc->objects[mc->nobjs].kpm_object);
325 	else
326 		return (mc->objects[mc->nobjs].kma_object);
327 }
328 
329 static struct kvm_objects
mmu_memory_page_cache_alloc(struct kvm_mmu_memory_cache * mc,size_t size)330 mmu_memory_page_cache_alloc(struct kvm_mmu_memory_cache *mc, size_t size)
331 {
332 	return (mc->objects[--mc->nobjs]);
333 }
334 
335 static struct kvm_pte_chain *
mmu_alloc_pte_chain(struct kvm_vcpu * vcpu)336 mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
337 {
338 	return (mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
339 	    sizeof (struct kvm_pte_chain)));
340 }
341 
342 static void
mmu_free_pte_chain(struct kvm_pte_chain * pc)343 mmu_free_pte_chain(struct kvm_pte_chain *pc)
344 {
345 	kmem_cache_free(pte_chain_cache, pc);
346 }
347 
348 static struct kvm_rmap_desc *
mmu_alloc_rmap_desc(struct kvm_vcpu * vcpu)349 mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
350 {
351 	return (mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
352 	    sizeof (struct kvm_rmap_desc)));
353 }
354 
355 static void
mmu_free_rmap_desc(struct kvm_rmap_desc * rd)356 mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
357 {
358 	kmem_cache_free(rmap_desc_cache, rd);
359 }
360 
361 /*
362  * Return the pointer to the largepage write count for a given
363  * gfn, handling slots that are not large page aligned.
364  */
365 int *
slot_largepage_idx(gfn_t gfn,struct kvm_memory_slot * slot,int level)366 slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot, int level)
367 {
368 	unsigned long idx;
369 
370 	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
371 	    (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
372 	return (&slot->lpage_info[level - 2][idx].write_count);
373 }
374 
375 static void
account_shadowed(struct kvm * kvm,gfn_t gfn)376 account_shadowed(struct kvm *kvm, gfn_t gfn)
377 {
378 	struct kvm_memory_slot *slot;
379 	int *write_count;
380 	int i;
381 
382 	gfn = unalias_gfn(kvm, gfn);
383 
384 	slot = gfn_to_memslot_unaliased(kvm, gfn);
385 	for (i = PT_DIRECTORY_LEVEL;
386 		i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
387 			write_count = slot_largepage_idx(gfn, slot, i);
388 			*write_count += 1;
389 	}
390 }
391 
unaccount_shadowed(struct kvm * kvm,gfn_t gfn)392 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
393 {
394 	struct kvm_memory_slot *slot;
395 	int *write_count;
396 	int i;
397 
398 	gfn = unalias_gfn(kvm, gfn);
399 	for (i = PT_DIRECTORY_LEVEL;
400 		i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
401 			slot = gfn_to_memslot_unaliased(kvm, gfn);
402 			write_count = slot_largepage_idx(gfn, slot, i);
403 			*write_count -= 1;
404 			if (*write_count < 0)
405 				cmn_err(CE_WARN,
406 				"unaccount_shadowed: *write_count = %d (< 0)\n",
407 				*write_count);
408 	}
409 }
410 
411 static int
has_wrprotected_page(struct kvm * kvm,gfn_t gfn,int level)412 has_wrprotected_page(struct kvm *kvm, gfn_t gfn, int level)
413 {
414 	struct kvm_memory_slot *slot;
415 	int *largepage_idx;
416 
417 	gfn = unalias_gfn(kvm, gfn);
418 	slot = gfn_to_memslot_unaliased(kvm, gfn);
419 
420 	if (slot) {
421 		largepage_idx = slot_largepage_idx(gfn, slot, level);
422 		return (*largepage_idx);
423 	}
424 
425 	return (1);
426 }
427 
428 static int
host_mapping_level(struct kvm * kvm,gfn_t gfn)429 host_mapping_level(struct kvm *kvm, gfn_t gfn)
430 {
431 	unsigned long page_size;
432 	int i, ret = 0;
433 
434 	page_size = kvm_host_page_size(kvm, gfn);
435 
436 	for (i = PT_PAGE_TABLE_LEVEL;
437 	    i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
438 		if (page_size >= KVM_HPAGE_SIZE(i))
439 			ret = i;
440 		else
441 			break;
442 	}
443 
444 	return (ret);
445 }
446 
447 static int
mapping_level(struct kvm_vcpu * vcpu,gfn_t large_gfn)448 mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
449 {
450 	struct kvm_memory_slot *slot;
451 	int host_level, level, max_level;
452 
453 	slot = gfn_to_memslot(vcpu->kvm, large_gfn);
454 	if (slot && slot->dirty_bitmap)
455 		return (PT_PAGE_TABLE_LEVEL);
456 
457 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
458 
459 	if (host_level == PT_PAGE_TABLE_LEVEL)
460 		return (host_level);
461 
462 	max_level = kvm_x86_ops->get_lpage_level() < host_level ?
463 		kvm_x86_ops->get_lpage_level() : host_level;
464 
465 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
466 		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
467 			break;
468 
469 	return (level - 1);
470 }
471 
472 /*
473  * Take gfn and return the reverse mapping to it.
474  * Note: gfn must be unaliased before this function get called
475  */
476 static unsigned long *
gfn_to_rmap(struct kvm * kvm,gfn_t gfn,int level)477 gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
478 {
479 	struct kvm_memory_slot *slot;
480 	unsigned long idx;
481 
482 	slot = gfn_to_memslot(kvm, gfn);
483 	if (level == PT_PAGE_TABLE_LEVEL)
484 	    return (&slot->rmap[gfn - slot->base_gfn]);
485 
486 	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
487 	    (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
488 
489 	return (&slot->lpage_info[level - 2][idx].rmap_pde);
490 }
491 
492 /*
493  * Reverse mapping data structures:
494  *
495  * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
496  * that points to page_address(page).
497  *
498  * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
499  * containing more mappings.
500  *
501  * Returns the number of rmap entries before the spte was added or zero if
502  * the spte was not added.
503  *
504  */
505 static int
rmap_add(struct kvm_vcpu * vcpu,uint64_t * spte,gfn_t gfn)506 rmap_add(struct kvm_vcpu *vcpu, uint64_t *spte, gfn_t gfn)
507 {
508 	struct kvm_mmu_page *sp;
509 	struct kvm_rmap_desc *desc;
510 	unsigned long *rmapp;
511 	int i, count = 0;
512 
513 	if (!is_rmap_spte(*spte))
514 		return (count);
515 
516 	gfn = unalias_gfn(vcpu->kvm, gfn);
517 	sp = page_header(vcpu->kvm, kvm_va2pa((caddr_t)spte));
518 	sp->gfns[spte - sp->spt] = gfn;
519 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
520 	if (!*rmapp) {
521 		*rmapp = (unsigned long)spte;
522 	} else if (!(*rmapp & 1)) {
523 		desc = mmu_alloc_rmap_desc(vcpu);
524 		desc->sptes[0] = (uint64_t *)*rmapp;
525 		desc->sptes[1] = spte;
526 		*rmapp = (unsigned long)desc | 1;
527 	} else {
528 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
529 		while (desc->sptes[RMAP_EXT-1] && desc->more) {
530 			desc = desc->more;
531 			count += RMAP_EXT;
532 		}
533 		if (desc->sptes[RMAP_EXT-1]) {
534 			desc->more = mmu_alloc_rmap_desc(vcpu);
535 			desc = desc->more;
536 		}
537 		for (i = 0; desc->sptes[i]; i++)
538 			continue;
539 		desc->sptes[i] = spte;
540 	}
541 	return (count);
542 }
543 
544 static void
rmap_desc_remove_entry(unsigned long * rmapp,struct kvm_rmap_desc * desc,int i,struct kvm_rmap_desc * prev_desc)545 rmap_desc_remove_entry(unsigned long *rmapp, struct kvm_rmap_desc *desc,
546     int i, struct kvm_rmap_desc *prev_desc)
547 {
548 	int j;
549 
550 	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
551 		continue;
552 
553 	desc->sptes[i] = desc->sptes[j];
554 	desc->sptes[j] = NULL;
555 
556 	if (j != 0)
557 		return;
558 	if (!prev_desc && !desc->more) {
559 		*rmapp = (unsigned long)desc->sptes[0];
560 	} else {
561 		if (prev_desc)
562 			prev_desc->more = desc->more;
563 		else
564 			*rmapp = (unsigned long)desc->more | 1;
565 	}
566 
567 	mmu_free_rmap_desc(desc);
568 }
569 
570 static void
rmap_remove(struct kvm * kvm,uint64_t * spte)571 rmap_remove(struct kvm *kvm, uint64_t *spte)
572 {
573 	struct kvm_rmap_desc *desc;
574 	struct kvm_rmap_desc *prev_desc;
575 	struct kvm_mmu_page *sp;
576 	pfn_t pfn;
577 	unsigned long *rmapp;
578 	int i;
579 
580 	if (!is_rmap_spte(*spte))
581 		return;
582 	sp = page_header(kvm, kvm_va2pa((caddr_t)spte));
583 	pfn = spte_to_pfn(*spte);
584 	if (*spte & shadow_accessed_mask)
585 		kvm_set_pfn_accessed(kvm, pfn);
586 	if (is_writable_pte(*spte))
587 		kvm_set_pfn_dirty(pfn);
588 	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
589 	if (!*rmapp) {
590 		panic("rmap_remove: %p %lx 0->BUG\n", spte, *spte);
591 	} else if (!(*rmapp & 1)) {
592 		if ((uint64_t *)*rmapp != spte) {
593 			panic("rmap_remove:  %p %lx 1->BUG\n", spte, *spte);
594 		}
595 		*rmapp = 0;
596 	} else {
597 		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
598 		prev_desc = NULL;
599 		while (desc) {
600 			for (i = 0; i < RMAP_EXT && desc->sptes[i]; i++) {
601 				if (desc->sptes[i] == spte) {
602 					rmap_desc_remove_entry(rmapp,
603 					    desc, i, prev_desc);
604 					return;
605 				}
606 			}
607 
608 			prev_desc = desc;
609 			desc = desc->more;
610 		}
611 		panic("rmap_remove: %p %lx many->many\n", spte, *spte);
612 	}
613 }
614 
615 static uint64_t *
rmap_next(struct kvm * kvm,unsigned long * rmapp,uint64_t * spte)616 rmap_next(struct kvm *kvm, unsigned long *rmapp, uint64_t *spte)
617 {
618 	struct kvm_rmap_desc *desc;
619 	struct kvm_rmap_desc *prev_desc;
620 	uint64_t *prev_spte;
621 	int i;
622 
623 	if (!*rmapp)
624 		return (NULL);
625 	else if (!(*rmapp & 1)) {
626 		if (!spte)
627 			return ((uint64_t *)*rmapp);
628 		return (NULL);
629 	}
630 
631 	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
632 	prev_desc = NULL;
633 	prev_spte = NULL;
634 	while (desc) {
635 		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
636 			if (prev_spte == spte)
637 				return (desc->sptes[i]);
638 			prev_spte = desc->sptes[i];
639 		}
640 		desc = desc->more;
641 	}
642 
643 	return (NULL);
644 }
645 
646 static int
rmap_write_protect(struct kvm * kvm,uint64_t gfn)647 rmap_write_protect(struct kvm *kvm, uint64_t gfn)
648 {
649 	unsigned long *rmapp;
650 	uint64_t *spte;
651 	int i, write_protected = 0;
652 
653 	gfn = unalias_gfn(kvm, gfn);
654 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
655 
656 	spte = rmap_next(kvm, rmapp, NULL);
657 	while (spte) {
658 		ASSERT(spte);
659 		ASSERT(*spte & PT_PRESENT_MASK);
660 		if (is_writable_pte(*spte)) {
661 			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
662 			write_protected = 1;
663 		}
664 		spte = rmap_next(kvm, rmapp, spte);
665 	}
666 	if (write_protected) {
667 		pfn_t pfn;
668 
669 		spte = rmap_next(kvm, rmapp, NULL);
670 		pfn = spte_to_pfn(*spte);
671 		kvm_set_pfn_dirty(pfn);
672 	}
673 
674 	/* check for huge page mappings */
675 	for (i = PT_DIRECTORY_LEVEL;
676 	    i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; i++) {
677 		rmapp = gfn_to_rmap(kvm, gfn, i);
678 		spte = rmap_next(kvm, rmapp, NULL);
679 		while (spte) {
680 			ASSERT(spte);
681 			ASSERT(*spte & PT_PRESENT_MASK);
682 			ASSERT((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) ==
683 			    (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
684 
685 			if (is_writable_pte(*spte)) {
686 				rmap_remove(kvm, spte);
687 				KVM_KSTAT_DEC(kvm, kvmks_lpages);
688 				__set_spte(spte, shadow_trap_nonpresent_pte);
689 				spte = NULL;
690 				write_protected = 1;
691 			}
692 			spte = rmap_next(kvm, rmapp, spte);
693 		}
694 	}
695 
696 	return (write_protected);
697 }
698 
699 static void
kvm_mmu_free_page(struct kvm * kvm,struct kvm_mmu_page * sp)700 kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
701 {
702 	kmem_free(sp->sptkma, PAGESIZE);
703 	kmem_free(sp->gfnskma, PAGESIZE);
704 
705 	mutex_enter(&kvm->kvm_avllock);
706 	avl_remove(&kvm->kvm_avlmp, sp);
707 	mutex_exit(&kvm->kvm_avllock);
708 	list_remove(&kvm->arch.active_mmu_pages, sp);
709 	kmem_cache_free(mmu_page_header_cache, sp);
710 	++kvm->arch.n_free_mmu_pages;
711 }
712 
713 static unsigned
kvm_page_table_hashfn(gfn_t gfn)714 kvm_page_table_hashfn(gfn_t gfn)
715 {
716 	return (gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1));
717 }
718 
719 static void
bitmap_zero(unsigned long * dst,int nbits)720 bitmap_zero(unsigned long *dst, int nbits)
721 {
722 	int len = BITS_TO_LONGS(nbits) * sizeof (unsigned long);
723 	memset(dst, 0, len);
724 }
725 
726 static struct kvm_mmu_page *
kvm_mmu_alloc_page(struct kvm_vcpu * vcpu,uint64_t * parent_pte)727 kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, uint64_t *parent_pte)
728 {
729 	struct kvm_mmu_page *sp;
730 	struct kvm_objects kobj;
731 
732 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
733 	    sizeof (*sp));
734 	kobj = mmu_memory_page_cache_alloc(&vcpu->arch.mmu_page_cache,
735 	    PAGESIZE);
736 	sp->spt = kobj.kpm_object;
737 	sp->sptkma = kobj.kma_object;
738 	kobj = mmu_memory_page_cache_alloc(&vcpu->arch.mmu_page_cache,
739 	    PAGESIZE);
740 	sp->gfns = kobj.kpm_object;
741 	sp->gfnskma = kobj.kma_object;
742 	sp->kmp_avlspt = (uintptr_t)virt_to_page((caddr_t)sp->spt);
743 	sp->vcpu = vcpu;
744 
745 	mutex_enter(&vcpu->kvm->kvm_avllock);
746 	avl_add(&vcpu->kvm->kvm_avlmp, sp);
747 	mutex_exit(&vcpu->kvm->kvm_avllock);
748 
749 	list_insert_head(&vcpu->kvm->arch.active_mmu_pages, sp);
750 
751 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
752 	sp->multimapped = 0;
753 	sp->parent_pte = parent_pte;
754 	--vcpu->kvm->arch.n_free_mmu_pages;
755 	return (sp);
756 }
757 
758 static void
mmu_page_remove_parent_pte(struct kvm_mmu_page * sp,uint64_t * parent_pte)759 mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, uint64_t *parent_pte)
760 {
761 	struct kvm_pte_chain *pte_chain;
762 	struct list_t *node;
763 	int i;
764 
765 	if (!sp->multimapped) {
766 		sp->parent_pte = NULL;
767 		return;
768 	}
769 
770 	for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL;
771 	    pte_chain = list_next(&sp->parent_ptes, pte_chain)) {
772 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
773 			if (!pte_chain->parent_ptes[i])
774 				break;
775 			if (pte_chain->parent_ptes[i] != parent_pte)
776 				continue;
777 			while (i + 1 < NR_PTE_CHAIN_ENTRIES &&
778 			    pte_chain->parent_ptes[i + 1]) {
779 				pte_chain->parent_ptes[i] =
780 				    pte_chain->parent_ptes[i + 1];
781 				i++;
782 			}
783 			pte_chain->parent_ptes[i] = NULL;
784 			if (i == 0) {
785 				list_remove(&sp->parent_ptes, pte_chain);
786 				mmu_free_pte_chain(pte_chain);
787 				if (list_is_empty(&sp->parent_ptes)) {
788 					sp->multimapped = 0;
789 					sp->parent_pte = NULL;
790 				}
791 			}
792 			return;
793 		}
794 	}
795 	panic("We shouldn't make it here\n");
796 }
797 
798 static void
mmu_page_add_parent_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,uint64_t * parent_pte)799 mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
800     struct kvm_mmu_page *sp, uint64_t *parent_pte)
801 {
802 	struct kvm_pte_chain *pte_chain;
803 	struct hlist_node *node;
804 	int i;
805 
806 	if (!parent_pte)
807 		return;
808 	if (!sp->multimapped) {
809 		uint64_t *old = sp->parent_pte;
810 
811 		if (!old) {
812 			sp->parent_pte = parent_pte;
813 			return;
814 		}
815 		sp->multimapped = 1;
816 		pte_chain = mmu_alloc_pte_chain(vcpu);
817 		list_create(&sp->parent_ptes, sizeof (struct kvm_pte_chain),
818 			    offsetof(struct kvm_pte_chain, link));
819 		list_insert_head(&sp->parent_ptes, pte_chain);
820 		pte_chain->parent_ptes[0] = old;
821 	}
822 
823 	for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL;
824 	    pte_chain = list_next(&sp->parent_ptes, pte_chain)) {
825 		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
826 			continue;
827 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
828 			if (!pte_chain->parent_ptes[i]) {
829 				pte_chain->parent_ptes[i] = parent_pte;
830 				return;
831 			}
832 		}
833 	}
834 
835 	pte_chain = mmu_alloc_pte_chain(vcpu);
836 	list_insert_head(&sp->parent_ptes, pte_chain);
837 	pte_chain->parent_ptes[0] = parent_pte;
838 }
839 
840 static void
mmu_parent_walk(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,mmu_parent_walk_fn fn)841 mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
842     mmu_parent_walk_fn fn)
843 {
844 	struct kvm_pte_chain *pte_chain;
845 	struct hlist_node *node;
846 	struct kvm_mmu_page *parent_sp;
847 	int i;
848 
849 	if (!sp->multimapped && sp->parent_pte) {
850 		parent_sp = page_header(vcpu->kvm,
851 		    kvm_va2pa((caddr_t)sp->parent_pte));
852 
853 		fn(vcpu, parent_sp);
854 		mmu_parent_walk(vcpu, parent_sp, fn);
855 		return;
856 	}
857 
858 	for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL;
859 	    pte_chain = list_next(&sp->parent_ptes, pte_chain)) {
860 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
861 			if (!pte_chain->parent_ptes[i])
862 				break;
863 
864 			parent_sp = page_header(vcpu->kvm, kvm_va2pa(
865 			    (caddr_t)pte_chain->parent_ptes[i]));
866 			fn(vcpu, parent_sp);
867 			mmu_parent_walk(vcpu, parent_sp, fn);
868 		}
869 	}
870 }
871 
872 static void
kvm_mmu_update_unsync_bitmap(uint64_t * spte,struct kvm * kvm)873 kvm_mmu_update_unsync_bitmap(uint64_t *spte, struct kvm *kvm)
874 {
875 	unsigned int index;
876 	struct kvm_mmu_page *sp = page_header(kvm, kvm_va2pa((caddr_t)spte));
877 
878 	index = spte - sp->spt;
879 	if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
880 		sp->unsync_children++;
881 }
882 
883 static void
kvm_mmu_update_parents_unsync(struct kvm_mmu_page * sp,struct kvm * kvm)884 kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp, struct kvm *kvm)
885 {
886 	struct kvm_pte_chain *pte_chain;
887 	int i;
888 
889 	if (!sp->parent_pte)
890 		return;
891 
892 	if (!sp->multimapped) {
893 		kvm_mmu_update_unsync_bitmap(sp->parent_pte, kvm);
894 		return;
895 	}
896 
897 	for (pte_chain = list_head(&sp->parent_ptes); pte_chain != NULL;
898 	    pte_chain = list_next(&sp->parent_ptes, pte_chain)) {
899 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
900 			if (!pte_chain->parent_ptes[i])
901 				break;
902 			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i],
903 			    kvm);
904 		}
905 	}
906 }
907 
908 static int
unsync_walk_fn(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)909 unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
910 {
911 	kvm_mmu_update_parents_unsync(sp, vcpu->kvm);
912 	return (1);
913 }
914 
915 void
kvm_mmu_mark_parents_unsync(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)916 kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
917 {
918 	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
919 	kvm_mmu_update_parents_unsync(sp, vcpu->kvm);
920 }
921 
922 static void
nonpaging_prefetch_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)923 nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
924 {
925 	int i;
926 
927 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
928 		sp->spt[i] = shadow_trap_nonpresent_pte;
929 }
930 
931 static int
nonpaging_sync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)932 nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
933 {
934 	return (1);
935 }
936 
937 static void
nonpaging_invlpg(struct kvm_vcpu * vcpu,gva_t gva)938 nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
939 {}
940 
941 #define	KVM_PAGE_ARRAY_NR 16
942 
943 typedef struct kvm_mmu_pages {
944 	struct mmu_page_and_offset {
945 		struct kvm_mmu_page *sp;
946 		unsigned int idx;
947 	} page[KVM_PAGE_ARRAY_NR];
948 	unsigned int nr;
949 } kvm_mmu_pages_t;
950 
951 #define	for_each_unsync_children(bitmap, idx)		\
952 	for (idx = bt_getlowbit(bitmap, 0, 511);	\
953 	    (idx != -1) && (idx < 512);			\
954 	    idx = bt_getlowbit(bitmap, idx+1, 511))
955 
956 static int
mmu_pages_add(struct kvm_mmu_pages * pvec,struct kvm_mmu_page * sp,int idx)957 mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx)
958 {
959 	int i;
960 
961 	if (sp->unsync) {
962 		for (i = 0; i < pvec->nr; i++) {
963 			if (pvec->page[i].sp == sp)
964 				return (0);
965 		}
966 	}
967 
968 	pvec->page[pvec->nr].sp = sp;
969 	pvec->page[pvec->nr].idx = idx;
970 	pvec->nr++;
971 
972 	return (pvec->nr == KVM_PAGE_ARRAY_NR);
973 }
974 
975 static int
__mmu_unsync_walk(struct kvm_mmu_page * sp,struct kvm_mmu_pages * pvec,struct kvm * kvm)976 __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec,
977     struct kvm *kvm)
978 {
979 	int i, ret, nr_unsync_leaf = 0;
980 
981 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
982 		uint64_t ent = sp->spt[i];
983 
984 		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
985 			struct kvm_mmu_page *child;
986 			child = page_header(kvm, ent & PT64_BASE_ADDR_MASK);
987 
988 			if (child->unsync_children) {
989 				if (mmu_pages_add(pvec, child, i))
990 					return (-ENOSPC);
991 				ret = __mmu_unsync_walk(child, pvec, kvm);
992 				if (!ret) {
993 					__clear_bit(i, sp->unsync_child_bitmap);
994 				} else if (ret > 0)
995 					nr_unsync_leaf += ret;
996 				else
997 					return (ret);
998 			}
999 
1000 			if (child->unsync) {
1001 				nr_unsync_leaf++;
1002 				if (mmu_pages_add(pvec, child, i))
1003 					return (-ENOSPC);
1004 			}
1005 		}
1006 	}
1007 
1008 	if (bt_getlowbit(sp->unsync_child_bitmap, 0, 511) == -1)
1009 		sp->unsync_children = 0;
1010 
1011 	return (nr_unsync_leaf);
1012 }
1013 
1014 static int
mmu_unsync_walk(struct kvm_mmu_page * sp,struct kvm_mmu_pages * pvec,struct kvm * kvm)1015 mmu_unsync_walk(struct kvm_mmu_page *sp,
1016     struct kvm_mmu_pages *pvec, struct kvm *kvm)
1017 {
1018 	if (!sp->unsync_children)
1019 		return (0);
1020 
1021 	mmu_pages_add(pvec, sp, 0);
1022 	return (__mmu_unsync_walk(sp, pvec, kvm));
1023 }
1024 
1025 static struct kvm_mmu_page *
kvm_mmu_lookup_page(struct kvm * kvm,gfn_t gfn)1026 kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1027 {
1028 	unsigned index;
1029 	list_t *bucket;
1030 	struct kvm_mmu_page *sp;
1031 
1032 	index = kvm_page_table_hashfn(gfn);
1033 	bucket = &kvm->arch.mmu_page_hash[index];
1034 	for (sp = list_head(bucket); sp; sp = list_next(bucket, sp)) {
1035 		if (sp->gfn == gfn && !sp->role.direct &&
1036 		    !sp->role.invalid) {
1037 			return (sp);
1038 		}
1039 	}
1040 
1041 	return (NULL);
1042 }
1043 
1044 
1045 static void
kvm_unlink_unsync_page(struct kvm * kvm,struct kvm_mmu_page * sp)1046 kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1047 {
1048 	sp->unsync = 0;
1049 	KVM_KSTAT_DEC(kvm, kvmks_mmu_unsync_page);
1050 }
1051 
1052 
1053 static int kvm_mmu_zap_page(struct kvm *, struct kvm_mmu_page *);
1054 
1055 static int
kvm_sync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)1056 kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1057 {
1058 	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1059 		kvm_mmu_zap_page(vcpu->kvm, sp);
1060 		return (1);
1061 	}
1062 
1063 	KVM_TRACE1(mmu__sync__page, struct kvm_mmu_page *, sp);
1064 
1065 	if (rmap_write_protect(vcpu->kvm, sp->gfn))
1066 		kvm_flush_remote_tlbs(vcpu->kvm);
1067 	kvm_unlink_unsync_page(vcpu->kvm, sp);
1068 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1069 		kvm_mmu_zap_page(vcpu->kvm, sp);
1070 		return (1);
1071 	}
1072 
1073 	kvm_mmu_flush_tlb(vcpu);
1074 	return (0);
1075 }
1076 
1077 typedef struct mmu_page_path {
1078 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
1079 	unsigned int idx[PT64_ROOT_LEVEL-1];
1080 } mmu_page_path_t;
1081 
1082 #define	for_each_sp(pvec, sp, parents, i)				\
1083 		for (i = mmu_pages_next(&pvec, &parents, -1),		\
1084 			sp = pvec.page[i].sp;				\
1085 			/*CSTYLED*/					\
1086 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1; });	\
1087 			i = mmu_pages_next(&pvec, &parents, i))
1088 
1089 static int
mmu_pages_next(struct kvm_mmu_pages * pvec,struct mmu_page_path * parents,int i)1090 mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, int i)
1091 {
1092 	int n;
1093 
1094 	for (n = i + 1; n < pvec->nr; n++) {
1095 		struct kvm_mmu_page *sp = pvec->page[n].sp;
1096 
1097 		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
1098 			parents->idx[0] = pvec->page[n].idx;
1099 			return (n);
1100 		}
1101 
1102 		parents->parent[sp->role.level-2] = sp;
1103 		parents->idx[sp->role.level-1] = pvec->page[n].idx;
1104 	}
1105 
1106 	return (n);
1107 }
1108 
1109 static void
mmu_pages_clear_parents(struct mmu_page_path * parents)1110 mmu_pages_clear_parents(struct mmu_page_path *parents)
1111 {
1112 	struct kvm_mmu_page *sp;
1113 	unsigned int level = 0;
1114 
1115 	do {
1116 		unsigned int idx = parents->idx[level];
1117 
1118 		sp = parents->parent[level];
1119 		if (!sp)
1120 			return;
1121 
1122 		--sp->unsync_children;
1123 		if ((int)sp->unsync_children < 0)
1124 			cmn_err(CE_WARN,
1125 			    "mmu_pages_clear_parents: unsync_children (%d)\n",
1126 			    (int)sp->unsync_children);
1127 		__clear_bit(idx, sp->unsync_child_bitmap);
1128 		level++;
1129 	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
1130 }
1131 
1132 static void
kvm_mmu_pages_init(struct kvm_mmu_page * parent,struct mmu_page_path * parents,struct kvm_mmu_pages * pvec)1133 kvm_mmu_pages_init(struct kvm_mmu_page *parent, struct mmu_page_path *parents,
1134     struct kvm_mmu_pages *pvec)
1135 {
1136 	parents->parent[parent->role.level-1] = NULL;
1137 	pvec->nr = 0;
1138 }
1139 
1140 static void
mmu_sync_children(struct kvm_vcpu * vcpu,struct kvm_mmu_page * parent)1141 mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *parent)
1142 {
1143 	int i;
1144 	struct kvm_mmu_page *sp;
1145 	struct mmu_page_path parents;
1146 	struct kvm_mmu_pages pages;
1147 
1148 	kvm_mmu_pages_init(parent, &parents, &pages);
1149 	while (mmu_unsync_walk(parent, &pages, vcpu->kvm)) {
1150 		int protected = 0;
1151 
1152 		for_each_sp(pages, sp, parents, i)
1153 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
1154 
1155 		if (protected)
1156 			kvm_flush_remote_tlbs(vcpu->kvm);
1157 
1158 		for_each_sp(pages, sp, parents, i) {
1159 			kvm_sync_page(vcpu, sp);
1160 			mmu_pages_clear_parents(&parents);
1161 		}
1162 		mutex_enter(&vcpu->kvm->mmu_lock);
1163 		kvm_mmu_pages_init(parent, &parents, &pages);
1164 		mutex_exit(&vcpu->kvm->mmu_lock);
1165 	}
1166 }
1167 
1168 struct kvm_mmu_page *
kvm_mmu_get_page(struct kvm_vcpu * vcpu,gfn_t gfn,gva_t gaddr,unsigned level,int direct,unsigned access,uint64_t * parent_pte)1169 kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, unsigned level,
1170     int direct, unsigned access, uint64_t *parent_pte)
1171 {
1172 	union kvm_mmu_page_role role;
1173 	unsigned index;
1174 	unsigned quadrant;
1175 	list_t *bucket;
1176 	struct kvm_mmu_page *sp, *nsp = NULL;
1177 	struct hlist_node *node, *tmp;
1178 
1179 	role = vcpu->arch.mmu.base_role;
1180 	role.level = level;
1181 	role.direct = direct;
1182 	role.access = access;
1183 
1184 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1185 		quadrant = gaddr >> (PAGESHIFT + (PT64_PT_BITS * level));
1186 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1187 		role.quadrant = quadrant;
1188 	}
1189 
1190 	index = kvm_page_table_hashfn(gfn);
1191 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1192 
1193 	for (sp = list_head(bucket); sp != NULL; sp = nsp) {
1194 		nsp = list_next(bucket, sp);
1195 		if (sp->gfn == gfn) {
1196 			if (sp->unsync)
1197 				if (kvm_sync_page(vcpu, sp))
1198 					continue;
1199 
1200 			if (sp->role.word != role.word)
1201 				continue;
1202 
1203 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1204 			if (sp->unsync_children) {
1205 				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1206 				kvm_mmu_mark_parents_unsync(vcpu, sp);
1207 			}
1208 			return (sp);
1209 		}
1210 	}
1211 
1212 	KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_cache_miss);
1213 	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
1214 
1215 	if (!sp)
1216 		return (sp);
1217 
1218 	sp->gfn = gfn;
1219 	sp->role = role;
1220 	list_insert_head(bucket, sp);
1221 	if (!direct) {
1222 		if (rmap_write_protect(vcpu->kvm, gfn))
1223 			kvm_flush_remote_tlbs(vcpu->kvm);
1224 		account_shadowed(vcpu->kvm, gfn);
1225 	}
1226 
1227 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1228 		vcpu->arch.mmu.prefetch_page(vcpu, sp);
1229 	else
1230 		nonpaging_prefetch_page(vcpu, sp);
1231 
1232 	KVM_TRACE1(mmu__get__page, struct kvm_mmu_page *, sp);
1233 
1234 	return (sp);
1235 }
1236 
1237 static void
shadow_walk_init(struct kvm_shadow_walk_iterator * iterator,struct kvm_vcpu * vcpu,uint64_t addr)1238 shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1239     struct kvm_vcpu *vcpu, uint64_t addr)
1240 {
1241 	iterator->addr = addr;
1242 	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1243 	iterator->level = vcpu->arch.mmu.shadow_root_level;
1244 	if (iterator->level == PT32E_ROOT_LEVEL) {
1245 		iterator->shadow_addr =
1246 		    vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1247 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
1248 		--iterator->level;
1249 		if (!iterator->shadow_addr)
1250 			iterator->level = 0;
1251 	}
1252 }
1253 
1254 static int
shadow_walk_okay(struct kvm_shadow_walk_iterator * iterator,struct kvm_vcpu * vcpu)1255 shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator,
1256     struct kvm_vcpu *vcpu)
1257 {
1258 	if (iterator->level < PT_PAGE_TABLE_LEVEL)
1259 		return (0);
1260 
1261 	if (iterator->level == PT_PAGE_TABLE_LEVEL) {
1262 		if (is_large_pte(*iterator->sptep))
1263 			return (0);
1264 	}
1265 
1266 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1267 	iterator->sptep =
1268 	    (uint64_t *)page_address(pfn_to_page((iterator->shadow_addr) >>
1269 	    PAGESHIFT)) + iterator->index;
1270 
1271 	return (1);
1272 }
1273 
1274 static void
shadow_walk_next(struct kvm_shadow_walk_iterator * iterator)1275 shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1276 {
1277 	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
1278 	--iterator->level;
1279 }
1280 
1281 static void
kvm_mmu_page_unlink_children(struct kvm * kvm,struct kvm_mmu_page * sp)1282 kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1283 {
1284 	unsigned i;
1285 	uint64_t *pt;
1286 	uint64_t ent;
1287 
1288 	pt = sp->spt;
1289 
1290 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1291 		ent = pt[i];
1292 
1293 		if (is_shadow_present_pte(ent)) {
1294 			if (!is_last_spte(ent, sp->role.level)) {
1295 				ent &= PT64_BASE_ADDR_MASK;
1296 				mmu_page_remove_parent_pte(page_header(kvm,
1297 				    ent), &pt[i]);
1298 			} else {
1299 				rmap_remove(kvm, &pt[i]);
1300 			}
1301 		}
1302 		pt[i] = shadow_trap_nonpresent_pte;
1303 	}
1304 }
1305 
1306 static void
kvm_mmu_put_page(struct kvm_mmu_page * sp,uint64_t * parent_pte)1307 kvm_mmu_put_page(struct kvm_mmu_page *sp, uint64_t *parent_pte)
1308 {
1309 	mmu_page_remove_parent_pte(sp, parent_pte);
1310 }
1311 
1312 static void
kvm_mmu_reset_last_pte_updated(struct kvm * kvm)1313 kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
1314 {
1315 	int i;
1316 	struct kvm_vcpu *vcpu;
1317 
1318 	kvm_for_each_vcpu(i, vcpu, kvm)
1319 		vcpu->arch.last_pte_updated = NULL;
1320 }
1321 
1322 static void
kvm_mmu_unlink_parents(struct kvm * kvm,struct kvm_mmu_page * sp)1323 kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1324 {
1325 	uint64_t *parent_pte;
1326 
1327 	while (sp->multimapped || sp->parent_pte) {
1328 		if (!sp->multimapped)
1329 			parent_pte = sp->parent_pte;
1330 		else {
1331 			struct kvm_pte_chain *chain;
1332 
1333 			chain = list_head(&sp->parent_ptes);
1334 
1335 			parent_pte = chain->parent_ptes[0];
1336 		}
1337 
1338 		kvm_mmu_put_page(sp, parent_pte);
1339 		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
1340 	}
1341 }
1342 
1343 static int
mmu_zap_unsync_children(struct kvm * kvm,struct kvm_mmu_page * parent)1344 mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *parent)
1345 {
1346 	int i, zapped = 0;
1347 	struct mmu_page_path parents;
1348 	struct kvm_mmu_pages pages;
1349 
1350 	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
1351 		return (0);
1352 
1353 	kvm_mmu_pages_init(parent, &parents, &pages);
1354 	while (mmu_unsync_walk(parent, &pages, kvm)) {
1355 		struct kvm_mmu_page *sp;
1356 
1357 		for_each_sp(pages, sp, parents, i) {
1358 			kvm_mmu_zap_page(kvm, sp);
1359 			mmu_pages_clear_parents(&parents);
1360 			zapped++;
1361 		}
1362 		kvm_mmu_pages_init(parent, &parents, &pages);
1363 	}
1364 
1365 	return (zapped);
1366 }
1367 
1368 static int
kvm_mmu_zap_page(struct kvm * kvm,struct kvm_mmu_page * sp)1369 kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1370 {
1371 	int ret;
1372 
1373 	ret = mmu_zap_unsync_children(kvm, sp);
1374 	kvm_mmu_page_unlink_children(kvm, sp);
1375 	kvm_mmu_unlink_parents(kvm, sp);
1376 	kvm_flush_remote_tlbs(kvm);
1377 
1378 	if (!sp->role.invalid && !sp->role.direct)
1379 		unaccount_shadowed(kvm, sp->gfn);
1380 
1381 	if (sp->unsync)
1382 		kvm_unlink_unsync_page(kvm, sp);
1383 
1384 	if (!sp->root_count) {
1385 		sp->hash_link.list_prev->list_next = sp->hash_link.list_next;
1386 		sp->hash_link.list_next->list_prev = sp->hash_link.list_prev;
1387 		sp->hash_link.list_prev = 0;
1388 		sp->hash_link.list_next = 0;
1389 		kvm_mmu_free_page(kvm, sp);
1390 	} else {
1391 		sp->role.invalid = 1;
1392 		list_remove(&kvm->arch.active_mmu_pages, sp);
1393 		list_insert_head(&kvm->arch.active_mmu_pages, sp);
1394 		kvm_reload_remote_mmus(kvm);
1395 	}
1396 	kvm_mmu_reset_last_pte_updated(kvm);
1397 
1398 	return (ret);
1399 }
1400 
1401 /*
1402  * Changing the number of mmu pages allocated to the vm
1403  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
1404  */
1405 void
kvm_mmu_change_mmu_pages(struct kvm * kvm,unsigned int kvm_nr_mmu_pages)1406 kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1407 {
1408 	int used_pages;
1409 
1410 	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1411 	used_pages = MAX(0, used_pages);
1412 
1413 	/* for the time being, assume that address space will only grow */
1414 	/* larger.  The following code will be added later. */
1415 
1416 	/*
1417 	 * If we set the number of mmu pages to be smaller be than the
1418 	 * number of actived pages , we must to free some mmu pages before we
1419 	 * change the value
1420 	 */
1421 
1422 	if (used_pages > kvm_nr_mmu_pages) {
1423 		while (used_pages > kvm_nr_mmu_pages &&
1424 			!list_is_empty(&kvm->arch.active_mmu_pages)) {
1425 			struct kvm_mmu_page *page;
1426 
1427 			page = (struct kvm_mmu_page *)
1428 			    list_tail(&kvm->arch.active_mmu_pages);
1429 
1430 			/* page removed by kvm_mmu_zap_page */
1431 			used_pages -= kvm_mmu_zap_page(kvm, page);
1432 			used_pages--;
1433 		}
1434 		kvm_nr_mmu_pages = used_pages;
1435 		kvm->arch.n_free_mmu_pages = 0;
1436 	} else {
1437 		kvm->arch.n_free_mmu_pages +=
1438 		    kvm_nr_mmu_pages - kvm->arch.n_alloc_mmu_pages;
1439 	}
1440 
1441 	kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
1442 }
1443 
1444 static int
kvm_mmu_unprotect_page(struct kvm * kvm,gfn_t gfn)1445 kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1446 {
1447 	unsigned index;
1448 	list_t *bucket;
1449 	struct kvm_mmu_page *sp, *nsp = NULL;
1450 	int r;
1451 
1452 	r = 0;
1453 	index = kvm_page_table_hashfn(gfn);
1454 	bucket = &kvm->arch.mmu_page_hash[index];
1455 
1456 	for (sp = list_head(bucket); sp; sp = nsp) {
1457 		/* preserve link to next node in case we free this one */
1458 		nsp = list_next(bucket, sp);
1459 
1460 		if (sp->gfn == gfn && !sp->role.direct) {
1461 			r = 1;
1462 			if (kvm_mmu_zap_page(kvm, sp))
1463 				nsp = list_head(bucket);
1464 		}
1465 	}
1466 	return (r);
1467 }
1468 
1469 static void
page_header_update_slot(struct kvm * kvm,void * pte,gfn_t gfn)1470 page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1471 {
1472 	int slot = memslot_id(kvm, gfn);
1473 	struct kvm_mmu_page *sp = page_header(kvm, kvm_va2pa(pte));
1474 
1475 	__set_bit(slot, sp->slot_bitmap);
1476 }
1477 
1478 static void
mmu_convert_notrap(struct kvm_mmu_page * sp)1479 mmu_convert_notrap(struct kvm_mmu_page *sp)
1480 {
1481 	int i;
1482 	uint64_t *pt = sp->spt;
1483 
1484 	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1485 		return;
1486 
1487 	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1488 		if (pt[i] == shadow_notrap_nonpresent_pte)
1489 			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
1490 	}
1491 }
1492 
1493 static int
get_mtrr_type(struct mtrr_state_type * mtrr_state,uint64_t start,uint64_t end)1494 get_mtrr_type(struct mtrr_state_type *mtrr_state, uint64_t start, uint64_t end)
1495 {
1496 	int i;
1497 	uint64_t base, mask;
1498 	uint8_t prev_match, curr_match;
1499 	int num_var_ranges = KVM_NR_VAR_MTRR;
1500 
1501 	if (!mtrr_state->enabled)
1502 		return (0xFF);
1503 
1504 	/* Make end inclusive end, instead of exclusive */
1505 	end--;
1506 
1507 	/* Look in fixed ranges. Just return the type as per start */
1508 	if (mtrr_state->have_fixed && (start < 0x100000)) {
1509 		int idx;
1510 
1511 		if (start < 0x80000) {
1512 			idx = 0;
1513 			idx += (start >> 16);
1514 			return (mtrr_state->fixed_ranges[idx]);
1515 		} else if (start < 0xC0000) {
1516 			idx = 1 * 8;
1517 			idx += ((start - 0x80000) >> 14);
1518 			return (mtrr_state->fixed_ranges[idx]);
1519 		} else if (start < 0x1000000) {
1520 			idx = 3 * 8;
1521 			idx += ((start - 0xC0000) >> 12);
1522 			return (mtrr_state->fixed_ranges[idx]);
1523 		}
1524 	}
1525 
1526 	/*
1527 	 * Look in variable ranges
1528 	 * Look of multiple ranges matching this address and pick type
1529 	 * as per MTRR precedence
1530 	 */
1531 	if (!(mtrr_state->enabled & 2))
1532 		return (mtrr_state->def_type);
1533 
1534 	prev_match = 0xFF;
1535 	for (i = 0; i < num_var_ranges; ++i) {
1536 		unsigned short start_state, end_state;
1537 
1538 		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
1539 			continue;
1540 
1541 		base = (((uint64_t)mtrr_state->var_ranges[i].base_hi) << 32) +
1542 		    (mtrr_state->var_ranges[i].base_lo & PAGEMASK);
1543 		mask = (((uint64_t)mtrr_state->var_ranges[i].mask_hi) << 32) +
1544 		    (mtrr_state->var_ranges[i].mask_lo & PAGEMASK);
1545 
1546 		start_state = ((start & mask) == (base & mask));
1547 		end_state = ((end & mask) == (base & mask));
1548 		if (start_state != end_state)
1549 			return (0xFE);
1550 
1551 		if ((start & mask) != (base & mask))
1552 			continue;
1553 
1554 		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
1555 		if (prev_match == 0xFF) {
1556 			prev_match = curr_match;
1557 			continue;
1558 		}
1559 
1560 		if (prev_match == MTRR_TYPE_UNCACHABLE ||
1561 		    curr_match == MTRR_TYPE_UNCACHABLE)
1562 			return (MTRR_TYPE_UNCACHABLE);
1563 
1564 		if ((prev_match == MTRR_TYPE_WRBACK &&
1565 		    curr_match == MTRR_TYPE_WRTHROUGH) ||
1566 		    (prev_match == MTRR_TYPE_WRTHROUGH &&
1567 		    curr_match == MTRR_TYPE_WRBACK)) {
1568 			prev_match = MTRR_TYPE_WRTHROUGH;
1569 			curr_match = MTRR_TYPE_WRTHROUGH;
1570 		}
1571 
1572 		if (prev_match != curr_match)
1573 			return (MTRR_TYPE_UNCACHABLE);
1574 	}
1575 
1576 	if (prev_match != 0xFF)
1577 		return (prev_match);
1578 
1579 	return (mtrr_state->def_type);
1580 }
1581 
1582 uint8_t
kvm_get_guest_memory_type(struct kvm_vcpu * vcpu,gfn_t gfn)1583 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
1584 {
1585 	uint8_t mtrr;
1586 
1587 	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state,
1588 	    gfn << PAGESHIFT, (gfn << PAGESHIFT) + PAGESIZE);
1589 	if (mtrr == 0xfe || mtrr == 0xff)
1590 		mtrr = MTRR_TYPE_WRBACK;
1591 	return (mtrr);
1592 }
1593 
1594 static int
kvm_unsync_page(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp)1595 kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1596 {
1597 	unsigned index;
1598 	list_t *bucket;
1599 	struct kvm_mmu_page *s;
1600 
1601 	index = kvm_page_table_hashfn(sp->gfn);
1602 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1603 	/* don't unsync if pagetable is shadowed with multiple roles */
1604 	/* XXX - need protection here(?) */
1605 	for (s = list_head(bucket); s; s = list_next(bucket, s)) {
1606 		if (s->gfn != sp->gfn || s->role.direct)
1607 			continue;
1608 		if (s->role.word != sp->role.word)
1609 			return (1);
1610 	}
1611 	KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_unsync_page);
1612 	sp->unsync = 1;
1613 
1614 	kvm_mmu_mark_parents_unsync(vcpu, sp);
1615 
1616 	mmu_convert_notrap(sp);
1617 	return (0);
1618 }
1619 
1620 static int
mmu_need_write_protect(struct kvm_vcpu * vcpu,gfn_t gfn,int can_unsync)1621 mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, int can_unsync)
1622 {
1623 	struct kvm_mmu_page *shadow;
1624 
1625 	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1626 	if (shadow) {
1627 		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1628 			return (1);
1629 		if (shadow->unsync)
1630 			return (0);
1631 		if (can_unsync && oos_shadow)
1632 			return (kvm_unsync_page(vcpu, shadow));
1633 		return (1);
1634 	}
1635 	return (0);
1636 }
1637 
1638 static int
set_spte(struct kvm_vcpu * vcpu,uint64_t * sptep,unsigned pte_access,int user_fault,int write_fault,int dirty,int level,gfn_t gfn,pfn_t pfn,int speculative,int can_unsync,int reset_host_protection)1639 set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, unsigned pte_access,
1640     int user_fault, int write_fault, int dirty, int level, gfn_t gfn,
1641     pfn_t pfn, int speculative, int can_unsync, int reset_host_protection)
1642 {
1643 	uint64_t spte;
1644 	int ret = 0;
1645 
1646 	/*
1647 	 * We don't set the accessed bit, since we sometimes want to see
1648 	 * whether the guest actually used the pte (in order to detect
1649 	 * demand paging).
1650 	 */
1651 	spte = shadow_base_present_pte | shadow_dirty_mask;
1652 	if (!speculative)
1653 		spte |= shadow_accessed_mask;
1654 	if (!dirty)
1655 		pte_access &= ~ACC_WRITE_MASK;
1656 	if (pte_access & ACC_EXEC_MASK)
1657 		spte |= shadow_x_mask;
1658 	else
1659 		spte |= shadow_nx_mask;
1660 	if (pte_access & ACC_USER_MASK)
1661 		spte |= shadow_user_mask;
1662 	if (level > PT_PAGE_TABLE_LEVEL)
1663 		spte |= PT_PAGE_SIZE_MASK;
1664 	if (tdp_enabled)
1665 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1666 			kvm_is_mmio_pfn(pfn));
1667 
1668 	if (reset_host_protection)
1669 		spte |= SPTE_HOST_WRITEABLE;
1670 
1671 	spte |= (uint64_t)pfn << PAGESHIFT;
1672 
1673 	if ((pte_access & ACC_WRITE_MASK) ||
1674 	    (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1675 
1676 		if (level > PT_PAGE_TABLE_LEVEL &&
1677 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
1678 			ret = 1;
1679 			spte = shadow_trap_nonpresent_pte;
1680 			goto set_pte;
1681 		}
1682 
1683 		spte |= PT_WRITABLE_MASK;
1684 
1685 		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
1686 			spte &= ~PT_USER_MASK;
1687 
1688 		/*
1689 		 * Optimization: for pte sync, if spte was writable the hash
1690 		 * lookup is unnecessary (and expensive). Write protection
1691 		 * is responsibility of mmu_get_page / kvm_sync_page.
1692 		 * Same reasoning can be applied to dirty page accounting.
1693 		 */
1694 		if (!can_unsync && is_writable_pte(*sptep))
1695 			goto set_pte;
1696 
1697 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1698 			ret = 1;
1699 			pte_access &= ~ACC_WRITE_MASK;
1700 			if (is_writable_pte(spte))
1701 				spte &= ~PT_WRITABLE_MASK;
1702 		}
1703 	}
1704 
1705 	if (pte_access & ACC_WRITE_MASK)
1706 		mark_page_dirty(vcpu->kvm, gfn);
1707 
1708 set_pte:
1709 	__set_spte(sptep, spte);
1710 
1711 	return (ret);
1712 }
1713 
1714 static int
kvm_unmap_rmapp(struct kvm * kvm,unsigned long * rmapp,unsigned long data)1715 kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
1716 		unsigned long data)
1717 {
1718 	uint64_t *spte;
1719 	int need_tlb_flush = 0;
1720 
1721 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
1722 		if (!(*spte & PT_PRESENT_MASK)) {
1723 			cmn_err(CE_PANIC,
1724 				"kvm_unmap_rmapp: spte = %p, *spte = %lx\n",
1725 				spte, *spte);
1726 		}
1727 		rmap_remove(kvm, spte);
1728 		__set_spte(spte, shadow_trap_nonpresent_pte);
1729 		need_tlb_flush = 1;
1730 	}
1731 	return (need_tlb_flush);
1732 }
1733 
1734 #define	RMAP_RECYCLE_THRESHOLD	1000
1735 
1736 static void
rmap_recycle(struct kvm_vcpu * vcpu,uint64_t * spte,gfn_t gfn)1737 rmap_recycle(struct kvm_vcpu *vcpu, uint64_t *spte, gfn_t gfn)
1738 {
1739 	unsigned long *rmapp;
1740 	struct kvm_mmu_page *sp;
1741 
1742 	sp = page_header(vcpu->kvm, kvm_va2pa((caddr_t)spte));
1743 
1744 	gfn = unalias_gfn(vcpu->kvm, gfn);
1745 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
1746 
1747 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
1748 	kvm_flush_remote_tlbs(vcpu->kvm);
1749 }
1750 
1751 static void
mmu_set_spte(struct kvm_vcpu * vcpu,uint64_t * sptep,unsigned pt_access,unsigned pte_access,int user_fault,int write_fault,int dirty,int * ptwrite,int level,gfn_t gfn,pfn_t pfn,int speculative,int reset_host_protection)1752 mmu_set_spte(struct kvm_vcpu *vcpu, uint64_t *sptep, unsigned pt_access,
1753     unsigned pte_access, int user_fault, int write_fault, int dirty,
1754     int *ptwrite, int level, gfn_t gfn, pfn_t pfn, int speculative,
1755     int reset_host_protection)
1756 {
1757 	int was_rmapped = 0;
1758 	int was_writable = is_writable_pte(*sptep);
1759 	int rmap_count;
1760 
1761 	if (is_rmap_spte(*sptep)) {
1762 		/*
1763 		 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1764 		 * the parent of the now unreachable PTE.
1765 		 */
1766 		if (level > PT_PAGE_TABLE_LEVEL &&
1767 		    !is_large_pte(*sptep)) {
1768 			struct kvm_mmu_page *child;
1769 			uint64_t pte = *sptep;
1770 
1771 			child = page_header(vcpu->kvm,
1772 			    pte & PT64_BASE_ADDR_MASK);
1773 			mmu_page_remove_parent_pte(child, sptep);
1774 			__set_spte(sptep, shadow_trap_nonpresent_pte);
1775 			kvm_flush_remote_tlbs(vcpu->kvm);
1776 		} else if (pfn != spte_to_pfn(*sptep)) {
1777 			rmap_remove(vcpu->kvm, sptep);
1778 			__set_spte(sptep, shadow_trap_nonpresent_pte);
1779 			kvm_flush_remote_tlbs(vcpu->kvm);
1780 		} else
1781 			was_rmapped = 1;
1782 	}
1783 
1784 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
1785 	    dirty, level, gfn, pfn, speculative, 1, reset_host_protection)) {
1786 		if (write_fault)
1787 			*ptwrite = 1;
1788 		kvm_x86_ops->tlb_flush(vcpu);
1789 	}
1790 
1791 	if (!was_rmapped && is_large_pte(*sptep))
1792 		KVM_KSTAT_INC(vcpu->kvm, kvmks_lpages);
1793 
1794 	page_header_update_slot(vcpu->kvm, sptep, gfn);
1795 	if (!was_rmapped) {
1796 		rmap_count = rmap_add(vcpu, sptep, gfn);
1797 		kvm_release_pfn_clean(pfn);
1798 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1799 			rmap_recycle(vcpu, sptep, gfn);
1800 	} else {
1801 		if (was_writable)
1802 			kvm_release_pfn_dirty(pfn);
1803 		else
1804 			kvm_release_pfn_clean(pfn);
1805 	}
1806 	if (speculative) {
1807 		vcpu->arch.last_pte_updated = sptep;
1808 		vcpu->arch.last_pte_gfn = gfn;
1809 	}
1810 }
1811 
1812 static void
nonpaging_new_cr3(struct kvm_vcpu * vcpu)1813 nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1814 {
1815 }
1816 
1817 static int
__direct_map(struct kvm_vcpu * vcpu,gpa_t v,int write,int level,gfn_t gfn,pfn_t pfn)1818 __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1819     int level, gfn_t gfn, pfn_t pfn)
1820 {
1821 	struct kvm_shadow_walk_iterator iterator;
1822 	struct kvm_mmu_page *sp;
1823 	int pt_write = 0;
1824 	gfn_t pseudo_gfn;
1825 
1826 	for_each_shadow_entry(vcpu, (uint64_t)gfn << PAGESHIFT, iterator) {
1827 		if (iterator.level == level) {
1828 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
1829 			    0, write, 1, &pt_write, level, gfn, pfn, 0, 1);
1830 			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_fixed);
1831 			break;
1832 		}
1833 
1834 		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
1835 			pseudo_gfn = (iterator.addr &
1836 			    PT64_DIR_BASE_ADDR_MASK) >> PAGESHIFT;
1837 
1838 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
1839 			    iterator.level - 1, 1, ACC_ALL, iterator.sptep);
1840 
1841 			if (!sp) {
1842 				cmn_err(CE_WARN, "nonpaging_map: ENOMEM\n");
1843 				kvm_release_pfn_clean(pfn);
1844 				return (-ENOMEM);
1845 			}
1846 
1847 			__set_spte(iterator.sptep, kvm_va2pa((caddr_t)sp->spt) |
1848 			    PT_PRESENT_MASK | PT_WRITABLE_MASK |
1849 			    shadow_user_mask | shadow_x_mask);
1850 		}
1851 	}
1852 
1853 	return (pt_write);
1854 }
1855 
1856 static int
nonpaging_map(struct kvm_vcpu * vcpu,gva_t v,int write,gfn_t gfn)1857 nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1858 {
1859 	int r;
1860 	int level;
1861 	pfn_t pfn;
1862 
1863 	level = mapping_level(vcpu, gfn);
1864 
1865 	/*
1866 	 * This path builds a PAE pagetable - so we can map 2mb pages at
1867 	 * maximum. Therefore check if the level is larger than that.
1868 	 */
1869 	if (level > PT_DIRECTORY_LEVEL)
1870 		level = PT_DIRECTORY_LEVEL;
1871 
1872 	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
1873 
1874 	smp_rmb();
1875 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
1876 
1877 	/* mmio */
1878 	if (is_error_pfn(pfn)) {
1879 		kvm_release_pfn_clean(pfn);
1880 		return (1);
1881 	}
1882 
1883 	mutex_enter(&vcpu->kvm->mmu_lock);
1884 	kvm_mmu_free_some_pages(vcpu);
1885 	r = __direct_map(vcpu, v, write, level, gfn, pfn);
1886 	mutex_exit(&vcpu->kvm->mmu_lock);
1887 
1888 	return (r);
1889 
1890 out_unlock:
1891 	mutex_exit(&vcpu->kvm->mmu_lock);
1892 	kvm_release_pfn_clean(pfn);
1893 	return (0);
1894 }
1895 
1896 static void
mmu_free_roots(struct kvm_vcpu * vcpu)1897 mmu_free_roots(struct kvm_vcpu *vcpu)
1898 {
1899 	int i;
1900 	struct kvm_mmu_page *sp;
1901 
1902 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1903 		return;
1904 
1905 	mutex_enter(&vcpu->kvm->mmu_lock);
1906 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1907 		hpa_t root = vcpu->arch.mmu.root_hpa;
1908 
1909 		sp = page_header(vcpu->kvm, root);
1910 		--sp->root_count;
1911 		if (!sp->root_count && sp->role.invalid)
1912 			kvm_mmu_zap_page(vcpu->kvm, sp);
1913 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1914 		mutex_exit(&vcpu->kvm->mmu_lock);
1915 		return;
1916 	}
1917 
1918 	for (i = 0; i < 4; i++) {
1919 		hpa_t root = vcpu->arch.mmu.pae_root[i];
1920 
1921 		if (root) {
1922 			root &= PT64_BASE_ADDR_MASK;
1923 			sp = page_header(vcpu->kvm, root);
1924 			--sp->root_count;
1925 			if (!sp->root_count && sp->role.invalid)
1926 				kvm_mmu_zap_page(vcpu->kvm, sp);
1927 		}
1928 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
1929 	}
1930 	mutex_exit(&vcpu->kvm->mmu_lock);
1931 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
1932 }
1933 
1934 static int
mmu_check_root(struct kvm_vcpu * vcpu,gfn_t root_gfn)1935 mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
1936 {
1937 	int ret = 0;
1938 
1939 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
1940 		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
1941 		ret = 1;
1942 	}
1943 
1944 	return (ret);
1945 }
1946 
1947 static int
mmu_alloc_roots(struct kvm_vcpu * vcpu)1948 mmu_alloc_roots(struct kvm_vcpu *vcpu)
1949 {
1950 	int i;
1951 	gfn_t root_gfn;
1952 	struct kvm_mmu_page *sp;
1953 	int direct = 0;
1954 	uint64_t pdptr;
1955 
1956 	root_gfn = vcpu->arch.cr3 >> PAGESHIFT;
1957 
1958 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1959 		hpa_t root = vcpu->arch.mmu.root_hpa;
1960 
1961 		ASSERT(!VALID_PAGE(root));
1962 		if (tdp_enabled)
1963 			direct = 1;
1964 		if (mmu_check_root(vcpu, root_gfn))
1965 			return (1);
1966 
1967 		mutex_enter(&vcpu->kvm->mmu_lock);
1968 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
1969 		    direct, ACC_ALL, NULL);
1970 		root = kvm_va2pa((caddr_t)sp->spt);
1971 
1972 		++sp->root_count;
1973 		mutex_exit(&vcpu->kvm->mmu_lock);
1974 		vcpu->arch.mmu.root_hpa = root;
1975 		return (0);
1976 	}
1977 	direct = !is_paging(vcpu);
1978 	if (tdp_enabled)
1979 		direct = 1;
1980 	for (i = 0; i < 4; ++i) {
1981 		hpa_t root = vcpu->arch.mmu.pae_root[i];
1982 
1983 		ASSERT(!VALID_PAGE(root));
1984 		if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
1985 			pdptr = kvm_pdptr_read(vcpu, i);
1986 
1987 			if (!is_present_gpte(pdptr)) {
1988 				vcpu->arch.mmu.pae_root[i] = 0;
1989 				continue;
1990 			}
1991 			root_gfn = pdptr >> PAGESHIFT;
1992 		} else if (vcpu->arch.mmu.root_level == 0)
1993 			root_gfn = 0;
1994 		if (mmu_check_root(vcpu, root_gfn))
1995 			return (1);
1996 
1997 		mutex_enter(&vcpu->kvm->mmu_lock);
1998 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
1999 			    PT32_ROOT_LEVEL, direct, ACC_ALL, NULL);
2000 		root = kvm_va2pa((caddr_t)sp->spt);
2001 		++sp->root_count;
2002 		mutex_exit(&vcpu->kvm->mmu_lock);
2003 
2004 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2005 	}
2006 	vcpu->arch.mmu.root_hpa = kvm_va2pa((caddr_t)vcpu->arch.mmu.pae_root);
2007 
2008 	return (0);
2009 }
2010 
2011 static void
mmu_sync_roots(struct kvm_vcpu * vcpu)2012 mmu_sync_roots(struct kvm_vcpu *vcpu)
2013 {
2014 	int i;
2015 	struct kvm_mmu_page *sp;
2016 
2017 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2018 		return;
2019 
2020 	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2021 		hpa_t root = vcpu->arch.mmu.root_hpa;
2022 		sp = page_header(vcpu->kvm, root);
2023 		mmu_sync_children(vcpu, sp);
2024 		return;
2025 	}
2026 
2027 	for (i = 0; i < 4; i++) {
2028 		hpa_t root = vcpu->arch.mmu.pae_root[i];
2029 
2030 		if (root && VALID_PAGE(root)) {
2031 			root &= PT64_BASE_ADDR_MASK;
2032 			sp = page_header(vcpu->kvm, root);
2033 			mmu_sync_children(vcpu, sp);
2034 		}
2035 	}
2036 }
2037 
2038 void
kvm_mmu_sync_roots(struct kvm_vcpu * vcpu)2039 kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2040 {
2041 	mutex_enter(&vcpu->kvm->mmu_lock);
2042 	mmu_sync_roots(vcpu);
2043 	mutex_exit(&vcpu->kvm->mmu_lock);
2044 }
2045 
2046 static gpa_t
nonpaging_gva_to_gpa(struct kvm_vcpu * vcpu,gva_t vaddr,uint32_t access,uint32_t * error)2047 nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2048     uint32_t access, uint32_t *error)
2049 {
2050 	if (error)
2051 		*error = 0;
2052 	return (vaddr);
2053 }
2054 
2055 static int
nonpaging_page_fault(struct kvm_vcpu * vcpu,gva_t gva,uint32_t error_code)2056 nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, uint32_t error_code)
2057 {
2058 	gfn_t gfn;
2059 	int r;
2060 
2061 	r = mmu_topup_memory_caches(vcpu);
2062 	if (r)
2063 		return (r);
2064 
2065 	ASSERT(vcpu);
2066 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2067 
2068 	gfn = gva >> PAGESHIFT;
2069 
2070 	return (nonpaging_map(vcpu, gva & PAGEMASK,
2071 	    error_code & PFERR_WRITE_MASK, gfn));
2072 }
2073 
2074 static int
tdp_page_fault(struct kvm_vcpu * vcpu,gva_t gpa,uint32_t error_code)2075 tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, uint32_t error_code)
2076 {
2077 	pfn_t pfn;
2078 	int r;
2079 	int level;
2080 	gfn_t gfn = gpa >> PAGESHIFT;
2081 
2082 	ASSERT(vcpu);
2083 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2084 
2085 	r = mmu_topup_memory_caches(vcpu);
2086 	if (r)
2087 		return (r);
2088 
2089 	level = mapping_level(vcpu, gfn);
2090 
2091 	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2092 
2093 	smp_rmb();
2094 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2095 	if (is_error_pfn(pfn)) {
2096 		kvm_release_pfn_clean(pfn);
2097 		return (1);
2098 	}
2099 	mutex_enter(&vcpu->kvm->mmu_lock);
2100 
2101 	kvm_mmu_free_some_pages(vcpu);
2102 	r = __direct_map(vcpu, gpa,
2103 	    error_code & PFERR_WRITE_MASK, level, gfn, pfn);
2104 	mutex_exit(&vcpu->kvm->mmu_lock);
2105 
2106 	return (r);
2107 
2108 out_unlock:
2109 	mutex_exit(&vcpu->kvm->mmu_lock);
2110 	kvm_release_pfn_clean(pfn);
2111 
2112 	return (0);
2113 }
2114 
2115 static void
nonpaging_free(struct kvm_vcpu * vcpu)2116 nonpaging_free(struct kvm_vcpu *vcpu)
2117 {
2118 	mmu_free_roots(vcpu);
2119 }
2120 
2121 static int
nonpaging_init_context(struct kvm_vcpu * vcpu)2122 nonpaging_init_context(struct kvm_vcpu *vcpu)
2123 {
2124 	struct kvm_mmu *context = &vcpu->arch.mmu;
2125 
2126 	context->new_cr3 = nonpaging_new_cr3;
2127 	context->page_fault = nonpaging_page_fault;
2128 	context->gva_to_gpa = nonpaging_gva_to_gpa;
2129 	context->free = nonpaging_free;
2130 	context->prefetch_page = nonpaging_prefetch_page;
2131 	context->sync_page = nonpaging_sync_page;
2132 	context->invlpg = nonpaging_invlpg;
2133 	context->root_level = 0;
2134 	context->shadow_root_level = PT32E_ROOT_LEVEL;
2135 	context->root_hpa = INVALID_PAGE;
2136 
2137 	return (0);
2138 }
2139 
2140 void
kvm_mmu_flush_tlb(struct kvm_vcpu * vcpu)2141 kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2142 {
2143 	kvm_x86_ops->tlb_flush(vcpu);
2144 }
2145 
2146 static void
paging_new_cr3(struct kvm_vcpu * vcpu)2147 paging_new_cr3(struct kvm_vcpu *vcpu)
2148 {
2149 	cmn_err(CE_CONT, "!%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
2150 	mmu_free_roots(vcpu);
2151 }
2152 
2153 static void
inject_page_fault(struct kvm_vcpu * vcpu,uint64_t addr,uint32_t err_code)2154 inject_page_fault(struct kvm_vcpu *vcpu, uint64_t addr, uint32_t err_code)
2155 {
2156 	kvm_inject_page_fault(vcpu, addr, err_code);
2157 }
2158 
2159 static void
paging_free(struct kvm_vcpu * vcpu)2160 paging_free(struct kvm_vcpu *vcpu)
2161 {
2162 	nonpaging_free(vcpu);
2163 }
2164 
2165 static int
is_rsvd_bits_set(struct kvm_vcpu * vcpu,uint64_t gpte,int level)2166 is_rsvd_bits_set(struct kvm_vcpu *vcpu, uint64_t gpte, int level)
2167 {
2168 	int bit7;
2169 
2170 	bit7 = (gpte >> 7) & 1;
2171 	return ((gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level - 1]) != 0);
2172 }
2173 
2174 #define	PTTYPE 64
2175 #include "kvm_paging_tmpl.h"
2176 #undef PTTYPE
2177 
2178 #define	PTTYPE 32
2179 #include "kvm_paging_tmpl.h"
2180 #undef PTTYPE
2181 
2182 static void
reset_rsvds_bits_mask(struct kvm_vcpu * vcpu,int level)2183 reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2184 {
2185 	struct kvm_mmu *context = &vcpu->arch.mmu;
2186 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2187 	uint64_t exb_bit_rsvd = 0;
2188 
2189 	if (!is_nx(vcpu))
2190 		exb_bit_rsvd = rsvd_bits(63, 63);
2191 	switch (level) {
2192 	case PT32_ROOT_LEVEL:
2193 		/* no rsvd bits for 2 level 4K page table entries */
2194 		context->rsvd_bits_mask[0][1] = 0;
2195 		context->rsvd_bits_mask[0][0] = 0;
2196 		if (is_cpuid_PSE36())
2197 			/* 36bits PSE 4MB page */
2198 			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2199 		else
2200 			/* 32 bits PSE 4MB page */
2201 			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2202 		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2203 		break;
2204 	case PT32E_ROOT_LEVEL:
2205 		context->rsvd_bits_mask[0][2] =
2206 			rsvd_bits(maxphyaddr, 63) |
2207 			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
2208 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2209 			rsvd_bits(maxphyaddr, 62);	/* PDE */
2210 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2211 			rsvd_bits(maxphyaddr, 62); 	/* PTE */
2212 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2213 			rsvd_bits(maxphyaddr, 62) |
2214 			rsvd_bits(13, 20);		/* large page */
2215 		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2216 		break;
2217 	case PT64_ROOT_LEVEL:
2218 		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
2219 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2220 		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
2221 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
2222 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
2223 			rsvd_bits(maxphyaddr, 51);
2224 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
2225 			rsvd_bits(maxphyaddr, 51);
2226 		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
2227 		context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
2228 			rsvd_bits(maxphyaddr, 51) |
2229 			rsvd_bits(13, 29);
2230 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2231 			rsvd_bits(maxphyaddr, 51) |
2232 			rsvd_bits(13, 20);		/* large page */
2233 		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2234 		break;
2235 	}
2236 }
2237 
2238 static int
paging64_init_context_common(struct kvm_vcpu * vcpu,int level)2239 paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2240 {
2241 	struct kvm_mmu *context = &vcpu->arch.mmu;
2242 
2243 	ASSERT(is_pae(vcpu));
2244 	context->new_cr3 = paging_new_cr3;
2245 	context->page_fault = paging64_page_fault;
2246 	context->gva_to_gpa = paging64_gva_to_gpa;
2247 	context->prefetch_page = paging64_prefetch_page;
2248 	context->sync_page = paging64_sync_page;
2249 	context->invlpg = paging64_invlpg;
2250 	context->free = paging_free;
2251 	context->root_level = level;
2252 	context->shadow_root_level = level;
2253 	context->root_hpa = INVALID_PAGE;
2254 
2255 	return (0);
2256 }
2257 
2258 static int
paging64_init_context(struct kvm_vcpu * vcpu)2259 paging64_init_context(struct kvm_vcpu *vcpu)
2260 {
2261 	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2262 	return (paging64_init_context_common(vcpu, PT64_ROOT_LEVEL));
2263 }
2264 
2265 static int
paging32_init_context(struct kvm_vcpu * vcpu)2266 paging32_init_context(struct kvm_vcpu *vcpu)
2267 {
2268 	struct kvm_mmu *context = &vcpu->arch.mmu;
2269 
2270 	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2271 	context->new_cr3 = paging_new_cr3;
2272 	context->page_fault = paging32_page_fault;
2273 	context->gva_to_gpa = paging32_gva_to_gpa;
2274 	context->free = paging_free;
2275 	context->prefetch_page = paging32_prefetch_page;
2276 	context->sync_page = paging32_sync_page;
2277 	context->invlpg = paging32_invlpg;
2278 	context->root_level = PT32_ROOT_LEVEL;
2279 	context->shadow_root_level = PT32E_ROOT_LEVEL;
2280 	context->root_hpa = INVALID_PAGE;
2281 
2282 	return (0);
2283 }
2284 
2285 static int
paging32E_init_context(struct kvm_vcpu * vcpu)2286 paging32E_init_context(struct kvm_vcpu *vcpu)
2287 {
2288 	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2289 	return (paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL));
2290 }
2291 
2292 static int
init_kvm_tdp_mmu(struct kvm_vcpu * vcpu)2293 init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2294 {
2295 	struct kvm_mmu *context = &vcpu->arch.mmu;
2296 
2297 	context->new_cr3 = nonpaging_new_cr3;
2298 	context->page_fault = tdp_page_fault;
2299 	context->free = nonpaging_free;
2300 	context->prefetch_page = nonpaging_prefetch_page;
2301 	context->sync_page = nonpaging_sync_page;
2302 	context->invlpg = nonpaging_invlpg;
2303 	context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2304 	context->root_hpa = INVALID_PAGE;
2305 
2306 	if (!is_paging(vcpu)) {
2307 		context->gva_to_gpa = nonpaging_gva_to_gpa;
2308 		context->root_level = 0;
2309 	} else if (is_long_mode(vcpu)) {
2310 		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
2311 		context->gva_to_gpa = paging64_gva_to_gpa;
2312 		context->root_level = PT64_ROOT_LEVEL;
2313 	} else if (is_pae(vcpu)) {
2314 		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
2315 		context->gva_to_gpa = paging64_gva_to_gpa;
2316 		context->root_level = PT32E_ROOT_LEVEL;
2317 	} else {
2318 		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2319 		context->gva_to_gpa = paging32_gva_to_gpa;
2320 		context->root_level = PT32_ROOT_LEVEL;
2321 	}
2322 
2323 	return (0);
2324 }
2325 
2326 static int
init_kvm_softmmu(struct kvm_vcpu * vcpu)2327 init_kvm_softmmu(struct kvm_vcpu *vcpu)
2328 {
2329 	int r;
2330 
2331 	ASSERT(vcpu);
2332 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2333 
2334 	if (!is_paging(vcpu))
2335 		r = nonpaging_init_context(vcpu);
2336 	else if (is_long_mode(vcpu))
2337 		r = paging64_init_context(vcpu);
2338 	else if (is_pae(vcpu))
2339 		r = paging32E_init_context(vcpu);
2340 	else
2341 		r = paging32_init_context(vcpu);
2342 
2343 	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
2344 	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2345 
2346 	return (r);
2347 }
2348 
2349 static int
init_kvm_mmu(struct kvm_vcpu * vcpu)2350 init_kvm_mmu(struct kvm_vcpu *vcpu)
2351 {
2352 	vcpu->arch.update_pte.pfn = -1; /* bad_pfn */
2353 
2354 	if (tdp_enabled)
2355 		return (init_kvm_tdp_mmu(vcpu));
2356 	else
2357 		return (init_kvm_softmmu(vcpu));
2358 
2359 	return (0);
2360 }
2361 
2362 static void
destroy_kvm_mmu(struct kvm_vcpu * vcpu)2363 destroy_kvm_mmu(struct kvm_vcpu *vcpu)
2364 {
2365 	ASSERT(vcpu);
2366 	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
2367 		vcpu->arch.mmu.free(vcpu);
2368 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2369 	}
2370 }
2371 
2372 int
kvm_mmu_reset_context(struct kvm_vcpu * vcpu)2373 kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
2374 {
2375 	destroy_kvm_mmu(vcpu);
2376 	return (init_kvm_mmu(vcpu));
2377 }
2378 
2379 int
kvm_mmu_load(struct kvm_vcpu * vcpu)2380 kvm_mmu_load(struct kvm_vcpu *vcpu)
2381 {
2382 	int r;
2383 
2384 	r = mmu_topup_memory_caches(vcpu);
2385 	if (r)
2386 		goto out;
2387 	mutex_enter(&vcpu->kvm->mmu_lock);
2388 	kvm_mmu_free_some_pages(vcpu);
2389 	mutex_exit(&vcpu->kvm->mmu_lock);
2390 
2391 	r = mmu_alloc_roots(vcpu);
2392 
2393 	mutex_enter(&vcpu->kvm->mmu_lock);
2394 	mmu_sync_roots(vcpu);
2395 	mutex_exit(&vcpu->kvm->mmu_lock);
2396 	if (r)
2397 		goto out;
2398 
2399 	/*
2400 	 * set_cr3() should ensure TLB has been flushed
2401 	 */
2402 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2403 out:
2404 	return (r);
2405 }
2406 
2407 
2408 void
kvm_mmu_unload(struct kvm_vcpu * vcpu)2409 kvm_mmu_unload(struct kvm_vcpu *vcpu)
2410 {
2411 	mmu_free_roots(vcpu);
2412 }
2413 
2414 static void
mmu_pte_write_zap_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,uint64_t * spte)2415 mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2416     struct kvm_mmu_page *sp, uint64_t *spte)
2417 {
2418 	uint64_t pte;
2419 	struct kvm_mmu_page *child;
2420 
2421 	pte = *spte;
2422 
2423 	if (is_shadow_present_pte(pte)) {
2424 		if (is_last_spte(pte, sp->role.level)) {
2425 			rmap_remove(vcpu->kvm, spte);
2426 		} else {
2427 			child = page_header(vcpu->kvm,
2428 			    pte & PT64_BASE_ADDR_MASK);
2429 			mmu_page_remove_parent_pte(child, spte);
2430 		}
2431 	}
2432 	__set_spte(spte, shadow_trap_nonpresent_pte);
2433 
2434 	if (is_large_pte(pte))
2435 		KVM_KSTAT_DEC(vcpu->kvm, kvmks_lpages);
2436 }
2437 
2438 static void
mmu_pte_write_new_pte(struct kvm_vcpu * vcpu,struct kvm_mmu_page * sp,uint64_t * spte,const void * new)2439 mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2440     uint64_t *spte, const void *new)
2441 {
2442 	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
2443 		KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_pte_zapped);
2444 		return;
2445 	}
2446 
2447 	KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_pte_updated);
2448 
2449 	if (sp->role.glevels == PT32_ROOT_LEVEL)
2450 		paging32_update_pte(vcpu, sp, spte, new);
2451 	else
2452 		paging64_update_pte(vcpu, sp, spte, new);
2453 }
2454 
2455 static int
need_remote_flush(uint64_t old,uint64_t new)2456 need_remote_flush(uint64_t old, uint64_t new)
2457 {
2458 	if (!is_shadow_present_pte(old))
2459 		return (0);
2460 	if (!is_shadow_present_pte(new))
2461 		return (1);
2462 	if ((old ^ new) & PT64_BASE_ADDR_MASK)
2463 		return (1);
2464 	old ^= PT64_NX_MASK;
2465 	new ^= PT64_NX_MASK;
2466 	return ((old & ~new & PT64_PERM_MASK) != 0);
2467 }
2468 
2469 static void
mmu_pte_write_flush_tlb(struct kvm_vcpu * vcpu,uint64_t old,uint64_t new)2470 mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, uint64_t old, uint64_t new)
2471 {
2472 	if (need_remote_flush(old, new))
2473 		kvm_flush_remote_tlbs(vcpu->kvm);
2474 	else
2475 		kvm_mmu_flush_tlb(vcpu);
2476 }
2477 
2478 static int
last_updated_pte_accessed(struct kvm_vcpu * vcpu)2479 last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2480 {
2481 	uint64_t *spte = vcpu->arch.last_pte_updated;
2482 
2483 	return (!!(spte && (*spte & shadow_accessed_mask)));
2484 }
2485 
2486 static void
mmu_guess_page_from_pte_write(struct kvm_vcpu * vcpu,gpa_t gpa,const uint8_t * new,int bytes)2487 mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2488     const uint8_t *new, int bytes)
2489 {
2490 	gfn_t gfn;
2491 	int r;
2492 	uint64_t gpte = 0;
2493 	pfn_t pfn;
2494 
2495 	if (bytes != 4 && bytes != 8)
2496 		return;
2497 
2498 	/*
2499 	 * Assume that the pte write on a page table of the same type
2500 	 * as the current vcpu paging mode.  This is nearly always true
2501 	 * (might be false while changing modes).  Note it is verified later
2502 	 * by update_pte().
2503 	 */
2504 	if (is_pae(vcpu)) {
2505 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2506 		if ((bytes == 4) && (gpa % 4 == 0)) {
2507 			r = kvm_read_guest(vcpu->kvm,
2508 			    gpa & ~(uint64_t)7, &gpte, 8);
2509 
2510 			if (r)
2511 				return;
2512 			memcpy((void *)((uintptr_t)&gpte + (gpa % 8)), new, 4);
2513 		} else if ((bytes == 8) && (gpa % 8 == 0)) {
2514 			memcpy((void *)&gpte, new, 8);
2515 		}
2516 	} else {
2517 		if ((bytes == 4) && (gpa % 4 == 0))
2518 			memcpy((void *)&gpte, new, 4);
2519 	}
2520 	if (!is_present_gpte(gpte))
2521 		return;
2522 
2523 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGESHIFT;
2524 
2525 	smp_rmb();
2526 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2527 
2528 	if (is_error_pfn(pfn)) {
2529 		kvm_release_pfn_clean(pfn);
2530 		return;
2531 	}
2532 	vcpu->arch.update_pte.gfn = gfn;
2533 	vcpu->arch.update_pte.pfn = pfn;
2534 }
2535 
2536 static void
kvm_mmu_access_page(struct kvm_vcpu * vcpu,gfn_t gfn)2537 kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2538 {
2539 	uint64_t *spte = vcpu->arch.last_pte_updated;
2540 
2541 	if (spte && vcpu->arch.last_pte_gfn == gfn && shadow_accessed_mask &&
2542 	    !(*spte & shadow_accessed_mask) && is_shadow_present_pte(*spte))
2543 		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2544 }
2545 
2546 void
kvm_mmu_pte_write(struct kvm_vcpu * vcpu,gpa_t gpa,const uint8_t * new,int bytes,int guest_initiated)2547 kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2548     const uint8_t *new, int bytes, int guest_initiated)
2549 {
2550 	gfn_t gfn = gpa >> PAGESHIFT;
2551 	struct kvm_mmu_page *sp, *nsp = NULL;
2552 	list_t *bucket;
2553 	unsigned index;
2554 	uint64_t entry, gentry;
2555 	uint64_t *spte;
2556 	unsigned offset = offset_in_page(gpa);
2557 	unsigned pte_size;
2558 	unsigned page_offset;
2559 	unsigned misaligned;
2560 	unsigned quadrant;
2561 	int level;
2562 	int flooded = 0;
2563 	int npte;
2564 	int r;
2565 
2566 	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
2567 	mutex_enter(&vcpu->kvm->mmu_lock);
2568 	kvm_mmu_access_page(vcpu, gfn);
2569 	kvm_mmu_free_some_pages(vcpu);
2570 	KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_pte_write);
2571 
2572 	if (guest_initiated) {
2573 		if (gfn == vcpu->arch.last_pt_write_gfn &&
2574 		    !last_updated_pte_accessed(vcpu)) {
2575 			++vcpu->arch.last_pt_write_count;
2576 			if (vcpu->arch.last_pt_write_count >= 3)
2577 				flooded = 1;
2578 		} else {
2579 			vcpu->arch.last_pt_write_gfn = gfn;
2580 			vcpu->arch.last_pt_write_count = 1;
2581 			vcpu->arch.last_pte_updated = NULL;
2582 		}
2583 	}
2584 	index = kvm_page_table_hashfn(gfn);
2585 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2586 
2587 	for (sp = list_head(bucket); sp; sp = nsp) {
2588 		/*
2589 		 * Keep next list node pointer as we may free the current one
2590 		 */
2591 		nsp = list_next(bucket, sp);
2592 
2593 		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2594 			continue;
2595 
2596 		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
2597 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2598 		misaligned |= bytes < 4;
2599 		if (misaligned || flooded) {
2600 			/*
2601 			 * Misaligned accesses are too much trouble to fix
2602 			 * up; also, they usually indicate a page is not used
2603 			 * as a page table.
2604 			 *
2605 			 * If we're seeing too many writes to a page,
2606 			 * it may no longer be a page table, or we may be
2607 			 * forking, in which case it is better to unmap the
2608 			 * page.
2609 			 */
2610 			if (kvm_mmu_zap_page(vcpu->kvm, sp)) {
2611 				/*
2612 				 * kvm_mmu_zap_page() freed page(s) from
2613 				 * somewhere in the list, so start walking
2614 				 * again from the head.
2615 				 */
2616 				nsp = list_head(bucket);
2617 			}
2618 			KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_flooded);
2619 			continue;
2620 		}
2621 		page_offset = offset;
2622 		level = sp->role.level;
2623 		npte = 1;
2624 		if (sp->role.glevels == PT32_ROOT_LEVEL) {
2625 			page_offset <<= 1;	/* 32->64 */
2626 			/*
2627 			 * A 32-bit pde maps 4MB while the shadow pdes map
2628 			 * only 2MB.  So we need to double the offset again
2629 			 * and zap two pdes instead of one.
2630 			 */
2631 			if (level == PT32_ROOT_LEVEL) {
2632 				page_offset &= ~7; /* kill rounding error */
2633 				page_offset <<= 1;
2634 				npte = 2;
2635 			}
2636 			quadrant = page_offset >> PAGESHIFT;
2637 			page_offset &= ~PAGEMASK;
2638 			if (quadrant != sp->role.quadrant)
2639 				continue;
2640 		}
2641 
2642 		spte = &sp->spt[page_offset / sizeof (*spte)];
2643 
2644 		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2645 			gentry = 0;
2646 			r = kvm_read_guest_atomic(vcpu->kvm,
2647 			    gpa & ~(uint64_t)(pte_size - 1), &gentry, pte_size);
2648 			new = (const void *)&gentry;
2649 			if (r < 0)
2650 				new = NULL;
2651 		}
2652 
2653 		while (npte--) {
2654 			entry = *spte;
2655 			mmu_pte_write_zap_pte(vcpu, sp, spte);
2656 			if (new)
2657 				mmu_pte_write_new_pte(vcpu, sp, spte, new);
2658 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2659 			++spte;
2660 		}
2661 	}
2662 
2663 	KVM_TRACE1(mmu__audit__post__pte, struct kvm_vcpu *, vcpu);
2664 	mutex_exit(&vcpu->kvm->mmu_lock);
2665 
2666 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2667 		kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2668 		vcpu->arch.update_pte.pfn = bad_pfn;
2669 	}
2670 }
2671 
2672 int
kvm_mmu_unprotect_page_virt(struct kvm_vcpu * vcpu,gva_t gva)2673 kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2674 {
2675 	gpa_t gpa;
2676 	int r;
2677 
2678 	if (tdp_enabled)
2679 		return (0);
2680 
2681 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2682 
2683 	mutex_enter(&vcpu->kvm->mmu_lock);
2684 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGESHIFT);
2685 	mutex_exit(&vcpu->kvm->mmu_lock);
2686 
2687 	return (r);
2688 }
2689 
2690 void
__kvm_mmu_free_some_pages(struct kvm_vcpu * vcpu)2691 __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2692 {
2693 	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
2694 	    !list_is_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2695 		struct kvm_mmu_page *sp;
2696 
2697 		sp = list_tail(&vcpu->kvm->arch.active_mmu_pages);
2698 		kvm_mmu_zap_page(vcpu->kvm, sp);
2699 		KVM_KSTAT_INC(vcpu->kvm, kvmks_mmu_recycled);
2700 	}
2701 }
2702 
2703 int
kvm_mmu_page_fault(struct kvm_vcpu * vcpu,gva_t cr2,uint32_t error_code)2704 kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, uint32_t error_code)
2705 {
2706 	int r;
2707 	enum emulation_result er;
2708 
2709 	if ((r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code)) < 0)
2710 		return (r);
2711 
2712 	if (r == 0)
2713 		return (1);
2714 
2715 	if ((r = mmu_topup_memory_caches(vcpu)) != 0)
2716 		return (r);
2717 
2718 	er = emulate_instruction(vcpu, cr2, error_code, 0);
2719 
2720 	switch (er) {
2721 	case EMULATE_DONE:
2722 		return (1);
2723 
2724 	case EMULATE_DO_MMIO:
2725 		KVM_VCPU_KSTAT_INC(vcpu, kvmvs_mmio_exits);
2726 		return (0);
2727 
2728 	case EMULATE_FAIL:
2729 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2730 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2731 		vcpu->run->internal.ndata = 0;
2732 		return (0);
2733 	default:
2734 		panic("kvm_mmu_page_fault: unknown return "
2735 		    "from emulate_instruction: %x\n", er);
2736 	}
2737 
2738 	return (0);
2739 }
2740 
2741 void
kvm_mmu_invlpg(struct kvm_vcpu * vcpu,gva_t gva)2742 kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2743 {
2744 	vcpu->arch.mmu.invlpg(vcpu, gva);
2745 	kvm_mmu_flush_tlb(vcpu);
2746 	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_invlpg);
2747 }
2748 
2749 void
kvm_enable_tdp(void)2750 kvm_enable_tdp(void)
2751 {
2752 	tdp_enabled = 1;
2753 }
2754 
2755 void
kvm_disable_tdp(void)2756 kvm_disable_tdp(void)
2757 {
2758 	tdp_enabled = 0;
2759 }
2760 
2761 static int
alloc_mmu_pages(struct kvm_vcpu * vcpu)2762 alloc_mmu_pages(struct kvm_vcpu *vcpu)
2763 {
2764 	page_t *page;
2765 	int i;
2766 
2767 	ASSERT(vcpu);
2768 
2769 	/*
2770 	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
2771 	 * Therefore we need to allocate shadow page tables in the first
2772 	 * 4GB of memory, which happens to fit the DMA32 zone.
2773 	 * XXX - for right now, ignore DMA32.  need to use ddi_dma_mem_alloc
2774 	 * to address this issue...
2775 	 * XXX - also, don't need to allocate a full page, we'll look
2776 	 * at htable_t later on solaris.
2777 	 */
2778 	page = alloc_page(KM_SLEEP, &vcpu->arch.mmu.alloc_pae_root);
2779 	if (!page)
2780 		return (-ENOMEM);
2781 
2782 	vcpu->arch.mmu.pae_root = (uint64_t *)page_address(page);
2783 
2784 	for (i = 0; i < 4; ++i)
2785 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2786 
2787 	return (0);
2788 }
2789 
2790 int
kvm_mmu_create(struct kvm_vcpu * vcpu)2791 kvm_mmu_create(struct kvm_vcpu *vcpu)
2792 {
2793 	int i;
2794 
2795 	ASSERT(vcpu);
2796 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2797 
2798 	/*
2799 	 * We'll initialize hash lists here
2800 	 */
2801 
2802 	for (i = 0; i < KVM_NUM_MMU_PAGES; i++)
2803 		list_create(&vcpu->kvm->arch.mmu_page_hash[i],
2804 		    sizeof (struct kvm_mmu_page),
2805 		    offsetof(struct kvm_mmu_page, hash_link));
2806 
2807 	return (alloc_mmu_pages(vcpu));
2808 }
2809 
2810 int
kvm_mmu_setup(struct kvm_vcpu * vcpu)2811 kvm_mmu_setup(struct kvm_vcpu *vcpu)
2812 {
2813 	ASSERT(vcpu);
2814 
2815 	return (init_kvm_mmu(vcpu));
2816 }
2817 
2818 static void
free_mmu_pages(struct kvm_vcpu * vcpu)2819 free_mmu_pages(struct kvm_vcpu *vcpu)
2820 {
2821 	kmem_free(vcpu->arch.mmu.alloc_pae_root, PAGESIZE);
2822 }
2823 
2824 static void
mmu_free_memory_cache(struct kvm_mmu_memory_cache * mc,struct kmem_cache * cp)2825 mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, struct kmem_cache *cp)
2826 {
2827 	while (mc->nobjs)
2828 		kmem_cache_free(cp, mc->objects[--mc->nobjs].kma_object);
2829 }
2830 
2831 static void
mmu_free_memory_cache_page(struct kvm_mmu_memory_cache * mc)2832 mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
2833 {
2834 	while (mc->nobjs)
2835 		kmem_free(mc->objects[--mc->nobjs].kma_object, PAGESIZE);
2836 }
2837 
2838 static void
mmu_free_memory_caches(struct kvm_vcpu * vcpu)2839 mmu_free_memory_caches(struct kvm_vcpu *vcpu)
2840 {
2841 	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
2842 	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
2843 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
2844 	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
2845 	    mmu_page_header_cache);
2846 }
2847 
2848 void
kvm_mmu_destroy(struct kvm_vcpu * vcpu)2849 kvm_mmu_destroy(struct kvm_vcpu *vcpu)
2850 {
2851 	ASSERT(vcpu);
2852 
2853 	destroy_kvm_mmu(vcpu);
2854 	free_mmu_pages(vcpu);
2855 	mmu_free_memory_caches(vcpu);
2856 }
2857 
2858 void
kvm_mmu_slot_remove_write_access(struct kvm * kvm,int slot)2859 kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2860 {
2861 	struct kvm_mmu_page *sp;
2862 
2863 	for (sp = list_head(&kvm->arch.active_mmu_pages);
2864 	    sp != NULL; sp = list_next(&kvm->arch.active_mmu_pages, sp)) {
2865 		int i;
2866 		uint64_t *pt;
2867 
2868 		if (!test_bit(slot, sp->slot_bitmap))
2869 			continue;
2870 
2871 		pt = sp->spt;
2872 		for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
2873 			/* avoid RMW */
2874 			if (pt[i] & PT_WRITABLE_MASK)
2875 				pt[i] &= ~PT_WRITABLE_MASK;
2876 		}
2877 	}
2878 	kvm_flush_remote_tlbs(kvm);
2879 }
2880 
2881 void
kvm_mmu_zap_all(struct kvm * kvm)2882 kvm_mmu_zap_all(struct kvm *kvm)
2883 {
2884 	struct kvm_mmu_page *sp, *nsp;
2885 
2886 	/*
2887 	 * In the following loop, sp may be freed and deleted
2888 	 * from the list indirectly from kvm_mmu_zap_page.
2889 	 * So we hold onto the next element before zapping.
2890 	 */
2891 	mutex_enter(&kvm->mmu_lock);
2892 
2893 	for (sp = list_head(&kvm->arch.active_mmu_pages);
2894 	    sp != NULL; sp = nsp) {
2895 		nsp = list_next(&kvm->arch.active_mmu_pages, sp);
2896 
2897 		if (kvm_mmu_zap_page(kvm, sp))
2898 			nsp = list_head(&kvm->arch.active_mmu_pages);
2899 	}
2900 
2901 	mutex_exit(&kvm->mmu_lock);
2902 	kvm_flush_remote_tlbs(kvm);
2903 }
2904 
2905 void
kvm_mmu_destroy_caches(void)2906 kvm_mmu_destroy_caches(void)
2907 {
2908 	if (pte_chain_cache)
2909 		kmem_cache_destroy(pte_chain_cache);
2910 	if (rmap_desc_cache)
2911 		kmem_cache_destroy(rmap_desc_cache);
2912 	if (mmu_page_header_cache)
2913 		kmem_cache_destroy(mmu_page_header_cache);
2914 }
2915 
2916 int
kvm_mmu_module_init(void)2917 kvm_mmu_module_init(void)
2918 {
2919 	if ((pte_chain_cache = kmem_cache_create("kvm_pte_chain",
2920 	    sizeof (struct kvm_pte_chain), 0, zero_constructor, NULL, NULL,
2921 	    (void *)sizeof (struct kvm_pte_chain), NULL, 0)) == NULL)
2922 		goto nomem;
2923 
2924 	if ((rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
2925 	    sizeof (struct kvm_rmap_desc), 0, zero_constructor, NULL, NULL,
2926 	    (void *)sizeof (struct kvm_rmap_desc), NULL, 0)) == NULL)
2927 		goto nomem;
2928 
2929 	if ((mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
2930 	    sizeof (struct kvm_mmu_page), 0, zero_constructor, NULL, NULL,
2931 	    (void *)sizeof (struct kvm_mmu_page), NULL, 0)) == NULL)
2932 		goto nomem;
2933 
2934 	return (0);
2935 
2936 nomem:
2937 	kvm_mmu_destroy_caches();
2938 	return (ENOMEM);
2939 }
2940 
2941 /*
2942  * Caculate mmu pages needed for kvm.
2943  */
2944 unsigned int
kvm_mmu_calculate_mmu_pages(struct kvm * kvm)2945 kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
2946 {
2947 	int i;
2948 	unsigned int nr_mmu_pages;
2949 	unsigned int  nr_pages = 0;
2950 	struct kvm_memslots *slots;
2951 
2952 	mutex_enter(&kvm->memslots_lock);
2953 	slots = kvm->memslots;
2954 	for (i = 0; i < slots->nmemslots; i++)
2955 		nr_pages += slots->memslots[i].npages;
2956 	mutex_exit(&kvm->memslots_lock);
2957 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
2958 	nr_mmu_pages = MAX(nr_mmu_pages, (unsigned int)KVM_MIN_ALLOC_MMU_PAGES);
2959 
2960 	return (nr_mmu_pages);
2961 }
2962 
2963 int
kvm_mmu_get_spte_hierarchy(struct kvm_vcpu * vcpu,uint64_t addr,uint64_t sptes[4])2964 kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu,
2965     uint64_t addr, uint64_t sptes[4])
2966 {
2967 	struct kvm_shadow_walk_iterator iterator;
2968 	int nr_sptes = 0;
2969 
2970 	mutex_enter(&vcpu->kvm->mmu_lock);
2971 	for_each_shadow_entry(vcpu, addr, iterator) {
2972 		sptes[iterator.level - 1] = *iterator.sptep;
2973 		nr_sptes++;
2974 		if (!is_shadow_present_pte(*iterator.sptep))
2975 			break;
2976 	}
2977 	mutex_exit(&vcpu->kvm->mmu_lock);
2978 
2979 	return (nr_sptes);
2980 }
2981 
2982 void
kvm_mmu_free_some_pages(struct kvm_vcpu * vcpu)2983 kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2984 {
2985 	if (vcpu->kvm->arch.