xref: /illumos-kvm/kvm_x86.c (revision a40ccdeb)
1 /*
2  * GPL HEADER START
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16  *
17  * GPL HEADER END
18  *
19  * Copyright 2011 various Linux Kernel contributors.
20  * Copyright 2019 Joyent, Inc.
21  */
22 
23 #include <sys/types.h>
24 #include <sys/param.h>
25 #include <sys/mutex.h>
26 #include <sys/ksynch.h>
27 #include <sys/condvar_impl.h>
28 #include <sys/ddi.h>
29 #include <sys/regset.h>
30 #include <sys/fp.h>
31 #include <sys/tss.h>
32 #include <sys/x86_archext.h>
33 #include <sys/controlregs.h>
34 #include <sys/smt.h>
35 #include <sys/machsystm.h>
36 
37 #include <vm/page.h>
38 #include <vm/hat.h>
39 
40 #include <asm/cpu.h>
41 
42 #include "kvm_bitops.h"
43 #include "kvm_vmx.h"
44 #include "msr-index.h"
45 #include "kvm_msr.h"
46 #include "kvm_host.h"
47 #include "kvm_lapic.h"
48 #include "processor-flags.h"
49 #include "kvm_cpuid.h"
50 #include "hyperv.h"
51 #include "kvm_apicdef.h"
52 #include "kvm_iodev.h"
53 #include "kvm.h"
54 #include "kvm_x86impl.h"
55 #include "kvm_irq.h"
56 #include "kvm_ioapic.h"
57 #include "kvm_coalesced_mmio.h"
58 #include "kvm_i8254.h"
59 #include "kvm_mmu.h"
60 #include "kvm_cache_regs.h"
61 #include "kvm_para.h"
62 
63 extern caddr_t smmap64(caddr_t addr, size_t len, int prot, int flags,
64     int fd, off_t pos);
65 extern int memcntl(caddr_t, size_t, int, caddr_t, int, int);
66 extern int lwp_sigmask(int, uint_t, uint_t, uint_t, uint_t);
67 extern uint64_t cpu_freq_hz;
68 
69 static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)];
70 static uint64_t cpu_tsc_khz;
71 
72 /*
73  * Globals
74  */
75 struct kvm_x86_ops *kvm_x86_ops;
76 int ignore_msrs = 0;
77 
78 #define	MAX_IO_MSRS 256
79 #define	CR0_RESERVED_BITS						\
80 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
81 	    | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
82 	    | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
83 #define	CR4_RESERVED_BITS						\
84 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
85 	    | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
86 	    | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
87 	    | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
88 
89 #define	CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
90 
91 /*
92  * EFER defaults:
93  * - enable syscall per default because its emulated by KVM
94  * - enable LME and LMA per default on 64 bit KVM
95  */
96 static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL;
97 
98 static void update_cr8_intercept(struct kvm_vcpu *);
99 static struct kvm_shared_msrs_global shared_msrs_global;
100 static struct kvm_shared_msrs *shared_msrs;
101 
102 void
kvm_sigprocmask(int how,sigset_t * setp,sigset_t * osetp)103 kvm_sigprocmask(int how, sigset_t *setp, sigset_t *osetp)
104 {
105 	k_sigset_t kset;
106 
107 	ASSERT(how == SIG_SETMASK);
108 	ASSERT(setp != NULL);
109 
110 	sigutok(setp, &kset);
111 
112 	if (osetp != NULL)
113 		sigktou(&curthread->t_hold, osetp);
114 
115 	(void) lwp_sigmask(SIG_SETMASK,
116 	    kset.__sigbits[0], kset.__sigbits[1], kset.__sigbits[2], 0);
117 }
118 
119 static void
kvm_on_user_return(struct kvm_vcpu * vcpu,struct kvm_user_return_notifier * urn)120 kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn)
121 {
122 	unsigned slot;
123 	struct kvm_shared_msrs *locals =
124 	    (struct kvm_shared_msrs *)(((caddr_t)urn) -
125 		offsetof(struct kvm_shared_msrs, urn));
126 	struct kvm_shared_msr_values *values;
127 
128 	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
129 		values = &locals->values[slot];
130 		if (values->host != values->curr) {
131 			wrmsrl(shared_msrs_global.msrs[slot], values->host);
132 			values->curr = values->host;
133 		}
134 	}
135 	locals->registered = 0;
136 	/*
137 	 * As the on-user-return handler indicates that this thread is either
138 	 * returning to userspace or going off-cpu, the host MSR values should
139 	 * be queried again prior to the next VM entry.
140 	 */
141 	locals->host_saved = 0;
142 	kvm_user_return_notifier_unregister(vcpu, urn);
143 }
144 
145 void
kvm_define_shared_msr(unsigned slot,uint32_t msr)146 kvm_define_shared_msr(unsigned slot, uint32_t msr)
147 {
148 	if (slot >= shared_msrs_global.nr)
149 		shared_msrs_global.nr = slot + 1;
150 	shared_msrs_global.msrs[slot] = msr;
151 
152 	/* we need ensured the shared_msr_global have been updated */
153 	smp_wmb();
154 }
155 
156 void
kvm_set_shared_msr(struct kvm_vcpu * vcpu,unsigned slot,uint64_t value,uint64_t mask)157 kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value,
158     uint64_t mask)
159 {
160 	struct kvm_shared_msrs *smsr = &shared_msrs[CPU->cpu_id];
161 	const uint32_t msr = shared_msrs_global.msrs[slot];
162 	const uint_t slot_bit = 1 << slot;
163 
164 	ASSERT(slot < KVM_NR_SHARED_MSRS);
165 
166 	/* Preserve host MSR values prior to loading the guest data. */
167 	if ((smsr->host_saved & slot_bit) == 0) {
168 		uint64_t temp;
169 
170 		rdmsrl_safe(msr, (unsigned long long *)&temp);
171 		smsr->values[slot].host = temp;
172 		smsr->values[slot].curr = temp;
173 		smsr->host_saved |= slot_bit;
174 	}
175 
176 	if (((value ^ smsr->values[slot].curr) & mask) == 0)
177 		return;
178 
179 	smsr->values[slot].curr = value;
180 	wrmsrl(msr, value);
181 
182 	if (!smsr->registered) {
183 		smsr->urn.on_user_return = kvm_on_user_return;
184 		kvm_user_return_notifier_register(vcpu, &smsr->urn);
185 		smsr->registered = 1;
186 	}
187 }
188 
189 unsigned long
segment_base(uint16_t selector)190 segment_base(uint16_t selector)
191 {
192 	struct descriptor_table gdt;
193 	struct desc_struct *d;
194 	unsigned long table_base;
195 	unsigned long v;
196 
197 	if (selector == 0)
198 		return (0);
199 
200 	kvm_get_gdt(&gdt);
201 	table_base = gdt.base;
202 
203 	if (selector & 4) {		/* from ldt */
204 		uint16_t ldt_selector = kvm_read_ldt();
205 
206 		table_base = segment_base(ldt_selector);
207 	}
208 	d = (struct desc_struct *)(table_base + (selector & ~7));
209 	v = get_desc_base(d);
210 
211 	if (d->c.b.s == 0 &&
212 	    (d->c.b.type == 2 || d->c.b.type == 9 || d->c.b.type == 11))
213 		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
214 
215 	return (v);
216 }
217 
218 uint64_t
kvm_get_apic_base(struct kvm_vcpu * vcpu)219 kvm_get_apic_base(struct kvm_vcpu *vcpu)
220 {
221 	if (irqchip_in_kernel(vcpu->kvm))
222 		return (vcpu->arch.apic_base);
223 	else
224 		return (vcpu->arch.apic_base);
225 }
226 
227 void
kvm_set_apic_base(struct kvm_vcpu * vcpu,uint64_t data)228 kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data)
229 {
230 	/* TODO: reserve bits check */
231 	if (irqchip_in_kernel(vcpu->kvm))
232 		kvm_lapic_set_base(vcpu, data);
233 	else
234 		vcpu->arch.apic_base = data;
235 }
236 
237 #define	EXCPT_BENIGN		0
238 #define	EXCPT_CONTRIBUTORY	1
239 #define	EXCPT_PF		2
240 
241 static int
exception_class(int vector)242 exception_class(int vector)
243 {
244 	switch (vector) {
245 	case PF_VECTOR:
246 		return (EXCPT_PF);
247 	case DE_VECTOR:
248 	case TS_VECTOR:
249 	case NP_VECTOR:
250 	case SS_VECTOR:
251 	case GP_VECTOR:
252 		return (EXCPT_CONTRIBUTORY);
253 	default:
254 		break;
255 	}
256 
257 	return (EXCPT_BENIGN);
258 }
259 
260 static void
kvm_multiple_exception(struct kvm_vcpu * vcpu,unsigned nr,int has_error,uint32_t error_code)261 kvm_multiple_exception(struct kvm_vcpu *vcpu,
262     unsigned nr, int has_error, uint32_t error_code)
263 {
264 	uint32_t prev_nr;
265 	int class1, class2;
266 
267 	if (!vcpu->arch.exception.pending) {
268 queue:
269 		vcpu->arch.exception.pending = 1;
270 		vcpu->arch.exception.has_error_code = has_error;
271 		vcpu->arch.exception.nr = nr;
272 		vcpu->arch.exception.error_code = error_code;
273 		return;
274 	}
275 
276 	/* to check exception */
277 	prev_nr = vcpu->arch.exception.nr;
278 	if (prev_nr == DF_VECTOR) {
279 		/* triple fault -> shutdown */
280 		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
281 		return;
282 	}
283 	class1 = exception_class(prev_nr);
284 	class2 = exception_class(nr);
285 	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
286 	    (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
287 		/* generate double fault per SDM Table 5-5 */
288 		vcpu->arch.exception.pending = 1;
289 		vcpu->arch.exception.has_error_code = 1;
290 		vcpu->arch.exception.nr = DF_VECTOR;
291 		vcpu->arch.exception.error_code = 0;
292 	} else {
293 		/*
294 		 * replace previous exception with a new one in a hope
295 		 * that instruction re-execution will regenerate lost
296 		 * exception
297 		 */
298 		goto queue;
299 	}
300 }
301 
302 void
kvm_queue_exception(struct kvm_vcpu * vcpu,unsigned nr)303 kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
304 {
305 	kvm_multiple_exception(vcpu, nr, 0, 0);
306 }
307 
308 void
kvm_inject_page_fault(struct kvm_vcpu * vcpu,unsigned long addr,uint32_t error_code)309 kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
310     uint32_t error_code)
311 {
312 	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest);
313 	vcpu->arch.cr2 = addr;
314 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
315 }
316 
317 void
kvm_inject_nmi(struct kvm_vcpu * vcpu)318 kvm_inject_nmi(struct kvm_vcpu *vcpu)
319 {
320 	vcpu->arch.nmi_pending = 1;
321 }
322 
323 void
kvm_inject_gp(struct kvm_vcpu * vcpu,uint32_t error_code)324 kvm_inject_gp(struct kvm_vcpu *vcpu, uint32_t error_code)
325 {
326 	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
327 }
328 
329 void
kvm_queue_exception_e(struct kvm_vcpu * vcpu,unsigned nr,uint32_t error_code)330 kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code)
331 {
332 	kvm_multiple_exception(vcpu, nr, 1, error_code);
333 }
334 
335 /*
336  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
337  * a #GP and return false.
338  */
339 int
kvm_require_cpl(struct kvm_vcpu * vcpu,int required_cpl)340 kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
341 {
342 	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
343 		return (1);
344 	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
345 	return (0);
346 }
347 
348 /*
349  * Load the pae pdptrs.  Return true is they are all valid.
350  */
351 int
load_pdptrs(struct kvm_vcpu * vcpu,unsigned long cr3)352 load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
353 {
354 	gfn_t pdpt_gfn = cr3 >> PAGESHIFT;
355 	unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2;
356 	int i;
357 	int ret;
358 	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
359 
360 	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn,
361 	    pdpte, offset * sizeof (uint64_t), sizeof (pdpte));
362 
363 	if (ret < 0) {
364 		ret = 0;
365 		goto out;
366 	}
367 
368 	for (i = 0; i < ARRAY_SIZE(pdpte); i++) {
369 		if (is_present_gpte(pdpte[i]) &&
370 		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
371 			ret = 0;
372 			goto out;
373 		}
374 	}
375 	ret = 1;
376 
377 	memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs));
378 	__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail);
379 	__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty);
380 out:
381 	return (ret);
382 }
383 
384 static int
pdptrs_changed(struct kvm_vcpu * vcpu)385 pdptrs_changed(struct kvm_vcpu *vcpu)
386 {
387 	uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
388 
389 	if (is_long_mode(vcpu) || !is_pae(vcpu))
390 		return (0);
391 
392 	if (!test_bit(VCPU_EXREG_PDPTR,
393 	    (unsigned long *)&vcpu->arch.regs_avail)) {
394 		return (1);
395 	}
396 
397 	if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u,
398 	    pdpte, sizeof (pdpte)) < 0)
399 		return (1);
400 
401 	return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0);
402 }
403 
404 void
kvm_set_cr0(struct kvm_vcpu * vcpu,unsigned long cr0)405 kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
406 {
407 	cr0 |= X86_CR0_ET;
408 
409 	if (cr0 & 0xffffffff00000000UL) {
410 		kvm_inject_gp(vcpu, 0);
411 		return;
412 	}
413 
414 	cr0 &= ~CR0_RESERVED_BITS;
415 
416 	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
417 		kvm_inject_gp(vcpu, 0);
418 		return;
419 	}
420 
421 	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
422 		kvm_inject_gp(vcpu, 0);
423 		return;
424 	}
425 
426 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
427 
428 		if ((vcpu->arch.efer & EFER_LME)) {
429 			int cs_db, cs_l;
430 
431 			if (!is_pae(vcpu)) {
432 				kvm_inject_gp(vcpu, 0);
433 				return;
434 			}
435 
436 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
437 			if (cs_l) {
438 				kvm_inject_gp(vcpu, 0);
439 				return;
440 
441 			}
442 		} else
443 
444 		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
445 			kvm_inject_gp(vcpu, 0);
446 			return;
447 		}
448 
449 	}
450 
451 	kvm_x86_ops->set_cr0(vcpu, cr0);
452 	vcpu->arch.cr0 = cr0;
453 	kvm_mmu_reset_context(vcpu);
454 }
455 
456 void
kvm_lmsw(struct kvm_vcpu * vcpu,unsigned long msw)457 kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
458 {
459 	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
460 }
461 
462 void
kvm_set_cr4(struct kvm_vcpu * vcpu,unsigned long cr4)463 kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
464 {
465 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
466 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
467 
468 	if (cr4 & CR4_RESERVED_BITS) {
469 		kvm_inject_gp(vcpu, 0);
470 		return;
471 	}
472 
473 	if (is_long_mode(vcpu)) {
474 		if (!(cr4 & X86_CR4_PAE)) {
475 			kvm_inject_gp(vcpu, 0);
476 			return;
477 		}
478 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) &&
479 	    ((cr4 ^ old_cr4) & pdptr_bits) &&
480 	    !load_pdptrs(vcpu, vcpu->arch.cr3)) {
481 		kvm_inject_gp(vcpu, 0);
482 		return;
483 	}
484 
485 	if (cr4 & X86_CR4_VMXE) {
486 		kvm_inject_gp(vcpu, 0);
487 		return;
488 	}
489 
490 	kvm_x86_ops->set_cr4(vcpu, cr4);
491 	vcpu->arch.cr4 = cr4;
492 	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
493 	kvm_mmu_reset_context(vcpu);
494 }
495 
496 void
kvm_set_cr3(struct kvm_vcpu * vcpu,unsigned long cr3)497 kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
498 {
499 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
500 		kvm_mmu_sync_roots(vcpu);
501 		kvm_mmu_flush_tlb(vcpu);
502 		return;
503 	}
504 
505 	if (is_long_mode(vcpu)) {
506 		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
507 			kvm_inject_gp(vcpu, 0);
508 			return;
509 		}
510 	} else {
511 		if (is_pae(vcpu)) {
512 			if (cr3 & CR3_PAE_RESERVED_BITS) {
513 				kvm_inject_gp(vcpu, 0);
514 				return;
515 			}
516 			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
517 				kvm_inject_gp(vcpu, 0);
518 				return;
519 			}
520 		}
521 		/*
522 		 * We don't check reserved bits in nonpae mode, because
523 		 * this isn't enforced, and VMware depends on this.
524 		 */
525 	}
526 
527 	/*
528 	 * Does the new cr3 value map to physical memory? (Note, we
529 	 * catch an invalid cr3 even in real-mode, because it would
530 	 * cause trouble later on when we turn on paging anyway.)
531 	 *
532 	 * A real CPU would silently accept an invalid cr3 and would
533 	 * attempt to use it - with largely undefined (and often hard
534 	 * to debug) behavior on the guest side.
535 	 */
536 	if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT)))
537 		kvm_inject_gp(vcpu, 0);
538 	else {
539 		vcpu->arch.cr3 = cr3;
540 		vcpu->arch.mmu.new_cr3(vcpu);
541 	}
542 }
543 
544 void
kvm_set_cr8(struct kvm_vcpu * vcpu,unsigned long cr8)545 kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
546 {
547 	if (cr8 & CR8_RESERVED_BITS) {
548 		kvm_inject_gp(vcpu, 0);
549 		return;
550 	}
551 
552 	if (irqchip_in_kernel(vcpu->kvm))
553 		kvm_lapic_set_tpr(vcpu, cr8);
554 	else
555 		vcpu->arch.cr8 = cr8;
556 }
557 
558 unsigned long
kvm_get_cr8(struct kvm_vcpu * vcpu)559 kvm_get_cr8(struct kvm_vcpu *vcpu)
560 {
561 	if (irqchip_in_kernel(vcpu->kvm)) {
562 		return (kvm_lapic_get_cr8(vcpu));
563 	} else {
564 		return (vcpu->arch.cr8);
565 	}
566 }
567 
568 /*
569  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
570  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
571  *
572  * This list is modified at module load time to reflect the
573  * capabilities of the host cpu. This capabilities test skips MSRs that are
574  * kvm-specific. Those are put in the beginning of the list.
575  */
576 
577 
578 #define	KVM_SAVE_MSRS_BEGIN	5
579 static uint32_t msrs_to_save[] = {
580 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
581 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
582 	HV_X64_MSR_APIC_ASSIST_PAGE,
583 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
584 	MSR_K6_STAR,
585 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
586 	MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
587 };
588 
589 static unsigned num_msrs_to_save;
590 
591 static uint32_t emulated_msrs[] = {
592 	MSR_IA32_MISC_ENABLE,
593 };
594 
595 static int
set_efer(struct kvm_vcpu * vcpu,uint64_t efer)596 set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
597 {
598 	if (efer & efer_reserved_bits)
599 		return (1);
600 
601 	if (is_paging(vcpu) &&
602 	    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
603 		return (1);
604 	}
605 
606 	if (efer & EFER_FFXSR) {
607 		struct kvm_cpuid_entry2 *feat;
608 
609 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
610 		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
611 			return (1);
612 	}
613 
614 	if (efer & EFER_SVME) {
615 		struct kvm_cpuid_entry2 *feat;
616 
617 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
618 		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
619 			return (1);
620 	}
621 
622 	efer &= ~EFER_LMA;
623 	efer |= vcpu->arch.efer & EFER_LMA;
624 
625 	kvm_x86_ops->set_efer(vcpu, efer);
626 
627 	vcpu->arch.efer = efer;
628 
629 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
630 	kvm_mmu_reset_context(vcpu);
631 
632 	return (0);
633 }
634 
635 void
kvm_enable_efer_bits(uint64_t mask)636 kvm_enable_efer_bits(uint64_t mask)
637 {
638 	efer_reserved_bits &= ~mask;
639 }
640 
641 /*
642  * Writes msr value into into the appropriate "register".
643  * Returns 0 on success, non-0 otherwise.
644  * Assumes vcpu_load() was already called.
645  */
646 int
kvm_set_msr(struct kvm_vcpu * vcpu,uint32_t msr_index,uint64_t data)647 kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
648 {
649 	return (kvm_x86_ops->set_msr(vcpu, msr_index, data));
650 }
651 
652 /*
653  * Adapt set_msr() to msr_io()'s calling convention
654  */
655 static int
do_set_msr(struct kvm_vcpu * vcpu,unsigned index,uint64_t * data)656 do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data)
657 {
658 	return (kvm_set_msr(vcpu, index, *data));
659 }
660 
661 static void
kvm_write_wall_clock(struct kvm * kvm,gpa_t wall_clock)662 kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
663 {
664 	uint32_t version;
665 	struct pvclock_wall_clock wc;
666 	timespec_t ts;
667 
668 	if (!wall_clock)
669 		return;
670 
671 	if (kvm_read_guest(kvm, wall_clock, &version, sizeof (version)) != 0)
672 		return;
673 
674 	if (version & 1)
675 		version++;	/* first time write, random junk */
676 
677 	version++;
678 
679 	kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
680 
681 	/* Use recorded time at VM creation */
682 	wc.sec = kvm->arch.boot_wallclock.tv_sec;
683 	wc.nsec = kvm->arch.boot_wallclock.tv_nsec;
684 	wc.version = version;
685 
686 	kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc));
687 
688 	version++;
689 	kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
690 }
691 
692 static uint32_t
div_frac(uint32_t dividend,uint32_t divisor)693 div_frac(uint32_t dividend, uint32_t divisor)
694 {
695 	uint32_t quotient, remainder;
696 
697 	/*
698 	 * Don't try to replace with do_div(), this one calculates
699 	 * "(dividend << 32) / divisor"
700 	 */
701 	__asm__("divl %4"
702 		: "=a" (quotient), "=d" (remainder)
703 		: "0" (0), "1" (dividend), "r" (divisor));
704 
705 	return (quotient);
706 }
707 
708 static void
kvm_write_guest_time(struct kvm_vcpu * v)709 kvm_write_guest_time(struct kvm_vcpu *v)
710 {
711 	struct kvm_vcpu_arch *vcpu = &v->arch;
712 	page_t *page;
713 	struct pvclock_vcpu_time_info *pvclock;
714 	hrtime_t hrt;
715 	uint64_t tsc;
716 	uint32_t scale, version;
717 	uint8_t shift;
718 
719 	if (vcpu->time_addr == 0)
720 		return;
721 
722 	page = gfn_to_page(v->kvm, vcpu->time_addr >> PAGESHIFT);
723 	if (page == bad_page) {
724 		vcpu->time_addr = 0;
725 		return;
726 	}
727 	pvclock = (void *)((uintptr_t)page_address(page) +
728 	    offset_in_page(vcpu->time_addr));
729 	version = pvclock->version;
730 
731 	/*
732 	 * A note from Linux upstream about the role of the 'version' field in
733 	 * the pvclock_vcpu_time_info structure:
734 	 *
735 	 * This VCPU is paused, but it's legal for a guest to read another
736 	 * VCPU's kvmclock, so we really have to follow the specification where
737 	 * it says that version is odd if data is being modified, and even
738 	 * after it is consistent.
739 	 */
740 	if (version & 1) {
741 		/* uninitialized state with update bit set */
742 		version += 2;
743 	} else {
744 		/* indicate update in progress */
745 		version++;
746 	}
747 	pvclock->version = version;
748 
749 	membar_producer();
750 
751 	hrt = tsc_gethrtime_params(&tsc, &scale, &shift);
752 	pvclock->tsc_timestamp = tsc + vcpu->tsc_offset;
753 	pvclock->system_time = hrt - v->kvm->arch.boot_hrtime;
754 	pvclock->tsc_to_system_mul = scale;
755 	pvclock->tsc_shift = shift;
756 	pvclock->flags = PVCLOCK_TSC_STABLE_BIT;
757 
758 	membar_producer();
759 
760 	/* indicate update finished */
761 	pvclock->version = version + 1;
762 	vcpu->time_update = hrt;
763 
764 	kvm_release_page_dirty(page);
765 	mark_page_dirty(v->kvm, vcpu->time_addr >> PAGESHIFT);
766 }
767 
768 /*
769  * In the upstream Linux KVM, routine updates to pvclock data are throttled to
770  * a 100ms interval.  We use that value as well.
771  */
772 #define	KVMCLOCK_UPDATE_INTERVAL	(100000000U) /* 100ms in ns */
773 
774 static int
kvm_request_guest_time_update(struct kvm_vcpu * v,boolean_t force)775 kvm_request_guest_time_update(struct kvm_vcpu *v, boolean_t force)
776 {
777 	struct kvm_vcpu_arch *vcpu = &v->arch;
778 
779 	if (vcpu->time_addr == 0)
780 		return (0);
781 
782 	/*
783 	 * If this is not a forced or first update request, check to see if a
784 	 * reasonable (and somewhat arbitrary) amount of time has passed. If
785 	 * the last update was recent, skip the pvclock update request to keep
786 	 * the write rate down.
787 	 */
788 	if (!force || vcpu->time_update != 0) {
789 		hrtime_t hrt;
790 
791 		hrt = gethrtime();
792 		if ((hrt - vcpu->time_update) < KVMCLOCK_UPDATE_INTERVAL)
793 			return (0);
794 	}
795 
796 	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
797 
798 	return (1);
799 }
800 
801 static int
msr_mtrr_valid(unsigned msr)802 msr_mtrr_valid(unsigned msr)
803 {
804 	switch (msr) {
805 	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
806 	case MSR_MTRRfix64K_00000:
807 	case MSR_MTRRfix16K_80000:
808 	case MSR_MTRRfix16K_A0000:
809 	case MSR_MTRRfix4K_C0000:
810 	case MSR_MTRRfix4K_C8000:
811 	case MSR_MTRRfix4K_D0000:
812 	case MSR_MTRRfix4K_D8000:
813 	case MSR_MTRRfix4K_E0000:
814 	case MSR_MTRRfix4K_E8000:
815 	case MSR_MTRRfix4K_F0000:
816 	case MSR_MTRRfix4K_F8000:
817 	case MSR_MTRRdefType:
818 	case MSR_IA32_CR_PAT:
819 		return (1);
820 	case 0x2f8:
821 		return (1);
822 	}
823 
824 	return (0);
825 }
826 
827 static int
valid_pat_type(unsigned t)828 valid_pat_type(unsigned t)
829 {
830 	return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */
831 }
832 
833 static int
valid_mtrr_type(unsigned t)834 valid_mtrr_type(unsigned t)
835 {
836 	return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */
837 }
838 
839 static int
mtrr_valid(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t data)840 mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
841 {
842 	int i;
843 
844 	if (!msr_mtrr_valid(msr))
845 		return (0);
846 
847 	if (msr == MSR_IA32_CR_PAT) {
848 		for (i = 0; i < 8; i++)
849 			if (!valid_pat_type((data >> (i * 8)) & 0xff))
850 				return (0);
851 		return (1);
852 	} else if (msr == MSR_MTRRdefType) {
853 		if (data & ~0xcff)
854 			return (0);
855 		return (valid_mtrr_type(data & 0xff));
856 	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
857 		for (i = 0; i < 8; i++)
858 			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
859 				return (0);
860 		return (1);
861 	}
862 
863 	/* variable MTRRs */
864 	return (valid_mtrr_type(data & 0xff));
865 }
866 
867 static int
set_msr_mtrr(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t data)868 set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
869 {
870 	struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
871 
872 	uint64_t *p = (uint64_t *)&state->fixed_ranges;
873 
874 	if (!mtrr_valid(vcpu, msr, data))
875 		return (1);
876 
877 	if (msr == MSR_MTRRdefType) {
878 		state->def_type = data;
879 		state->enabled = (data & 0xc00) >> 10;
880 	} else if (msr == MSR_MTRRfix64K_00000)
881 		p[0] = data;
882 	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
883 		p[1 + msr - MSR_MTRRfix16K_80000] = data;
884 	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
885 		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
886 	else if (msr == MSR_IA32_CR_PAT)
887 		vcpu->arch.pat = data;
888 	else {	/* Variable MTRRs */
889 		int idx, is_mtrr_mask;
890 		uint64_t *pt;
891 
892 		idx = (msr - 0x200) / 2;
893 		is_mtrr_mask = msr - 0x200 - 2 * idx;
894 
895 		if (!is_mtrr_mask) {
896 			pt = (uint64_t *)&state->var_ranges[idx].base_lo;
897 		} else {
898 			pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
899 		}
900 
901 		*pt = data;
902 	}
903 
904 	kvm_mmu_reset_context(vcpu);
905 
906 	return (0);
907 }
908 
909 static int
set_msr_mce(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t data)910 set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
911 {
912 	uint64_t mcg_cap = vcpu->arch.mcg_cap;
913 	unsigned bank_num = mcg_cap & 0xff;
914 
915 	switch (msr) {
916 	case MSR_IA32_MCG_STATUS:
917 		vcpu->arch.mcg_status = data;
918 		break;
919 	case MSR_IA32_MCG_CTL:
920 		if (!(mcg_cap & MCG_CTL_P))
921 			return (1);
922 		if (data != 0 && data != ~(uint64_t)0)
923 			return (-1);
924 		vcpu->arch.mcg_ctl = data;
925 		break;
926 	default:
927 		if (msr >= MSR_IA32_MC0_CTL &&
928 		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
929 			uint32_t offset = msr - MSR_IA32_MC0_CTL;
930 			/*
931 			 * only 0 or all 1s can be written to IA32_MCi_CTL
932 			 * some Linux kernels though clear bit 10 in bank 4 to
933 			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
934 			 * this to avoid an uncatched #GP in the guest
935 			 */
936 			if ((offset & 0x3) == 0 &&
937 			    data != 0 && (data | (1 << 10)) != ~(uint64_t)0)
938 				return (-1);
939 			vcpu->arch.mce_banks[offset] = data;
940 			break;
941 		}
942 		return (1);
943 	}
944 	return (0);
945 }
946 
947 static int
xen_hvm_config(struct kvm_vcpu * vcpu,uint64_t data)948 xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data)
949 {
950 	struct kvm *kvm = vcpu->kvm;
951 	int lm = is_long_mode(vcpu);
952 	uint8_t *blob_addr = lm ?
953 	    (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 :
954 	    (uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
955 	uint8_t blob_size = lm ?
956 	    kvm->arch.xen_hvm_config.blob_size_64 :
957 	    kvm->arch.xen_hvm_config.blob_size_32;
958 	uint32_t page_num = data & ~PAGEMASK;
959 	uint64_t page_addr = data & PAGEMASK;
960 	uint8_t *page;
961 	int r;
962 
963 	r = E2BIG;
964 	if (page_num >= blob_size)
965 		goto out;
966 	r = ENOMEM;
967 	page = kmem_alloc(PAGESIZE, KM_SLEEP);
968 	r = EFAULT;
969 	if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE))
970 		goto out_free;
971 	if (kvm_write_guest(kvm, page_addr, page, PAGESIZE))
972 		goto out_free;
973 	r = 0;
974 out_free:
975 	kmem_free(page, PAGESIZE);
976 out:
977 	return (r);
978 }
979 
980 
981 static int
kvm_hv_hypercall_enabled(struct kvm * kvm)982 kvm_hv_hypercall_enabled(struct kvm *kvm)
983 {
984 	return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE);
985 }
986 
987 static int
kvm_hv_msr_partition_wide(uint32_t msr)988 kvm_hv_msr_partition_wide(uint32_t msr)
989 {
990 	int r = 0;
991 	switch (msr) {
992 	case HV_X64_MSR_GUEST_OS_ID:
993 	case HV_X64_MSR_HYPERCALL:
994 		r = 1;
995 		break;
996 	}
997 
998 	return (r);
999 }
1000 
1001 static int
set_msr_hyperv_pw(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t data)1002 set_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
1003 {
1004 	struct kvm *kvm = vcpu->kvm;
1005 
1006 	switch (msr) {
1007 	case HV_X64_MSR_GUEST_OS_ID:
1008 		kvm->arch.hv_guest_os_id = data;
1009 		/* setting guest os id to zero disables hypercall page */
1010 		if (!kvm->arch.hv_guest_os_id)
1011 			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1012 		break;
1013 	case HV_X64_MSR_HYPERCALL: {
1014 		uint64_t gfn;
1015 		unsigned long addr;
1016 		uint8_t instructions[4];
1017 
1018 		/* if guest os id is not set hypercall should remain disabled */
1019 		if (!kvm->arch.hv_guest_os_id)
1020 			break;
1021 		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1022 			kvm->arch.hv_hypercall = data;
1023 			break;
1024 		}
1025 		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1026 		addr = gfn_to_hva(kvm, gfn);
1027 		if (kvm_is_error_hva(addr))
1028 			return (1);
1029 		kvm_x86_ops->patch_hypercall(vcpu, instructions);
1030 		((unsigned char *)instructions)[3] = 0xc3; /* ret */
1031 		if (copyout(instructions, (caddr_t)addr, 4))
1032 			return (1);
1033 		kvm->arch.hv_hypercall = data;
1034 		break;
1035 	}
1036 	default:
1037 		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
1038 		    "data 0x%lx\n", msr, data);
1039 		return (1);
1040 	}
1041 
1042 	return (0);
1043 }
1044 
1045 static int
clear_user(void * addr,unsigned long size)1046 clear_user(void *addr, unsigned long size)
1047 {
1048 	caddr_t ka;
1049 	int rval = 0;
1050 
1051 	ka = kmem_zalloc(size, KM_SLEEP);
1052 	rval = copyout(ka, addr, size);
1053 	kmem_free(ka, size);
1054 
1055 	return (rval);
1056 }
1057 
1058 static int
set_msr_hyperv(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t data)1059 set_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
1060 {
1061 	switch (msr) {
1062 	case HV_X64_MSR_APIC_ASSIST_PAGE: {
1063 		unsigned long addr;
1064 
1065 		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1066 			vcpu->arch.hv_vapic = data;
1067 			break;
1068 		}
1069 
1070 		addr = gfn_to_hva(vcpu->kvm,
1071 		    data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1072 
1073 		if (kvm_is_error_hva(addr))
1074 			return (1);
1075 
1076 		if (clear_user((void *)addr, PAGESIZE))
1077 			return (1);
1078 
1079 		vcpu->arch.hv_vapic = data;
1080 		break;
1081 	}
1082 
1083 	case HV_X64_MSR_EOI:
1084 		return (kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data));
1085 	case HV_X64_MSR_ICR:
1086 		return (kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data));
1087 	case HV_X64_MSR_TPR:
1088 		return (kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data));
1089 
1090 	default:
1091 		cmn_err(CE_WARN, "HYPER-V unimplemented wrmsr: 0x%x "
1092 		    "data 0x%lx\n", msr, data);
1093 		return (1);
1094 	}
1095 
1096 	return (0);
1097 }
1098 
1099 int
kvm_set_msr_common(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t data)1100 kvm_set_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
1101 {
1102 	switch (msr) {
1103 	case MSR_EFER:
1104 		return (set_efer(vcpu, data));
1105 	case MSR_K7_HWCR:
1106 		data &= ~(uint64_t)0x40; /* ignore flush filter disable */
1107 		if (data != 0) {
1108 			cmn_err(CE_CONT,
1109 			    "!unimplemented HWCR wrmsr: 0x%lx\n", data);
1110 			return (1);
1111 		}
1112 		break;
1113 	case MSR_FAM10H_MMIO_CONF_BASE:
1114 		if (data != 0) {
1115 			cmn_err(CE_CONT, "!unimplemented MMIO_CONF_BASE wrmsr: "
1116 				"0x%lx\n", data);
1117 			return (1);
1118 		}
1119 		break;
1120 	case MSR_AMD64_NB_CFG:
1121 		break;
1122 	case MSR_IA32_DEBUGCTLMSR:
1123 		if (!data) {
1124 			/* We support the non-activated case already */
1125 			break;
1126 		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
1127 			/*
1128 			 * Values other than LBR and BTF are vendor-specific,
1129 			 * thus reserved and should throw a #GP
1130 			 */
1131 			return (1);
1132 		}
1133 		cmn_err(CE_CONT, "!%s: MSR_IA32_DEBUGCTLMSR 0x%lx, nop\n",
1134 			__func__, data);
1135 		break;
1136 	case MSR_IA32_UCODE_REV:
1137 	case MSR_IA32_UCODE_WRITE:
1138 	case MSR_VM_HSAVE_PA:
1139 	case MSR_AMD64_PATCH_LOADER:
1140 		break;
1141 	case 0x200 ... 0x2ff:
1142 		return (set_msr_mtrr(vcpu, msr, data));
1143 	case MSR_IA32_APICBASE:
1144 		kvm_set_apic_base(vcpu, data);
1145 		break;
1146 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1147 		return (kvm_x2apic_msr_write(vcpu, msr, data));
1148 	case MSR_IA32_MISC_ENABLE:
1149 		vcpu->arch.ia32_misc_enable_msr = data;
1150 		break;
1151 
1152 	case MSR_KVM_WALL_CLOCK:
1153 	case MSR_KVM_WALL_CLOCK_NEW:
1154 		vcpu->kvm->arch.wall_clock = data;
1155 		kvm_write_wall_clock(vcpu->kvm, data);
1156 		break;
1157 	case MSR_KVM_SYSTEM_TIME:
1158 	case MSR_KVM_SYSTEM_TIME_NEW:
1159 	{
1160 		vcpu->arch.time_addr = 0;
1161 		vcpu->arch.time_val = data;
1162 
1163 		/* nothing further to do if disabled */
1164 		if ((data & 1) == 0)
1165 			break;
1166 
1167 		/* insist that the time output be confined to a single page */
1168 		data &= ~1UL;
1169 		if (((data & PAGEOFFSET) +
1170 		    sizeof (struct pvclock_vcpu_time_info)) > PAGESIZE) {
1171 			break;
1172 		}
1173 
1174 		vcpu->arch.time_addr = data;
1175 		kvm_request_guest_time_update(vcpu, B_TRUE);
1176 		break;
1177 	}
1178 
1179 	case MSR_IA32_MCG_CTL:
1180 	case MSR_IA32_MCG_STATUS:
1181 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1182 		return (set_msr_mce(vcpu, msr, data));
1183 
1184 	/*
1185 	 * Performance counters are not protected by a CPUID bit, so we should
1186 	 * check all of them in the generic path for the sake of cross vendor
1187 	 * migration. Writing a zero into the event select MSRs disables them,
1188 	 * which we perfectly emulate ;-). Any other value should be at least
1189 	 * reported, some guests depend on them.
1190 	 */
1191 	case MSR_P6_EVNTSEL0:
1192 	case MSR_P6_EVNTSEL1:
1193 	case MSR_K7_EVNTSEL0:
1194 	case MSR_K7_EVNTSEL1:
1195 	case MSR_K7_EVNTSEL2:
1196 	case MSR_K7_EVNTSEL3:
1197 		if (data != 0)
1198 			cmn_err(CE_CONT, "!unimplemented perfctr wrmsr: "
1199 				"0x%x data 0x%lx\n", msr, data);
1200 		break;
1201 	/*
1202 	 * at least RHEL 4 unconditionally writes to the perfctr registers,
1203 	 * so we ignore writes to make it happy.
1204 	 */
1205 	case MSR_P6_PERFCTR0:
1206 	case MSR_P6_PERFCTR1:
1207 	case MSR_K7_PERFCTR0:
1208 	case MSR_K7_PERFCTR1:
1209 	case MSR_K7_PERFCTR2:
1210 	case MSR_K7_PERFCTR3:
1211 		cmn_err(CE_CONT, "!unimplemented perfctr wrmsr: "
1212 			"0x%x data 0x%lx\n", msr, data);
1213 		break;
1214 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1215 		if (kvm_hv_msr_partition_wide(msr)) {
1216 			int r;
1217 			mutex_enter(&vcpu->kvm->lock);
1218 			r = set_msr_hyperv_pw(vcpu, msr, data);
1219 			mutex_exit(&vcpu->kvm->lock);
1220 			return (r);
1221 		} else
1222 			return (set_msr_hyperv(vcpu, msr, data));
1223 		break;
1224 	default:
1225 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1226 			return (xen_hvm_config(vcpu, data));
1227 		if (!ignore_msrs) {
1228 			cmn_err(CE_CONT, "!unhandled wrmsr: 0x%x data %lx\n",
1229 				msr, data);
1230 			return (1);
1231 		} else {
1232 			cmn_err(CE_CONT, "!ignored wrmsr: 0x%x data %lx\n",
1233 				msr, data);
1234 			break;
1235 		}
1236 	}
1237 
1238 	return (0);
1239 }
1240 
1241 /*
1242  * Reads an msr value (of 'msr_index') into 'pdata'.
1243  * Returns 0 on success, non-0 otherwise.
1244  * Assumes vcpu_load() was already called.
1245  */
1246 int
kvm_get_msr(struct kvm_vcpu * vcpu,uint32_t msr_index,uint64_t * pdata)1247 kvm_get_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t *pdata)
1248 {
1249 	return (kvm_x86_ops->get_msr(vcpu, msr_index, pdata));
1250 }
1251 
1252 static int
get_msr_mtrr(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t * pdata)1253 get_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
1254 {
1255 	struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
1256 	uint64_t *p = (uint64_t *)&state->fixed_ranges;
1257 
1258 	if (!msr_mtrr_valid(msr))
1259 		return (1);
1260 
1261 	if (msr == MSR_MTRRdefType)
1262 		*pdata = vcpu->arch.mtrr_state.def_type +
1263 		    (vcpu->arch.mtrr_state.enabled << 10);
1264 	else if (msr == MSR_MTRRfix64K_00000)
1265 		*pdata = p[0];
1266 	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1267 		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
1268 	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1269 		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
1270 	else if (msr == MSR_IA32_CR_PAT)
1271 		*pdata = vcpu->arch.pat;
1272 	else {	/* Variable MTRRs */
1273 		int idx, is_mtrr_mask;
1274 		uint64_t *pt;
1275 
1276 		idx = (msr - 0x200) / 2;
1277 		is_mtrr_mask = msr - 0x200 - 2 * idx;
1278 		if (!is_mtrr_mask) {
1279 			pt = (uint64_t *)&state->var_ranges[idx].base_lo;
1280 		} else {
1281 			pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
1282 		}
1283 
1284 		*pdata = *pt;
1285 	}
1286 
1287 	return (0);
1288 }
1289 
1290 static int
get_msr_mce(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t * pdata)1291 get_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
1292 {
1293 	uint64_t data;
1294 	uint64_t mcg_cap = vcpu->arch.mcg_cap;
1295 	unsigned bank_num = mcg_cap & 0xff;
1296 
1297 	switch (msr) {
1298 	case MSR_IA32_P5_MC_ADDR:
1299 	case MSR_IA32_P5_MC_TYPE:
1300 		data = 0;
1301 		break;
1302 	case MSR_IA32_MCG_CAP:
1303 		data = vcpu->arch.mcg_cap;
1304 		break;
1305 	case MSR_IA32_MCG_CTL:
1306 		if (!(mcg_cap & MCG_CTL_P))
1307 			return (1);
1308 		data = vcpu->arch.mcg_ctl;
1309 		break;
1310 	case MSR_IA32_MCG_STATUS:
1311 		data = vcpu->arch.mcg_status;
1312 		break;
1313 	default:
1314 		if (msr >= MSR_IA32_MC0_CTL &&
1315 		    msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
1316 			uint32_t offset = msr - MSR_IA32_MC0_CTL;
1317 			data = vcpu->arch.mce_banks[offset];
1318 			break;
1319 		}
1320 		return (1);
1321 	}
1322 	*pdata = data;
1323 	return (0);
1324 }
1325 
1326 static int
get_msr_hyperv_pw(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t * pdata)1327 get_msr_hyperv_pw(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
1328 {
1329 	uint64_t data = 0;
1330 	struct kvm *kvm = vcpu->kvm;
1331 
1332 	switch (msr) {
1333 	case HV_X64_MSR_GUEST_OS_ID:
1334 		data = kvm->arch.hv_guest_os_id;
1335 		break;
1336 	case HV_X64_MSR_HYPERCALL:
1337 		data = kvm->arch.hv_hypercall;
1338 		break;
1339 	default:
1340 		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1341 		return (1);
1342 	}
1343 
1344 	*pdata = data;
1345 
1346 	return (0);
1347 }
1348 
1349 static int
get_msr_hyperv(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t * pdata)1350 get_msr_hyperv(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
1351 {
1352 	uint64_t data = 0;
1353 
1354 	switch (msr) {
1355 	case HV_X64_MSR_VP_INDEX: {
1356 		int r;
1357 		struct kvm_vcpu *v;
1358 		kvm_for_each_vcpu(r, v, vcpu->kvm)
1359 			if (v == vcpu)
1360 				data = r;
1361 		break;
1362 	}
1363 	case HV_X64_MSR_EOI:
1364 		return (kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata));
1365 	case HV_X64_MSR_ICR:
1366 		return (kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata));
1367 	case HV_X64_MSR_TPR:
1368 		return (kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata));
1369 	default:
1370 		cmn_err(CE_WARN, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1371 		return (1);
1372 	}
1373 
1374 	*pdata = data;
1375 	return (0);
1376 }
1377 
1378 int
kvm_get_msr_common(struct kvm_vcpu * vcpu,uint32_t msr,uint64_t * pdata)1379 kvm_get_msr_common(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t *pdata)
1380 {
1381 	uint64_t data;
1382 
1383 	switch (msr) {
1384 	case MSR_IA32_PLATFORM_ID:
1385 	case MSR_IA32_UCODE_REV:
1386 	case MSR_IA32_EBL_CR_POWERON:
1387 	case MSR_IA32_DEBUGCTLMSR:
1388 	case MSR_IA32_LASTBRANCHFROMIP:
1389 	case MSR_IA32_LASTBRANCHTOIP:
1390 	case MSR_IA32_LASTINTFROMIP:
1391 	case MSR_IA32_LASTINTTOIP:
1392 	case MSR_K8_SYSCFG:
1393 	case MSR_K7_HWCR:
1394 	case MSR_VM_HSAVE_PA:
1395 	case MSR_P6_PERFCTR0:
1396 	case MSR_P6_PERFCTR1:
1397 	case MSR_P6_EVNTSEL0:
1398 	case MSR_P6_EVNTSEL1:
1399 	case MSR_K7_EVNTSEL0:
1400 	case MSR_K7_PERFCTR0:
1401 	case MSR_K8_INT_PENDING_MSG:
1402 	case MSR_AMD64_NB_CFG:
1403 	case MSR_FAM10H_MMIO_CONF_BASE:
1404 		data = 0;
1405 		break;
1406 	case MSR_MTRRcap:
1407 		data = 0x500 | KVM_NR_VAR_MTRR;
1408 		break;
1409 	case 0x200 ... 0x2ff:
1410 		return (get_msr_mtrr(vcpu, msr, pdata));
1411 	case 0xcd: /* fsb frequency */
1412 		data = 3;
1413 		break;
1414 	case MSR_IA32_APICBASE:
1415 		data = kvm_get_apic_base(vcpu);
1416 		break;
1417 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
1418 		return (kvm_x2apic_msr_read(vcpu, msr, pdata));
1419 		break;
1420 	case MSR_IA32_MISC_ENABLE:
1421 		data = vcpu->arch.ia32_misc_enable_msr;
1422 		break;
1423 	case MSR_IA32_PERF_STATUS:
1424 		/* TSC increment by tick */
1425 		data = 1000ULL;
1426 		/* CPU multiplier */
1427 		data |= (((uint64_t)4ULL) << 40);
1428 		break;
1429 	case MSR_EFER:
1430 		data = vcpu->arch.efer;
1431 		break;
1432 	case MSR_KVM_WALL_CLOCK:
1433 	case MSR_KVM_WALL_CLOCK_NEW:
1434 		data = vcpu->kvm->arch.wall_clock;
1435 		break;
1436 	case MSR_KVM_SYSTEM_TIME:
1437 	case MSR_KVM_SYSTEM_TIME_NEW:
1438 		data = vcpu->arch.time_val;
1439 		break;
1440 	case MSR_IA32_P5_MC_ADDR:
1441 	case MSR_IA32_P5_MC_TYPE:
1442 	case MSR_IA32_MCG_CAP:
1443 	case MSR_IA32_MCG_CTL:
1444 	case MSR_IA32_MCG_STATUS:
1445 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1446 		return (get_msr_mce(vcpu, msr, pdata));
1447 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1448 		if (kvm_hv_msr_partition_wide(msr)) {
1449 			int r;
1450 			mutex_enter(&vcpu->kvm->lock);
1451 			r = get_msr_hyperv_pw(vcpu, msr, pdata);
1452 			mutex_exit(&vcpu->kvm->lock);
1453 			return (r);
1454 		} else
1455 			return (get_msr_hyperv(vcpu, msr, pdata));
1456 		break;
1457 	default:
1458 		if (!ignore_msrs) {
1459 			cmn_err(CE_CONT, "!unhandled rdmsr: 0x%x\n", msr);
1460 			return (1);
1461 		} else {
1462 			cmn_err(CE_CONT, "!ignored rdmsr: 0x%x\n", msr);
1463 			data = 0;
1464 		}
1465 		break;
1466 	}
1467 	*pdata = data;
1468 
1469 	return (0);
1470 }
1471 
1472 /*
1473  * Read or write a bunch of msrs. All parameters are kernel addresses.
1474  *
1475  * @return number of msrs set successfully.
1476  */
1477 static int
__msr_io(struct kvm_vcpu * vcpu,struct kvm_msrs * msrs,struct kvm_msr_entry * entries,int (* do_msr)(struct kvm_vcpu * vcpu,unsigned index,uint64_t * data))1478 __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1479     struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu,
1480     unsigned index, uint64_t *data))
1481 {
1482 	int i, idx;
1483 
1484 	vcpu_load(vcpu);
1485 
1486 	for (i = 0; i < msrs->nmsrs; i++) {
1487 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
1488 			break;
1489 	}
1490 
1491 	vcpu_put(vcpu);
1492 
1493 	return (i);
1494 }
1495 
1496 int
kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu * vcpu,struct kvm_msrs * msrs,int * rv)1497 kvm_vcpu_ioctl_get_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv)
1498 {
1499 	int r;
1500 
1501 	if (msrs->nmsrs >= MAX_IO_MSRS)
1502 		return (-E2BIG);
1503 
1504 	if ((r = __msr_io(vcpu, msrs, msrs->entries, kvm_get_msr)) < 0)
1505 		return (r);
1506 
1507 	*rv = r;
1508 
1509 	return (0);
1510 }
1511 
1512 int
kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu * vcpu,struct kvm_msrs * msrs,int * rv)1513 kvm_vcpu_ioctl_set_msrs(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int *rv)
1514 {
1515 	int r;
1516 
1517 	if (msrs->nmsrs >= MAX_IO_MSRS)
1518 		return (-E2BIG);
1519 
1520 	if ((r = __msr_io(vcpu, msrs, msrs->entries, do_set_msr)) < 0)
1521 		return (-EINVAL);
1522 
1523 	*rv = r;
1524 
1525 	return (0);
1526 }
1527 
1528 int
kvm_dev_ioctl_check_extension(long ext,int * rval_p)1529 kvm_dev_ioctl_check_extension(long ext, int *rval_p)
1530 {
1531 	int r;
1532 
1533 	switch (ext) {
1534 	case KVM_CAP_IRQCHIP:
1535 	case KVM_CAP_HLT:
1536 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1537 	case KVM_CAP_SET_TSS_ADDR:
1538 	case KVM_CAP_EXT_CPUID:
1539 	case KVM_CAP_CLOCKSOURCE:
1540 	case KVM_CAP_PIT:
1541 	case KVM_CAP_NOP_IO_DELAY:
1542 	case KVM_CAP_MP_STATE:
1543 	case KVM_CAP_SYNC_MMU:
1544 	case KVM_CAP_REINJECT_CONTROL:
1545 	case KVM_CAP_IRQ_INJECT_STATUS:
1546 	case KVM_CAP_ASSIGN_DEV_IRQ:
1547 	case KVM_CAP_IOEVENTFD:
1548 	case KVM_CAP_PIT2:
1549 	case KVM_CAP_PIT_STATE2:
1550 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1551 	case KVM_CAP_XEN_HVM:
1552 	case KVM_CAP_ADJUST_CLOCK:
1553 	case KVM_CAP_VCPU_EVENTS:
1554 	case KVM_CAP_HYPERV:
1555 	case KVM_CAP_HYPERV_VAPIC:
1556 	case KVM_CAP_HYPERV_SPIN:
1557 	case KVM_CAP_PCI_SEGMENT:
1558 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
1559 		*rval_p = 1;
1560 		r = DDI_SUCCESS;
1561 		break;
1562 	case KVM_CAP_COALESCED_MMIO:
1563 		*rval_p = KVM_COALESCED_MMIO_PAGE_OFFSET;
1564 		r = DDI_SUCCESS;
1565 		break;
1566 	case KVM_CAP_VAPIC:
1567 		*rval_p = !kvm_x86_ops->cpu_has_accelerated_tpr();
1568 		r = DDI_SUCCESS;
1569 		break;
1570 	case KVM_CAP_NR_VCPUS:
1571 		*rval_p = KVM_MAX_VCPUS;
1572 		r = DDI_SUCCESS;
1573 		break;
1574 	case KVM_CAP_NR_MEMSLOTS:
1575 		*rval_p = KVM_MEMORY_SLOTS;
1576 		r = DDI_SUCCESS;
1577 		break;
1578 	case KVM_CAP_PV_MMU:	/* obsolete */
1579 		r = EINVAL;
1580 		break;
1581 	case KVM_CAP_IOMMU:
1582 		*rval_p = 0;
1583 		r = DDI_SUCCESS;
1584 		break;
1585 	case KVM_CAP_MCE:
1586 		*rval_p = KVM_MAX_MCE_BANKS;
1587 		r = DDI_SUCCESS;
1588 		break;
1589 	default:
1590 		r = EINVAL;
1591 		break;
1592 	}
1593 
1594 	return (r);
1595 }
1596 
1597 void
kvm_arch_vcpu_load(struct kvm_vcpu * vcpu,int cpu)1598 kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1599 {
1600 	kvm_x86_ops->vcpu_load(vcpu, cpu);
1601 	kvm_request_guest_time_update(vcpu, B_FALSE);
1602 }
1603 
1604 void
kvm_arch_vcpu_put(struct kvm_vcpu * vcpu)1605 kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1606 {
1607 	kvm_put_guest_fpu(vcpu);
1608 
1609 	kvm_x86_ops->vcpu_put(vcpu);
1610 }
1611 
1612 static int
is_efer_nx(void)1613 is_efer_nx(void)
1614 {
1615 	unsigned long long efer = 0;
1616 
1617 	rdmsrl_safe(MSR_EFER, &efer);
1618 	return (efer & EFER_NX);
1619 }
1620 
1621 int
kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu * vcpu,struct kvm_cpuid2 * cpuid,int * rv,intptr_t arg)1622 kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
1623     int *rv, intptr_t arg)
1624 {
1625 	struct kvm_cpuid2 *id;
1626 
1627 	id = (void *)arg;
1628 
1629 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1630 		return (E2BIG);
1631 
1632 	if (copyin(id->entries, vcpu->arch.cpuid_entries,
1633 	    cpuid->nent * sizeof (struct kvm_cpuid_entry2)) < 0)
1634 		return (EFAULT);
1635 
1636 	vcpu_load(vcpu);
1637 	vcpu->arch.cpuid_nent = cpuid->nent;
1638 	kvm_apic_set_version(vcpu);
1639 	kvm_x86_ops->cpuid_update(vcpu);
1640 	vcpu_put(vcpu);
1641 
1642 	return (0);
1643 }
1644 
1645 int
kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu * vcpu,struct kvm_cpuid2 * cpuid)1646 kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid)
1647 {
1648 	int r;
1649 	struct kvm_cpuid_entry2 *entries = cpuid->entries;
1650 
1651 	vcpu_load(vcpu);
1652 
1653 	cpuid->nent = vcpu->arch.cpuid_nent;
1654 
1655 	if (cpuid->nent < vcpu->arch.cpuid_nent) {
1656 		vcpu_put(vcpu);
1657 		return (E2BIG);
1658 	}
1659 
1660 	bcopy(&vcpu->arch.cpuid_entries, cpuid->entries,
1661 	    vcpu->arch.cpuid_nent * sizeof (struct kvm_cpuid_entry2));
1662 
1663 	vcpu_put(vcpu);
1664 
1665 	return (0);
1666 }
1667 
native_cpuid(unsigned int * eax,unsigned int * ebx,unsigned int * ecx,unsigned int * edx)1668 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
1669 				unsigned int *ecx, unsigned int *edx)
1670 {
1671 	/* ecx is often an input as well as an output. */
1672 	__asm__ volatile("cpuid"
1673 	    : "=a" (*eax),
1674 	    "=b" (*ebx),
1675 	    "=c" (*ecx),
1676 	    "=d" (*edx)
1677 	    : "0" (*eax), "2" (*ecx));
1678 }
1679 
1680 #define	__cpuid			native_cpuid
1681 
1682 /* Some CPUID calls want 'count' to be placed in ecx */
1683 static inline void
cpuid_count(unsigned int op,int count,unsigned int * eax,unsigned int * ebx,unsigned int * ecx,unsigned int * edx)1684 cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx,
1685     unsigned int *ecx, unsigned int *edx)
1686 {
1687 	*eax = op;
1688 	*ecx = count;
1689 	__cpuid(eax, ebx, ecx, edx);
1690 }
1691 
1692 static void
do_cpuid_1_ent(kvm_cpuid_entry2_t * entry,uint32_t function,uint32_t index)1693 do_cpuid_1_ent(kvm_cpuid_entry2_t *entry, uint32_t function, uint32_t index)
1694 {
1695 	entry->function = function;
1696 	entry->index = index;
1697 	cpuid_count(entry->function, entry->index,
1698 		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1699 	entry->flags = 0;
1700 }
1701 
1702 
1703 #define	F(x) bit(X86_FEATURE_##x)
1704 
1705 static void
do_cpuid_ent(struct kvm_cpuid_entry2 * entry,uint32_t function,uint32_t index,int * nent,int maxnent)1706 do_cpuid_ent(struct kvm_cpuid_entry2 *entry, uint32_t function,
1707     uint32_t index, int *nent, int maxnent)
1708 {
1709 	unsigned int ddic;
1710 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1711 	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1712 				? F(GBPAGES) : 0;
1713 	unsigned f_lm = F(LM);
1714 	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1715 
1716 	/* cpuid 1.edx */
1717 	const uint32_t kvm_supported_word0_x86_features =
1718 		F(FPU) | F(VME) | F(DE) | F(PSE) |
1719 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1720 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1721 		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1722 		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1723 		0 /* Reserved, DS, ACPI */ | F(MMX) |
1724 		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1725 		0 /* HTT, TM, Reserved, PBE */;
1726 	/* cpuid 0x80000001.edx */
1727 	const uint32_t kvm_supported_word1_x86_features =
1728 		F(FPU) | F(VME) | F(DE) | F(PSE) |
1729 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1730 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1731 		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1732 		F(PAT) | F(PSE36) | 0 /* Reserved */ |
1733 		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1734 		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1735 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1736 	/* cpuid 1.ecx */
1737 	const uint32_t kvm_supported_word4_x86_features =
1738 		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1739 		0 /* DS-CPL, VMX, SMX, EST */ |
1740 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1741 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1742 		0 /* Reserved, DCA */ | F(XMM4_1) |
1743 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1744 		0 /* Reserved, XSAVE, OSXSAVE */ | F(AES) |
1745 		0 /* AVX, Reserved */;
1746 	/* cpuid 0x80000001.ecx */
1747 	const uint32_t kvm_supported_word6_x86_features =
1748 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1749 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1750 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1751 		0 /* SKINIT */ | 0 /* WDT */;
1752 
1753 	/*
1754 	 * Keep us from migrating between cpuid calls.
1755 	 */
1756 	kpreempt_disable();
1757 
1758 	do_cpuid_1_ent(entry, function, index);
1759 	++*nent;
1760 
1761 	switch (function) {
1762 	case 0:
1763 		entry->eax = min(entry->eax, (uint32_t)0xb);
1764 		break;
1765 	case 1:
1766 		entry->edx &= kvm_supported_word0_x86_features;
1767 		entry->ecx &= kvm_supported_word4_x86_features;
1768 		/*
1769 		 * we support x2apic emulation even if host does not support
1770 		 * it since we emulate x2apic in software
1771 		 */
1772 		entry->ecx |= F(X2APIC);
1773 		break;
1774 	/*
1775 	 * function 2 entries are STATEFUL. That is, repeated cpuid commands
1776 	 * may return different values. This forces us to get_cpu() before
1777 	 * issuing the first command, and also to emulate this annoying behavior
1778 	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT
1779 	 */
1780 	case 2: {
1781 		int t, times = entry->eax & 0xff;
1782 
1783 		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1784 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1785 		for (t = 1; t < times && *nent < maxnent; ++t) {
1786 			do_cpuid_1_ent(&entry[t], function, 0);
1787 			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1788 			++*nent;
1789 		}
1790 		break;
1791 	}
1792 	/* function 4 and 0xb have additional index. */
1793 	case 4: {
1794 		int i, cache_type;
1795 
1796 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1797 		/* read more entries until cache_type is zero */
1798 		for (i = 1; *nent < maxnent; ++i) {
1799 			cache_type = entry[i - 1].eax & 0x1f;
1800 			if (!cache_type)
1801 				break;
1802 			do_cpuid_1_ent(&entry[i], function, i);
1803 			entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1804 			++*nent;
1805 		}
1806 		break;
1807 	}
1808 	case 0xb: {
1809 		int i, level_type;
1810 
1811 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1812 		/* read more entries until level_type is zero */
1813 		for (i = 1; *nent < maxnent; ++i) {
1814 			level_type = entry[i - 1].ecx & 0xff00;
1815 			if (!level_type)
1816 				break;
1817 			do_cpuid_1_ent(&entry[i], function, i);
1818 			entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1819 			++*nent;
1820 		}
1821 		break;
1822 	}
1823 	case 0x80000000:
1824 		entry->eax = min(entry->eax, 0x8000001a);
1825 		break;
1826 	case 0x80000001:
1827 		entry->edx &= kvm_supported_word1_x86_features;
1828 		entry->ecx &= kvm_supported_word6_x86_features;
1829 		break;
1830 	}
1831 
1832 	kpreempt_enable();
1833 }
1834 
1835 #undef F
1836 
1837 int
kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 * cpuid,struct kvm_cpuid_entry2 * entries)1838 kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1839     struct kvm_cpuid_entry2  *entries)
1840 {
1841 	struct kvm_cpuid_entry2 *cpuid_entries;
1842 	int limit, nent = 0, r = E2BIG;
1843 	uint32_t func;
1844 	int allocsize = 0;
1845 
1846 	if (cpuid->nent < 1)
1847 		goto out;
1848 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1849 		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
1850 	r = ENOMEM;
1851 	allocsize = sizeof (struct kvm_cpuid_entry2) * cpuid->nent;
1852 	cpuid_entries = kmem_zalloc(allocsize, KM_SLEEP);
1853 
1854 	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1855 	limit = cpuid_entries[0].eax;
1856 	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1857 		do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent);
1858 
1859 	r = E2BIG;
1860 	if (nent >= cpuid->nent)
1861 		goto out_free;
1862 
1863 	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1864 	limit = cpuid_entries[nent - 1].eax;
1865 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1866 		do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent);
1867 	r = E2BIG;
1868 	if (nent >= cpuid->nent)
1869 		goto out_free;
1870 
1871 	r = EFAULT;
1872 	if (copyout(cpuid_entries, entries,
1873 	    nent * sizeof (kvm_cpuid_entry2_t)))
1874 		goto out_free;
1875 
1876 	cpuid->nent = nent;
1877 	r = 0;
1878 
1879 out_free:
1880 	kmem_free(cpuid_entries, allocsize);
1881 out:
1882 	return (r);
1883 }
1884 
1885 int
kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu * vcpu,struct kvm_lapic_state * s)1886 kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
1887 {
1888 	vcpu_load(vcpu);
1889 	bcopy(vcpu->arch.apic->regs, s->regs, sizeof (*s));
1890 	vcpu_put(vcpu);
1891 
1892 	return (0);
1893 }
1894 
1895 int
kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu * vcpu,struct kvm_lapic_state * s)1896 kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
1897 {
1898 	vcpu_load(vcpu);
1899 	bcopy(s->regs, vcpu->arch.apic->regs, sizeof (*s));
1900 	kvm_apic_post_state_restore(vcpu);
1901 	update_cr8_intercept(vcpu);
1902 	vcpu_put(vcpu);
1903 
1904 	return (0);
1905 }
1906 
1907 int
kvm_vcpu_ioctl_interrupt(struct kvm_vcpu * vcpu,struct kvm_interrupt * irq)1908 kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
1909 {
1910 	if (irq->irq < 0 || irq->irq >= 256)
1911 		return (-EINVAL);
1912 
1913 	if (irqchip_in_kernel(vcpu->kvm))
1914 		return (-ENXIO);
1915 
1916 	vcpu_load(vcpu);
1917 
1918 	kvm_queue_interrupt(vcpu, irq->irq, 0);
1919 
1920 	vcpu_put(vcpu);
1921 
1922 	return (0);
1923 }
1924 
1925 int
kvm_vcpu_ioctl_nmi(struct kvm_vcpu * vcpu)1926 kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1927 {
1928 	vcpu_load(vcpu);
1929 	kvm_inject_nmi(vcpu);
1930 	vcpu_put(vcpu);
1931 
1932 	return (0);
1933 }
1934 
1935 int
kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu * vcpu,uint64_t * mcg_capp)1936 kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, uint64_t *mcg_capp)
1937 {
1938 	int rval;
1939 	uint64_t mcg_cap = *mcg_capp;
1940 	unsigned bank_num = mcg_cap & 0xff, bank;
1941 
1942 	vcpu_load(vcpu);
1943 	rval = -EINVAL;
1944 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
1945 		goto out;
1946 	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
1947 		goto out;
1948 	rval = 0;
1949 	vcpu->arch.mcg_cap = mcg_cap;
1950 	/* Init IA32_MCG_CTL to all 1s */
1951 	if (mcg_cap & MCG_CTL_P)
1952 		vcpu->arch.mcg_ctl = ~(uint64_t)0;
1953 	/* Init IA32_MCi_CTL to all 1s */
1954 	for (bank = 0; bank < bank_num; bank++)
1955 		vcpu->arch.mce_banks[bank * 4] = ~(uint64_t)0;
1956 out:
1957 	vcpu_put(vcpu);
1958 	return (rval);
1959 }
1960 
1961 int
kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1962 kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
1963     struct kvm_vcpu_events *events)
1964 {
1965 	vcpu_load(vcpu);
1966 
1967 	events->exception.injected = vcpu->arch.exception.pending;
1968 	events->exception.nr = vcpu->arch.exception.nr;
1969 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
1970 	events->exception.error_code = vcpu->arch.exception.error_code;
1971 
1972 	events->interrupt.injected = vcpu->arch.interrupt.pending;
1973 	events->interrupt.nr = vcpu->arch.interrupt.nr;
1974 	events->interrupt.soft = vcpu->arch.interrupt.soft;
1975 
1976 	events->nmi.injected = vcpu->arch.nmi_injected;
1977 	events->nmi.pending = vcpu->arch.nmi_pending;
1978 	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
1979 
1980 	events->sipi_vector = vcpu->arch.sipi_vector;
1981 
1982 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING |
1983 	    KVM_VCPUEVENT_VALID_SIPI_VECTOR);
1984 
1985 	vcpu_put(vcpu);
1986 
1987 	return (0);
1988 }
1989 
1990 int
kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu * vcpu,struct kvm_vcpu_events * events)1991 kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
1992     struct kvm_vcpu_events *events)
1993 {
1994 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING |
1995 	    KVM_VCPUEVENT_VALID_SIPI_VECTOR))
1996 		return (-EINVAL);
1997 
1998 	vcpu_load(vcpu);
1999 
2000 	vcpu->arch.exception.pending = events->exception.injected;
2001 	vcpu->arch.exception.nr = events->exception.nr;
2002 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2003 	vcpu->arch.exception.error_code = events->exception.error_code;
2004 
2005 	vcpu->arch.interrupt.pending = events->interrupt.injected;
2006 	vcpu->arch.interrupt.nr = events->interrupt.nr;
2007 	vcpu->arch.interrupt.soft = events->interrupt.soft;
2008 
2009 	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2010 		kvm_pic_clear_isr_ack(vcpu->kvm);
2011 
2012 	vcpu->arch.nmi_injected = events->nmi.injected;
2013 
2014 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2015 		vcpu->arch.nmi_pending = events->nmi.pending;
2016 
2017 	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2018 
2019 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2020 		vcpu->arch.sipi_vector = events->sipi_vector;
2021 
2022 	vcpu_put(vcpu);
2023 
2024 	return (0);
2025 }
2026 
2027 int
kvm_vm_ioctl_set_tss_addr(struct kvm * kvmp,uintptr_t addr)2028 kvm_vm_ioctl_set_tss_addr(struct kvm *kvmp, uintptr_t addr)
2029 {
2030 	return (kvm_x86_ops->set_tss_addr(kvmp, addr));
2031 }
2032 
2033 int
kvm_vm_ioctl_set_identity_map_addr(struct kvm * kvm,uint64_t ident_addr)2034 kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, uint64_t ident_addr)
2035 {
2036 	kvm->arch.ept_identity_map_addr = ident_addr;
2037 	return (0);
2038 }
2039 
2040 gfn_t
unalias_gfn_instantiation(struct kvm * kvm,gfn_t gfn)2041 unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2042 {
2043 	int i;
2044 	struct kvm_mem_alias *alias;
2045 	struct kvm_mem_aliases *aliases;
2046 #ifdef XXX
2047 	aliases = rcu_dereference(kvm->arch.aliases);
2048 #else
2049 	XXX_KVM_SYNC_PROBE;
2050 	aliases = kvm->arch.aliases;
2051 #endif
2052 
2053 	for (i = 0; i < aliases->naliases; i++) {
2054 		alias = &aliases->aliases[i];
2055 		if (alias->flags & KVM_ALIAS_INVALID)
2056 			continue;
2057 		if (gfn >= alias->base_gfn &&
2058 		    gfn < alias->base_gfn + alias->npages)
2059 			return (alias->target_gfn + gfn - alias->base_gfn);
2060 	}
2061 
2062 	return (gfn);
2063 }
2064 
2065 gfn_t
unalias_gfn(struct kvm * kvm,gfn_t gfn)2066 unalias_gfn(struct kvm *kvm, gfn_t gfn)
2067 {
2068 	int i;
2069 	struct kvm_mem_alias *alias;
2070 	struct kvm_mem_aliases *aliases;
2071 
2072 	/* XXX need protection */
2073 	aliases = kvm->arch.aliases;
2074 
2075 	for (i = 0; i < aliases->naliases; ++i) {
2076 		alias = &aliases->aliases[i];
2077 		if (gfn >= alias->base_gfn &&
2078 		    gfn < alias->base_gfn + alias->npages)
2079 			return (alias->target_gfn + gfn - alias->base_gfn);
2080 	}
2081 	return (gfn);
2082 }
2083 
2084 int
kvm_vm_ioctl_get_irqchip(struct kvm * kvm,struct kvm_irqchip * chip)2085 kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2086 {
2087 	int r;
2088 
2089 	r = 0;
2090 	switch (chip->chip_id) {
2091 	case KVM_IRQCHIP_PIC_MASTER:
2092 		memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[0],
2093 		    sizeof (struct kvm_pic_state));
2094 		break;
2095 	case KVM_IRQCHIP_PIC_SLAVE:
2096 		memcpy(&chip->chip.pic, &pic_irqchip(kvm)->pics[1],
2097 		    sizeof (struct kvm_pic_state));
2098 		break;
2099 	case KVM_IRQCHIP_IOAPIC:
2100 		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2101 		break;
2102 	default:
2103 		r = EINVAL;
2104 		break;
2105 	}
2106 
2107 	return (r);
2108 }
2109 
2110 int
kvm_vm_ioctl_set_irqchip(struct kvm * kvm,struct kvm_irqchip * chip)2111 kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2112 {
2113 	int r;
2114 
2115 	r = 0;
2116 
2117 	switch (chip->chip_id) {
2118 	case KVM_IRQCHIP_PIC_MASTER:
2119 		mutex_enter(&pic_irqchip(kvm)->lock);
2120 		memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic,
2121 		    sizeof (struct kvm_pic_state));
2122 		mutex_exit(&pic_irqchip(kvm)->lock);
2123 		break;
2124 	case KVM_IRQCHIP_PIC_SLAVE:
2125 		mutex_enter(&pic_irqchip(kvm)->lock);
2126 		memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic,
2127 		    sizeof (struct kvm_pic_state));
2128 		mutex_exit(&pic_irqchip(kvm)->lock);
2129 		break;
2130 	case KVM_IRQCHIP_IOAPIC:
2131 		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2132 		break;
2133 	default:
2134 		r = EINVAL;
2135 		break;
2136 	}
2137 
2138 	kvm_pic_update_irq(pic_irqchip(kvm));
2139 
2140 	return (r);
2141 }
2142 
2143 int
kvm_vm_ioctl_get_pit2(struct kvm * kvm,struct kvm_pit_state2 * ps)2144 kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2145 {
2146 	struct kvm_pit *vpit = kvm->arch.vpit;
2147 
2148 	mutex_enter(&vpit->pit_state.lock);
2149 	memcpy(ps->channels, &vpit->pit_state.channels, sizeof (ps->channels));
2150 	ps->flags = vpit->pit_state.flags;
2151 	mutex_exit(&vpit->pit_state.lock);
2152 
2153 	return (0);
2154 }
2155 
2156 int
kvm_vm_ioctl_set_pit2(struct kvm * kvm,struct kvm_pit_state2 * ps)2157 kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2158 {
2159 	boolean_t prev_legacy, cur_legacy, start = B_FALSE;
2160 	struct kvm_pit *vpit = kvm->arch.vpit;
2161 
2162 	mutex_enter(&vpit->pit_state.lock);
2163 	prev_legacy = vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
2164 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
2165 
2166 	if (!prev_legacy && cur_legacy)
2167 		start = B_TRUE;
2168 
2169 	memcpy(&vpit->pit_state.channels, &ps->channels,
2170 	    sizeof (vpit->pit_state.channels));
2171 
2172 	vpit->pit_state.flags = ps->flags;
2173 	kvm_pit_load_count(kvm, 0, vpit->pit_state.channels[0].count, start);
2174 
2175 	mutex_exit(&vpit->pit_state.lock);
2176 
2177 	return (0);
2178 }
2179 
2180 /* TODO: As Pascal would say, we can do better */
2181 int
kvm_vm_ioctl_get_msr_index_list(struct kvm * kvm,uintptr_t arg)2182 kvm_vm_ioctl_get_msr_index_list(struct kvm *kvm, uintptr_t arg)
2183 {
2184 
2185 	struct kvm_msr_list *user_msr_list = (struct kvm_msr_list *)arg;
2186 	struct kvm_msr_list *msr_list;
2187 	size_t sz = sizeof (struct kvm_msr_list);
2188 	unsigned n;
2189 
2190 	msr_list = kmem_zalloc(sz, KM_SLEEP);
2191 
2192 	if (copyin(user_msr_list, msr_list, sz) != 0) {
2193 		kmem_free(msr_list, sz);
2194 		return (EFAULT);
2195 	}
2196 
2197 	n = msr_list->nmsrs;
2198 	msr_list->nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2199 
2200 	if (copyout(msr_list, user_msr_list, sz) != 0) {
2201 		kmem_free(msr_list, sz);
2202 		return (EFAULT);
2203 	}
2204 
2205 	if (n < msr_list->nmsrs) {
2206 		kmem_free(msr_list, sz);
2207 		return (E2BIG);
2208 	}
2209 
2210 	if (copyout(&msrs_to_save, user_msr_list->indices,
2211 	    num_msrs_to_save * sizeof (uint32_t))) {
2212 		kmem_free(msr_list, sz);
2213 		return (EFAULT);
2214 	}
2215 
2216 	if (copyout(&emulated_msrs, user_msr_list->indices +
2217 	    num_msrs_to_save, ARRAY_SIZE(emulated_msrs) *
2218 	    sizeof (uint32_t)) != 0) {
2219 		kmem_free(msr_list, sz);
2220 		return (EFAULT);
2221 	}
2222 
2223 	kmem_free(msr_list, sz);
2224 
2225 	return (0);
2226 }
2227 
2228 /*
2229  * Get (and clear) the dirty memory log for a memory slot.
2230  */
2231 int
kvm_vm_ioctl_get_dirty_log(struct kvm * kvm,struct kvm_dirty_log * log)2232 kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
2233 {
2234 	int r, i;
2235 	struct kvm_memory_slot *memslot;
2236 	unsigned long n;
2237 	unsigned long is_dirty = 0;
2238 	unsigned long *dirty_bitmap = NULL;
2239 
2240 	mutex_enter(&kvm->slots_lock);
2241 
2242 	r = EINVAL;
2243 	if (log->slot >= KVM_MEMORY_SLOTS)
2244 		goto out;
2245 
2246 	memslot = &kvm->memslots->memslots[log->slot];
2247 	r = ENOENT;
2248 	if (!memslot->dirty_bitmap)
2249 		goto out;
2250 
2251 	n = kvm_dirty_bitmap_bytes(memslot);
2252 
2253 	dirty_bitmap = kmem_alloc(n, KM_SLEEP);
2254 	memset(dirty_bitmap, 0, n);
2255 
2256 	for (i = 0; !is_dirty && i < n / sizeof (long); i++)
2257 		is_dirty = memslot->dirty_bitmap[i];
2258 
2259 	/* If nothing is dirty, don't bother messing with page tables. */
2260 	if (is_dirty) {
2261 		struct kvm_memslots *slots, *old_slots;
2262 
2263 		mutex_enter(&kvm->mmu_lock);
2264 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
2265 		mutex_exit(&kvm->mmu_lock);
2266 
2267 		slots = kmem_zalloc(sizeof (struct kvm_memslots), KM_SLEEP);
2268 		if (!slots)
2269 			goto out_free;
2270 
2271 		memcpy(slots, kvm->memslots, sizeof (struct kvm_memslots));
2272 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2273 
2274 		mutex_enter(&kvm->memslots_lock);
2275 		old_slots = kvm->memslots;
2276 		kvm->memslots = slots;
2277 		mutex_exit(&kvm->memslots_lock);
2278 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2279 		kmem_free(old_slots, sizeof (struct kvm_memslots));
2280 	}
2281 
2282 	r = 0;
2283 	if (copyout(dirty_bitmap, log->dirty_bitmap, n) != 0)
2284 		r = EFAULT;
2285 out_free:
2286 	kmem_free(dirty_bitmap, n);
2287 out:
2288 	mutex_exit(&kvm->slots_lock);
2289 	return (r);
2290 }
2291 
2292 static void
kvm_init_msr_list(void)2293 kvm_init_msr_list(void)
2294 {
2295 	uint32_t dummy[2];
2296 	unsigned i, j;
2297 
2298 	/* skip the first msrs in the list. KVM-specific */
2299 	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2300 		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2301 			continue;
2302 		if (j < i)
2303 			msrs_to_save[j] = msrs_to_save[i];
2304 		j++;
2305 	}
2306 	num_msrs_to_save = j;
2307 }
2308 
2309 static int
vcpu_mmio_write(struct kvm_vcpu * vcpu,gpa_t addr,int len,const void * v)2310 vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v)
2311 {
2312 	if (vcpu->arch.apic &&
2313 	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2314 		return (0);
2315 
2316 	return (kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v));
2317 }
2318 
2319 static int
vcpu_mmio_read(struct kvm_vcpu * vcpu,gpa_t addr,int len,void * v)2320 vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2321 {
2322 	if (vcpu->arch.apic &&
2323 	    !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2324 		return (0);
2325 
2326 	return (kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v));
2327 }
2328 
2329 gpa_t
kvm_mmu_gva_to_gpa_read(struct kvm_vcpu * vcpu,gva_t gva,uint32_t * error)2330 kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
2331 {
2332 	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
2333 	    PFERR_USER_MASK : 0;
2334 
2335 	return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error));
2336 }
2337 
2338 gpa_t
kvm_mmu_gva_to_gpa_write(struct kvm_vcpu * vcpu,gva_t gva,uint32_t * error)2339 kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, uint32_t *error)
2340 {
2341 	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
2342 	    PFERR_USER_MASK : 0;
2343 
2344 	access |= PFERR_WRITE_MASK;
2345 
2346 	return (vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error));
2347 }
2348 
2349 /* used for instruction fetching */
2350 static int
kvm_fetch_guest_virt(gva_t addr,void * val,unsigned int bytes,struct kvm_vcpu * vcpu,uint32_t * error)2351 kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
2352     struct kvm_vcpu *vcpu, uint32_t *error)
2353 {
2354 	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
2355 	    PFERR_USER_MASK : 0;
2356 
2357 	return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
2358 	    access | PFERR_FETCH_MASK, error));
2359 }
2360 
2361 static int
kvm_read_guest_virt(gva_t addr,void * val,unsigned int bytes,struct kvm_vcpu * vcpu,uint32_t * error)2362 kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2363     struct kvm_vcpu *vcpu, uint32_t *error)
2364 {
2365 	uint32_t access = (kvm_x86_ops->get_cpl(vcpu) == 3) ?
2366 	    PFERR_USER_MASK : 0;
2367 
2368 	return (kvm_read_guest_virt_helper(addr, val,
2369 	    bytes, vcpu, access, error));
2370 }
2371 
2372 static int
kvm_read_guest_virt_system(gva_t addr,void * val,unsigned int bytes,struct kvm_vcpu * vcpu,uint32_t * error)2373 kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
2374     struct kvm_vcpu *vcpu, uint32_t *error)
2375 {
2376 	return (kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error));
2377 }
2378 
2379 static int
kvm_write_guest_virt(gva_t addr,void * val,unsigned int bytes,struct kvm_vcpu * vcpu,uint32_t * error)2380 kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2381     struct kvm_vcpu *vcpu, uint32_t *error)
2382 {
2383 	uintptr_t data = (uintptr_t)val;
2384 
2385 	while (bytes) {
2386 		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
2387 		unsigned offset = addr & (PAGESIZE-1);
2388 		unsigned towrite = min(bytes, (unsigned)PAGESIZE - offset);
2389 		int ret;
2390 
2391 		if (gpa == UNMAPPED_GVA)
2392 			return (X86EMUL_PROPAGATE_FAULT);
2393 
2394 		if (kvm_write_guest(vcpu->kvm, gpa, (void *)data, towrite) < 0)
2395 			return (X86EMUL_UNHANDLEABLE);
2396 
2397 		bytes -= towrite;
2398 		data += towrite;
2399 		addr += towrite;
2400 	}
2401 
2402 	return (0);
2403 }
2404 
2405 static int
emulator_read_emulated(unsigned long addr,void * val,unsigned int bytes,struct kvm_vcpu * vcpu)2406 emulator_read_emulated(unsigned long addr, void *val,
2407     unsigned int bytes, struct kvm_vcpu *vcpu)
2408 {
2409 	gpa_t gpa;
2410 	uint32_t error_code;
2411 
2412 	if (vcpu->mmio_read_completed) {
2413 		memcpy(val, vcpu->mmio_data, bytes);
2414 		KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t,
2415 		    vcpu->mmio_phys_addr, uint64_t, *(uint64_t *)val);
2416 
2417 		vcpu->mmio_read_completed = 0;
2418 		return (X86EMUL_CONTINUE);
2419 	}
2420 
2421 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
2422 
2423 	if (gpa == UNMAPPED_GVA) {
2424 		kvm_inject_page_fault(vcpu, addr, error_code);
2425 		return (X86EMUL_PROPAGATE_FAULT);
2426 	}
2427 
2428 	/* For APIC access vmexit */
2429 	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
2430 		goto mmio;
2431 
2432 	if (kvm_read_guest_virt(addr, val,
2433 	    bytes, vcpu, NULL) == X86EMUL_CONTINUE)
2434 		return (X86EMUL_CONTINUE);
2435 
2436 mmio:
2437 	/*
2438 	 * Is this MMIO handled locally?
2439 	 */
2440 	if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) {
2441 		KVM_TRACE3(mmio__read, unsigned int, bytes, uintptr_t, gpa,
2442 		    uint64_t, *(uint64_t *)val);
2443 		return (X86EMUL_CONTINUE);
2444 	}
2445 
2446 	KVM_TRACE2(mmio__read__unsatisfied, unsigned int, bytes,
2447 	    uintptr_t, gpa);
2448 
2449 	vcpu->mmio_needed = 1;
2450 	vcpu->mmio_phys_addr = gpa;
2451 	vcpu->mmio_size = bytes;
2452 	vcpu->mmio_is_write = 0;
2453 
2454 	return (X86EMUL_UNHANDLEABLE);
2455 }
2456 
2457 int
emulator_write_phys(struct kvm_vcpu * vcpu,gpa_t gpa,const void * val,int bytes)2458 emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2459     const void *val, int bytes)
2460 {
2461 	int ret;
2462 
2463 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2464 
2465 	if (ret < 0)
2466 		return (0);
2467 
2468 	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2469 
2470 	return (1);
2471 }
2472 
2473 static int
emulator_write_emulated_onepage(unsigned long addr,const void * val,unsigned int bytes,struct kvm_vcpu * vcpu)2474 emulator_write_emulated_onepage(unsigned long addr, const void *val,
2475     unsigned int bytes, struct kvm_vcpu *vcpu)
2476 {
2477 	gpa_t gpa;
2478 	uint32_t error_code;
2479 
2480 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
2481 
2482 	if (gpa == UNMAPPED_GVA) {
2483 		kvm_inject_page_fault(vcpu, addr, error_code);
2484 		return (X86EMUL_PROPAGATE_FAULT);
2485 	}
2486 
2487 	/* For APIC access vmexit */
2488 	if ((gpa & PAGEMASK) == APIC_DEFAULT_PHYS_BASE)
2489 		goto mmio;
2490 
2491 	if (emulator_write_phys(vcpu, gpa, val, bytes))
2492 		return (X86EMUL_CONTINUE);
2493 
2494 mmio:
2495 	KVM_TRACE3(mmio__write, unsigned int, bytes, uintptr_t, gpa,
2496 	    uint64_t, *(uint64_t *)val);
2497 
2498 	/*
2499 	 * Is this MMIO handled locally?
2500 	 */
2501 	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
2502 		return (X86EMUL_CONTINUE);
2503 
2504 	vcpu->mmio_needed = 1;
2505 	vcpu->mmio_phys_addr = gpa;
2506 	vcpu->mmio_size = bytes;
2507 	vcpu->mmio_is_write = 1;
2508 	memcpy(vcpu->mmio_data, val, bytes);
2509 
2510 	return (X86EMUL_CONTINUE);
2511 }
2512 
2513 int
emulator_write_emulated(unsigned long addr,const void * val,unsigned int bytes,struct kvm_vcpu * vcpu)2514 emulator_write_emulated(unsigned long addr, const void *val,
2515     unsigned int bytes, struct kvm_vcpu *vcpu)
2516 {
2517 	uintptr_t data = (uintptr_t)val;
2518 
2519 	/* Crossing a page boundary? */
2520 	if (((addr + bytes - 1) ^ addr) & PAGEMASK) {
2521 		int rc, now;
2522 
2523 		now = -addr & ~PAGEMASK;
2524 		rc = emulator_write_emulated_onepage(addr,
2525 		    (void *)data, now, vcpu);
2526 
2527 		if (rc != X86EMUL_CONTINUE)
2528 			return (rc);
2529 
2530 		addr += now;
2531 		data += now;
2532 		bytes -= now;
2533 	}
2534 
2535 	return (emulator_write_emulated_onepage(addr, val, bytes, vcpu));
2536 }
2537 
2538 static int
emulator_cmpxchg_emulated(unsigned long addr,const void * old,const void * new,unsigned int bytes,struct kvm_vcpu * vcpu)2539 emulator_cmpxchg_emulated(unsigned long addr, const void *old,
2540     const void *new, unsigned int bytes, struct kvm_vcpu *vcpu)
2541 {
2542 	cmn_err(CE_WARN, "kvm: emulating exchange as write\n");
2543 	return (emulator_write_emulated(addr, new, bytes, vcpu));
2544 }
2545 
2546 static unsigned long
get_segment_base(struct kvm_vcpu * vcpu,int seg)2547 get_segment_base(struct kvm_vcpu *vcpu, int seg)
2548 {
2549 	return (kvm_x86_ops->get_segment_base(vcpu, seg));
2550 }
2551 
2552 void
kvm_report_emulation_failure(struct kvm_vcpu * vcpu,const char * context)2553 kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2554 {
2555 	uint64_t ops, ctx = (uint64_t)context;
2556 	unsigned long rip = kvm_rip_read(vcpu);
2557 	unsigned long rip_linear;
2558 
2559 	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2560 
2561 	kvm_read_guest_virt(rip_linear, &ops, 8, vcpu, NULL);
2562 
2563 	kvm_ringbuf_record(&vcpu->kvcpu_ringbuf, KVM_RINGBUF_TAG_EMUFAIL0, ctx);
2564 	kvm_ringbuf_record(&vcpu->kvcpu_ringbuf, KVM_RINGBUF_TAG_EMUFAIL1, rip);
2565 	kvm_ringbuf_record(&vcpu->kvcpu_ringbuf, KVM_RINGBUF_TAG_EMUFAIL2, ops);
2566 }
2567 
2568 static struct x86_emulate_ops emulate_ops = {
2569 	.read_std		= kvm_read_guest_virt_system,
2570 	.fetch			= kvm_fetch_guest_virt,
2571 	.read_emulated		= emulator_read_emulated,
2572 	.write_emulated		= emulator_write_emulated,
2573 	.cmpxchg_emulated	= emulator_cmpxchg_emulated,
2574 };
2575 
2576 static void
cache_all_regs(struct kvm_vcpu * vcpu)2577 cache_all_regs(struct kvm_vcpu *vcpu)
2578 {
2579 	kvm_register_read(vcpu, VCPU_REGS_RAX);
2580 	kvm_register_read(vcpu, VCPU_REGS_RSP);
2581 	kvm_register_read(vcpu, VCPU_REGS_RIP);
2582 	vcpu->arch.regs_dirty = ~0;
2583 }
2584 
2585 int
emulate_instruction(struct kvm_vcpu * vcpu,unsigned long cr2,uint16_t error_code,int emulation_type)2586 emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
2587     uint16_t error_code, int emulation_type)
2588 {
2589 	int r, shadow_mask;
2590 	struct decode_cache *c;
2591 	struct kvm_run *run = vcpu->run;
2592 	char *ctx = NULL;
2593 
2594 	kvm_clear_exception_queue(vcpu);
2595 	vcpu->arch.mmio_fault_cr2 = cr2;
2596 
2597 	/*
2598 	 * TODO: fix emulate.c to use guest_read/write_register
2599 	 * instead of direct ->regs accesses, can save hundred cycles
2600 	 * on Intel for instructions that don't read/change RSP, for
2601 	 * for example.
2602 	 */
2603 	cache_all_regs(vcpu);
2604 
2605 	vcpu->mmio_is_write = 0;
2606 	vcpu->arch.pio.string = 0;
2607 
2608 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2609 		int cs_db, cs_l;
2610 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2611 
2612 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
2613 		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2614 		vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ?
2615 		    X86EMUL_MODE_REAL :
2616 		    (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) ?
2617 		    X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 :
2618 		    cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2619 
2620 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2621 
2622 		/*
2623 		 * Only allow emulation of specific instructions on #UD
2624 		 * (namely VMMCALL, sysenter, sysexit, syscall)
2625 		 */
2626 		c = &vcpu->arch.emulate_ctxt.decode;
2627 		if (emulation_type & EMULTYPE_TRAP_UD) {
2628 			if (!c->twobyte) {
2629 				ctx = "non-twobyte";
2630 				goto fail;
2631 			}
2632 
2633 			switch (c->b) {
2634 			case 0x01: /* VMMCALL */
2635 				if (c->modrm_mod != 3 || c->modrm_rm != 1) {
2636 					ctx = "vmmcall";
2637 					goto fail;
2638 				}
2639 
2640 				break;
2641 			case 0x34: /* sysenter */
2642 			case 0x35: /* sysexit */
2643 				if (c->modrm_mod != 0 || c->modrm_rm != 0) {
2644 					ctx = "sysenter/sysexit";
2645 					goto fail;
2646 				}
2647 
2648 				break;
2649 			case 0x05: /* syscall */
2650 				if (c->modrm_mod != 0 || c->modrm_rm != 0) {
2651 					ctx = "syscall";
2652 					goto fail;
2653 				}
2654 
2655 				break;
2656 			default:
2657 				ctx = "unknown";
2658 				goto fail;
2659 			}
2660 
2661 			if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) {
2662 				ctx = "modcrm";
2663 				goto fail;
2664 			}
2665 		}
2666 
2667 		KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation);
2668 
2669 		if (r)  {
2670 			KVM_VCPU_KSTAT_INC(vcpu, kvmvs_insn_emulation_fail);
2671 
2672 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2673 				return (EMULATE_DONE);
2674 
2675 			ctx = "decode";
2676 			goto fail;
2677 		}
2678 	}
2679 
2680 	if (emulation_type & EMULTYPE_SKIP) {
2681 		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2682 		return (EMULATE_DONE);
2683 	}
2684 
2685 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2686 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2687 
2688 	if (r == 0)
2689 		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2690 
2691 	if (vcpu->arch.pio.string)
2692 		return (EMULATE_DO_MMIO);
2693 
2694 	if ((r || vcpu->mmio_is_write) && run) {
2695 		run->exit_reason = KVM_EXIT_MMIO;
2696 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
2697 		memcpy(run->mmio.data, vcpu->mmio_data, 8);
2698 		run->mmio.len = vcpu->mmio_size;
2699 		run->mmio.is_write = vcpu->mmio_is_write;
2700 	}
2701 
2702 	if (r) {
2703 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2704 			return (EMULATE_DONE);
2705 
2706 		if (!vcpu->mmio_needed) {
2707 			ctx = "mmio";
2708 			goto fail;
2709 		}
2710 
2711 		return (EMULATE_DO_MMIO);
2712 	}
2713 
2714 	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2715 
2716 	if (vcpu->mmio_is_write) {
2717 		vcpu->mmio_needed = 0;
2718 		return (EMULATE_DO_MMIO);
2719 	}
2720 
2721 	return (EMULATE_DONE);
2722 
2723 fail:
2724 	kvm_report_emulation_failure(vcpu, ctx != NULL ? ctx : "????");
2725 	return (EMULATE_FAIL);
2726 }
2727 
2728 static int
pio_copy_data(struct kvm_vcpu * vcpu)2729 pio_copy_data(struct kvm_vcpu *vcpu)
2730 {
2731 	void *p = vcpu->arch.pio_data;
2732 	gva_t q = vcpu->arch.pio.guest_gva;
2733 	unsigned bytes;
2734 	int ret;
2735 	uint32_t error_code;
2736 
2737 	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2738 
2739 	if (vcpu->arch.pio.in)
2740 		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
2741 	else
2742 		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
2743 
2744 	if (ret == X86EMUL_PROPAGATE_FAULT)
2745 		kvm_inject_page_fault(vcpu, q, error_code);
2746 
2747 	return (ret);
2748 }
2749 
2750 int
complete_pio(struct kvm_vcpu * vcpu)2751 complete_pio(struct kvm_vcpu *vcpu)
2752 {
2753 	struct kvm_pio_request *io = &vcpu->arch.pio;
2754 	long delta;
2755 	int r;
2756 	unsigned long val;
2757 
2758 	if (!io->string) {
2759 		if (io->in) {
2760 			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2761 			memcpy(&val, vcpu->arch.pio_data, io->size);
2762 			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2763 		}
2764 	} else {
2765 		if (io->in) {
2766 			r = pio_copy_data(vcpu);
2767 			if (r)
2768 				goto out;
2769 		}
2770 
2771 		delta = 1;
2772 		if (io->rep) {
2773 			delta *= io->cur_count;
2774 			/*
2775 			 * The size of the register should really depend on
2776 			 * current address size.
2777 			 */
2778 			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2779 			val -= delta;
2780 			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2781 		}
2782 		if (io->down)
2783 			delta = -delta;
2784 		delta *= io->size;
2785 		if (io->in) {
2786 			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2787 			val += delta;
2788 			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2789 		} else {
2790 			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2791 			val += delta;
2792 			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2793 		}
2794 	}
2795 out:
2796 	io->count -= io->cur_count;
2797 	io->cur_count = 0;
2798 
2799 	return (0);
2800 }
2801 
2802 static int
kernel_pio(struct kvm_vcpu * vcpu,void * pd)2803 kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2804 {
2805 	/* TODO: String I/O for in kernel device */
2806 	int r;
2807 
2808 	if (vcpu->arch.pio.in) {
2809 		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
2810 		    vcpu->arch.pio.size, pd);
2811 	} else {
2812 		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
2813 		    vcpu->arch.pio.port, vcpu->arch.pio.size, pd);
2814 	}
2815 
2816 	return (r);
2817 }
2818 
2819 int
kvm_emulate_pio(struct kvm_vcpu * vcpu,int in,int size,unsigned port)2820 kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2821 {
2822 	unsigned long val;
2823 
2824 	DTRACE_PROBE4(kvm__pio, int, !in, unsigned, port, int, size,
2825 	    unsigned long, 1)
2826 
2827 	vcpu->run->exit_reason = KVM_EXIT_IO;
2828 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2829 	vcpu->run->io.size = vcpu->arch.pio.size = size;
2830 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGESIZE;
2831 	vcpu->run->io.count = vcpu->arch.pio.count =
2832 	    vcpu->arch.pio.cur_count = 1;
2833 	vcpu->run->io.port = vcpu->arch.pio.port = port;
2834 	vcpu->arch.pio.in = in;
2835 	vcpu->arch.pio.string = 0;
2836 	vcpu->arch.pio.down = 0;
2837 	vcpu->arch.pio.rep = 0;
2838 
2839 	if (!vcpu->arch.pio.in) {
2840 		val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2841 		memcpy(vcpu->arch.pio_data, &val, 4);
2842 	}
2843 
2844 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
2845 		complete_pio(vcpu);
2846 		return (1);
2847 	}
2848 
2849 	return (0);
2850 }
2851 
2852 void
kvm_timer_fire(void * arg)2853 kvm_timer_fire(void *arg)
2854 {
2855 	struct kvm_timer *timer = (struct kvm_timer *)arg;
2856 	struct kvm_vcpu *vcpu = timer->vcpu;
2857 
2858 	if (vcpu == NULL)
2859 		return;
2860 
2861 	mutex_enter(&vcpu->kvcpu_kick_lock);
2862 
2863 	if (timer->reinject || !timer->pending) {
2864 		atomic_add_32((volatile uint32_t *)&timer->pending, 1);
2865 		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
2866 	}
2867 
2868 	timer->intervals++;
2869 
2870 	cv_broadcast(&vcpu->kvcpu_kick_cv);
2871 	mutex_exit(&vcpu->kvcpu_kick_lock);
2872 }
2873 
2874 static void
kvm_timer_init(void)2875 kvm_timer_init(void)
2876 {
2877 	int cpu;
2878 
2879 	/*
2880 	 * We assume a constant time stamp counter increment rate, which
2881 	 * is true for all CPUs that support hardware virtualization
2882 	 * extensions.
2883 	 */
2884 	cpu_tsc_khz = (cpu_freq_hz / 1000);
2885 }
2886 
2887 int
kvm_arch_init(void * opaque)2888 kvm_arch_init(void *opaque)
2889 {
2890 	int r;
2891 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2892 
2893 	if (ops->cpu_has_kvm_support()) {
2894 		cmn_err(CE_WARN, "kvm: no hardware support\n");
2895 		r = ENOTSUP;
2896 		goto out;
2897 	}
2898 	if (ops->disabled_by_bios()) {
2899 		cmn_err(CE_WARN, "kvm: disabled by bios\n");
2900 		r = ENOTSUP;
2901 		goto out;
2902 	}
2903 
2904 	r = kvm_mmu_module_init();
2905 	if (r)
2906 		goto out;
2907 
2908 	kvm_init_msr_list();
2909 
2910 	kvm_x86_ops = ops;
2911 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2912 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2913 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2914 	    PT_DIRTY_MASK, PT64_NX_MASK, 0);
2915 
2916 	kvm_timer_init();
2917 
2918 	return (0);
2919 
2920 out:
2921 	return (r);
2922 }
2923 
2924 int
kvm_emulate_halt(struct kvm_vcpu * vcpu)2925 kvm_emulate_halt(struct kvm_vcpu *vcpu)
2926 {
2927 	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_halt_exits);
2928 
2929 	if (irqchip_in_kernel(vcpu->kvm)) {
2930 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2931 		return (1);
2932 	} else {
2933 		vcpu->run->exit_reason = KVM_EXIT_HLT;
2934 		return (0);
2935 	}
2936 }
2937 
2938 int
kvm_hv_hypercall(struct kvm_vcpu * vcpu)2939 kvm_hv_hypercall(struct kvm_vcpu *vcpu)
2940 {
2941 	uint64_t param, ingpa, outgpa, ret;
2942 	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
2943 	int fast, longmode;
2944 	int cs_db, cs_l;
2945 
2946 	/*
2947 	 * hypercall generates UD from non zero cpl and real mode
2948 	 * per HYPER-V spec
2949 	 */
2950 	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
2951 		kvm_queue_exception(vcpu, UD_VECTOR);
2952 		return (0);
2953 	}
2954 
2955 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2956 	longmode = is_long_mode(vcpu) && cs_l == 1;
2957 
2958 	if (!longmode) {
2959 		param = ((uint64_t)kvm_register_read(vcpu,
2960 		    VCPU_REGS_RDX) << 32) | (kvm_register_read(vcpu,
2961 		    VCPU_REGS_RAX) & 0xffffffff);
2962 
2963 		ingpa = ((uint64_t)kvm_register_read(vcpu,
2964 		    VCPU_REGS_RBX) << 32) | (kvm_register_read(vcpu,
2965 		    VCPU_REGS_RCX) & 0xffffffff);
2966 
2967 		outgpa = ((uint64_t)kvm_register_read(vcpu,
2968 		    VCPU_REGS_RDI) << 32) | (kvm_register_read(vcpu,
2969 		    VCPU_REGS_RSI) & 0xffffffff);
2970 	} else {
2971 		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
2972 		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
2973 		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
2974 	}
2975 
2976 	code = param & 0xffff;
2977 	fast = (param >> 16) & 0x1;
2978 	rep_cnt = (param >> 32) & 0xfff;
2979 	rep_idx = (param >> 48) & 0xfff;
2980 
2981 	KVM_TRACE6(hv__hypercall, uintptr_t, code, uintptr_t, fast,
2982 	    uintptr_t, rep_cnt, uintptr_t, rep_idx, uintptr_t, ingpa,
2983 	    uintptr_t, outgpa);
2984 
2985 	switch (code) {
2986 	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
2987 #ifdef XXX
2988 		kvm_vcpu_on_spin(vcpu);
2989 #else
2990 		XXX_KVM_PROBE;
2991 #endif
2992 		break;
2993 	default:
2994 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
2995 		break;
2996 	}
2997 
2998 	ret = res | (((uint64_t)rep_done & 0xfff) << 32);
2999 
3000 	if (longmode) {
3001 		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3002 	} else {
3003 		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
3004 		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
3005 	}
3006 
3007 	return (1);
3008 }
3009 
3010 #define	KVM_HC_VAPIC_POLL_IRQ		1
3011 #define	KVM_HC_MMU_OP			2
3012 
3013 int
kvm_emulate_hypercall(struct kvm_vcpu * vcpu)3014 kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3015 {
3016 	unsigned long nr, a0, a1, a2, a3, ret;
3017 	int r = 1;
3018 
3019 	if (kvm_hv_hypercall_enabled(vcpu->kvm))
3020 		return (kvm_hv_hypercall(vcpu));
3021 
3022 	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3023 	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3024 	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
3025 	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
3026 	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
3027 
3028 	KVM_TRACE5(hypercall, uintptr_t, nr, uintptr_t, a0, uintptr_t, a1,
3029 	    uintptr_t, a2, uintptr_t, a3);
3030 
3031 	if (!is_long_mode(vcpu)) {
3032 		nr &= 0xFFFFFFFF;
3033 		a0 &= 0xFFFFFFFF;
3034 		a1 &= 0xFFFFFFFF;
3035 		a2 &= 0xFFFFFFFF;
3036 		a3 &= 0xFFFFFFFF;
3037 	}
3038 
3039 	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
3040 		ret = -EPERM;
3041 		goto out;
3042 	}
3043 
3044 	switch (nr) {
3045 	case KVM_HC_VAPIC_POLL_IRQ:
3046 		ret = 0;
3047 		break;
3048 	case KVM_HC_MMU_OP:
3049 		ret = -ENOSYS;
3050 		break;
3051 	default:
3052 		ret = -ENOSYS;
3053 		break;
3054 	}
3055 out:
3056 	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3057 
3058 	KVM_VCPU_KSTAT_INC(vcpu, kvmvs_hypercalls);
3059 
3060 	return (r);
3061 }
3062 
3063 static int
move_to_next_stateful_cpuid_entry(struct kvm_vcpu * vcpu,int i)3064 move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
3065 {
3066 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
3067 	int j, nent = vcpu->arch.cpuid_nent;
3068 
3069 	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
3070 	/* when no next entry is found, the current entry[i] is reselected */
3071 	for (j = i + 1; ; j = (j + 1) % nent) {
3072 		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
3073 		if (ej->function == e->function) {
3074 			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3075 			return (j);
3076 		}
3077 	}
3078 
3079 	return (0); /* silence gcc, even though control never reaches here */
3080 }
3081 
3082 /*
3083  * find an entry with matching function, matching index (if needed), and that
3084  * should be read next (if it's stateful)
3085  */
3086 static int
is_matching_cpuid_entry(struct kvm_cpuid_entry2 * e,uint32_t function,uint32_t index)3087 is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3088     uint32_t function, uint32_t index)
3089 {
3090 	if (e->function != function)
3091 		return (0);
3092 	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3093 		return (0);
3094 	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3095 	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3096 		return (0);
3097 	return (1);
3098 }
3099 
3100 struct kvm_cpuid_entry2 *
kvm_find_cpuid_entry(struct kvm_vcpu * vcpu,uint32_t function,uint32_t index)3101 kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, uint32_t function, uint32_t index)
3102 {
3103 	int i;
3104 	struct kvm_cpuid_entry2 *best = NULL;
3105 
3106 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3107 		struct kvm_cpuid_entry2 *e;
3108 
3109 		e = &vcpu->arch.cpuid_entries[i];
3110 		if (is_matching_cpuid_entry(e, function, index)) {
3111 			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3112 				move_to_next_stateful_cpuid_entry(vcpu, i);
3113 			best = e;
3114 			break;
3115 		}
3116 		/*
3117 		 * Both basic or both extended?
3118 		 */
3119 		if (((e->function ^ function) & 0x80000000) == 0)
3120 			if (!best || e->function > best->function)
3121 				best = e;
3122 	}
3123 
3124 	return (best);
3125 }
3126 
3127 int
cpuid_maxphyaddr(struct kvm_vcpu * vcpu)3128 cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3129 {
3130 	struct kvm_cpuid_entry2 *best;
3131 
3132 	if ((best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0)) != NULL)
3133 		return (best->eax & 0xff);
3134 
3135 	return (36);
3136 }
3137 
3138 void
kvm_emulate_cpuid(struct kvm_vcpu * vcpu)3139 kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3140 {
3141 	uint32_t function, index;
3142 	struct kvm_cpuid_entry2 *best;
3143 
3144 	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3145 	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3146