xref: /illumos-kvm-cmd/qemu-kvm.c (revision 36b02c7d)
1 /*
2  * qemu/kvm integration
3  *
4  * Copyright (C) 2006-2008 Qumranet Technologies
5  * Portions Copyright 2011 Joyent, Inc.
6  *
7  * Licensed under the terms of the GNU GPL version 2 or higher.
8  */
9 #include "config.h"
10 #include "config-host.h"
11 
12 #include <assert.h>
13 #include <string.h>
14 #include "hw/hw.h"
15 #include "sysemu.h"
16 #include "qemu-common.h"
17 #include "console.h"
18 #include "block.h"
19 #include "compatfd.h"
20 #include "gdbstub.h"
21 #include "monitor.h"
22 
23 #include "qemu-kvm.h"
24 #include "libkvm.h"
25 
26 #include <pthread.h>
27 #include <sys/utsname.h>
28 #include <sys/syscall.h>
29 #include <sys/mman.h>
30 #include <sys/ioctl.h>
31 #include "compatfd.h"
32 #ifdef __linux__
33 #include <sys/prctl.h>
34 #endif
35 
36 #define false 0
37 #define true 1
38 
39 #ifndef PR_MCE_KILL
40 #define PR_MCE_KILL 33
41 #endif
42 
43 #ifndef BUS_MCEERR_AR
44 #define BUS_MCEERR_AR 4
45 #endif
46 #ifndef BUS_MCEERR_AO
47 #define BUS_MCEERR_AO 5
48 #endif
49 
50 #define EXPECTED_KVM_API_VERSION 12
51 
52 #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
53 #error libkvm: userspace and kernel version mismatch
54 #endif
55 
56 int kvm_irqchip = 1;
57 int kvm_pit = 1;
58 int kvm_pit_reinject = 1;
59 int kvm_nested = 0;
60 
61 
62 KVMState *kvm_state;
63 kvm_context_t kvm_context;
64 
65 pthread_mutex_t qemu_mutex = PTHREAD_MUTEX_INITIALIZER;
66 pthread_cond_t qemu_vcpu_cond = PTHREAD_COND_INITIALIZER;
67 pthread_cond_t qemu_system_cond = PTHREAD_COND_INITIALIZER;
68 pthread_cond_t qemu_pause_cond = PTHREAD_COND_INITIALIZER;
69 pthread_cond_t qemu_work_cond = PTHREAD_COND_INITIALIZER;
70 __thread CPUState *current_env;
71 
72 static int qemu_system_ready;
73 
74 #define SIG_IPI (SIGRTMIN+4)
75 
76 pthread_t io_thread;
77 static int io_thread_sigfd = -1;
78 
79 static CPUState *kvm_debug_cpu_requested;
80 
81 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
82 /* The list of ioperm_data */
83 static QLIST_HEAD(, ioperm_data) ioperm_head;
84 #endif
85 
86 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
87 
88 int kvm_abi = EXPECTED_KVM_API_VERSION;
89 int kvm_page_size;
90 
91 #ifdef KVM_CAP_SET_GUEST_DEBUG
kvm_debug(CPUState * env,struct kvm_debug_exit_arch * arch_info)92 static int kvm_debug(CPUState *env,
93                      struct kvm_debug_exit_arch *arch_info)
94 {
95     int handle = kvm_arch_debug(arch_info);
96 
97     if (handle) {
98         kvm_debug_cpu_requested = env;
99         env->stopped = 1;
100     }
101     return handle;
102 }
103 #endif
104 
handle_unhandled(uint64_t reason)105 static int handle_unhandled(uint64_t reason)
106 {
107     fprintf(stderr, "kvm: unhandled exit %" PRIx64 "\n", reason);
108     return -EINVAL;
109 }
110 
111 #define VMX_INVALID_GUEST_STATE 0x80000021
112 
handle_failed_vmentry(uint64_t reason)113 static int handle_failed_vmentry(uint64_t reason)
114 {
115     fprintf(stderr, "kvm: vm entry failed with error 0x%" PRIx64 "\n\n", reason);
116 
117     /* Perhaps we will need to check if this machine is intel since exit reason 0x21
118        has a different interpretation on SVM */
119     if (reason == VMX_INVALID_GUEST_STATE) {
120         fprintf(stderr, "If you're runnning a guest on an Intel machine without\n");
121         fprintf(stderr, "unrestricted mode support, the failure can be most likely\n");
122         fprintf(stderr, "due to the guest entering an invalid state for Intel VT.\n");
123         fprintf(stderr, "For example, the guest maybe running in big real mode\n");
124         fprintf(stderr, "which is not supported on less recent Intel processors.\n\n");
125     }
126 
127     return -EINVAL;
128 }
129 
set_gsi(kvm_context_t kvm,unsigned int gsi)130 static inline void set_gsi(kvm_context_t kvm, unsigned int gsi)
131 {
132     uint32_t *bitmap = kvm->used_gsi_bitmap;
133 
134     if (gsi < kvm->max_gsi)
135         bitmap[gsi / 32] |= 1U << (gsi % 32);
136     else
137         DPRINTF("Invalid GSI %u\n", gsi);
138 }
139 
clear_gsi(kvm_context_t kvm,unsigned int gsi)140 static inline void clear_gsi(kvm_context_t kvm, unsigned int gsi)
141 {
142     uint32_t *bitmap = kvm->used_gsi_bitmap;
143 
144     if (gsi < kvm->max_gsi)
145         bitmap[gsi / 32] &= ~(1U << (gsi % 32));
146     else
147         DPRINTF("Invalid GSI %u\n", gsi);
148 }
149 
150 static int kvm_create_context(void);
151 
kvm_init(void)152 int kvm_init(void)
153 {
154     int fd;
155     int r, gsi_count;
156 
157 
158     fd = open("/dev/kvm", O_RDWR);
159     if (fd == -1) {
160         perror("open /dev/kvm");
161         return -1;
162     }
163     r = ioctl(fd, KVM_GET_API_VERSION, 0);
164     if (r == -1) {
165         fprintf(stderr,
166                 "kvm kernel version too old: "
167                 "KVM_GET_API_VERSION ioctl not supported\n");
168         goto out_close;
169     }
170     if (r < EXPECTED_KVM_API_VERSION) {
171         fprintf(stderr, "kvm kernel version too old: "
172                 "We expect API version %d or newer, but got "
173                 "version %d\n", EXPECTED_KVM_API_VERSION, r);
174         goto out_close;
175     }
176     if (r > EXPECTED_KVM_API_VERSION) {
177         fprintf(stderr, "kvm userspace version too old\n");
178         goto out_close;
179     }
180     kvm_abi = r;
181     kvm_page_size = getpagesize();
182     kvm_state = qemu_mallocz(sizeof(*kvm_state));
183     kvm_context = &kvm_state->kvm_context;
184 
185     kvm_state->fd = fd;
186     kvm_state->vmfd = -1;
187     kvm_context->opaque = cpu_single_env;
188     kvm_context->dirty_pages_log_all = 0;
189     kvm_context->no_irqchip_creation = 0;
190     kvm_context->no_pit_creation = 0;
191 
192 #ifdef KVM_CAP_SET_GUEST_DEBUG
193     QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints);
194 #endif
195 
196     gsi_count = kvm_get_gsi_count(kvm_context);
197     if (gsi_count > 0) {
198         int gsi_bits, i;
199 
200         /* Round up so we can search ints using ffs */
201         gsi_bits = ALIGN(gsi_count, 32);
202         kvm_context->used_gsi_bitmap = qemu_mallocz(gsi_bits / 8);
203         kvm_context->max_gsi = gsi_bits;
204 
205         /* Mark any over-allocated bits as already in use */
206         for (i = gsi_count; i < gsi_bits; i++) {
207             set_gsi(kvm_context, i);
208         }
209     }
210 
211     kvm_cpu_register_phys_memory_client();
212 
213     pthread_mutex_lock(&qemu_mutex);
214     return kvm_create_context();
215 
216   out_close:
217     close(fd);
218     return -1;
219 }
220 
kvm_finalize(KVMState * s)221 static void kvm_finalize(KVMState *s)
222 {
223     /* FIXME
224        if (kvm->vcpu_fd[0] != -1)
225            close(kvm->vcpu_fd[0]);
226        if (kvm->vm_fd != -1)
227            close(kvm->vm_fd);
228      */
229     close(s->fd);
230     free(s);
231 }
232 
kvm_disable_irqchip_creation(kvm_context_t kvm)233 void kvm_disable_irqchip_creation(kvm_context_t kvm)
234 {
235     kvm->no_irqchip_creation = 1;
236 }
237 
kvm_disable_pit_creation(kvm_context_t kvm)238 void kvm_disable_pit_creation(kvm_context_t kvm)
239 {
240     kvm->no_pit_creation = 1;
241 }
242 
kvm_reset_vcpu(void * opaque)243 static void kvm_reset_vcpu(void *opaque)
244 {
245     CPUState *env = opaque;
246 
247     kvm_arch_cpu_reset(env);
248 }
249 
kvm_create_vcpu(CPUState * env,int id)250 static void kvm_create_vcpu(CPUState *env, int id)
251 {
252     long mmap_size;
253     int r;
254     KVMState *s = kvm_state;
255 
256 #ifdef CONFIG_SOLARIS
257     r = kvm_vm_clone(kvm_state);
258 
259     if (r < 0) {
260         fprintf(stderr, "kvm_create_vcpu could not clone fd: %m\n");
261         goto err;
262     }
263 
264     env->kvm_fd = r;
265     env->kvm_state = kvm_state;
266 
267     r = ioctl(env->kvm_fd, KVM_CREATE_VCPU, id);
268 #else
269     r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id);
270 #endif
271 
272     if (r < 0) {
273         fprintf(stderr, "kvm_create_vcpu: %m\n");
274         fprintf(stderr, "Failed to create vCPU. Check the -smp parameter.\n");
275         goto err;
276     }
277 
278 #ifndef CONFIG_SOLARIS
279     env->kvm_fd = r;
280     env->kvm_state = kvm_state;
281 #endif
282 
283     mmap_size = kvm_ioctl(kvm_state, KVM_GET_VCPU_MMAP_SIZE, 0);
284     if (mmap_size < 0) {
285         fprintf(stderr, "get vcpu mmap size: %m\n");
286         goto err_fd;
287     }
288     env->kvm_run =
289         mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, env->kvm_fd,
290              0);
291     if (env->kvm_run == MAP_FAILED) {
292         fprintf(stderr, "mmap vcpu area: %m\n");
293         goto err_fd;
294     }
295 
296 #ifdef KVM_CAP_COALESCED_MMIO
297     if (s->coalesced_mmio && !s->coalesced_mmio_ring)
298         s->coalesced_mmio_ring = (void *) env->kvm_run +
299                s->coalesced_mmio * PAGE_SIZE;
300 #endif
301 
302     r = kvm_arch_init_vcpu(env);
303     if (r == 0) {
304         qemu_register_reset(kvm_reset_vcpu, env);
305     }
306 
307     return;
308   err_fd:
309     close(env->kvm_fd);
310   err:
311     /* We're no good with semi-broken states. */
312     abort();
313 }
314 
kvm_set_boot_vcpu_id(kvm_context_t kvm,uint32_t id)315 static int kvm_set_boot_vcpu_id(kvm_context_t kvm, uint32_t id)
316 {
317 #ifdef KVM_CAP_SET_BOOT_CPU_ID
318     int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_SET_BOOT_CPU_ID);
319     if (r > 0) {
320         return kvm_vm_ioctl(kvm_state, KVM_SET_BOOT_CPU_ID, id);
321     }
322     return -ENOSYS;
323 #else
324     return -ENOSYS;
325 #endif
326 }
327 
kvm_create_vm(kvm_context_t kvm)328 int kvm_create_vm(kvm_context_t kvm)
329 {
330     int fd;
331 #ifdef KVM_CAP_IRQ_ROUTING
332     kvm->irq_routes = qemu_mallocz(sizeof(*kvm->irq_routes));
333     kvm->nr_allocated_irq_routes = 0;
334 #endif
335 
336     fd = kvm_ioctl(kvm_state, KVM_CREATE_VM, 0);
337     if (fd < 0) {
338         fprintf(stderr, "kvm_create_vm: %m\n");
339         return -1;
340     }
341 #ifdef CONFIG_SOLARIS
342     kvm_state->vmfd = kvm_state->fd;
343 #else
344     kvm_state->vmfd = fd;
345 #endif
346     return 0;
347 }
348 
kvm_create_default_phys_mem(kvm_context_t kvm,unsigned long phys_mem_bytes,void ** vm_mem)349 static int kvm_create_default_phys_mem(kvm_context_t kvm,
350                                        unsigned long phys_mem_bytes,
351                                        void **vm_mem)
352 {
353 #ifdef KVM_CAP_USER_MEMORY
354     int r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_USER_MEMORY);
355     if (r > 0)
356         return 0;
357     fprintf(stderr,
358             "Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported\n");
359 #else
360 #error Hypervisor too old: KVM_CAP_USER_MEMORY extension not supported
361 #endif
362     return -1;
363 }
364 
kvm_create_irqchip(kvm_context_t kvm)365 void kvm_create_irqchip(kvm_context_t kvm)
366 {
367     int r;
368 
369     kvm->irqchip_in_kernel = 0;
370 #ifdef KVM_CAP_IRQCHIP
371     if (!kvm->no_irqchip_creation) {
372         r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_IRQCHIP);
373         if (r > 0) {            /* kernel irqchip supported */
374             r = kvm_vm_ioctl(kvm_state, KVM_CREATE_IRQCHIP);
375             if (r >= 0) {
376                 kvm->irqchip_inject_ioctl = KVM_IRQ_LINE;
377 #if defined(KVM_CAP_IRQ_INJECT_STATUS) && defined(KVM_IRQ_LINE_STATUS)
378                 r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION,
379                               KVM_CAP_IRQ_INJECT_STATUS);
380                 if (r > 0) {
381                     kvm->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
382                 }
383 #endif
384                 kvm->irqchip_in_kernel = 1;
385             } else
386                 fprintf(stderr, "Create kernel PIC irqchip failed\n");
387         }
388     }
389 #endif
390     kvm_state->irqchip_in_kernel = kvm->irqchip_in_kernel;
391 }
392 
kvm_create(kvm_context_t kvm,unsigned long phys_mem_bytes,void ** vm_mem)393 int kvm_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem)
394 {
395     int r, i;
396 
397     r = kvm_create_vm(kvm);
398     if (r < 0) {
399         return r;
400     }
401     r = kvm_arch_create(kvm, phys_mem_bytes, vm_mem);
402     if (r < 0) {
403         return r;
404     }
405     for (i = 0; i < ARRAY_SIZE(kvm_state->slots); i++) {
406         kvm_state->slots[i].slot = i;
407     }
408 
409     r = kvm_create_default_phys_mem(kvm, phys_mem_bytes, vm_mem);
410     if (r < 0) {
411         return r;
412     }
413 
414     kvm_create_irqchip(kvm);
415 
416     return 0;
417 }
418 
419 #ifdef KVM_CAP_IRQCHIP
420 
kvm_set_irq_level(kvm_context_t kvm,int irq,int level,int * status)421 int kvm_set_irq_level(kvm_context_t kvm, int irq, int level, int *status)
422 {
423     struct kvm_irq_level event;
424     int r;
425 
426     if (!kvm->irqchip_in_kernel) {
427         return 0;
428     }
429     event.level = level;
430     event.irq = irq;
431     r = kvm_vm_ioctl(kvm_state, kvm->irqchip_inject_ioctl, &event);
432     if (r < 0) {
433         perror("kvm_set_irq_level");
434     }
435 
436     if (status) {
437 #ifdef KVM_CAP_IRQ_INJECT_STATUS
438         *status =
439             (kvm->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
440 #else
441         *status = 1;
442 #endif
443     }
444 
445     return 1;
446 }
447 
kvm_get_irqchip(kvm_context_t kvm,struct kvm_irqchip * chip)448 int kvm_get_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
449 {
450     int r;
451 
452     if (!kvm->irqchip_in_kernel) {
453         return 0;
454     }
455     r = kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
456     if (r < 0) {
457         perror("kvm_get_irqchip\n");
458     }
459     return r;
460 }
461 
kvm_set_irqchip(kvm_context_t kvm,struct kvm_irqchip * chip)462 int kvm_set_irqchip(kvm_context_t kvm, struct kvm_irqchip *chip)
463 {
464     int r;
465 
466     if (!kvm->irqchip_in_kernel) {
467         return 0;
468     }
469     r = kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
470     if (r < 0) {
471         perror("kvm_set_irqchip\n");
472     }
473     return r;
474 }
475 
476 #endif
477 
handle_debug(CPUState * env)478 static int handle_debug(CPUState *env)
479 {
480 #ifdef KVM_CAP_SET_GUEST_DEBUG
481     struct kvm_run *run = env->kvm_run;
482 
483     return kvm_debug(env, &run->debug.arch);
484 #else
485     return 0;
486 #endif
487 }
488 
kvm_get_regs(CPUState * env,struct kvm_regs * regs)489 int kvm_get_regs(CPUState *env, struct kvm_regs *regs)
490 {
491     return kvm_vcpu_ioctl(env, KVM_GET_REGS, regs);
492 }
493 
kvm_set_regs(CPUState * env,struct kvm_regs * regs)494 int kvm_set_regs(CPUState *env, struct kvm_regs *regs)
495 {
496     return kvm_vcpu_ioctl(env, KVM_SET_REGS, regs);
497 }
498 
499 #ifdef KVM_CAP_MP_STATE
kvm_get_mpstate(CPUState * env,struct kvm_mp_state * mp_state)500 int kvm_get_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
501 {
502     int r;
503 
504     r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
505     if (r > 0) {
506         return kvm_vcpu_ioctl(env, KVM_GET_MP_STATE, mp_state);
507     }
508     return -ENOSYS;
509 }
510 
kvm_set_mpstate(CPUState * env,struct kvm_mp_state * mp_state)511 int kvm_set_mpstate(CPUState *env, struct kvm_mp_state *mp_state)
512 {
513     int r;
514 
515     r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_MP_STATE);
516     if (r > 0) {
517         return kvm_vcpu_ioctl(env, KVM_SET_MP_STATE, mp_state);
518     }
519     return -ENOSYS;
520 }
521 #endif
522 
handle_mmio(CPUState * env)523 static int handle_mmio(CPUState *env)
524 {
525     unsigned long addr = env->kvm_run->mmio.phys_addr;
526     struct kvm_run *kvm_run = env->kvm_run;
527     void *data = kvm_run->mmio.data;
528 
529     /* hack: Red Hat 7.1 generates these weird accesses. */
530     if ((addr > 0xa0000 - 4 && addr <= 0xa0000) && kvm_run->mmio.len == 3) {
531         return 0;
532     }
533 
534     cpu_physical_memory_rw(addr, data, kvm_run->mmio.len, kvm_run->mmio.is_write);
535     return 0;
536 }
537 
handle_io_window(kvm_context_t kvm)538 int handle_io_window(kvm_context_t kvm)
539 {
540     return 1;
541 }
542 
handle_shutdown(kvm_context_t kvm,CPUState * env)543 int handle_shutdown(kvm_context_t kvm, CPUState *env)
544 {
545     /* stop the current vcpu from going back to guest mode */
546     env->stopped = 1;
547 
548     qemu_system_reset_request();
549     return 1;
550 }
551 
push_nmi(kvm_context_t kvm)552 static inline void push_nmi(kvm_context_t kvm)
553 {
554 #ifdef KVM_CAP_USER_NMI
555     kvm_arch_push_nmi(kvm->opaque);
556 #endif                          /* KVM_CAP_USER_NMI */
557 }
558 
post_kvm_run(kvm_context_t kvm,CPUState * env)559 void post_kvm_run(kvm_context_t kvm, CPUState *env)
560 {
561     pthread_mutex_lock(&qemu_mutex);
562     kvm_arch_post_run(env, env->kvm_run);
563     cpu_single_env = env;
564 }
565 
pre_kvm_run(kvm_context_t kvm,CPUState * env)566 int pre_kvm_run(kvm_context_t kvm, CPUState *env)
567 {
568     kvm_arch_pre_run(env, env->kvm_run);
569 
570     pthread_mutex_unlock(&qemu_mutex);
571     return 0;
572 }
573 
kvm_is_ready_for_interrupt_injection(CPUState * env)574 int kvm_is_ready_for_interrupt_injection(CPUState *env)
575 {
576     return env->kvm_run->ready_for_interrupt_injection;
577 }
578 
kvm_run(CPUState * env)579 int kvm_run(CPUState *env)
580 {
581     int r;
582     kvm_context_t kvm = &env->kvm_state->kvm_context;
583     struct kvm_run *run = env->kvm_run;
584     int fd = env->kvm_fd;
585 
586   again:
587     if (env->kvm_vcpu_dirty) {
588         kvm_arch_load_regs(env, KVM_PUT_RUNTIME_STATE);
589         env->kvm_vcpu_dirty = 0;
590     }
591     push_nmi(kvm);
592 #if !defined(__s390__)
593     if (!kvm->irqchip_in_kernel) {
594         run->request_interrupt_window = kvm_arch_try_push_interrupts(env);
595     }
596 #endif
597 
598     r = pre_kvm_run(kvm, env);
599     if (r) {
600         return r;
601     }
602     if (env->exit_request) {
603         env->exit_request = 0;
604         pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
605     }
606     r = ioctl(fd, KVM_RUN, 0);
607 
608     if (r == -1 && errno != EINTR && errno != EAGAIN) {
609         r = -errno;
610         post_kvm_run(kvm, env);
611         fprintf(stderr, "kvm_run: %s\n", strerror(-r));
612         return r;
613     }
614 
615     post_kvm_run(kvm, env);
616 
617     kvm_flush_coalesced_mmio_buffer();
618 
619 #if !defined(__s390__)
620     if (r == -1) {
621         r = handle_io_window(kvm);
622         goto more;
623     }
624 #endif
625     if (1) {
626         switch (run->exit_reason) {
627         case KVM_EXIT_UNKNOWN:
628             r = handle_unhandled(run->hw.hardware_exit_reason);
629             break;
630         case KVM_EXIT_FAIL_ENTRY:
631             r = handle_failed_vmentry(run->fail_entry.hardware_entry_failure_reason);
632             break;
633         case KVM_EXIT_EXCEPTION:
634             fprintf(stderr, "exception %d (%x)\n", run->ex.exception,
635                     run->ex.error_code);
636             kvm_show_regs(env);
637             kvm_show_code(env);
638             abort();
639             break;
640         case KVM_EXIT_IO:
641             r = kvm_handle_io(run->io.port,
642                                 (uint8_t *)run + run->io.data_offset,
643                                 run->io.direction,
644                                 run->io.size,
645                                 run->io.count);
646             r = 0;
647             break;
648         case KVM_EXIT_DEBUG:
649             r = handle_debug(env);
650             break;
651         case KVM_EXIT_MMIO:
652             r = handle_mmio(env);
653             break;
654         case KVM_EXIT_HLT:
655             r = kvm_arch_halt(env);
656             break;
657         case KVM_EXIT_IRQ_WINDOW_OPEN:
658 #ifdef CONFIG_SOLARIS
659 	case KVM_EXIT_INTR:
660 #endif
661             break;
662         case KVM_EXIT_SHUTDOWN:
663             r = handle_shutdown(kvm, env);
664             break;
665 #if defined(__s390__)
666         case KVM_EXIT_S390_SIEIC:
667             r = kvm_s390_handle_intercept(kvm, env, run);
668             break;
669         case KVM_EXIT_S390_RESET:
670             r = kvm_s390_handle_reset(kvm, env, run);
671             break;
672 #endif
673 	case KVM_EXIT_INTERNAL_ERROR:
674             kvm_handle_internal_error(env, run);
675             r = 1;
676 	    break;
677         default:
678             if (kvm_arch_run(env)) {
679                 fprintf(stderr, "unhandled vm exit: 0x%x\n", run->exit_reason);
680                 kvm_show_regs(env);
681                 abort();
682             }
683             break;
684         }
685     }
686 more:
687     if (!r) {
688         goto again;
689     }
690     return r;
691 }
692 
kvm_inject_irq(CPUState * env,unsigned irq)693 int kvm_inject_irq(CPUState *env, unsigned irq)
694 {
695     struct kvm_interrupt intr;
696 
697     intr.irq = irq;
698     return kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
699 }
700 
kvm_inject_nmi(CPUState * env)701 int kvm_inject_nmi(CPUState *env)
702 {
703 #ifdef KVM_CAP_USER_NMI
704     return kvm_vcpu_ioctl(env, KVM_NMI);
705 #else
706     return -ENOSYS;
707 #endif
708 }
709 
kvm_init_coalesced_mmio(kvm_context_t kvm)710 int kvm_init_coalesced_mmio(kvm_context_t kvm)
711 {
712     int r = 0;
713     kvm_state->coalesced_mmio = 0;
714 #ifdef KVM_CAP_COALESCED_MMIO
715     r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_COALESCED_MMIO);
716     if (r > 0) {
717         kvm_state->coalesced_mmio = r;
718         return 0;
719     }
720 #endif
721     return r;
722 }
723 
724 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
kvm_assign_pci_device(kvm_context_t kvm,struct kvm_assigned_pci_dev * assigned_dev)725 int kvm_assign_pci_device(kvm_context_t kvm,
726                           struct kvm_assigned_pci_dev *assigned_dev)
727 {
728     return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_PCI_DEVICE, assigned_dev);
729 }
730 
kvm_old_assign_irq(kvm_context_t kvm,struct kvm_assigned_irq * assigned_irq)731 static int kvm_old_assign_irq(kvm_context_t kvm,
732                               struct kvm_assigned_irq *assigned_irq)
733 {
734     return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_IRQ, assigned_irq);
735 }
736 
737 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
kvm_assign_irq(kvm_context_t kvm,struct kvm_assigned_irq * assigned_irq)738 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
739 {
740     int ret;
741 
742     ret = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_ASSIGN_DEV_IRQ);
743     if (ret > 0) {
744         return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_DEV_IRQ, assigned_irq);
745     }
746 
747     return kvm_old_assign_irq(kvm, assigned_irq);
748 }
749 
kvm_deassign_irq(kvm_context_t kvm,struct kvm_assigned_irq * assigned_irq)750 int kvm_deassign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
751 {
752     return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_DEV_IRQ, assigned_irq);
753 }
754 #else
kvm_assign_irq(kvm_context_t kvm,struct kvm_assigned_irq * assigned_irq)755 int kvm_assign_irq(kvm_context_t kvm, struct kvm_assigned_irq *assigned_irq)
756 {
757     return kvm_old_assign_irq(kvm, assigned_irq);
758 }
759 #endif
760 #endif
761 
762 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
kvm_deassign_pci_device(kvm_context_t kvm,struct kvm_assigned_pci_dev * assigned_dev)763 int kvm_deassign_pci_device(kvm_context_t kvm,
764                             struct kvm_assigned_pci_dev *assigned_dev)
765 {
766     return kvm_vm_ioctl(kvm_state, KVM_DEASSIGN_PCI_DEVICE, assigned_dev);
767 }
768 #endif
769 
kvm_reinject_control(kvm_context_t kvm,int pit_reinject)770 int kvm_reinject_control(kvm_context_t kvm, int pit_reinject)
771 {
772 #ifdef KVM_CAP_REINJECT_CONTROL
773     int r;
774     struct kvm_reinject_control control;
775 
776     control.pit_reinject = pit_reinject;
777 
778     r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_REINJECT_CONTROL);
779     if (r > 0) {
780         return kvm_vm_ioctl(kvm_state, KVM_REINJECT_CONTROL, &control);
781     }
782 #endif
783     return -ENOSYS;
784 }
785 
kvm_has_gsi_routing(void)786 int kvm_has_gsi_routing(void)
787 {
788     int r = 0;
789 
790 #ifdef KVM_CAP_IRQ_ROUTING
791     r = kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
792 #endif
793     return r;
794 }
795 
kvm_get_gsi_count(kvm_context_t kvm)796 int kvm_get_gsi_count(kvm_context_t kvm)
797 {
798 #ifdef KVM_CAP_IRQ_ROUTING
799     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
800 #else
801     return -EINVAL;
802 #endif
803 }
804 
kvm_clear_gsi_routes(void)805 int kvm_clear_gsi_routes(void)
806 {
807 #ifdef KVM_CAP_IRQ_ROUTING
808     kvm_context_t kvm = kvm_context;
809 
810     kvm->irq_routes->nr = 0;
811     return 0;
812 #else
813     return -EINVAL;
814 #endif
815 }
816 
kvm_add_routing_entry(struct kvm_irq_routing_entry * entry)817 int kvm_add_routing_entry(struct kvm_irq_routing_entry *entry)
818 {
819 #ifdef KVM_CAP_IRQ_ROUTING
820     kvm_context_t kvm = kvm_context;
821     struct kvm_irq_routing *z;
822     struct kvm_irq_routing_entry *new;
823     int n, size;
824 
825     if (kvm->irq_routes->nr == kvm->nr_allocated_irq_routes) {
826         n = kvm->nr_allocated_irq_routes * 2;
827         if (n < 64) {
828             n = 64;
829         }
830         size = sizeof(struct kvm_irq_routing);
831         size += n * sizeof(*new);
832         z = realloc(kvm->irq_routes, size);
833         if (!z) {
834             return -ENOMEM;
835         }
836         kvm->nr_allocated_irq_routes = n;
837         kvm->irq_routes = z;
838     }
839     n = kvm->irq_routes->nr++;
840     new = &kvm->irq_routes->entries[n];
841     memset(new, 0, sizeof(*new));
842     new->gsi = entry->gsi;
843     new->type = entry->type;
844     new->flags = entry->flags;
845     new->u = entry->u;
846 
847     set_gsi(kvm, entry->gsi);
848 
849     return 0;
850 #else
851     return -ENOSYS;
852 #endif
853 }
854 
kvm_add_irq_route(int gsi,int irqchip,int pin)855 int kvm_add_irq_route(int gsi, int irqchip, int pin)
856 {
857 #ifdef KVM_CAP_IRQ_ROUTING
858     struct kvm_irq_routing_entry e;
859 
860     e.gsi = gsi;
861     e.type = KVM_IRQ_ROUTING_IRQCHIP;
862     e.flags = 0;
863     e.u.irqchip.irqchip = irqchip;
864     e.u.irqchip.pin = pin;
865     return kvm_add_routing_entry(&e);
866 #else
867     return -ENOSYS;
868 #endif
869 }
870 
kvm_del_routing_entry(struct kvm_irq_routing_entry * entry)871 int kvm_del_routing_entry(struct kvm_irq_routing_entry *entry)
872 {
873 #ifdef KVM_CAP_IRQ_ROUTING
874     kvm_context_t kvm = kvm_context;
875     struct kvm_irq_routing_entry *e, *p;
876     int i, gsi, found = 0;
877 
878     gsi = entry->gsi;
879 
880     for (i = 0; i < kvm->irq_routes->nr; ++i) {
881         e = &kvm->irq_routes->entries[i];
882         if (e->type == entry->type && e->gsi == gsi) {
883             switch (e->type) {
884             case KVM_IRQ_ROUTING_IRQCHIP:{
885                     if (e->u.irqchip.irqchip ==
886                         entry->u.irqchip.irqchip
887                         && e->u.irqchip.pin == entry->u.irqchip.pin) {
888                         p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
889                         *e = *p;
890                         found = 1;
891                     }
892                     break;
893                 }
894             case KVM_IRQ_ROUTING_MSI:{
895                     if (e->u.msi.address_lo ==
896                         entry->u.msi.address_lo
897                         && e->u.msi.address_hi ==
898                         entry->u.msi.address_hi
899                         && e->u.msi.data == entry->u.msi.data) {
900                         p = &kvm->irq_routes->entries[--kvm->irq_routes->nr];
901                         *e = *p;
902                         found = 1;
903                     }
904                     break;
905                 }
906             default:
907                 break;
908             }
909             if (found) {
910                 /* If there are no other users of this GSI
911                  * mark it available in the bitmap */
912                 for (i = 0; i < kvm->irq_routes->nr; i++) {
913                     e = &kvm->irq_routes->entries[i];
914                     if (e->gsi == gsi)
915                         break;
916                 }
917                 if (i == kvm->irq_routes->nr) {
918                     clear_gsi(kvm, gsi);
919                 }
920 
921                 return 0;
922             }
923         }
924     }
925     return -ESRCH;
926 #else
927     return -ENOSYS;
928 #endif
929 }
930 
kvm_update_routing_entry(struct kvm_irq_routing_entry * entry,struct kvm_irq_routing_entry * newentry)931 int kvm_update_routing_entry(struct kvm_irq_routing_entry *entry,
932                              struct kvm_irq_routing_entry *newentry)
933 {
934 #ifdef KVM_CAP_IRQ_ROUTING
935     kvm_context_t kvm = kvm_context;
936     struct kvm_irq_routing_entry *e;
937     int i;
938 
939     if (entry->gsi != newentry->gsi || entry->type != newentry->type) {
940         return -EINVAL;
941     }
942 
943     for (i = 0; i < kvm->irq_routes->nr; ++i) {
944         e = &kvm->irq_routes->entries[i];
945         if (e->type != entry->type || e->gsi != entry->gsi) {
946             continue;
947         }
948         switch (e->type) {
949         case KVM_IRQ_ROUTING_IRQCHIP:
950             if (e->u.irqchip.irqchip == entry->u.irqchip.irqchip &&
951                 e->u.irqchip.pin == entry->u.irqchip.pin) {
952                 memcpy(&e->u.irqchip, &newentry->u.irqchip,
953                        sizeof e->u.irqchip);
954                 return 0;
955             }
956             break;
957         case KVM_IRQ_ROUTING_MSI:
958             if (e->u.msi.address_lo == entry->u.msi.address_lo &&
959                 e->u.msi.address_hi == entry->u.msi.address_hi &&
960                 e->u.msi.data == entry->u.msi.data) {
961                 memcpy(&e->u.msi, &newentry->u.msi, sizeof e->u.msi);
962                 return 0;
963             }
964             break;
965         default:
966             break;
967         }
968     }
969     return -ESRCH;
970 #else
971     return -ENOSYS;
972 #endif
973 }
974 
kvm_del_irq_route(int gsi,int irqchip,int pin)975 int kvm_del_irq_route(int gsi, int irqchip, int pin)
976 {
977 #ifdef KVM_CAP_IRQ_ROUTING
978     struct kvm_irq_routing_entry e;
979 
980     e.gsi = gsi;
981     e.type = KVM_IRQ_ROUTING_IRQCHIP;
982     e.flags = 0;
983     e.u.irqchip.irqchip = irqchip;
984     e.u.irqchip.pin = pin;
985     return kvm_del_routing_entry(&e);
986 #else
987     return -ENOSYS;
988 #endif
989 }
990 
kvm_commit_irq_routes(void)991 int kvm_commit_irq_routes(void)
992 {
993 #ifdef KVM_CAP_IRQ_ROUTING
994     kvm_context_t kvm = kvm_context;
995 
996     kvm->irq_routes->flags = 0;
997     return kvm_vm_ioctl(kvm_state, KVM_SET_GSI_ROUTING, kvm->irq_routes);
998 #else
999     return -ENOSYS;
1000 #endif
1001 }
1002 
kvm_get_irq_route_gsi(void)1003 int kvm_get_irq_route_gsi(void)
1004 {
1005     kvm_context_t kvm = kvm_context;
1006     int i, bit;
1007     uint32_t *buf = kvm->used_gsi_bitmap;
1008 
1009     /* Return the lowest unused GSI in the bitmap */
1010     for (i = 0; i < kvm->max_gsi / 32; i++) {
1011         bit = ffs(~buf[i]);
1012         if (!bit) {
1013             continue;
1014         }
1015 
1016         return bit - 1 + i * 32;
1017     }
1018 
1019     return -ENOSPC;
1020 }
1021 
kvm_msix_routing_entry(struct kvm_irq_routing_entry * e,uint32_t gsi,uint32_t addr_lo,uint32_t addr_hi,uint32_t data)1022 static void kvm_msix_routing_entry(struct kvm_irq_routing_entry *e,
1023                                    uint32_t gsi, uint32_t addr_lo,
1024                                    uint32_t addr_hi, uint32_t data)
1025 
1026 {
1027     e->gsi = gsi;
1028     e->type = KVM_IRQ_ROUTING_MSI;
1029     e->flags = 0;
1030     e->u.msi.address_lo = addr_lo;
1031     e->u.msi.address_hi = addr_hi;
1032     e->u.msi.data = data;
1033 }
1034 
kvm_add_msix(uint32_t gsi,uint32_t addr_lo,uint32_t addr_hi,uint32_t data)1035 int kvm_add_msix(uint32_t gsi, uint32_t addr_lo,
1036                         uint32_t addr_hi, uint32_t data)
1037 {
1038     struct kvm_irq_routing_entry e;
1039 
1040     kvm_msix_routing_entry(&e, gsi, addr_lo, addr_hi, data);
1041     return kvm_add_routing_entry(&e);
1042 }
1043 
kvm_del_msix(uint32_t gsi,uint32_t addr_lo,uint32_t addr_hi,uint32_t data)1044 int kvm_del_msix(uint32_t gsi, uint32_t addr_lo,
1045                         uint32_t addr_hi, uint32_t data)
1046 {
1047     struct kvm_irq_routing_entry e;
1048 
1049     kvm_msix_routing_entry(&e, gsi, addr_lo, addr_hi, data);
1050     return kvm_del_routing_entry(&e);
1051 }
1052 
kvm_update_msix(uint32_t old_gsi,uint32_t old_addr_lo,uint32_t old_addr_hi,uint32_t old_data,uint32_t new_gsi,uint32_t new_addr_lo,uint32_t new_addr_hi,uint32_t new_data)1053 int kvm_update_msix(uint32_t old_gsi, uint32_t old_addr_lo,
1054                     uint32_t old_addr_hi, uint32_t old_data,
1055                     uint32_t new_gsi, uint32_t new_addr_lo,
1056                     uint32_t new_addr_hi, uint32_t new_data)
1057 {
1058     struct kvm_irq_routing_entry e1, e2;
1059 
1060     kvm_msix_routing_entry(&e1, old_gsi, old_addr_lo, old_addr_hi, old_data);
1061     kvm_msix_routing_entry(&e2, new_gsi, new_addr_lo, new_addr_hi, new_data);
1062     return kvm_update_routing_entry(&e1, &e2);
1063 }
1064 
1065 
1066 #ifdef KVM_CAP_DEVICE_MSIX
kvm_assign_set_msix_nr(kvm_context_t kvm,struct kvm_assigned_msix_nr * msix_nr)1067 int kvm_assign_set_msix_nr(kvm_context_t kvm,
1068                            struct kvm_assigned_msix_nr *msix_nr)
1069 {
1070     return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_NR, msix_nr);
1071 }
1072 
kvm_assign_set_msix_entry(kvm_context_t kvm,struct kvm_assigned_msix_entry * entry)1073 int kvm_assign_set_msix_entry(kvm_context_t kvm,
1074                               struct kvm_assigned_msix_entry *entry)
1075 {
1076     return kvm_vm_ioctl(kvm_state, KVM_ASSIGN_SET_MSIX_ENTRY, entry);
1077 }
1078 #endif
1079 
1080 #if defined(KVM_CAP_IRQFD) && defined(CONFIG_EVENTFD)
1081 
1082 #include <sys/eventfd.h>
1083 
_kvm_irqfd(kvm_context_t kvm,int fd,int gsi,int flags)1084 static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
1085 {
1086     struct kvm_irqfd data = {
1087         .fd = fd,
1088         .gsi = gsi,
1089         .flags = flags,
1090     };
1091 
1092     return kvm_vm_ioctl(kvm_state, KVM_IRQFD, &data);
1093 }
1094 
kvm_irqfd(kvm_context_t kvm,int gsi,int flags)1095 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1096 {
1097     int r;
1098     int fd;
1099 
1100     if (!kvm_check_extension(kvm_state, KVM_CAP_IRQFD))
1101         return -ENOENT;
1102 
1103     fd = eventfd(0, 0);
1104     if (fd < 0) {
1105         return -errno;
1106     }
1107 
1108     r = _kvm_irqfd(kvm, fd, gsi, 0);
1109     if (r < 0) {
1110         close(fd);
1111         return -errno;
1112     }
1113 
1114     return fd;
1115 }
1116 
1117 #else                           /* KVM_CAP_IRQFD */
1118 
kvm_irqfd(kvm_context_t kvm,int gsi,int flags)1119 int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
1120 {
1121     return -ENOSYS;
1122 }
1123 
1124 #endif                          /* KVM_CAP_IRQFD */
kvm_get_thread_id(void)1125 unsigned long kvm_get_thread_id(void)
1126 {
1127     return pthread_self();
1128 }
1129 
qemu_cond_wait(pthread_cond_t * cond)1130 static void qemu_cond_wait(pthread_cond_t *cond)
1131 {
1132     CPUState *env = cpu_single_env;
1133 
1134     pthread_cond_wait(cond, &qemu_mutex);
1135     cpu_single_env = env;
1136 }
1137 
sig_ipi_handler(int n)1138 static void sig_ipi_handler(int n)
1139 {
1140 }
1141 
sigbus_reraise(void)1142 static void sigbus_reraise(void)
1143 {
1144     sigset_t set;
1145     struct sigaction action;
1146 
1147     memset(&action, 0, sizeof(action));
1148     action.sa_handler = SIG_DFL;
1149     if (!sigaction(SIGBUS, &action, NULL)) {
1150         raise(SIGBUS);
1151         sigemptyset(&set);
1152         sigaddset(&set, SIGBUS);
1153         sigprocmask(SIG_UNBLOCK, &set, NULL);
1154     }
1155     perror("Failed to re-raise SIGBUS!\n");
1156     abort();
1157 }
1158 
sigbus_handler(int n,struct qemu_signalfd_siginfo * siginfo,void * ctx)1159 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
1160                            void *ctx)
1161 {
1162     if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
1163         sigbus_reraise();
1164 }
1165 
on_vcpu(CPUState * env,void (* func)(void * data),void * data)1166 void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
1167 {
1168     struct qemu_work_item wi;
1169 
1170     if (env == current_env) {
1171         func(data);
1172         return;
1173     }
1174 
1175     wi.func = func;
1176     wi.data = data;
1177     if (!env->kvm_cpu_state.queued_work_first) {
1178         env->kvm_cpu_state.queued_work_first = &wi;
1179     } else {
1180         env->kvm_cpu_state.queued_work_last->next = &wi;
1181     }
1182     env->kvm_cpu_state.queued_work_last = &wi;
1183     wi.next = NULL;
1184     wi.done = false;
1185 
1186     pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1187     while (!wi.done) {
1188         qemu_cond_wait(&qemu_work_cond);
1189     }
1190 }
1191 
do_kvm_cpu_synchronize_state(void * _env)1192 static void do_kvm_cpu_synchronize_state(void *_env)
1193 {
1194     CPUState *env = _env;
1195 
1196     if (!env->kvm_vcpu_dirty) {
1197         kvm_arch_save_regs(env);
1198         env->kvm_vcpu_dirty = 1;
1199     }
1200 }
1201 
kvm_cpu_synchronize_state(CPUState * env)1202 void kvm_cpu_synchronize_state(CPUState *env)
1203 {
1204     if (!env->kvm_vcpu_dirty) {
1205         on_vcpu(env, do_kvm_cpu_synchronize_state, env);
1206     }
1207 }
1208 
kvm_cpu_synchronize_post_reset(CPUState * env)1209 void kvm_cpu_synchronize_post_reset(CPUState *env)
1210 {
1211     kvm_arch_load_regs(env, KVM_PUT_RESET_STATE);
1212     env->kvm_vcpu_dirty = 0;
1213 }
1214 
kvm_cpu_synchronize_post_init(CPUState * env)1215 void kvm_cpu_synchronize_post_init(CPUState *env)
1216 {
1217     kvm_arch_load_regs(env, KVM_PUT_FULL_STATE);
1218     env->kvm_vcpu_dirty = 0;
1219 }
1220 
inject_interrupt(void * data)1221 static void inject_interrupt(void *data)
1222 {
1223     cpu_interrupt(current_env, (long) data);
1224 }
1225 
kvm_inject_interrupt(CPUState * env,int mask)1226 void kvm_inject_interrupt(CPUState *env, int mask)
1227 {
1228     on_vcpu(env, inject_interrupt, (void *) (long) mask);
1229 }
1230 
kvm_update_interrupt_request(CPUState * env)1231 void kvm_update_interrupt_request(CPUState *env)
1232 {
1233     int signal = 0;
1234 
1235     if (env) {
1236         if (!current_env || !current_env->created) {
1237             signal = 1;
1238         }
1239         /*
1240          * Testing for created here is really redundant
1241          */
1242         if (current_env && current_env->created &&
1243             env != current_env && !env->kvm_cpu_state.signalled) {
1244             signal = 1;
1245         }
1246 
1247         if (signal) {
1248             env->kvm_cpu_state.signalled = 1;
1249             if (env->kvm_cpu_state.thread) {
1250                 pthread_kill(env->kvm_cpu_state.thread, SIG_IPI);
1251             }
1252         }
1253     }
1254 }
1255 
kvm_cpu_exec(CPUState * env)1256 int kvm_cpu_exec(CPUState *env)
1257 {
1258     int r;
1259 
1260     r = kvm_run(env);
1261     if (r < 0) {
1262         printf("kvm_run returned %d\n", r);
1263         vm_stop(0);
1264     }
1265 
1266     return 0;
1267 }
1268 
kvm_cpu_is_stopped(CPUState * env)1269 int kvm_cpu_is_stopped(CPUState *env)
1270 {
1271     return !vm_running || env->stopped;
1272 }
1273 
flush_queued_work(CPUState * env)1274 static void flush_queued_work(CPUState *env)
1275 {
1276     struct qemu_work_item *wi;
1277 
1278     if (!env->kvm_cpu_state.queued_work_first) {
1279         return;
1280     }
1281 
1282     while ((wi = env->kvm_cpu_state.queued_work_first)) {
1283         env->kvm_cpu_state.queued_work_first = wi->next;
1284         wi->func(wi->data);
1285         wi->done = true;
1286     }
1287     env->kvm_cpu_state.queued_work_last = NULL;
1288     pthread_cond_broadcast(&qemu_work_cond);
1289 }
1290 
kvm_main_loop_wait(CPUState * env,int timeout)1291 static void kvm_main_loop_wait(CPUState *env, int timeout)
1292 {
1293     struct timespec ts;
1294     int r, e;
1295     siginfo_t siginfo;
1296     sigset_t waitset;
1297     sigset_t chkset;
1298 
1299     ts.tv_sec = timeout / 1000;
1300     ts.tv_nsec = (timeout % 1000) * 1000000;
1301     sigemptyset(&waitset);
1302     sigaddset(&waitset, SIG_IPI);
1303     sigaddset(&waitset, SIGBUS);
1304 
1305     do {
1306         pthread_mutex_unlock(&qemu_mutex);
1307 
1308         r = sigtimedwait(&waitset, &siginfo, &ts);
1309         e = errno;
1310 
1311         pthread_mutex_lock(&qemu_mutex);
1312 
1313         if (r == -1 && !(e == EAGAIN || e == EINTR)) {
1314             printf("sigtimedwait: %s\n", strerror(e));
1315             exit(1);
1316         }
1317 
1318         switch (r) {
1319         case SIGBUS:
1320             if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
1321                 sigbus_reraise();
1322             break;
1323         default:
1324             break;
1325         }
1326 
1327         r = sigpending(&chkset);
1328         if (r == -1) {
1329             printf("sigpending: %s\n", strerror(e));
1330             exit(1);
1331         }
1332     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
1333 
1334     cpu_single_env = env;
1335     flush_queued_work(env);
1336 
1337     if (env->stop) {
1338         env->stop = 0;
1339         env->stopped = 1;
1340         pthread_cond_signal(&qemu_pause_cond);
1341     }
1342 
1343     env->kvm_cpu_state.signalled = 0;
1344 }
1345 
all_threads_paused(void)1346 static int all_threads_paused(void)
1347 {
1348     CPUState *penv = first_cpu;
1349 
1350     while (penv) {
1351         if (penv->stop) {
1352             return 0;
1353         }
1354         penv = (CPUState *) penv->next_cpu;
1355     }
1356 
1357     return 1;
1358 }
1359 
pause_all_threads(void)1360 static void pause_all_threads(void)
1361 {
1362     CPUState *penv = first_cpu;
1363 
1364     while (penv) {
1365         if (penv != cpu_single_env) {
1366             penv->stop = 1;
1367             pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1368         } else {
1369             penv->stop = 0;
1370             penv->stopped = 1;
1371             cpu_exit(penv);
1372         }
1373         penv = (CPUState *) penv->next_cpu;
1374     }
1375 
1376     while (!all_threads_paused()) {
1377         qemu_cond_wait(&qemu_pause_cond);
1378     }
1379 }
1380 
resume_all_threads(void)1381 static void resume_all_threads(void)
1382 {
1383     CPUState *penv = first_cpu;
1384 
1385     assert(!cpu_single_env);
1386 
1387     while (penv) {
1388         penv->stop = 0;
1389         penv->stopped = 0;
1390         pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI);
1391         penv = (CPUState *) penv->next_cpu;
1392     }
1393 }
1394 
kvm_vm_state_change_handler(void * context,int running,int reason)1395 static void kvm_vm_state_change_handler(void *context, int running, int reason)
1396 {
1397     if (running) {
1398         resume_all_threads();
1399     } else {
1400         pause_all_threads();
1401     }
1402 }
1403 
setup_kernel_sigmask(CPUState * env)1404 static void setup_kernel_sigmask(CPUState *env)
1405 {
1406     sigset_t set;
1407 
1408     sigemptyset(&set);
1409     sigaddset(&set, SIGUSR2);
1410     sigaddset(&set, SIGIO);
1411     sigaddset(&set, SIGALRM);
1412     sigprocmask(SIG_BLOCK, &set, NULL);
1413 
1414     sigprocmask(SIG_BLOCK, NULL, &set);
1415     sigdelset(&set, SIG_IPI);
1416     sigdelset(&set, SIGBUS);
1417 
1418     kvm_set_signal_mask(env, &set);
1419 }
1420 
qemu_kvm_system_reset(void)1421 static void qemu_kvm_system_reset(void)
1422 {
1423     pause_all_threads();
1424 
1425     qemu_system_reset();
1426 
1427     resume_all_threads();
1428 }
1429 
process_irqchip_events(CPUState * env)1430 static void process_irqchip_events(CPUState *env)
1431 {
1432     kvm_arch_process_irqchip_events(env);
1433     if (kvm_arch_has_work(env))
1434         env->halted = 0;
1435 }
1436 
kvm_main_loop_cpu(CPUState * env)1437 static int kvm_main_loop_cpu(CPUState *env)
1438 {
1439     while (1) {
1440         int run_cpu = !kvm_cpu_is_stopped(env);
1441         if (run_cpu && !kvm_irqchip_in_kernel()) {
1442             process_irqchip_events(env);
1443             run_cpu = !env->halted;
1444         }
1445         if (run_cpu) {
1446             kvm_cpu_exec(env);
1447             kvm_main_loop_wait(env, 0);
1448         } else {
1449             kvm_main_loop_wait(env, 1000);
1450         }
1451     }
1452     pthread_mutex_unlock(&qemu_mutex);
1453     return 0;
1454 }
1455 
ap_main_loop(void * _env)1456 static void *ap_main_loop(void *_env)
1457 {
1458     CPUState *env = _env;
1459     sigset_t signals;
1460 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1461     struct ioperm_data *data = NULL;
1462 #endif
1463 
1464     current_env = env;
1465     env->thread_id = kvm_get_thread_id();
1466     sigfillset(&signals);
1467     sigprocmask(SIG_BLOCK, &signals, NULL);
1468 
1469 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
1470     /* do ioperm for io ports of assigned devices */
1471     QLIST_FOREACH(data, &ioperm_head, entries)
1472         on_vcpu(env, kvm_arch_do_ioperm, data);
1473 #endif
1474 
1475     pthread_mutex_lock(&qemu_mutex);
1476     cpu_single_env = env;
1477 
1478     kvm_create_vcpu(env, env->cpu_index);
1479     setup_kernel_sigmask(env);
1480 
1481     /* signal VCPU creation */
1482     current_env->created = 1;
1483     pthread_cond_signal(&qemu_vcpu_cond);
1484 
1485     /* and wait for machine initialization */
1486     while (!qemu_system_ready) {
1487         qemu_cond_wait(&qemu_system_cond);
1488     }
1489 
1490     /* re-initialize cpu_single_env after re-acquiring qemu_mutex */
1491     cpu_single_env = env;
1492 
1493     kvm_main_loop_cpu(env);
1494     return NULL;
1495 }
1496 
kvm_init_vcpu(CPUState * env)1497 int kvm_init_vcpu(CPUState *env)
1498 {
1499     pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env);
1500 
1501     while (env->created == 0) {
1502         qemu_cond_wait(&qemu_vcpu_cond);
1503     }
1504 
1505     return 0;
1506 }
1507 
kvm_vcpu_inited(CPUState * env)1508 int kvm_vcpu_inited(CPUState *env)
1509 {
1510     return env->created;
1511 }
1512 
1513 #ifdef TARGET_I386
kvm_hpet_disable_kpit(void)1514 void kvm_hpet_disable_kpit(void)
1515 {
1516     struct kvm_pit_state2 ps2;
1517 
1518     kvm_get_pit2(kvm_context, &ps2);
1519     ps2.flags |= KVM_PIT_FLAGS_HPET_LEGACY;
1520     kvm_set_pit2(kvm_context, &ps2);
1521 }
1522 
kvm_hpet_enable_kpit(void)1523 void kvm_hpet_enable_kpit(void)
1524 {
1525     struct kvm_pit_state2 ps2;
1526 
1527     kvm_get_pit2(kvm_context, &ps2);
1528     ps2.flags &= ~KVM_PIT_FLAGS_HPET_LEGACY;
1529     kvm_set_pit2(kvm_context, &ps2);
1530 }
1531 #endif
1532 
kvm_init_ap(void)1533 int kvm_init_ap(void)
1534 {
1535     struct sigaction action;
1536 
1537     qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
1538 
1539     memset(&action, 0, sizeof(action));
1540     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sig_ipi_handler;
1541     sigaction(SIG_IPI, &action, NULL);
1542 
1543     memset(&action, 0, sizeof(action));
1544     action.sa_flags = SA_SIGINFO;
1545     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
1546     sigaction(SIGBUS, &action, NULL);
1547 #ifdef __linux__
1548     prctl(PR_MCE_KILL, 1, 1, 0, 0);
1549 #endif
1550     return 0;
1551 }
1552 
1553 /* If we have signalfd, we mask out the signals we want to handle and then
1554  * use signalfd to listen for them.  We rely on whatever the current signal
1555  * handler is to dispatch the signals when we receive them.
1556  */
1557 
sigfd_handler(void * opaque)1558 static void sigfd_handler(void *opaque)
1559 {
1560     int fd = (unsigned long) opaque;
1561     struct qemu_signalfd_siginfo info;
1562     struct sigaction action;
1563     ssize_t len;
1564 
1565     while (1) {
1566         do {
1567             len = read(fd, &info, sizeof(info));
1568         } while (len == -1 && errno == EINTR);
1569 
1570         if (len == -1 && errno == EAGAIN) {
1571             break;
1572         }
1573 
1574         if (len != sizeof(info)) {
1575             printf("read from sigfd returned %zd: %m\n", len);
1576             return;
1577         }
1578 
1579         sigaction(info.ssi_signo, NULL, &action);
1580         if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
1581             action.sa_sigaction(info.ssi_signo,
1582                                 (siginfo_t *)&info, NULL);
1583         } else if (action.sa_handler) {
1584             action.sa_handler(info.ssi_signo);
1585         }
1586     }
1587 }
1588 
kvm_main_loop(void)1589 int kvm_main_loop(void)
1590 {
1591     sigset_t mask;
1592     int sigfd;
1593 
1594     io_thread = pthread_self();
1595     qemu_system_ready = 1;
1596 
1597     sigemptyset(&mask);
1598     sigaddset(&mask, SIGIO);
1599     sigaddset(&mask, SIGALRM);
1600     sigaddset(&mask, SIGBUS);
1601     sigprocmask(SIG_BLOCK, &mask, NULL);
1602 
1603     sigfd = qemu_signalfd(&mask);
1604     if (sigfd == -1) {
1605         fprintf(stderr, "failed to create signalfd\n");
1606         return -errno;
1607     }
1608 
1609     fcntl(sigfd, F_SETFL, O_NONBLOCK);
1610 
1611     qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
1612                          (void *)(unsigned long) sigfd);
1613 
1614     pthread_cond_broadcast(&qemu_system_cond);
1615 
1616     io_thread_sigfd = sigfd;
1617     cpu_single_env = NULL;
1618 
1619     while (1) {
1620         main_loop_wait(0);
1621         if (qemu_shutdown_requested()) {
1622             monitor_protocol_event(QEVENT_SHUTDOWN, NULL);
1623             if (qemu_no_shutdown()) {
1624                 vm_stop(0);
1625             } else {
1626                 break;
1627             }
1628         } else if (qemu_powerdown_requested()) {
1629             monitor_protocol_event(QEVENT_POWERDOWN, NULL);
1630             qemu_irq_raise(qemu_system_powerdown);
1631         } else if (qemu_reset_requested()) {
1632             qemu_kvm_system_reset();
1633         } else if (kvm_debug_cpu_requested) {
1634             gdb_set_stop_cpu(kvm_debug_cpu_requested);
1635             vm_stop(EXCP_DEBUG);
1636             kvm_debug_cpu_requested = NULL;
1637         }
1638     }
1639 
1640     bdrv_close_all();
1641     pause_all_threads();
1642     pthread_mutex_unlock(&qemu_mutex);
1643 
1644     return 0;
1645 }
1646 
1647 #if !defined(TARGET_I386)
kvm_arch_init_irq_routing(void)1648 int kvm_arch_init_irq_routing(void)
1649 {
1650     return 0;
1651 }
1652 #endif
1653 
1654 extern int no_hpet;
1655 
kvm_create_context(void)1656 static int kvm_create_context(void)
1657 {
1658     static const char upgrade_note[] =
1659     "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1660     "(see http://sourceforge.net/projects/kvm).\n";
1661 
1662     int r;
1663 
1664     if (!kvm_irqchip) {
1665         kvm_disable_irqchip_creation(kvm_context);
1666     }
1667     if (!kvm_pit) {
1668         kvm_disable_pit_creation(kvm_context);
1669     }
1670     if (kvm_create(kvm_context, 0, NULL) < 0) {
1671         kvm_finalize(kvm_state);
1672         return -1;
1673     }
1674     r = kvm_arch_qemu_create_context();
1675     if (r < 0) {
1676         kvm_finalize(kvm_state);
1677         return -1;
1678     }
1679     if (kvm_pit && !kvm_pit_reinject) {
1680         if (kvm_reinject_control(kvm_context, 0)) {
1681             fprintf(stderr, "failure to disable in-kernel PIT reinjection\n");
1682             return -1;
1683         }
1684     }
1685 
1686     /* There was a nasty bug in < kvm-80 that prevents memory slots from being
1687      * destroyed properly.  Since we rely on this capability, refuse to work
1688      * with any kernel without this capability. */
1689     if (!kvm_check_extension(kvm_state, KVM_CAP_DESTROY_MEMORY_REGION_WORKS)) {
1690         fprintf(stderr,
1691                 "KVM kernel module broken (DESTROY_MEMORY_REGION).\n%s",
1692                 upgrade_note);
1693         return -EINVAL;
1694     }
1695 
1696     r = kvm_arch_init_irq_routing();
1697     if (r < 0) {
1698         return r;
1699     }
1700 
1701     kvm_state->vcpu_events = 0;
1702 #ifdef KVM_CAP_VCPU_EVENTS
1703     kvm_state->vcpu_events = kvm_check_extension(kvm_state, KVM_CAP_VCPU_EVENTS);
1704 #endif
1705 
1706     kvm_state->debugregs = 0;
1707 #ifdef KVM_CAP_DEBUGREGS
1708     kvm_state->debugregs = kvm_check_extension(kvm_state, KVM_CAP_DEBUGREGS);
1709 #endif
1710 
1711     kvm_state->xsave = 0;
1712 #ifdef KVM_CAP_XSAVE
1713     kvm_state->xsave = kvm_check_extension(kvm_state, KVM_CAP_XSAVE);
1714 #endif
1715 
1716     kvm_state->xcrs = 0;
1717 #ifdef KVM_CAP_XCRS
1718     kvm_state->xcrs = kvm_check_extension(kvm_state, KVM_CAP_XCRS);
1719 #endif
1720 
1721     kvm_state->many_ioeventfds = kvm_check_many_ioeventfds();
1722 
1723     kvm_init_ap();
1724     if (kvm_irqchip) {
1725         if (!qemu_kvm_has_gsi_routing()) {
1726             irq0override = 0;
1727 #ifdef TARGET_I386
1728             /* if kernel can't do irq routing, interrupt source
1729              * override 0->2 can not be set up as required by hpet,
1730              * so disable hpet.
1731              */
1732             no_hpet = 1;
1733         } else if (!qemu_kvm_has_pit_state2()) {
1734             no_hpet = 1;
1735         }
1736 #else
1737         }
1738 #endif
1739     }
1740 
1741     return 0;
1742 }
1743 
1744 #ifdef KVM_CAP_IRQCHIP
1745 
kvm_set_irq(int irq,int level,int * status)1746 int kvm_set_irq(int irq, int level, int *status)
1747 {
1748     return kvm_set_irq_level(kvm_context, irq, level, status);
1749 }
1750 
1751 #endif
1752 
kvm_mutex_unlock(void)1753 static void kvm_mutex_unlock(void)
1754 {
1755     assert(!cpu_single_env);
1756     pthread_mutex_unlock(&qemu_mutex);
1757 }
1758 
kvm_mutex_lock(void)1759 static void kvm_mutex_lock(void)
1760 {
1761     pthread_mutex_lock(&qemu_mutex);
1762     cpu_single_env = NULL;
1763 }
1764 
qemu_mutex_unlock_iothread(void)1765 void qemu_mutex_unlock_iothread(void)
1766 {
1767     if (kvm_enabled()) {
1768         kvm_mutex_unlock();
1769     }
1770 }
1771 
qemu_mutex_lock_iothread(void)1772 void qemu_mutex_lock_iothread(void)
1773 {
1774     if (kvm_enabled()) {
1775         kvm_mutex_lock();
1776     }
1777 }
1778 
1779 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
kvm_add_ioperm_data(struct ioperm_data * data)1780 void kvm_add_ioperm_data(struct ioperm_data *data)
1781 {
1782     QLIST_INSERT_HEAD(&ioperm_head, data, entries);
1783 }
1784 
kvm_remove_ioperm_data(unsigned long start_port,unsigned long num)1785 void kvm_remove_ioperm_data(unsigned long start_port, unsigned long num)
1786 {
1787     struct ioperm_data *data;
1788 
1789     data = QLIST_FIRST(&ioperm_head);
1790     while (data) {
1791         struct ioperm_data *next = QLIST_NEXT(data, entries);
1792 
1793         if (data->start_port == start_port && data->num == num) {
1794             QLIST_REMOVE(data, entries);
1795             qemu_free(data);
1796         }
1797 
1798         data = next;
1799     }
1800 }
1801 
kvm_ioperm(CPUState * env,void * data)1802 void kvm_ioperm(CPUState *env, void *data)
1803 {
1804     if (kvm_enabled() && qemu_system_ready) {
1805         on_vcpu(env, kvm_arch_do_ioperm, data);
1806     }
1807 }
1808 
1809 #endif
1810 
kvm_set_boot_cpu_id(uint32_t id)1811 int kvm_set_boot_cpu_id(uint32_t id)
1812 {
1813     return kvm_set_boot_vcpu_id(kvm_context, id);
1814 }
1815 
1816