1 /*
2 * Copyright (c) 2007, Neocleus Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 *
18 * Assign a PCI device from the host to a guest VM.
19 *
20 * Adapted for KVM by Qumranet.
21 *
22 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
27 */
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <sys/io.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include "qemu-kvm.h"
34 #include "hw.h"
35 #include "pc.h"
36 #include "qemu-error.h"
37 #include "console.h"
38 #include "device-assignment.h"
39 #include "loader.h"
40 #include "monitor.h"
41 #include "range.h"
42 #include <pci/header.h>
43 #include "sysemu.h"
44
45 /* From linux/ioport.h */
46 #define IORESOURCE_IO 0x00000100 /* Resource type */
47 #define IORESOURCE_MEM 0x00000200
48 #define IORESOURCE_IRQ 0x00000400
49 #define IORESOURCE_DMA 0x00000800
50 #define IORESOURCE_PREFETCH 0x00001000 /* No side effects */
51
52 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
53
54 #ifdef DEVICE_ASSIGNMENT_DEBUG
55 #define DEBUG(fmt, ...) \
56 do { \
57 fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \
58 } while (0)
59 #else
60 #define DEBUG(fmt, ...) do { } while(0)
61 #endif
62
63 static void assigned_dev_load_option_rom(AssignedDevice *dev);
64
65 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
66
67 static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
68 uint32_t address,
69 uint32_t val, int len);
70
71 static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
72 uint32_t address, int len);
73
assigned_dev_ioport_rw(AssignedDevRegion * dev_region,uint32_t addr,int len,uint32_t * val)74 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
75 uint32_t addr, int len, uint32_t *val)
76 {
77 uint32_t ret = 0;
78 uint32_t offset = addr - dev_region->e_physbase;
79 int fd = dev_region->region->resource_fd;
80
81 if (fd >= 0) {
82 if (val) {
83 DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
84 *val, len, addr, offset);
85 if (pwrite(fd, val, len, offset) != len) {
86 fprintf(stderr, "%s - pwrite failed %s\n",
87 __func__, strerror(errno));
88 }
89 } else {
90 if (pread(fd, &ret, len, offset) != len) {
91 fprintf(stderr, "%s - pread failed %s\n",
92 __func__, strerror(errno));
93 ret = (1UL << (len * 8)) - 1;
94 }
95 DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
96 ret, len, addr, offset);
97 }
98 } else {
99 uint32_t port = offset + dev_region->u.r_baseport;
100
101 if (val) {
102 DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
103 *val, len, addr, port);
104 switch (len) {
105 case 1:
106 outb(*val, port);
107 break;
108 case 2:
109 outw(*val, port);
110 break;
111 case 4:
112 outl(*val, port);
113 break;
114 }
115 } else {
116 switch (len) {
117 case 1:
118 ret = inb(port);
119 break;
120 case 2:
121 ret = inw(port);
122 break;
123 case 4:
124 ret = inl(port);
125 break;
126 }
127 DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
128 ret, len, addr, port);
129 }
130 }
131 return ret;
132 }
133
assigned_dev_ioport_writeb(void * opaque,uint32_t addr,uint32_t value)134 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
135 uint32_t value)
136 {
137 assigned_dev_ioport_rw(opaque, addr, 1, &value);
138 return;
139 }
140
assigned_dev_ioport_writew(void * opaque,uint32_t addr,uint32_t value)141 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
142 uint32_t value)
143 {
144 assigned_dev_ioport_rw(opaque, addr, 2, &value);
145 return;
146 }
147
assigned_dev_ioport_writel(void * opaque,uint32_t addr,uint32_t value)148 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
149 uint32_t value)
150 {
151 assigned_dev_ioport_rw(opaque, addr, 4, &value);
152 return;
153 }
154
assigned_dev_ioport_readb(void * opaque,uint32_t addr)155 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
156 {
157 return assigned_dev_ioport_rw(opaque, addr, 1, NULL);
158 }
159
assigned_dev_ioport_readw(void * opaque,uint32_t addr)160 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
161 {
162 return assigned_dev_ioport_rw(opaque, addr, 2, NULL);
163 }
164
assigned_dev_ioport_readl(void * opaque,uint32_t addr)165 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
166 {
167 return assigned_dev_ioport_rw(opaque, addr, 4, NULL);
168 }
169
slow_bar_readb(void * opaque,target_phys_addr_t addr)170 static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
171 {
172 AssignedDevRegion *d = opaque;
173 uint8_t *in = d->u.r_virtbase + addr;
174 uint32_t r;
175
176 r = *in;
177 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
178
179 return r;
180 }
181
slow_bar_readw(void * opaque,target_phys_addr_t addr)182 static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
183 {
184 AssignedDevRegion *d = opaque;
185 uint16_t *in = d->u.r_virtbase + addr;
186 uint32_t r;
187
188 r = *in;
189 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
190
191 return r;
192 }
193
slow_bar_readl(void * opaque,target_phys_addr_t addr)194 static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
195 {
196 AssignedDevRegion *d = opaque;
197 uint32_t *in = d->u.r_virtbase + addr;
198 uint32_t r;
199
200 r = *in;
201 DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
202
203 return r;
204 }
205
slow_bar_writeb(void * opaque,target_phys_addr_t addr,uint32_t val)206 static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
207 {
208 AssignedDevRegion *d = opaque;
209 uint8_t *out = d->u.r_virtbase + addr;
210
211 DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
212 *out = val;
213 }
214
slow_bar_writew(void * opaque,target_phys_addr_t addr,uint32_t val)215 static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
216 {
217 AssignedDevRegion *d = opaque;
218 uint16_t *out = d->u.r_virtbase + addr;
219
220 DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
221 *out = val;
222 }
223
slow_bar_writel(void * opaque,target_phys_addr_t addr,uint32_t val)224 static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
225 {
226 AssignedDevRegion *d = opaque;
227 uint32_t *out = d->u.r_virtbase + addr;
228
229 DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
230 *out = val;
231 }
232
233 static CPUWriteMemoryFunc * const slow_bar_write[] = {
234 &slow_bar_writeb,
235 &slow_bar_writew,
236 &slow_bar_writel
237 };
238
239 static CPUReadMemoryFunc * const slow_bar_read[] = {
240 &slow_bar_readb,
241 &slow_bar_readw,
242 &slow_bar_readl
243 };
244
assigned_dev_iomem_map_slow(PCIDevice * pci_dev,int region_num,pcibus_t e_phys,pcibus_t e_size,int type)245 static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
246 pcibus_t e_phys, pcibus_t e_size,
247 int type)
248 {
249 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
250 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
251 PCIRegion *real_region = &r_dev->real_device.regions[region_num];
252 int m;
253
254 DEBUG("%s", "slow map\n");
255 m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region,
256 DEVICE_NATIVE_ENDIAN);
257 cpu_register_physical_memory(e_phys, e_size, m);
258
259 /* MSI-X MMIO page */
260 if ((e_size > 0) &&
261 real_region->base_addr <= r_dev->msix_table_addr &&
262 real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
263 int offset = r_dev->msix_table_addr - real_region->base_addr;
264
265 cpu_register_physical_memory(e_phys + offset,
266 TARGET_PAGE_SIZE, r_dev->mmio_index);
267 }
268 }
269
assigned_dev_iomem_map(PCIDevice * pci_dev,int region_num,pcibus_t e_phys,pcibus_t e_size,int type)270 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
271 pcibus_t e_phys, pcibus_t e_size, int type)
272 {
273 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
274 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
275 PCIRegion *real_region = &r_dev->real_device.regions[region_num];
276 int ret = 0;
277
278 DEBUG("e_phys=%08" FMT_PCIBUS " r_virt=%p type=%d len=%08" FMT_PCIBUS " region_num=%d \n",
279 e_phys, region->u.r_virtbase, type, e_size, region_num);
280
281 region->e_physbase = e_phys;
282 region->e_size = e_size;
283
284 if (e_size > 0) {
285 cpu_register_physical_memory(e_phys, e_size, region->memory_index);
286
287 /* deal with MSI-X MMIO page */
288 if (real_region->base_addr <= r_dev->msix_table_addr &&
289 real_region->base_addr + real_region->size >=
290 r_dev->msix_table_addr) {
291 int offset = r_dev->msix_table_addr - real_region->base_addr;
292
293 cpu_register_physical_memory(e_phys + offset,
294 TARGET_PAGE_SIZE, r_dev->mmio_index);
295 }
296 }
297
298 if (ret != 0) {
299 fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
300 exit(1);
301 }
302 }
303
assigned_dev_ioport_map(PCIDevice * pci_dev,int region_num,pcibus_t addr,pcibus_t size,int type)304 static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
305 pcibus_t addr, pcibus_t size, int type)
306 {
307 AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
308 AssignedDevRegion *region = &r_dev->v_addrs[region_num];
309 int first_map = (region->e_size == 0);
310 CPUState *env;
311
312 region->e_physbase = addr;
313 region->e_size = size;
314
315 DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
316 addr, region->u.r_baseport, type, size, region_num);
317
318 if (first_map && region->region->resource_fd < 0) {
319 struct ioperm_data *data;
320
321 data = qemu_mallocz(sizeof(struct ioperm_data));
322 if (data == NULL) {
323 fprintf(stderr, "%s: Out of memory\n", __func__);
324 exit(1);
325 }
326
327 data->start_port = region->u.r_baseport;
328 data->num = region->r_size;
329 data->turn_on = 1;
330
331 kvm_add_ioperm_data(data);
332
333 for (env = first_cpu; env; env = env->next_cpu)
334 kvm_ioperm(env, data);
335 }
336
337 register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
338 (r_dev->v_addrs + region_num));
339 register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
340 (r_dev->v_addrs + region_num));
341 register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
342 (r_dev->v_addrs + region_num));
343 register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
344 (r_dev->v_addrs + region_num));
345 register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
346 (r_dev->v_addrs + region_num));
347 register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
348 (r_dev->v_addrs + region_num));
349 }
350
assigned_dev_pci_read(PCIDevice * d,int pos,int len)351 static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
352 {
353 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
354 uint32_t val;
355 ssize_t ret;
356 int fd = pci_dev->real_device.config_fd;
357
358 again:
359 ret = pread(fd, &val, len, pos);
360 if (ret != len) {
361 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
362 goto again;
363
364 fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
365 __func__, ret, errno);
366
367 exit(1);
368 }
369
370 return val;
371 }
372
assigned_dev_pci_read_byte(PCIDevice * d,int pos)373 static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
374 {
375 return (uint8_t)assigned_dev_pci_read(d, pos, 1);
376 }
377
assigned_dev_pci_write(PCIDevice * d,int pos,uint32_t val,int len)378 static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
379 {
380 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
381 ssize_t ret;
382 int fd = pci_dev->real_device.config_fd;
383
384 again:
385 ret = pwrite(fd, &val, len, pos);
386 if (ret != len) {
387 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
388 goto again;
389
390 fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
391 __func__, ret, errno);
392
393 exit(1);
394 }
395
396 return;
397 }
398
pci_find_cap_offset(PCIDevice * d,uint8_t cap,uint8_t start)399 static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
400 {
401 int id;
402 int max_cap = 48;
403 int pos = start ? start : PCI_CAPABILITY_LIST;
404 int status;
405
406 status = assigned_dev_pci_read_byte(d, PCI_STATUS);
407 if ((status & PCI_STATUS_CAP_LIST) == 0)
408 return 0;
409
410 while (max_cap--) {
411 pos = assigned_dev_pci_read_byte(d, pos);
412 if (pos < 0x40)
413 break;
414
415 pos &= ~3;
416 id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
417
418 if (id == 0xff)
419 break;
420 if (id == cap)
421 return pos;
422
423 pos += PCI_CAP_LIST_NEXT;
424 }
425 return 0;
426 }
427
assigned_dev_pci_write_config(PCIDevice * d,uint32_t address,uint32_t val,int len)428 static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
429 uint32_t val, int len)
430 {
431 int fd;
432 ssize_t ret;
433 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
434
435 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
436 ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
437 (uint16_t) address, val, len);
438
439 if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
440 return assigned_device_pci_cap_write_config(d, address, val, len);
441 }
442
443 if (address == 0x4) {
444 pci_default_write_config(d, address, val, len);
445 /* Continue to program the card */
446 }
447
448 if ((address >= 0x10 && address <= 0x24) || address == 0x30 ||
449 address == 0x34 || address == 0x3c || address == 0x3d) {
450 /* used for update-mappings (BAR emulation) */
451 pci_default_write_config(d, address, val, len);
452 return;
453 }
454
455 DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
456 ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
457 (uint16_t) address, val, len);
458
459 fd = pci_dev->real_device.config_fd;
460
461 again:
462 ret = pwrite(fd, &val, len, address);
463 if (ret != len) {
464 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
465 goto again;
466
467 fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
468 __func__, ret, errno);
469
470 exit(1);
471 }
472 }
473
assigned_dev_pci_read_config(PCIDevice * d,uint32_t address,int len)474 static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
475 int len)
476 {
477 uint32_t val = 0;
478 int fd;
479 ssize_t ret;
480 AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
481
482 if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
483 val = assigned_device_pci_cap_read_config(d, address, len);
484 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
485 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
486 return val;
487 }
488
489 if (address < 0x4 || (pci_dev->need_emulate_cmd && address == 0x4) ||
490 (address >= 0x10 && address <= 0x24) || address == 0x30 ||
491 address == 0x34 || address == 0x3c || address == 0x3d) {
492 val = pci_default_read_config(d, address, len);
493 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
494 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
495 return val;
496 }
497
498 /* vga specific, remove later */
499 if (address == 0xFC)
500 goto do_log;
501
502 fd = pci_dev->real_device.config_fd;
503
504 again:
505 ret = pread(fd, &val, len, address);
506 if (ret != len) {
507 if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
508 goto again;
509
510 fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
511 __func__, ret, errno);
512
513 exit(1);
514 }
515
516 do_log:
517 DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
518 (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
519
520 if (!pci_dev->cap.available) {
521 /* kill the special capabilities */
522 if (address == 4 && len == 4)
523 val &= ~0x100000;
524 else if (address == 6)
525 val &= ~0x10;
526 }
527
528 return val;
529 }
530
assigned_dev_register_regions(PCIRegion * io_regions,unsigned long regions_num,AssignedDevice * pci_dev)531 static int assigned_dev_register_regions(PCIRegion *io_regions,
532 unsigned long regions_num,
533 AssignedDevice *pci_dev)
534 {
535 uint32_t i;
536 PCIRegion *cur_region = io_regions;
537
538 for (i = 0; i < regions_num; i++, cur_region++) {
539 if (!cur_region->valid)
540 continue;
541 pci_dev->v_addrs[i].num = i;
542
543 /* handle memory io regions */
544 if (cur_region->type & IORESOURCE_MEM) {
545 int slow_map = 0;
546 int t = cur_region->type & IORESOURCE_PREFETCH
547 ? PCI_BASE_ADDRESS_MEM_PREFETCH
548 : PCI_BASE_ADDRESS_SPACE_MEMORY;
549
550 if (cur_region->size & 0xFFF) {
551 fprintf(stderr, "PCI region %d at address 0x%llx "
552 "has size 0x%x, which is not a multiple of 4K. "
553 "You might experience some performance hit "
554 "due to that.\n",
555 i, (unsigned long long)cur_region->base_addr,
556 cur_region->size);
557 slow_map = 1;
558 }
559
560 /* map physical memory */
561 pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
562 pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
563 PROT_WRITE | PROT_READ,
564 MAP_SHARED,
565 cur_region->resource_fd,
566 (off_t)0);
567
568 if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
569 pci_dev->v_addrs[i].u.r_virtbase = NULL;
570 fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
571 "\n", __func__,
572 (uint32_t) (cur_region->base_addr));
573 return -1;
574 }
575
576 pci_dev->v_addrs[i].r_size = cur_region->size;
577 pci_dev->v_addrs[i].e_size = 0;
578
579 /* add offset */
580 pci_dev->v_addrs[i].u.r_virtbase +=
581 (cur_region->base_addr & 0xFFF);
582
583
584 if (!slow_map) {
585 void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
586 char name[32];
587 snprintf(name, sizeof(name), "%s.bar%d",
588 pci_dev->dev.qdev.info->name, i);
589 pci_dev->v_addrs[i].memory_index =
590 qemu_ram_alloc_from_ptr(
591 &pci_dev->dev.qdev,
592 name, cur_region->size,
593 virtbase);
594 } else
595 pci_dev->v_addrs[i].memory_index = 0;
596
597 pci_register_bar((PCIDevice *) pci_dev, i,
598 cur_region->size, t,
599 slow_map ? assigned_dev_iomem_map_slow
600 : assigned_dev_iomem_map);
601 continue;
602 } else {
603 /* handle port io regions */
604 uint32_t val;
605 int ret;
606
607 /* Test kernel support for ioport resource read/write. Old
608 * kernels return EIO. New kernels only allow 1/2/4 byte reads
609 * so should return EINVAL for a 3 byte read */
610 ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
611 if (ret == 3) {
612 fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
613 abort();
614 } else if (errno != EINVAL) {
615 fprintf(stderr, "Using raw in/out ioport access (sysfs - %s)\n",
616 strerror(errno));
617 close(pci_dev->v_addrs[i].region->resource_fd);
618 pci_dev->v_addrs[i].region->resource_fd = -1;
619 }
620
621 pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
622 pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
623 pci_dev->v_addrs[i].r_size = cur_region->size;
624 pci_dev->v_addrs[i].e_size = 0;
625
626 pci_register_bar((PCIDevice *) pci_dev, i,
627 cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
628 assigned_dev_ioport_map);
629
630 /* not relevant for port io */
631 pci_dev->v_addrs[i].memory_index = 0;
632 }
633 }
634
635 /* success */
636 return 0;
637 }
638
get_real_id(const char * devpath,const char * idname,uint16_t * val)639 static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
640 {
641 FILE *f;
642 char name[128];
643 long id;
644
645 snprintf(name, sizeof(name), "%s%s", devpath, idname);
646 f = fopen(name, "r");
647 if (f == NULL) {
648 fprintf(stderr, "%s: %s: %m\n", __func__, name);
649 return -1;
650 }
651 if (fscanf(f, "%li\n", &id) == 1) {
652 *val = id;
653 } else {
654 return -1;
655 }
656 fclose(f);
657
658 return 0;
659 }
660
get_real_vendor_id(const char * devpath,uint16_t * val)661 static int get_real_vendor_id(const char *devpath, uint16_t *val)
662 {
663 return get_real_id(devpath, "vendor", val);
664 }
665
get_real_device_id(const char * devpath,uint16_t * val)666 static int get_real_device_id(const char *devpath, uint16_t *val)
667 {
668 return get_real_id(devpath, "device", val);
669 }
670
get_real_device(AssignedDevice * pci_dev,uint16_t r_seg,uint8_t r_bus,uint8_t r_dev,uint8_t r_func)671 static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
672 uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
673 {
674 char dir[128], name[128];
675 int fd, r = 0, v;
676 FILE *f;
677 unsigned long long start, end, size, flags;
678 uint16_t id;
679 struct stat statbuf;
680 PCIRegion *rp;
681 PCIDevRegions *dev = &pci_dev->real_device;
682
683 dev->region_number = 0;
684
685 snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
686 r_seg, r_bus, r_dev, r_func);
687
688 snprintf(name, sizeof(name), "%sconfig", dir);
689
690 if (pci_dev->configfd_name && *pci_dev->configfd_name) {
691 if (qemu_isdigit(pci_dev->configfd_name[0])) {
692 dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
693 } else {
694 dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
695 if (dev->config_fd < 0) {
696 fprintf(stderr, "%s: (%s) unkown\n", __func__,
697 pci_dev->configfd_name);
698 return 1;
699 }
700 }
701 } else {
702 dev->config_fd = open(name, O_RDWR);
703
704 if (dev->config_fd == -1) {
705 fprintf(stderr, "%s: %s: %m\n", __func__, name);
706 return 1;
707 }
708 }
709 again:
710 r = read(dev->config_fd, pci_dev->dev.config,
711 pci_config_size(&pci_dev->dev));
712 if (r < 0) {
713 if (errno == EINTR || errno == EAGAIN)
714 goto again;
715 fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
716 }
717
718 /* Clear host resource mapping info. If we choose not to register a
719 * BAR, such as might be the case with the option ROM, we can get
720 * confusing, unwritable, residual addresses from the host here. */
721 memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
722 memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
723
724 snprintf(name, sizeof(name), "%sresource", dir);
725
726 f = fopen(name, "r");
727 if (f == NULL) {
728 fprintf(stderr, "%s: %s: %m\n", __func__, name);
729 return 1;
730 }
731
732 for (r = 0; r < PCI_ROM_SLOT; r++) {
733 if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
734 break;
735
736 rp = dev->regions + r;
737 rp->valid = 0;
738 rp->resource_fd = -1;
739 size = end - start + 1;
740 flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
741 if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
742 continue;
743 if (flags & IORESOURCE_MEM) {
744 flags &= ~IORESOURCE_IO;
745 } else {
746 flags &= ~IORESOURCE_PREFETCH;
747 }
748 snprintf(name, sizeof(name), "%sresource%d", dir, r);
749 fd = open(name, O_RDWR);
750 if (fd == -1)
751 continue;
752 rp->resource_fd = fd;
753
754 rp->type = flags;
755 rp->valid = 1;
756 rp->base_addr = start;
757 rp->size = size;
758 pci_dev->v_addrs[r].region = rp;
759 DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
760 r, rp->size, start, rp->type, rp->resource_fd);
761 }
762
763 fclose(f);
764
765 /* read and fill vendor ID */
766 v = get_real_vendor_id(dir, &id);
767 if (v) {
768 return 1;
769 }
770 pci_dev->dev.config[0] = id & 0xff;
771 pci_dev->dev.config[1] = (id & 0xff00) >> 8;
772
773 /* read and fill device ID */
774 v = get_real_device_id(dir, &id);
775 if (v) {
776 return 1;
777 }
778 pci_dev->dev.config[2] = id & 0xff;
779 pci_dev->dev.config[3] = (id & 0xff00) >> 8;
780
781 /* dealing with virtual function device */
782 snprintf(name, sizeof(name), "%sphysfn/", dir);
783 if (!stat(name, &statbuf))
784 pci_dev->need_emulate_cmd = 1;
785 else
786 pci_dev->need_emulate_cmd = 0;
787
788 dev->region_number = r;
789 return 0;
790 }
791
792 static QLIST_HEAD(, AssignedDevice) devs = QLIST_HEAD_INITIALIZER(devs);
793
794 #ifdef KVM_CAP_IRQ_ROUTING
free_dev_irq_entries(AssignedDevice * dev)795 static void free_dev_irq_entries(AssignedDevice *dev)
796 {
797 int i;
798
799 for (i = 0; i < dev->irq_entries_nr; i++)
800 kvm_del_routing_entry(&dev->entry[i]);
801 free(dev->entry);
802 dev->entry = NULL;
803 dev->irq_entries_nr = 0;
804 }
805 #endif
806
free_assigned_device(AssignedDevice * dev)807 static void free_assigned_device(AssignedDevice *dev)
808 {
809 if (dev) {
810 int i;
811
812 for (i = 0; i < dev->real_device.region_number; i++) {
813 PCIRegion *pci_region = &dev->real_device.regions[i];
814 AssignedDevRegion *region = &dev->v_addrs[i];
815
816 if (!pci_region->valid)
817 continue;
818
819 if (pci_region->type & IORESOURCE_IO) {
820 if (pci_region->resource_fd < 0) {
821 kvm_remove_ioperm_data(region->u.r_baseport,
822 region->r_size);
823 }
824 } else if (pci_region->type & IORESOURCE_MEM) {
825 if (region->u.r_virtbase) {
826 if (region->memory_index) {
827 cpu_register_physical_memory(region->e_physbase,
828 region->e_size,
829 IO_MEM_UNASSIGNED);
830 qemu_ram_unmap(region->memory_index);
831 }
832 if (munmap(region->u.r_virtbase,
833 (pci_region->size + 0xFFF) & 0xFFFFF000))
834 fprintf(stderr,
835 "Failed to unmap assigned device region: %s\n",
836 strerror(errno));
837 }
838 }
839 if (pci_region->resource_fd >= 0) {
840 close(pci_region->resource_fd);
841 }
842 }
843
844 if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
845 assigned_dev_unregister_msix_mmio(dev);
846
847 if (dev->real_device.config_fd >= 0) {
848 close(dev->real_device.config_fd);
849 }
850
851 #ifdef KVM_CAP_IRQ_ROUTING
852 free_dev_irq_entries(dev);
853 #endif
854 }
855 }
856
calc_assigned_dev_id(uint16_t seg,uint8_t bus,uint8_t devfn)857 static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
858 {
859 return (uint32_t)seg << 16 | (uint32_t)bus << 8 | (uint32_t)devfn;
860 }
861
assign_failed_examine(AssignedDevice * dev)862 static void assign_failed_examine(AssignedDevice *dev)
863 {
864 char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
865 uint16_t vendor_id, device_id;
866 int r;
867
868 sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
869 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
870
871 sprintf(name, "%sdriver", dir);
872
873 r = readlink(name, driver, sizeof(driver));
874 if ((r <= 0) || r >= sizeof(driver) || !(ns = strrchr(driver, '/'))) {
875 goto fail;
876 }
877
878 ns++;
879
880 if (get_real_vendor_id(dir, &vendor_id) ||
881 get_real_device_id(dir, &device_id)) {
882 goto fail;
883 }
884
885 fprintf(stderr, "*** The driver '%s' is occupying your device "
886 "%04x:%02x:%02x.%x.\n",
887 ns, dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
888 fprintf(stderr, "***\n");
889 fprintf(stderr, "*** You can try the following commands to free it:\n");
890 fprintf(stderr, "***\n");
891 fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
892 "new_id\n", vendor_id, device_id);
893 fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
894 "%s/unbind\n",
895 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func, ns);
896 fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
897 "pci-stub/bind\n",
898 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
899 fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
900 "/remove_id\n", vendor_id, device_id);
901 fprintf(stderr, "***\n");
902
903 return;
904
905 fail:
906 fprintf(stderr, "Couldn't find out why.\n");
907 }
908
assign_device(AssignedDevice * dev)909 static int assign_device(AssignedDevice *dev)
910 {
911 struct kvm_assigned_pci_dev assigned_dev_data;
912 int r;
913
914 #ifdef KVM_CAP_PCI_SEGMENT
915 /* Only pass non-zero PCI segment to capable module */
916 if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
917 dev->h_segnr) {
918 fprintf(stderr, "Can't assign device inside non-zero PCI segment "
919 "as this KVM module doesn't support it.\n");
920 return -ENODEV;
921 }
922 #endif
923
924 memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
925 assigned_dev_data.assigned_dev_id =
926 calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
927 #ifdef KVM_CAP_PCI_SEGMENT
928 assigned_dev_data.segnr = dev->h_segnr;
929 #endif
930 assigned_dev_data.busnr = dev->h_busnr;
931 assigned_dev_data.devfn = dev->h_devfn;
932
933 #ifdef KVM_CAP_IOMMU
934 /* We always enable the IOMMU unless disabled on the command line */
935 if (dev->features & ASSIGNED_DEVICE_USE_IOMMU_MASK) {
936 if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
937 fprintf(stderr, "No IOMMU found. Unable to assign device \"%s\"\n",
938 dev->dev.qdev.id);
939 return -ENODEV;
940 }
941 assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
942 }
943 #else
944 dev->features &= ~ASSIGNED_DEVICE_USE_IOMMU_MASK;
945 #endif
946 if (!(dev->features & ASSIGNED_DEVICE_USE_IOMMU_MASK)) {
947 fprintf(stderr,
948 "WARNING: Assigning a device without IOMMU protection can "
949 "cause host memory corruption if the device issues DMA write "
950 "requests!\n");
951 }
952
953 r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
954 if (r < 0) {
955 fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
956 dev->dev.qdev.id, strerror(-r));
957
958 switch (r) {
959 case -EBUSY:
960 assign_failed_examine(dev);
961 break;
962 default:
963 break;
964 }
965 }
966 return r;
967 }
968
assign_irq(AssignedDevice * dev)969 static int assign_irq(AssignedDevice *dev)
970 {
971 struct kvm_assigned_irq assigned_irq_data;
972 int irq, r = 0;
973
974 /* Interrupt PIN 0 means don't use INTx */
975 if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0)
976 return 0;
977
978 irq = pci_map_irq(&dev->dev, dev->intpin);
979 irq = piix_get_irq(irq);
980
981 #ifdef TARGET_IA64
982 irq = ipf_map_irq(&dev->dev, irq);
983 #endif
984
985 if (dev->girq == irq)
986 return r;
987
988 memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
989 assigned_irq_data.assigned_dev_id =
990 calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
991 assigned_irq_data.guest_irq = irq;
992 assigned_irq_data.host_irq = dev->real_device.irq;
993 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
994 if (dev->irq_requested_type) {
995 assigned_irq_data.flags = dev->irq_requested_type;
996 r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
997 /* -ENXIO means no assigned irq */
998 if (r && r != -ENXIO)
999 perror("assign_irq: deassign");
1000 }
1001
1002 assigned_irq_data.flags = KVM_DEV_IRQ_GUEST_INTX;
1003 if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
1004 dev->cap.available & ASSIGNED_DEVICE_CAP_MSI)
1005 assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_MSI;
1006 else
1007 assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_INTX;
1008 #endif
1009
1010 r = kvm_assign_irq(kvm_context, &assigned_irq_data);
1011 if (r < 0) {
1012 fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
1013 dev->dev.qdev.id, strerror(-r));
1014 fprintf(stderr, "Perhaps you are assigning a device "
1015 "that shares an IRQ with another device?\n");
1016 return r;
1017 }
1018
1019 dev->girq = irq;
1020 dev->irq_requested_type = assigned_irq_data.flags;
1021 return r;
1022 }
1023
deassign_device(AssignedDevice * dev)1024 static void deassign_device(AssignedDevice *dev)
1025 {
1026 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1027 struct kvm_assigned_pci_dev assigned_dev_data;
1028 int r;
1029
1030 memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
1031 assigned_dev_data.assigned_dev_id =
1032 calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
1033
1034 r = kvm_deassign_pci_device(kvm_context, &assigned_dev_data);
1035 if (r < 0)
1036 fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
1037 dev->dev.qdev.id, strerror(-r));
1038 #endif
1039 }
1040
1041 #if 0
1042 AssignedDevInfo *get_assigned_device(int pcibus, int slot)
1043 {
1044 AssignedDevice *assigned_dev = NULL;
1045 AssignedDevInfo *adev = NULL;
1046
1047 QLIST_FOREACH(adev, &adev_head, next) {
1048 assigned_dev = adev->assigned_dev;
1049 if (pci_bus_num(assigned_dev->dev.bus) == pcibus &&
1050 PCI_SLOT(assigned_dev->dev.devfn) == slot)
1051 return adev;
1052 }
1053
1054 return NULL;
1055 }
1056 #endif
1057
1058 /* The pci config space got updated. Check if irq numbers have changed
1059 * for our devices
1060 */
assigned_dev_update_irqs(void)1061 void assigned_dev_update_irqs(void)
1062 {
1063 AssignedDevice *dev, *next;
1064 int r;
1065
1066 dev = QLIST_FIRST(&devs);
1067 while (dev) {
1068 next = QLIST_NEXT(dev, next);
1069 r = assign_irq(dev);
1070 if (r < 0)
1071 qdev_unplug(&dev->dev.qdev);
1072 dev = next;
1073 }
1074 }
1075
1076 #ifdef KVM_CAP_IRQ_ROUTING
1077
1078 #ifdef KVM_CAP_DEVICE_MSI
assigned_dev_update_msi(PCIDevice * pci_dev,unsigned int ctrl_pos)1079 static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos)
1080 {
1081 struct kvm_assigned_irq assigned_irq_data;
1082 AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1083 uint8_t ctrl_byte = pci_dev->config[ctrl_pos];
1084 int r;
1085
1086 memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
1087 assigned_irq_data.assigned_dev_id =
1088 calc_assigned_dev_id(assigned_dev->h_segnr, assigned_dev->h_busnr,
1089 (uint8_t)assigned_dev->h_devfn);
1090
1091 /* Some guests gratuitously disable MSI even if they're not using it,
1092 * try to catch this by only deassigning irqs if the guest is using
1093 * MSI or intends to start. */
1094 if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI) ||
1095 (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
1096
1097 assigned_irq_data.flags = assigned_dev->irq_requested_type;
1098 free_dev_irq_entries(assigned_dev);
1099 r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
1100 /* -ENXIO means no assigned irq */
1101 if (r && r != -ENXIO)
1102 perror("assigned_dev_update_msi: deassign irq");
1103
1104 assigned_dev->irq_requested_type = 0;
1105 }
1106
1107 if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
1108 int pos = ctrl_pos - PCI_MSI_FLAGS;
1109 assigned_dev->entry = calloc(1, sizeof(struct kvm_irq_routing_entry));
1110 if (!assigned_dev->entry) {
1111 perror("assigned_dev_update_msi: ");
1112 return;
1113 }
1114 assigned_dev->entry->u.msi.address_lo =
1115 pci_get_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO);
1116 assigned_dev->entry->u.msi.address_hi = 0;
1117 assigned_dev->entry->u.msi.data =
1118 pci_get_word(pci_dev->config + pos + PCI_MSI_DATA_32);
1119 assigned_dev->entry->type = KVM_IRQ_ROUTING_MSI;
1120 r = kvm_get_irq_route_gsi();
1121 if (r < 0) {
1122 perror("assigned_dev_update_msi: kvm_get_irq_route_gsi");
1123 return;
1124 }
1125 assigned_dev->entry->gsi = r;
1126
1127 kvm_add_routing_entry(assigned_dev->entry);
1128 if (kvm_commit_irq_routes() < 0) {
1129 perror("assigned_dev_update_msi: kvm_commit_irq_routes");
1130 assigned_dev->cap.state &= ~ASSIGNED_DEVICE_MSI_ENABLED;
1131 return;
1132 }
1133 assigned_dev->irq_entries_nr = 1;
1134
1135 assigned_irq_data.guest_irq = assigned_dev->entry->gsi;
1136 assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_MSI;
1137 if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0)
1138 perror("assigned_dev_enable_msi: assign irq");
1139
1140 assigned_dev->girq = -1;
1141 assigned_dev->irq_requested_type = assigned_irq_data.flags;
1142 } else {
1143 assign_irq(assigned_dev);
1144 }
1145 }
1146 #endif
1147
1148 #ifdef KVM_CAP_DEVICE_MSIX
assigned_dev_update_msix_mmio(PCIDevice * pci_dev)1149 static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
1150 {
1151 AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
1152 uint16_t entries_nr = 0, entries_max_nr;
1153 int pos = 0, i, r = 0;
1154 uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
1155 struct kvm_assigned_msix_nr msix_nr;
1156 struct kvm_assigned_msix_entry msix_entry;
1157 void *va = adev->msix_table_page;
1158
1159 pos = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
1160
1161 entries_max_nr = *(uint16_t *)(pci_dev->config + pos + 2);
1162 entries_max_nr &= PCI_MSIX_TABSIZE;
1163 entries_max_nr += 1;
1164
1165 /* Get the usable entry number for allocating */
1166 for (i = 0; i < entries_max_nr; i++) {
1167 memcpy(&msg_ctrl, va + i * 16 + 12, 4);
1168 memcpy(&msg_data, va + i * 16 + 8, 4);
1169 /* Ignore unused entry even it's unmasked */
1170 if (msg_data == 0)
1171 continue;
1172 entries_nr ++;
1173 }
1174
1175 if (entries_nr == 0) {
1176 fprintf(stderr, "MSI-X entry number is zero!\n");
1177 return -EINVAL;
1178 }
1179 msix_nr.assigned_dev_id = calc_assigned_dev_id(adev->h_segnr, adev->h_busnr,
1180 (uint8_t)adev->h_devfn);
1181 msix_nr.entry_nr = entries_nr;
1182 r = kvm_assign_set_msix_nr(kvm_context, &msix_nr);
1183 if (r != 0) {
1184 fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
1185 strerror(-r));
1186 return r;
1187 }
1188
1189 free_dev_irq_entries(adev);
1190 adev->irq_entries_nr = entries_nr;
1191 adev->entry = calloc(entries_nr, sizeof(struct kvm_irq_routing_entry));
1192 if (!adev->entry) {
1193 perror("assigned_dev_update_msix_mmio: ");
1194 return -errno;
1195 }
1196
1197 msix_entry.assigned_dev_id = msix_nr.assigned_dev_id;
1198 entries_nr = 0;
1199 for (i = 0; i < entries_max_nr; i++) {
1200 if (entries_nr >= msix_nr.entry_nr)
1201 break;
1202 memcpy(&msg_ctrl, va + i * 16 + 12, 4);
1203 memcpy(&msg_data, va + i * 16 + 8, 4);
1204 if (msg_data == 0)
1205 continue;
1206
1207 memcpy(&msg_addr, va + i * 16, 4);
1208 memcpy(&msg_upper_addr, va + i * 16 + 4, 4);
1209
1210 r = kvm_get_irq_route_gsi();
1211 if (r < 0)
1212 return r;
1213
1214 adev->entry[entries_nr].gsi = r;
1215 adev->entry[entries_nr].type = KVM_IRQ_ROUTING_MSI;
1216 adev->entry[entries_nr].flags = 0;
1217 adev->entry[entries_nr].u.msi.address_lo = msg_addr;
1218 adev->entry[entries_nr].u.msi.address_hi = msg_upper_addr;
1219 adev->entry[entries_nr].u.msi.data = msg_data;
1220 DEBUG("MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!", msg_data, msg_addr);
1221 kvm_add_routing_entry(&adev->entry[entries_nr]);
1222
1223 msix_entry.gsi = adev->entry[entries_nr].gsi;
1224 msix_entry.entry = i;
1225 r = kvm_assign_set_msix_entry(kvm_context, &msix_entry);
1226 if (r) {
1227 fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
1228 break;
1229 }
1230 DEBUG("MSI-X entry gsi 0x%x, entry %d\n!",
1231 msix_entry.gsi, msix_entry.entry);
1232 entries_nr ++;
1233 }
1234
1235 if (r == 0 && kvm_commit_irq_routes() < 0) {
1236 perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes");
1237 return -EINVAL;
1238 }
1239
1240 return r;
1241 }
1242
assigned_dev_update_msix(PCIDevice * pci_dev,unsigned int ctrl_pos)1243 static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
1244 {
1245 struct kvm_assigned_irq assigned_irq_data;
1246 AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1247 uint16_t *ctrl_word = (uint16_t *)(pci_dev->config + ctrl_pos);
1248 int r;
1249
1250 memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
1251 assigned_irq_data.assigned_dev_id =
1252 calc_assigned_dev_id(assigned_dev->h_segnr, assigned_dev->h_busnr,
1253 (uint8_t)assigned_dev->h_devfn);
1254
1255 /* Some guests gratuitously disable MSIX even if they're not using it,
1256 * try to catch this by only deassigning irqs if the guest is using
1257 * MSIX or intends to start. */
1258 if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSIX) ||
1259 (*ctrl_word & PCI_MSIX_ENABLE)) {
1260
1261 assigned_irq_data.flags = assigned_dev->irq_requested_type;
1262 free_dev_irq_entries(assigned_dev);
1263 r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
1264 /* -ENXIO means no assigned irq */
1265 if (r && r != -ENXIO)
1266 perror("assigned_dev_update_msix: deassign irq");
1267
1268 assigned_dev->irq_requested_type = 0;
1269 }
1270
1271 if (*ctrl_word & PCI_MSIX_ENABLE) {
1272 assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSIX |
1273 KVM_DEV_IRQ_GUEST_MSIX;
1274
1275 if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
1276 perror("assigned_dev_update_msix_mmio");
1277 return;
1278 }
1279 if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0) {
1280 perror("assigned_dev_enable_msix: assign irq");
1281 return;
1282 }
1283 assigned_dev->girq = -1;
1284 assigned_dev->irq_requested_type = assigned_irq_data.flags;
1285 } else {
1286 assign_irq(assigned_dev);
1287 }
1288 }
1289 #endif
1290 #endif
1291
1292 /* There can be multiple VNDR capabilities per device, we need to find the
1293 * one that starts closet to the given address without going over. */
find_vndr_start(PCIDevice * pci_dev,uint32_t address)1294 static uint8_t find_vndr_start(PCIDevice *pci_dev, uint32_t address)
1295 {
1296 uint8_t cap, pos;
1297
1298 for (cap = pos = 0;
1299 (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
1300 pos += PCI_CAP_LIST_NEXT) {
1301 if (pos <= address) {
1302 cap = MAX(pos, cap);
1303 }
1304 }
1305 return cap;
1306 }
1307
1308 /* Merge the bits set in mask from mval into val. Both val and mval are
1309 * at the same addr offset, pos is the starting offset of the mask. */
merge_bits(uint32_t val,uint32_t mval,uint8_t addr,int len,uint8_t pos,uint32_t mask)1310 static uint32_t merge_bits(uint32_t val, uint32_t mval, uint8_t addr,
1311 int len, uint8_t pos, uint32_t mask)
1312 {
1313 if (!ranges_overlap(addr, len, pos, 4)) {
1314 return val;
1315 }
1316
1317 if (addr >= pos) {
1318 mask >>= (addr - pos) * 8;
1319 } else {
1320 mask <<= (pos - addr) * 8;
1321 }
1322 mask &= 0xffffffffU >> (4 - len) * 8;
1323
1324 val &= ~mask;
1325 val |= (mval & mask);
1326
1327 return val;
1328 }
1329
assigned_device_pci_cap_read_config(PCIDevice * pci_dev,uint32_t address,int len)1330 static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
1331 uint32_t address, int len)
1332 {
1333 uint8_t cap, cap_id = pci_dev->config_map[address];
1334 uint32_t val;
1335
1336 switch (cap_id) {
1337
1338 case PCI_CAP_ID_VPD:
1339 cap = pci_find_capability(pci_dev, cap_id);
1340 val = assigned_dev_pci_read(pci_dev, address, len);
1341 return merge_bits(val, pci_get_long(pci_dev->config + address),
1342 address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
1343
1344 case PCI_CAP_ID_VNDR:
1345 cap = find_vndr_start(pci_dev, address);
1346 val = assigned_dev_pci_read(pci_dev, address, len);
1347 return merge_bits(val, pci_get_long(pci_dev->config + address),
1348 address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
1349 }
1350
1351 return pci_default_read_config(pci_dev, address, len);
1352 }
1353
assigned_device_pci_cap_write_config(PCIDevice * pci_dev,uint32_t address,uint32_t val,int len)1354 static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
1355 uint32_t address,
1356 uint32_t val, int len)
1357 {
1358 uint8_t cap_id = pci_dev->config_map[address];
1359
1360 pci_default_write_config(pci_dev, address, val, len);
1361 switch (cap_id) {
1362 #ifdef KVM_CAP_IRQ_ROUTING
1363 case PCI_CAP_ID_MSI:
1364 #ifdef KVM_CAP_DEVICE_MSI
1365 {
1366 uint8_t cap = pci_find_capability(pci_dev, cap_id);
1367 if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
1368 assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
1369 }
1370 }
1371 #endif
1372 break;
1373
1374 case PCI_CAP_ID_MSIX:
1375 #ifdef KVM_CAP_DEVICE_MSIX
1376 {
1377 uint8_t cap = pci_find_capability(pci_dev, cap_id);
1378 if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
1379 assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
1380 }
1381 }
1382 #endif
1383 break;
1384 #endif
1385
1386 case PCI_CAP_ID_VPD:
1387 case PCI_CAP_ID_VNDR:
1388 assigned_dev_pci_write(pci_dev, address, val, len);
1389 break;
1390 }
1391 }
1392
assigned_device_pci_cap_init(PCIDevice * pci_dev)1393 static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
1394 {
1395 AssignedDevice *dev = container_of(pci_dev, AssignedDevice, dev);
1396 PCIRegion *pci_region = dev->real_device.regions;
1397 int ret, pos;
1398
1399 /* Clear initial capabilities pointer and status copied from hw */
1400 pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
1401 pci_set_word(pci_dev->config + PCI_STATUS,
1402 pci_get_word(pci_dev->config + PCI_STATUS) &
1403 ~PCI_STATUS_CAP_LIST);
1404
1405 #ifdef KVM_CAP_IRQ_ROUTING
1406 #ifdef KVM_CAP_DEVICE_MSI
1407 /* Expose MSI capability
1408 * MSI capability is the 1st capability in capability config */
1409 if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0))) {
1410 dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
1411 /* Only 32-bit/no-mask currently supported */
1412 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10)) < 0) {
1413 return ret;
1414 }
1415
1416 pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
1417 pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
1418 PCI_MSI_FLAGS_QMASK);
1419 pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
1420 pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
1421
1422 /* Set writable fields */
1423 pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
1424 PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
1425 pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
1426 pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
1427 }
1428 #endif
1429 #ifdef KVM_CAP_DEVICE_MSIX
1430 /* Expose MSI-X capability */
1431 if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0))) {
1432 int bar_nr;
1433 uint32_t msix_table_entry;
1434
1435 dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
1436 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12)) < 0) {
1437 return ret;
1438 }
1439
1440 pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
1441 pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
1442 PCI_MSIX_TABSIZE);
1443
1444 /* Only enable and function mask bits are writable */
1445 pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
1446 PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
1447
1448 msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
1449 bar_nr = msix_table_entry & PCI_MSIX_BIR;
1450 msix_table_entry &= ~PCI_MSIX_BIR;
1451 dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
1452 }
1453 #endif
1454 #endif
1455
1456 /* Minimal PM support, nothing writable, device appears to NAK changes */
1457 if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0))) {
1458 uint16_t pmc;
1459 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos,
1460 PCI_PM_SIZEOF)) < 0) {
1461 return ret;
1462 }
1463
1464 pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
1465 pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
1466 pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
1467
1468 /* assign_device will bring the device up to D0, so we don't need
1469 * to worry about doing that ourselves here. */
1470 pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
1471 PCI_PM_CTRL_NO_SOFT_RESET);
1472
1473 pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
1474 pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
1475 }
1476
1477 if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0))) {
1478 uint8_t version;
1479 uint16_t type, devctl, lnkcap, lnksta;
1480 uint32_t devcap;
1481 int size = 0x3c; /* version 2 size */
1482
1483 version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
1484 version &= PCI_EXP_FLAGS_VERS;
1485 if (version == 1) {
1486 size = 0x14;
1487 } else if (version > 2) {
1488 fprintf(stderr, "Unsupported PCI express capability version %d\n",
1489 version);
1490 return -EINVAL;
1491 }
1492
1493 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP,
1494 pos, size)) < 0) {
1495 return ret;
1496 }
1497
1498 type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
1499 type = (type & PCI_EXP_FLAGS_TYPE) >> 8;
1500 if (type != PCI_EXP_TYPE_ENDPOINT &&
1501 type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
1502 fprintf(stderr,
1503 "Device assignment only supports endpoint assignment, "
1504 "device type %d\n", type);
1505 return -EINVAL;
1506 }
1507
1508 /* capabilities, pass existing read-only copy
1509 * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
1510
1511 /* device capabilities: hide FLR */
1512 devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
1513 devcap &= ~PCI_EXP_DEVCAP_FLR;
1514 pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
1515
1516 /* device control: clear all error reporting enable bits, leaving
1517 * leaving only a few host values. Note, these are
1518 * all writable, but not passed to hw.
1519 */
1520 devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
1521 devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
1522 PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
1523 pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
1524 devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
1525 pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
1526
1527 /* Clear device status */
1528 pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
1529
1530 /* Link capabilities, expose links and latencues, clear reporting */
1531 lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
1532 lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
1533 PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
1534 PCI_EXP_LNKCAP_L1EL);
1535 pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
1536 pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
1537 PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
1538 PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
1539 PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
1540
1541 /* Link control, pass existing read-only copy. Should be writable? */
1542
1543 /* Link status, only expose current speed and width */
1544 lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
1545 lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
1546 pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
1547
1548 if (version >= 2) {
1549 /* Slot capabilities, control, status - not needed for endpoints */
1550 pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
1551 pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
1552 pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
1553
1554 /* Root control, capabilities, status - not needed for endpoints */
1555 pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
1556 pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
1557 pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
1558
1559 /* Device capabilities/control 2, pass existing read-only copy */
1560 /* Link control 2, pass existing read-only copy */
1561 }
1562 }
1563
1564 if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0))) {
1565 uint16_t cmd;
1566 uint32_t status;
1567
1568 /* Only expose the minimum, 8 byte capability */
1569 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8)) < 0) {
1570 return ret;
1571 }
1572
1573 /* Command register, clear upper bits, including extended modes */
1574 cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
1575 cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
1576 PCI_X_CMD_MAX_SPLIT);
1577 pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
1578
1579 /* Status register, update with emulated PCI bus location, clear
1580 * error bits, leave the rest. */
1581 status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
1582 status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
1583 status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
1584 status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
1585 PCI_X_STATUS_SPL_ERR);
1586 pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
1587 }
1588
1589 if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0))) {
1590 /* Direct R/W passthrough */
1591 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8)) < 0) {
1592 return ret;
1593 }
1594 }
1595
1596 /* Devices can have multiple vendor capabilities, get them all */
1597 for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
1598 pos += PCI_CAP_LIST_NEXT) {
1599 uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
1600 /* Direct R/W passthrough */
1601 if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR,
1602 pos, len)) < 0) {
1603 return ret;
1604 }
1605 }
1606
1607 return 0;
1608 }
1609
msix_mmio_readl(void * opaque,target_phys_addr_t addr)1610 static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
1611 {
1612 AssignedDevice *adev = opaque;
1613 unsigned int offset = addr & 0xfff;
1614 void *page = adev->msix_table_page;
1615 uint32_t val = 0;
1616
1617 memcpy(&val, (void *)((char *)page + offset), 4);
1618
1619 return val;
1620 }
1621
msix_mmio_readb(void * opaque,target_phys_addr_t addr)1622 static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
1623 {
1624 return ((msix_mmio_readl(opaque, addr & ~3)) >>
1625 (8 * (addr & 3))) & 0xff;
1626 }
1627
msix_mmio_readw(void * opaque,target_phys_addr_t addr)1628 static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
1629 {
1630 return ((msix_mmio_readl(opaque, addr & ~3)) >>
1631 (8 * (addr & 3))) & 0xffff;
1632 }
1633
msix_mmio_writel(void * opaque,target_phys_addr_t addr,uint32_t val)1634 static void msix_mmio_writel(void *opaque,
1635 target_phys_addr_t addr, uint32_t val)
1636 {
1637 AssignedDevice *adev = opaque;
1638 unsigned int offset = addr & 0xfff;
1639 void *page = adev->msix_table_page;
1640
1641 DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n",
1642 addr, val);
1643 memcpy((void *)((char *)page + offset), &val, 4);
1644 }
1645
msix_mmio_writew(void * opaque,target_phys_addr_t addr,uint32_t val)1646 static void msix_mmio_writew(void *opaque,
1647 target_phys_addr_t addr, uint32_t val)
1648 {
1649 msix_mmio_writel(opaque, addr & ~3,
1650 (val & 0xffff) << (8*(addr & 3)));
1651 }
1652
msix_mmio_writeb(void * opaque,target_phys_addr_t addr,uint32_t val)1653 static void msix_mmio_writeb(void *opaque,
1654 target_phys_addr_t addr, uint32_t val)
1655 {
1656 msix_mmio_writel(opaque, addr & ~3,
1657 (val & 0xff) << (8*(addr & 3)));
1658 }
1659
1660 static CPUWriteMemoryFunc *msix_mmio_write[] = {
1661 msix_mmio_writeb, msix_mmio_writew, msix_mmio_writel
1662 };
1663
1664 static CPUReadMemoryFunc *msix_mmio_read[] = {
1665 msix_mmio_readb, msix_mmio_readw, msix_mmio_readl
1666 };
1667
assigned_dev_register_msix_mmio(AssignedDevice * dev)1668 static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
1669 {
1670 dev->msix_table_page = mmap(NULL, 0x1000,
1671 PROT_READ|PROT_WRITE,
1672 MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
1673 if (dev->msix_table_page == MAP_FAILED) {
1674 fprintf(stderr, "fail allocate msix_table_page! %s\n",
1675 strerror(errno));
1676 return -EFAULT;
1677 }
1678 memset(dev->msix_table_page, 0, 0x1000);
1679 dev->mmio_index = cpu_register_io_memory(
1680 msix_mmio_read, msix_mmio_write, dev,
1681 DEVICE_NATIVE_ENDIAN);
1682 return 0;
1683 }
1684
assigned_dev_unregister_msix_mmio(AssignedDevice * dev)1685 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
1686 {
1687 if (!dev->msix_table_page)
1688 return;
1689
1690 cpu_unregister_io_memory(dev->mmio_index);
1691 dev->mmio_index = 0;
1692
1693 if (munmap(dev->msix_table_page, 0x1000) == -1) {
1694 fprintf(stderr, "error unmapping msix_table_page! %s\n",
1695 strerror(errno));
1696 }
1697 dev->msix_table_page = NULL;
1698 }
1699
1700 static const VMStateDescription vmstate_assigned_device = {
1701 .name = "pci-assign",
1702 .fields = (VMStateField []) {
1703 VMSTATE_END_OF_LIST()
1704 }
1705 };
1706
reset_assigned_device(DeviceState * dev)1707 static void reset_assigned_device(DeviceState *dev)
1708 {
1709 PCIDevice *d = DO_UPCAST(PCIDevice, qdev, dev);
1710
1711 /*
1712 * When a 0 is written to the command register, the device is logically
1713 * disconnected from the PCI bus. This avoids further DMA transfers.
1714 */
1715 assigned_dev_pci_write_config(d, PCI_COMMAND, 0, 2);
1716 }
1717
assigned_initfn(struct PCIDevice * pci_dev)1718 static int assigned_initfn(struct PCIDevice *pci_dev)
1719 {
1720 AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
1721 uint8_t e_device, e_intx;
1722 int r;
1723
1724 if (!kvm_enabled()) {
1725 error_report("pci-assign: error: requires KVM support");
1726 return -1;
1727 }
1728
1729 if (!dev->host.seg && !dev->host.bus && !dev->host.dev && !dev->host.func) {
1730 error_report("pci-assign: error: no host device specified");
1731 return -1;
1732 }
1733
1734 if (get_real_device(dev, dev->host.seg, dev->host.bus,
1735 dev->host.dev, dev->host.func)) {
1736 error_report("pci-assign: Error: Couldn't get real device (%s)!",
1737 dev->dev.qdev.id);
1738 goto out;
1739 }
1740
1741 /* handle real device's MMIO/PIO BARs */
1742 if (assigned_dev_register_regions(dev->real_device.regions,
1743 dev->real_device.region_number,
1744 dev))
1745 goto out;
1746
1747 /* handle interrupt routing */
1748 e_device = (dev->dev.devfn >> 3) & 0x1f;
1749 e_intx = dev->dev.config[0x3d] - 1;
1750 dev->intpin = e_intx;
1751 dev->run = 0;
1752 dev->girq = -1;
1753 dev->h_segnr = dev->host.seg;
1754 dev->h_busnr = dev->host.bus;
1755 dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
1756
1757 if (assigned_device_pci_cap_init(pci_dev) < 0)
1758 goto out;
1759
1760 /* assign device to guest */
1761 r = assign_device(dev);
1762 if (r < 0)
1763 goto out;
1764
1765 /* assign irq for the device */
1766 r = assign_irq(dev);
1767 if (r < 0)
1768 goto assigned_out;
1769
1770 /* intercept MSI-X entry page in the MMIO */
1771 if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
1772 if (assigned_dev_register_msix_mmio(dev))
1773 goto assigned_out;
1774
1775 assigned_dev_load_option_rom(dev);
1776 QLIST_INSERT_HEAD(&devs, dev, next);
1777
1778 add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL);
1779
1780 /* Register a vmsd so that we can mark it unmigratable. */
1781 vmstate_register(&dev->dev.qdev, 0, &vmstate_assigned_device, dev);
1782 register_device_unmigratable(&dev->dev.qdev,
1783 vmstate_assigned_device.name, dev);
1784
1785 return 0;
1786
1787 assigned_out:
1788 deassign_device(dev);
1789 out:
1790 free_assigned_device(dev);
1791 return -1;
1792 }
1793
assigned_exitfn(struct PCIDevice * pci_dev)1794 static int assigned_exitfn(struct PCIDevice *pci_dev)
1795 {
1796 AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
1797
1798 vmstate_unregister(&dev->dev.qdev, &vmstate_assigned_device, dev);
1799 QLIST_REMOVE(dev, next);
1800 deassign_device(dev);
1801 free_assigned_device(dev);
1802 return 0;
1803 }
1804
parse_hostaddr(DeviceState * dev,Property * prop,const char * str)1805 static int parse_hostaddr(DeviceState *dev, Property *prop, const char *str)
1806 {
1807 PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop);
1808 int rc;
1809
1810 rc = pci_parse_host_devaddr(str, &ptr->seg, &ptr->bus, &ptr->dev, &ptr->func);
1811 if (rc != 0)
1812 return -1;
1813 return 0;
1814 }
1815
print_hostaddr(DeviceState * dev,Property * prop,char * dest,size_t len)1816 static int print_hostaddr(DeviceState *dev, Property *prop, char *dest, size_t len)
1817 {
1818 PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop);
1819
1820 return snprintf(dest, len, "%02x:%02x.%x", ptr->bus, ptr->dev, ptr->func);
1821 }
1822
1823 PropertyInfo qdev_prop_hostaddr = {
1824 .name = "pci-hostaddr",
1825 .type = -1,
1826 .size = sizeof(PCIHostDevice),
1827 .parse = parse_hostaddr,
1828 .print = print_hostaddr,
1829 };
1830
1831 static PCIDeviceInfo assign_info = {
1832 .qdev.name = "pci-assign",
1833 .qdev.desc = "pass through host pci devices to the guest",
1834 .qdev.size = sizeof(AssignedDevice),
1835 .qdev.reset = reset_assigned_device,
1836 .init = assigned_initfn,
1837 .exit = assigned_exitfn,
1838 .config_read = assigned_dev_pci_read_config,
1839 .config_write = assigned_dev_pci_write_config,
1840 .qdev.props = (Property[]) {
1841 DEFINE_PROP("host", AssignedDevice, host, qdev_prop_hostaddr, PCIHostDevice),
1842 DEFINE_PROP_BIT("iommu", AssignedDevice, features,
1843 ASSIGNED_DEVICE_USE_IOMMU_BIT, true),
1844 DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
1845 ASSIGNED_DEVICE_PREFER_MSI_BIT, true),
1846 DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
1847 DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
1848 DEFINE_PROP_END_OF_LIST(),
1849 },
1850 };
1851
assign_register_devices(void)1852 static void assign_register_devices(void)
1853 {
1854 pci_qdev_register(&assign_info);
1855 }
1856
device_init(assign_register_devices)1857 device_init(assign_register_devices)
1858
1859 /*
1860 * Scan the assigned devices for the devices that have an option ROM, and then
1861 * load the corresponding ROM data to RAM. If an error occurs while loading an
1862 * option ROM, we just ignore that option ROM and continue with the next one.
1863 */
1864 static void assigned_dev_load_option_rom(AssignedDevice *dev)
1865 {
1866 char name[32], rom_file[64];
1867 FILE *fp;
1868 uint8_t val;
1869 struct stat st;
1870 void *ptr;
1871
1872 /* If loading ROM from file, pci handles it */
1873 if (dev->dev.romfile || !dev->dev.rom_bar)
1874 return;
1875
1876 snprintf(rom_file, sizeof(rom_file),
1877 "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
1878 dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
1879
1880 if (stat(rom_file, &st)) {
1881 return;
1882 }
1883
1884 if (access(rom_file, F_OK)) {
1885 fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
1886 rom_file);
1887 return;
1888 }
1889
1890 /* Write "1" to the ROM file to enable it */
1891 fp = fopen(rom_file, "r+");
1892 if (fp == NULL) {
1893 return;
1894 }
1895 val = 1;
1896 if (fwrite(&val, 1, 1, fp) != 1) {
1897 goto close_rom;
1898 }
1899 fseek(fp, 0, SEEK_SET);
1900
1901 snprintf(name, sizeof(name), "%s.rom", dev->dev.qdev.info->name);
1902 dev->dev.rom_offset = qemu_ram_alloc(&dev->dev.qdev, name, st.st_size);
1903 ptr = qemu_get_ram_ptr(dev->dev.rom_offset);
1904 memset(ptr, 0xff, st.st_size);
1905
1906 if (!fread(ptr, 1, st.st_size, fp)) {
1907 fprintf(stderr, "pci-assign: Cannot read from host %s\n"
1908 "\tDevice option ROM contents are probably invalid "
1909 "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
1910 "or load from file with romfile=\n", rom_file);
1911 qemu_ram_free(dev->dev.rom_offset);
1912 dev->dev.rom_offset = 0;
1913 goto close_rom;
1914 }
1915
1916 pci_register_bar(&dev->dev, PCI_ROM_SLOT,
1917 st.st_size, 0, pci_map_option_rom);
1918 close_rom:
1919 /* Write "0" to disable ROM */
1920 fseek(fp, 0, SEEK_SET);
1921 val = 0;
1922 if (!fwrite(&val, 1, 1, fp)) {
1923 DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
1924 }
1925 fclose(fp);
1926 }
1927