xref: /illumos-kvm-cmd/hw/device-assignment.c (revision 68396ea9)
1 /*
2  * Copyright (c) 2007, Neocleus Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  *
18  *  Assign a PCI device from the host to a guest VM.
19  *
20  *  Adapted for KVM by Qumranet.
21  *
22  *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
23  *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
24  *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
25  *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
26  *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
27  */
28 #include <stdio.h>
29 #include <unistd.h>
30 #include <sys/io.h>
31 #include <sys/types.h>
32 #include <sys/stat.h>
33 #include "qemu-kvm.h"
34 #include "hw.h"
35 #include "pc.h"
36 #include "qemu-error.h"
37 #include "console.h"
38 #include "device-assignment.h"
39 #include "loader.h"
40 #include "monitor.h"
41 #include "range.h"
42 #include <pci/header.h>
43 #include "sysemu.h"
44 
45 /* From linux/ioport.h */
46 #define IORESOURCE_IO       0x00000100  /* Resource type */
47 #define IORESOURCE_MEM      0x00000200
48 #define IORESOURCE_IRQ      0x00000400
49 #define IORESOURCE_DMA      0x00000800
50 #define IORESOURCE_PREFETCH 0x00001000  /* No side effects */
51 
52 /* #define DEVICE_ASSIGNMENT_DEBUG 1 */
53 
54 #ifdef DEVICE_ASSIGNMENT_DEBUG
55 #define DEBUG(fmt, ...)                                       \
56     do {                                                      \
57       fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__);    \
58     } while (0)
59 #else
60 #define DEBUG(fmt, ...) do { } while(0)
61 #endif
62 
63 static void assigned_dev_load_option_rom(AssignedDevice *dev);
64 
65 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev);
66 
67 static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
68                                                  uint32_t address,
69                                                  uint32_t val, int len);
70 
71 static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
72                                                     uint32_t address, int len);
73 
assigned_dev_ioport_rw(AssignedDevRegion * dev_region,uint32_t addr,int len,uint32_t * val)74 static uint32_t assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
75                                        uint32_t addr, int len, uint32_t *val)
76 {
77     uint32_t ret = 0;
78     uint32_t offset = addr - dev_region->e_physbase;
79     int fd = dev_region->region->resource_fd;
80 
81     if (fd >= 0) {
82         if (val) {
83             DEBUG("pwrite val=%x, len=%d, e_phys=%x, offset=%x\n",
84                   *val, len, addr, offset);
85             if (pwrite(fd, val, len, offset) != len) {
86                 fprintf(stderr, "%s - pwrite failed %s\n",
87                         __func__, strerror(errno));
88             }
89         } else {
90             if (pread(fd, &ret, len, offset) != len) {
91                 fprintf(stderr, "%s - pread failed %s\n",
92                         __func__, strerror(errno));
93                 ret = (1UL << (len * 8)) - 1;
94             }
95             DEBUG("pread ret=%x, len=%d, e_phys=%x, offset=%x\n",
96                   ret, len, addr, offset);
97         }
98     } else {
99         uint32_t port = offset + dev_region->u.r_baseport;
100 
101         if (val) {
102             DEBUG("out val=%x, len=%d, e_phys=%x, host=%x\n",
103                   *val, len, addr, port);
104             switch (len) {
105                 case 1:
106                     outb(*val, port);
107                     break;
108                 case 2:
109                     outw(*val, port);
110                     break;
111                 case 4:
112                     outl(*val, port);
113                     break;
114             }
115         } else {
116             switch (len) {
117                 case 1:
118                     ret = inb(port);
119                     break;
120                 case 2:
121                     ret = inw(port);
122                     break;
123                 case 4:
124                     ret = inl(port);
125                     break;
126             }
127             DEBUG("in val=%x, len=%d, e_phys=%x, host=%x\n",
128                   ret, len, addr, port);
129         }
130     }
131     return ret;
132 }
133 
assigned_dev_ioport_writeb(void * opaque,uint32_t addr,uint32_t value)134 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
135                                        uint32_t value)
136 {
137     assigned_dev_ioport_rw(opaque, addr, 1, &value);
138     return;
139 }
140 
assigned_dev_ioport_writew(void * opaque,uint32_t addr,uint32_t value)141 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
142                                        uint32_t value)
143 {
144     assigned_dev_ioport_rw(opaque, addr, 2, &value);
145     return;
146 }
147 
assigned_dev_ioport_writel(void * opaque,uint32_t addr,uint32_t value)148 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
149                        uint32_t value)
150 {
151     assigned_dev_ioport_rw(opaque, addr, 4, &value);
152     return;
153 }
154 
assigned_dev_ioport_readb(void * opaque,uint32_t addr)155 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
156 {
157     return assigned_dev_ioport_rw(opaque, addr, 1, NULL);
158 }
159 
assigned_dev_ioport_readw(void * opaque,uint32_t addr)160 static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
161 {
162     return assigned_dev_ioport_rw(opaque, addr, 2, NULL);
163 }
164 
assigned_dev_ioport_readl(void * opaque,uint32_t addr)165 static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
166 {
167     return assigned_dev_ioport_rw(opaque, addr, 4, NULL);
168 }
169 
slow_bar_readb(void * opaque,target_phys_addr_t addr)170 static uint32_t slow_bar_readb(void *opaque, target_phys_addr_t addr)
171 {
172     AssignedDevRegion *d = opaque;
173     uint8_t *in = d->u.r_virtbase + addr;
174     uint32_t r;
175 
176     r = *in;
177     DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
178 
179     return r;
180 }
181 
slow_bar_readw(void * opaque,target_phys_addr_t addr)182 static uint32_t slow_bar_readw(void *opaque, target_phys_addr_t addr)
183 {
184     AssignedDevRegion *d = opaque;
185     uint16_t *in = d->u.r_virtbase + addr;
186     uint32_t r;
187 
188     r = *in;
189     DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
190 
191     return r;
192 }
193 
slow_bar_readl(void * opaque,target_phys_addr_t addr)194 static uint32_t slow_bar_readl(void *opaque, target_phys_addr_t addr)
195 {
196     AssignedDevRegion *d = opaque;
197     uint32_t *in = d->u.r_virtbase + addr;
198     uint32_t r;
199 
200     r = *in;
201     DEBUG("slow_bar_readl addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, r);
202 
203     return r;
204 }
205 
slow_bar_writeb(void * opaque,target_phys_addr_t addr,uint32_t val)206 static void slow_bar_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
207 {
208     AssignedDevRegion *d = opaque;
209     uint8_t *out = d->u.r_virtbase + addr;
210 
211     DEBUG("slow_bar_writeb addr=0x" TARGET_FMT_plx " val=0x%02x\n", addr, val);
212     *out = val;
213 }
214 
slow_bar_writew(void * opaque,target_phys_addr_t addr,uint32_t val)215 static void slow_bar_writew(void *opaque, target_phys_addr_t addr, uint32_t val)
216 {
217     AssignedDevRegion *d = opaque;
218     uint16_t *out = d->u.r_virtbase + addr;
219 
220     DEBUG("slow_bar_writew addr=0x" TARGET_FMT_plx " val=0x%04x\n", addr, val);
221     *out = val;
222 }
223 
slow_bar_writel(void * opaque,target_phys_addr_t addr,uint32_t val)224 static void slow_bar_writel(void *opaque, target_phys_addr_t addr, uint32_t val)
225 {
226     AssignedDevRegion *d = opaque;
227     uint32_t *out = d->u.r_virtbase + addr;
228 
229     DEBUG("slow_bar_writel addr=0x" TARGET_FMT_plx " val=0x%08x\n", addr, val);
230     *out = val;
231 }
232 
233 static CPUWriteMemoryFunc * const slow_bar_write[] = {
234     &slow_bar_writeb,
235     &slow_bar_writew,
236     &slow_bar_writel
237 };
238 
239 static CPUReadMemoryFunc * const slow_bar_read[] = {
240     &slow_bar_readb,
241     &slow_bar_readw,
242     &slow_bar_readl
243 };
244 
assigned_dev_iomem_map_slow(PCIDevice * pci_dev,int region_num,pcibus_t e_phys,pcibus_t e_size,int type)245 static void assigned_dev_iomem_map_slow(PCIDevice *pci_dev, int region_num,
246                                         pcibus_t e_phys, pcibus_t e_size,
247                                         int type)
248 {
249     AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
250     AssignedDevRegion *region = &r_dev->v_addrs[region_num];
251     PCIRegion *real_region = &r_dev->real_device.regions[region_num];
252     int m;
253 
254     DEBUG("%s", "slow map\n");
255     m = cpu_register_io_memory(slow_bar_read, slow_bar_write, region,
256                                DEVICE_NATIVE_ENDIAN);
257     cpu_register_physical_memory(e_phys, e_size, m);
258 
259     /* MSI-X MMIO page */
260     if ((e_size > 0) &&
261         real_region->base_addr <= r_dev->msix_table_addr &&
262         real_region->base_addr + real_region->size >= r_dev->msix_table_addr) {
263         int offset = r_dev->msix_table_addr - real_region->base_addr;
264 
265         cpu_register_physical_memory(e_phys + offset,
266                 TARGET_PAGE_SIZE, r_dev->mmio_index);
267     }
268 }
269 
assigned_dev_iomem_map(PCIDevice * pci_dev,int region_num,pcibus_t e_phys,pcibus_t e_size,int type)270 static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num,
271                                    pcibus_t e_phys, pcibus_t e_size, int type)
272 {
273     AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
274     AssignedDevRegion *region = &r_dev->v_addrs[region_num];
275     PCIRegion *real_region = &r_dev->real_device.regions[region_num];
276     int ret = 0;
277 
278     DEBUG("e_phys=%08" FMT_PCIBUS " r_virt=%p type=%d len=%08" FMT_PCIBUS " region_num=%d \n",
279           e_phys, region->u.r_virtbase, type, e_size, region_num);
280 
281     region->e_physbase = e_phys;
282     region->e_size = e_size;
283 
284     if (e_size > 0) {
285         cpu_register_physical_memory(e_phys, e_size, region->memory_index);
286 
287         /* deal with MSI-X MMIO page */
288         if (real_region->base_addr <= r_dev->msix_table_addr &&
289                 real_region->base_addr + real_region->size >=
290                 r_dev->msix_table_addr) {
291             int offset = r_dev->msix_table_addr - real_region->base_addr;
292 
293             cpu_register_physical_memory(e_phys + offset,
294                     TARGET_PAGE_SIZE, r_dev->mmio_index);
295         }
296     }
297 
298     if (ret != 0) {
299 	fprintf(stderr, "%s: Error: create new mapping failed\n", __func__);
300 	exit(1);
301     }
302 }
303 
assigned_dev_ioport_map(PCIDevice * pci_dev,int region_num,pcibus_t addr,pcibus_t size,int type)304 static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
305                                     pcibus_t addr, pcibus_t size, int type)
306 {
307     AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev);
308     AssignedDevRegion *region = &r_dev->v_addrs[region_num];
309     int first_map = (region->e_size == 0);
310     CPUState *env;
311 
312     region->e_physbase = addr;
313     region->e_size = size;
314 
315     DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
316           addr, region->u.r_baseport, type, size, region_num);
317 
318     if (first_map && region->region->resource_fd < 0) {
319 	struct ioperm_data *data;
320 
321 	data = qemu_mallocz(sizeof(struct ioperm_data));
322 	if (data == NULL) {
323 	    fprintf(stderr, "%s: Out of memory\n", __func__);
324 	    exit(1);
325 	}
326 
327 	data->start_port = region->u.r_baseport;
328 	data->num = region->r_size;
329 	data->turn_on = 1;
330 
331 	kvm_add_ioperm_data(data);
332 
333 	for (env = first_cpu; env; env = env->next_cpu)
334 	    kvm_ioperm(env, data);
335     }
336 
337     register_ioport_read(addr, size, 1, assigned_dev_ioport_readb,
338                          (r_dev->v_addrs + region_num));
339     register_ioport_read(addr, size, 2, assigned_dev_ioport_readw,
340                          (r_dev->v_addrs + region_num));
341     register_ioport_read(addr, size, 4, assigned_dev_ioport_readl,
342                          (r_dev->v_addrs + region_num));
343     register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb,
344                           (r_dev->v_addrs + region_num));
345     register_ioport_write(addr, size, 2, assigned_dev_ioport_writew,
346                           (r_dev->v_addrs + region_num));
347     register_ioport_write(addr, size, 4, assigned_dev_ioport_writel,
348                           (r_dev->v_addrs + region_num));
349 }
350 
assigned_dev_pci_read(PCIDevice * d,int pos,int len)351 static uint32_t assigned_dev_pci_read(PCIDevice *d, int pos, int len)
352 {
353     AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
354     uint32_t val;
355     ssize_t ret;
356     int fd = pci_dev->real_device.config_fd;
357 
358 again:
359     ret = pread(fd, &val, len, pos);
360     if (ret != len) {
361 	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
362 	    goto again;
363 
364 	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
365 		__func__, ret, errno);
366 
367 	exit(1);
368     }
369 
370     return val;
371 }
372 
assigned_dev_pci_read_byte(PCIDevice * d,int pos)373 static uint8_t assigned_dev_pci_read_byte(PCIDevice *d, int pos)
374 {
375     return (uint8_t)assigned_dev_pci_read(d, pos, 1);
376 }
377 
assigned_dev_pci_write(PCIDevice * d,int pos,uint32_t val,int len)378 static void assigned_dev_pci_write(PCIDevice *d, int pos, uint32_t val, int len)
379 {
380     AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
381     ssize_t ret;
382     int fd = pci_dev->real_device.config_fd;
383 
384 again:
385     ret = pwrite(fd, &val, len, pos);
386     if (ret != len) {
387 	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
388 	    goto again;
389 
390 	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
391 		__func__, ret, errno);
392 
393 	exit(1);
394     }
395 
396     return;
397 }
398 
pci_find_cap_offset(PCIDevice * d,uint8_t cap,uint8_t start)399 static uint8_t pci_find_cap_offset(PCIDevice *d, uint8_t cap, uint8_t start)
400 {
401     int id;
402     int max_cap = 48;
403     int pos = start ? start : PCI_CAPABILITY_LIST;
404     int status;
405 
406     status = assigned_dev_pci_read_byte(d, PCI_STATUS);
407     if ((status & PCI_STATUS_CAP_LIST) == 0)
408         return 0;
409 
410     while (max_cap--) {
411         pos = assigned_dev_pci_read_byte(d, pos);
412         if (pos < 0x40)
413             break;
414 
415         pos &= ~3;
416         id = assigned_dev_pci_read_byte(d, pos + PCI_CAP_LIST_ID);
417 
418         if (id == 0xff)
419             break;
420         if (id == cap)
421             return pos;
422 
423         pos += PCI_CAP_LIST_NEXT;
424     }
425     return 0;
426 }
427 
assigned_dev_pci_write_config(PCIDevice * d,uint32_t address,uint32_t val,int len)428 static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address,
429                                           uint32_t val, int len)
430 {
431     int fd;
432     ssize_t ret;
433     AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
434 
435     DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
436           ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
437           (uint16_t) address, val, len);
438 
439     if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
440         return assigned_device_pci_cap_write_config(d, address, val, len);
441     }
442 
443     if (address == 0x4) {
444         pci_default_write_config(d, address, val, len);
445         /* Continue to program the card */
446     }
447 
448     if ((address >= 0x10 && address <= 0x24) || address == 0x30 ||
449         address == 0x34 || address == 0x3c || address == 0x3d) {
450         /* used for update-mappings (BAR emulation) */
451         pci_default_write_config(d, address, val, len);
452         return;
453     }
454 
455     DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
456           ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7),
457           (uint16_t) address, val, len);
458 
459     fd = pci_dev->real_device.config_fd;
460 
461 again:
462     ret = pwrite(fd, &val, len, address);
463     if (ret != len) {
464 	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
465 	    goto again;
466 
467 	fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n",
468 		__func__, ret, errno);
469 
470 	exit(1);
471     }
472 }
473 
assigned_dev_pci_read_config(PCIDevice * d,uint32_t address,int len)474 static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address,
475                                              int len)
476 {
477     uint32_t val = 0;
478     int fd;
479     ssize_t ret;
480     AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev);
481 
482     if (address >= PCI_CONFIG_HEADER_SIZE && d->config_map[address]) {
483         val = assigned_device_pci_cap_read_config(d, address, len);
484         DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
485               (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
486         return val;
487     }
488 
489     if (address < 0x4 || (pci_dev->need_emulate_cmd && address == 0x4) ||
490 	(address >= 0x10 && address <= 0x24) || address == 0x30 ||
491         address == 0x34 || address == 0x3c || address == 0x3d) {
492         val = pci_default_read_config(d, address, len);
493         DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
494               (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
495         return val;
496     }
497 
498     /* vga specific, remove later */
499     if (address == 0xFC)
500         goto do_log;
501 
502     fd = pci_dev->real_device.config_fd;
503 
504 again:
505     ret = pread(fd, &val, len, address);
506     if (ret != len) {
507 	if ((ret < 0) && (errno == EINTR || errno == EAGAIN))
508 	    goto again;
509 
510 	fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n",
511 		__func__, ret, errno);
512 
513 	exit(1);
514     }
515 
516 do_log:
517     DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
518           (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
519 
520     if (!pci_dev->cap.available) {
521         /* kill the special capabilities */
522         if (address == 4 && len == 4)
523             val &= ~0x100000;
524         else if (address == 6)
525             val &= ~0x10;
526     }
527 
528     return val;
529 }
530 
assigned_dev_register_regions(PCIRegion * io_regions,unsigned long regions_num,AssignedDevice * pci_dev)531 static int assigned_dev_register_regions(PCIRegion *io_regions,
532                                          unsigned long regions_num,
533                                          AssignedDevice *pci_dev)
534 {
535     uint32_t i;
536     PCIRegion *cur_region = io_regions;
537 
538     for (i = 0; i < regions_num; i++, cur_region++) {
539         if (!cur_region->valid)
540             continue;
541         pci_dev->v_addrs[i].num = i;
542 
543         /* handle memory io regions */
544         if (cur_region->type & IORESOURCE_MEM) {
545             int slow_map = 0;
546             int t = cur_region->type & IORESOURCE_PREFETCH
547                 ? PCI_BASE_ADDRESS_MEM_PREFETCH
548                 : PCI_BASE_ADDRESS_SPACE_MEMORY;
549 
550             if (cur_region->size & 0xFFF) {
551                 fprintf(stderr, "PCI region %d at address 0x%llx "
552                         "has size 0x%x, which is not a multiple of 4K. "
553                         "You might experience some performance hit "
554                         "due to that.\n",
555                         i, (unsigned long long)cur_region->base_addr,
556                         cur_region->size);
557                 slow_map = 1;
558             }
559 
560             /* map physical memory */
561             pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
562             pci_dev->v_addrs[i].u.r_virtbase = mmap(NULL, cur_region->size,
563                                                     PROT_WRITE | PROT_READ,
564                                                     MAP_SHARED,
565                                                     cur_region->resource_fd,
566                                                     (off_t)0);
567 
568             if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) {
569                 pci_dev->v_addrs[i].u.r_virtbase = NULL;
570                 fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!"
571                         "\n", __func__,
572                         (uint32_t) (cur_region->base_addr));
573                 return -1;
574             }
575 
576             pci_dev->v_addrs[i].r_size = cur_region->size;
577             pci_dev->v_addrs[i].e_size = 0;
578 
579             /* add offset */
580             pci_dev->v_addrs[i].u.r_virtbase +=
581                 (cur_region->base_addr & 0xFFF);
582 
583 
584             if (!slow_map) {
585                 void *virtbase = pci_dev->v_addrs[i].u.r_virtbase;
586                 char name[32];
587                 snprintf(name, sizeof(name), "%s.bar%d",
588                          pci_dev->dev.qdev.info->name, i);
589                 pci_dev->v_addrs[i].memory_index =
590                                             qemu_ram_alloc_from_ptr(
591                                                          &pci_dev->dev.qdev,
592                                                          name, cur_region->size,
593                                                          virtbase);
594             } else
595                 pci_dev->v_addrs[i].memory_index = 0;
596 
597             pci_register_bar((PCIDevice *) pci_dev, i,
598                              cur_region->size, t,
599                              slow_map ? assigned_dev_iomem_map_slow
600                                       : assigned_dev_iomem_map);
601             continue;
602         } else {
603             /* handle port io regions */
604             uint32_t val;
605             int ret;
606 
607             /* Test kernel support for ioport resource read/write.  Old
608              * kernels return EIO.  New kernels only allow 1/2/4 byte reads
609              * so should return EINVAL for a 3 byte read */
610             ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
611             if (ret == 3) {
612                 fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
613                 abort();
614             } else if (errno != EINVAL) {
615                 fprintf(stderr, "Using raw in/out ioport access (sysfs - %s)\n",
616                         strerror(errno));
617                 close(pci_dev->v_addrs[i].region->resource_fd);
618                 pci_dev->v_addrs[i].region->resource_fd = -1;
619             }
620 
621             pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
622             pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
623             pci_dev->v_addrs[i].r_size = cur_region->size;
624             pci_dev->v_addrs[i].e_size = 0;
625 
626             pci_register_bar((PCIDevice *) pci_dev, i,
627                              cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
628                              assigned_dev_ioport_map);
629 
630             /* not relevant for port io */
631             pci_dev->v_addrs[i].memory_index = 0;
632         }
633     }
634 
635     /* success */
636     return 0;
637 }
638 
get_real_id(const char * devpath,const char * idname,uint16_t * val)639 static int get_real_id(const char *devpath, const char *idname, uint16_t *val)
640 {
641     FILE *f;
642     char name[128];
643     long id;
644 
645     snprintf(name, sizeof(name), "%s%s", devpath, idname);
646     f = fopen(name, "r");
647     if (f == NULL) {
648         fprintf(stderr, "%s: %s: %m\n", __func__, name);
649         return -1;
650     }
651     if (fscanf(f, "%li\n", &id) == 1) {
652         *val = id;
653     } else {
654         return -1;
655     }
656     fclose(f);
657 
658     return 0;
659 }
660 
get_real_vendor_id(const char * devpath,uint16_t * val)661 static int get_real_vendor_id(const char *devpath, uint16_t *val)
662 {
663     return get_real_id(devpath, "vendor", val);
664 }
665 
get_real_device_id(const char * devpath,uint16_t * val)666 static int get_real_device_id(const char *devpath, uint16_t *val)
667 {
668     return get_real_id(devpath, "device", val);
669 }
670 
get_real_device(AssignedDevice * pci_dev,uint16_t r_seg,uint8_t r_bus,uint8_t r_dev,uint8_t r_func)671 static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
672                            uint8_t r_bus, uint8_t r_dev, uint8_t r_func)
673 {
674     char dir[128], name[128];
675     int fd, r = 0, v;
676     FILE *f;
677     unsigned long long start, end, size, flags;
678     uint16_t id;
679     struct stat statbuf;
680     PCIRegion *rp;
681     PCIDevRegions *dev = &pci_dev->real_device;
682 
683     dev->region_number = 0;
684 
685     snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/%04x:%02x:%02x.%x/",
686 	     r_seg, r_bus, r_dev, r_func);
687 
688     snprintf(name, sizeof(name), "%sconfig", dir);
689 
690     if (pci_dev->configfd_name && *pci_dev->configfd_name) {
691         if (qemu_isdigit(pci_dev->configfd_name[0])) {
692             dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
693         } else {
694             dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
695             if (dev->config_fd < 0) {
696                 fprintf(stderr, "%s: (%s) unkown\n", __func__,
697                         pci_dev->configfd_name);
698                 return 1;
699             }
700         }
701     } else {
702         dev->config_fd = open(name, O_RDWR);
703 
704         if (dev->config_fd == -1) {
705             fprintf(stderr, "%s: %s: %m\n", __func__, name);
706             return 1;
707         }
708     }
709 again:
710     r = read(dev->config_fd, pci_dev->dev.config,
711              pci_config_size(&pci_dev->dev));
712     if (r < 0) {
713         if (errno == EINTR || errno == EAGAIN)
714             goto again;
715         fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno);
716     }
717 
718     /* Clear host resource mapping info.  If we choose not to register a
719      * BAR, such as might be the case with the option ROM, we can get
720      * confusing, unwritable, residual addresses from the host here. */
721     memset(&pci_dev->dev.config[PCI_BASE_ADDRESS_0], 0, 24);
722     memset(&pci_dev->dev.config[PCI_ROM_ADDRESS], 0, 4);
723 
724     snprintf(name, sizeof(name), "%sresource", dir);
725 
726     f = fopen(name, "r");
727     if (f == NULL) {
728         fprintf(stderr, "%s: %s: %m\n", __func__, name);
729         return 1;
730     }
731 
732     for (r = 0; r < PCI_ROM_SLOT; r++) {
733 	if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3)
734 	    break;
735 
736         rp = dev->regions + r;
737         rp->valid = 0;
738         rp->resource_fd = -1;
739         size = end - start + 1;
740         flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
741         if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
742             continue;
743         if (flags & IORESOURCE_MEM) {
744             flags &= ~IORESOURCE_IO;
745         } else {
746             flags &= ~IORESOURCE_PREFETCH;
747         }
748         snprintf(name, sizeof(name), "%sresource%d", dir, r);
749         fd = open(name, O_RDWR);
750         if (fd == -1)
751             continue;
752         rp->resource_fd = fd;
753 
754         rp->type = flags;
755         rp->valid = 1;
756         rp->base_addr = start;
757         rp->size = size;
758         pci_dev->v_addrs[r].region = rp;
759         DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
760               r, rp->size, start, rp->type, rp->resource_fd);
761     }
762 
763     fclose(f);
764 
765     /* read and fill vendor ID */
766     v = get_real_vendor_id(dir, &id);
767     if (v) {
768         return 1;
769     }
770     pci_dev->dev.config[0] = id & 0xff;
771     pci_dev->dev.config[1] = (id & 0xff00) >> 8;
772 
773     /* read and fill device ID */
774     v = get_real_device_id(dir, &id);
775     if (v) {
776         return 1;
777     }
778     pci_dev->dev.config[2] = id & 0xff;
779     pci_dev->dev.config[3] = (id & 0xff00) >> 8;
780 
781     /* dealing with virtual function device */
782     snprintf(name, sizeof(name), "%sphysfn/", dir);
783     if (!stat(name, &statbuf))
784 	    pci_dev->need_emulate_cmd = 1;
785     else
786 	    pci_dev->need_emulate_cmd = 0;
787 
788     dev->region_number = r;
789     return 0;
790 }
791 
792 static QLIST_HEAD(, AssignedDevice) devs = QLIST_HEAD_INITIALIZER(devs);
793 
794 #ifdef KVM_CAP_IRQ_ROUTING
free_dev_irq_entries(AssignedDevice * dev)795 static void free_dev_irq_entries(AssignedDevice *dev)
796 {
797     int i;
798 
799     for (i = 0; i < dev->irq_entries_nr; i++)
800         kvm_del_routing_entry(&dev->entry[i]);
801     free(dev->entry);
802     dev->entry = NULL;
803     dev->irq_entries_nr = 0;
804 }
805 #endif
806 
free_assigned_device(AssignedDevice * dev)807 static void free_assigned_device(AssignedDevice *dev)
808 {
809     if (dev) {
810         int i;
811 
812         for (i = 0; i < dev->real_device.region_number; i++) {
813             PCIRegion *pci_region = &dev->real_device.regions[i];
814             AssignedDevRegion *region = &dev->v_addrs[i];
815 
816             if (!pci_region->valid)
817                 continue;
818 
819             if (pci_region->type & IORESOURCE_IO) {
820                 if (pci_region->resource_fd < 0) {
821                     kvm_remove_ioperm_data(region->u.r_baseport,
822                                            region->r_size);
823                 }
824             } else if (pci_region->type & IORESOURCE_MEM) {
825                 if (region->u.r_virtbase) {
826                     if (region->memory_index) {
827                         cpu_register_physical_memory(region->e_physbase,
828                                                      region->e_size,
829                                                      IO_MEM_UNASSIGNED);
830                         qemu_ram_unmap(region->memory_index);
831                     }
832                     if (munmap(region->u.r_virtbase,
833                                (pci_region->size + 0xFFF) & 0xFFFFF000))
834                         fprintf(stderr,
835 				"Failed to unmap assigned device region: %s\n",
836 				strerror(errno));
837                 }
838             }
839             if (pci_region->resource_fd >= 0) {
840                 close(pci_region->resource_fd);
841             }
842         }
843 
844         if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
845             assigned_dev_unregister_msix_mmio(dev);
846 
847         if (dev->real_device.config_fd >= 0) {
848             close(dev->real_device.config_fd);
849         }
850 
851 #ifdef KVM_CAP_IRQ_ROUTING
852         free_dev_irq_entries(dev);
853 #endif
854     }
855 }
856 
calc_assigned_dev_id(uint16_t seg,uint8_t bus,uint8_t devfn)857 static uint32_t calc_assigned_dev_id(uint16_t seg, uint8_t bus, uint8_t devfn)
858 {
859     return (uint32_t)seg << 16 | (uint32_t)bus << 8 | (uint32_t)devfn;
860 }
861 
assign_failed_examine(AssignedDevice * dev)862 static void assign_failed_examine(AssignedDevice *dev)
863 {
864     char name[PATH_MAX], dir[PATH_MAX], driver[PATH_MAX] = {}, *ns;
865     uint16_t vendor_id, device_id;
866     int r;
867 
868     sprintf(dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
869             dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
870 
871     sprintf(name, "%sdriver", dir);
872 
873     r = readlink(name, driver, sizeof(driver));
874     if ((r <= 0) || r >= sizeof(driver) || !(ns = strrchr(driver, '/'))) {
875         goto fail;
876     }
877 
878     ns++;
879 
880     if (get_real_vendor_id(dir, &vendor_id) ||
881         get_real_device_id(dir, &device_id)) {
882         goto fail;
883     }
884 
885     fprintf(stderr, "*** The driver '%s' is occupying your device "
886                     "%04x:%02x:%02x.%x.\n",
887             ns, dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
888     fprintf(stderr, "***\n");
889     fprintf(stderr, "*** You can try the following commands to free it:\n");
890     fprintf(stderr, "***\n");
891     fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub/"
892                     "new_id\n", vendor_id, device_id);
893     fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
894                     "%s/unbind\n",
895             dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func, ns);
896     fprintf(stderr, "*** $ echo \"%04x:%02x:%02x.%x\" > /sys/bus/pci/drivers/"
897                     "pci-stub/bind\n",
898             dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
899     fprintf(stderr, "*** $ echo \"%04x %04x\" > /sys/bus/pci/drivers/pci-stub"
900                     "/remove_id\n", vendor_id, device_id);
901     fprintf(stderr, "***\n");
902 
903     return;
904 
905 fail:
906     fprintf(stderr, "Couldn't find out why.\n");
907 }
908 
assign_device(AssignedDevice * dev)909 static int assign_device(AssignedDevice *dev)
910 {
911     struct kvm_assigned_pci_dev assigned_dev_data;
912     int r;
913 
914 #ifdef KVM_CAP_PCI_SEGMENT
915     /* Only pass non-zero PCI segment to capable module */
916     if (!kvm_check_extension(kvm_state, KVM_CAP_PCI_SEGMENT) &&
917         dev->h_segnr) {
918         fprintf(stderr, "Can't assign device inside non-zero PCI segment "
919                 "as this KVM module doesn't support it.\n");
920         return -ENODEV;
921     }
922 #endif
923 
924     memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
925     assigned_dev_data.assigned_dev_id  =
926 	calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
927 #ifdef KVM_CAP_PCI_SEGMENT
928     assigned_dev_data.segnr = dev->h_segnr;
929 #endif
930     assigned_dev_data.busnr = dev->h_busnr;
931     assigned_dev_data.devfn = dev->h_devfn;
932 
933 #ifdef KVM_CAP_IOMMU
934     /* We always enable the IOMMU unless disabled on the command line */
935     if (dev->features & ASSIGNED_DEVICE_USE_IOMMU_MASK) {
936         if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) {
937             fprintf(stderr, "No IOMMU found.  Unable to assign device \"%s\"\n",
938                     dev->dev.qdev.id);
939             return -ENODEV;
940         }
941         assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU;
942     }
943 #else
944     dev->features &= ~ASSIGNED_DEVICE_USE_IOMMU_MASK;
945 #endif
946     if (!(dev->features & ASSIGNED_DEVICE_USE_IOMMU_MASK)) {
947         fprintf(stderr,
948                 "WARNING: Assigning a device without IOMMU protection can "
949                 "cause host memory corruption if the device issues DMA write "
950                 "requests!\n");
951     }
952 
953     r = kvm_assign_pci_device(kvm_context, &assigned_dev_data);
954     if (r < 0) {
955         fprintf(stderr, "Failed to assign device \"%s\" : %s\n",
956                 dev->dev.qdev.id, strerror(-r));
957 
958         switch (r) {
959             case -EBUSY:
960                 assign_failed_examine(dev);
961                 break;
962             default:
963                 break;
964         }
965     }
966     return r;
967 }
968 
assign_irq(AssignedDevice * dev)969 static int assign_irq(AssignedDevice *dev)
970 {
971     struct kvm_assigned_irq assigned_irq_data;
972     int irq, r = 0;
973 
974     /* Interrupt PIN 0 means don't use INTx */
975     if (assigned_dev_pci_read_byte(&dev->dev, PCI_INTERRUPT_PIN) == 0)
976         return 0;
977 
978     irq = pci_map_irq(&dev->dev, dev->intpin);
979     irq = piix_get_irq(irq);
980 
981 #ifdef TARGET_IA64
982     irq = ipf_map_irq(&dev->dev, irq);
983 #endif
984 
985     if (dev->girq == irq)
986         return r;
987 
988     memset(&assigned_irq_data, 0, sizeof(assigned_irq_data));
989     assigned_irq_data.assigned_dev_id =
990         calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
991     assigned_irq_data.guest_irq = irq;
992     assigned_irq_data.host_irq = dev->real_device.irq;
993 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
994     if (dev->irq_requested_type) {
995         assigned_irq_data.flags = dev->irq_requested_type;
996         r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
997         /* -ENXIO means no assigned irq */
998         if (r && r != -ENXIO)
999             perror("assign_irq: deassign");
1000     }
1001 
1002     assigned_irq_data.flags = KVM_DEV_IRQ_GUEST_INTX;
1003     if (dev->features & ASSIGNED_DEVICE_PREFER_MSI_MASK &&
1004         dev->cap.available & ASSIGNED_DEVICE_CAP_MSI)
1005         assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_MSI;
1006     else
1007         assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_INTX;
1008 #endif
1009 
1010     r = kvm_assign_irq(kvm_context, &assigned_irq_data);
1011     if (r < 0) {
1012         fprintf(stderr, "Failed to assign irq for \"%s\": %s\n",
1013                 dev->dev.qdev.id, strerror(-r));
1014         fprintf(stderr, "Perhaps you are assigning a device "
1015                 "that shares an IRQ with another device?\n");
1016         return r;
1017     }
1018 
1019     dev->girq = irq;
1020     dev->irq_requested_type = assigned_irq_data.flags;
1021     return r;
1022 }
1023 
deassign_device(AssignedDevice * dev)1024 static void deassign_device(AssignedDevice *dev)
1025 {
1026 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
1027     struct kvm_assigned_pci_dev assigned_dev_data;
1028     int r;
1029 
1030     memset(&assigned_dev_data, 0, sizeof(assigned_dev_data));
1031     assigned_dev_data.assigned_dev_id  =
1032 	calc_assigned_dev_id(dev->h_segnr, dev->h_busnr, dev->h_devfn);
1033 
1034     r = kvm_deassign_pci_device(kvm_context, &assigned_dev_data);
1035     if (r < 0)
1036 	fprintf(stderr, "Failed to deassign device \"%s\" : %s\n",
1037                 dev->dev.qdev.id, strerror(-r));
1038 #endif
1039 }
1040 
1041 #if 0
1042 AssignedDevInfo *get_assigned_device(int pcibus, int slot)
1043 {
1044     AssignedDevice *assigned_dev = NULL;
1045     AssignedDevInfo *adev = NULL;
1046 
1047     QLIST_FOREACH(adev, &adev_head, next) {
1048         assigned_dev = adev->assigned_dev;
1049         if (pci_bus_num(assigned_dev->dev.bus) == pcibus &&
1050             PCI_SLOT(assigned_dev->dev.devfn) == slot)
1051             return adev;
1052     }
1053 
1054     return NULL;
1055 }
1056 #endif
1057 
1058 /* The pci config space got updated. Check if irq numbers have changed
1059  * for our devices
1060  */
assigned_dev_update_irqs(void)1061 void assigned_dev_update_irqs(void)
1062 {
1063     AssignedDevice *dev, *next;
1064     int r;
1065 
1066     dev = QLIST_FIRST(&devs);
1067     while (dev) {
1068         next = QLIST_NEXT(dev, next);
1069         r = assign_irq(dev);
1070         if (r < 0)
1071             qdev_unplug(&dev->dev.qdev);
1072         dev = next;
1073     }
1074 }
1075 
1076 #ifdef KVM_CAP_IRQ_ROUTING
1077 
1078 #ifdef KVM_CAP_DEVICE_MSI
assigned_dev_update_msi(PCIDevice * pci_dev,unsigned int ctrl_pos)1079 static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos)
1080 {
1081     struct kvm_assigned_irq assigned_irq_data;
1082     AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1083     uint8_t ctrl_byte = pci_dev->config[ctrl_pos];
1084     int r;
1085 
1086     memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
1087     assigned_irq_data.assigned_dev_id  =
1088         calc_assigned_dev_id(assigned_dev->h_segnr, assigned_dev->h_busnr,
1089                 (uint8_t)assigned_dev->h_devfn);
1090 
1091     /* Some guests gratuitously disable MSI even if they're not using it,
1092      * try to catch this by only deassigning irqs if the guest is using
1093      * MSI or intends to start. */
1094     if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSI) ||
1095         (ctrl_byte & PCI_MSI_FLAGS_ENABLE)) {
1096 
1097         assigned_irq_data.flags = assigned_dev->irq_requested_type;
1098         free_dev_irq_entries(assigned_dev);
1099         r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
1100         /* -ENXIO means no assigned irq */
1101         if (r && r != -ENXIO)
1102             perror("assigned_dev_update_msi: deassign irq");
1103 
1104         assigned_dev->irq_requested_type = 0;
1105     }
1106 
1107     if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) {
1108         int pos = ctrl_pos - PCI_MSI_FLAGS;
1109         assigned_dev->entry = calloc(1, sizeof(struct kvm_irq_routing_entry));
1110         if (!assigned_dev->entry) {
1111             perror("assigned_dev_update_msi: ");
1112             return;
1113         }
1114         assigned_dev->entry->u.msi.address_lo =
1115             pci_get_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO);
1116         assigned_dev->entry->u.msi.address_hi = 0;
1117         assigned_dev->entry->u.msi.data =
1118             pci_get_word(pci_dev->config + pos + PCI_MSI_DATA_32);
1119         assigned_dev->entry->type = KVM_IRQ_ROUTING_MSI;
1120         r = kvm_get_irq_route_gsi();
1121         if (r < 0) {
1122             perror("assigned_dev_update_msi: kvm_get_irq_route_gsi");
1123             return;
1124         }
1125         assigned_dev->entry->gsi = r;
1126 
1127         kvm_add_routing_entry(assigned_dev->entry);
1128         if (kvm_commit_irq_routes() < 0) {
1129             perror("assigned_dev_update_msi: kvm_commit_irq_routes");
1130             assigned_dev->cap.state &= ~ASSIGNED_DEVICE_MSI_ENABLED;
1131             return;
1132         }
1133 	assigned_dev->irq_entries_nr = 1;
1134 
1135         assigned_irq_data.guest_irq = assigned_dev->entry->gsi;
1136 	assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_MSI;
1137         if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0)
1138             perror("assigned_dev_enable_msi: assign irq");
1139 
1140         assigned_dev->girq = -1;
1141         assigned_dev->irq_requested_type = assigned_irq_data.flags;
1142     } else {
1143         assign_irq(assigned_dev);
1144     }
1145 }
1146 #endif
1147 
1148 #ifdef KVM_CAP_DEVICE_MSIX
assigned_dev_update_msix_mmio(PCIDevice * pci_dev)1149 static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev)
1150 {
1151     AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev);
1152     uint16_t entries_nr = 0, entries_max_nr;
1153     int pos = 0, i, r = 0;
1154     uint32_t msg_addr, msg_upper_addr, msg_data, msg_ctrl;
1155     struct kvm_assigned_msix_nr msix_nr;
1156     struct kvm_assigned_msix_entry msix_entry;
1157     void *va = adev->msix_table_page;
1158 
1159     pos = pci_find_capability(pci_dev, PCI_CAP_ID_MSIX);
1160 
1161     entries_max_nr = *(uint16_t *)(pci_dev->config + pos + 2);
1162     entries_max_nr &= PCI_MSIX_TABSIZE;
1163     entries_max_nr += 1;
1164 
1165     /* Get the usable entry number for allocating */
1166     for (i = 0; i < entries_max_nr; i++) {
1167         memcpy(&msg_ctrl, va + i * 16 + 12, 4);
1168         memcpy(&msg_data, va + i * 16 + 8, 4);
1169         /* Ignore unused entry even it's unmasked */
1170         if (msg_data == 0)
1171             continue;
1172         entries_nr ++;
1173     }
1174 
1175     if (entries_nr == 0) {
1176         fprintf(stderr, "MSI-X entry number is zero!\n");
1177         return -EINVAL;
1178     }
1179     msix_nr.assigned_dev_id = calc_assigned_dev_id(adev->h_segnr, adev->h_busnr,
1180                                           (uint8_t)adev->h_devfn);
1181     msix_nr.entry_nr = entries_nr;
1182     r = kvm_assign_set_msix_nr(kvm_context, &msix_nr);
1183     if (r != 0) {
1184         fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n",
1185 			strerror(-r));
1186         return r;
1187     }
1188 
1189     free_dev_irq_entries(adev);
1190     adev->irq_entries_nr = entries_nr;
1191     adev->entry = calloc(entries_nr, sizeof(struct kvm_irq_routing_entry));
1192     if (!adev->entry) {
1193         perror("assigned_dev_update_msix_mmio: ");
1194         return -errno;
1195     }
1196 
1197     msix_entry.assigned_dev_id = msix_nr.assigned_dev_id;
1198     entries_nr = 0;
1199     for (i = 0; i < entries_max_nr; i++) {
1200         if (entries_nr >= msix_nr.entry_nr)
1201             break;
1202         memcpy(&msg_ctrl, va + i * 16 + 12, 4);
1203         memcpy(&msg_data, va + i * 16 + 8, 4);
1204         if (msg_data == 0)
1205             continue;
1206 
1207         memcpy(&msg_addr, va + i * 16, 4);
1208         memcpy(&msg_upper_addr, va + i * 16 + 4, 4);
1209 
1210         r = kvm_get_irq_route_gsi();
1211         if (r < 0)
1212             return r;
1213 
1214         adev->entry[entries_nr].gsi = r;
1215         adev->entry[entries_nr].type = KVM_IRQ_ROUTING_MSI;
1216         adev->entry[entries_nr].flags = 0;
1217         adev->entry[entries_nr].u.msi.address_lo = msg_addr;
1218         adev->entry[entries_nr].u.msi.address_hi = msg_upper_addr;
1219         adev->entry[entries_nr].u.msi.data = msg_data;
1220         DEBUG("MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!", msg_data, msg_addr);
1221 	kvm_add_routing_entry(&adev->entry[entries_nr]);
1222 
1223         msix_entry.gsi = adev->entry[entries_nr].gsi;
1224         msix_entry.entry = i;
1225         r = kvm_assign_set_msix_entry(kvm_context, &msix_entry);
1226         if (r) {
1227             fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r));
1228             break;
1229         }
1230         DEBUG("MSI-X entry gsi 0x%x, entry %d\n!",
1231                 msix_entry.gsi, msix_entry.entry);
1232         entries_nr ++;
1233     }
1234 
1235     if (r == 0 && kvm_commit_irq_routes() < 0) {
1236 	    perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes");
1237 	    return -EINVAL;
1238     }
1239 
1240     return r;
1241 }
1242 
assigned_dev_update_msix(PCIDevice * pci_dev,unsigned int ctrl_pos)1243 static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos)
1244 {
1245     struct kvm_assigned_irq assigned_irq_data;
1246     AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev);
1247     uint16_t *ctrl_word = (uint16_t *)(pci_dev->config + ctrl_pos);
1248     int r;
1249 
1250     memset(&assigned_irq_data, 0, sizeof assigned_irq_data);
1251     assigned_irq_data.assigned_dev_id  =
1252             calc_assigned_dev_id(assigned_dev->h_segnr, assigned_dev->h_busnr,
1253                     (uint8_t)assigned_dev->h_devfn);
1254 
1255     /* Some guests gratuitously disable MSIX even if they're not using it,
1256      * try to catch this by only deassigning irqs if the guest is using
1257      * MSIX or intends to start. */
1258     if ((assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MSIX) ||
1259         (*ctrl_word & PCI_MSIX_ENABLE)) {
1260 
1261         assigned_irq_data.flags = assigned_dev->irq_requested_type;
1262         free_dev_irq_entries(assigned_dev);
1263         r = kvm_deassign_irq(kvm_context, &assigned_irq_data);
1264         /* -ENXIO means no assigned irq */
1265         if (r && r != -ENXIO)
1266             perror("assigned_dev_update_msix: deassign irq");
1267 
1268         assigned_dev->irq_requested_type = 0;
1269     }
1270 
1271     if (*ctrl_word & PCI_MSIX_ENABLE) {
1272         assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSIX |
1273                                   KVM_DEV_IRQ_GUEST_MSIX;
1274 
1275         if (assigned_dev_update_msix_mmio(pci_dev) < 0) {
1276             perror("assigned_dev_update_msix_mmio");
1277             return;
1278         }
1279         if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0) {
1280             perror("assigned_dev_enable_msix: assign irq");
1281             return;
1282         }
1283         assigned_dev->girq = -1;
1284         assigned_dev->irq_requested_type = assigned_irq_data.flags;
1285     } else {
1286         assign_irq(assigned_dev);
1287     }
1288 }
1289 #endif
1290 #endif
1291 
1292 /* There can be multiple VNDR capabilities per device, we need to find the
1293  * one that starts closet to the given address without going over. */
find_vndr_start(PCIDevice * pci_dev,uint32_t address)1294 static uint8_t find_vndr_start(PCIDevice *pci_dev, uint32_t address)
1295 {
1296     uint8_t cap, pos;
1297 
1298     for (cap = pos = 0;
1299          (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
1300          pos += PCI_CAP_LIST_NEXT) {
1301         if (pos <= address) {
1302             cap = MAX(pos, cap);
1303         }
1304     }
1305     return cap;
1306 }
1307 
1308 /* Merge the bits set in mask from mval into val.  Both val and mval are
1309  * at the same addr offset, pos is the starting offset of the mask. */
merge_bits(uint32_t val,uint32_t mval,uint8_t addr,int len,uint8_t pos,uint32_t mask)1310 static uint32_t merge_bits(uint32_t val, uint32_t mval, uint8_t addr,
1311                            int len, uint8_t pos, uint32_t mask)
1312 {
1313     if (!ranges_overlap(addr, len, pos, 4)) {
1314         return val;
1315     }
1316 
1317     if (addr >= pos) {
1318         mask >>= (addr - pos) * 8;
1319     } else {
1320         mask <<= (pos - addr) * 8;
1321     }
1322     mask &= 0xffffffffU >> (4 - len) * 8;
1323 
1324     val &= ~mask;
1325     val |= (mval & mask);
1326 
1327     return val;
1328 }
1329 
assigned_device_pci_cap_read_config(PCIDevice * pci_dev,uint32_t address,int len)1330 static uint32_t assigned_device_pci_cap_read_config(PCIDevice *pci_dev,
1331                                                     uint32_t address, int len)
1332 {
1333     uint8_t cap, cap_id = pci_dev->config_map[address];
1334     uint32_t val;
1335 
1336     switch (cap_id) {
1337 
1338     case PCI_CAP_ID_VPD:
1339         cap = pci_find_capability(pci_dev, cap_id);
1340         val = assigned_dev_pci_read(pci_dev, address, len);
1341         return merge_bits(val, pci_get_long(pci_dev->config + address),
1342                           address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
1343 
1344     case PCI_CAP_ID_VNDR:
1345         cap = find_vndr_start(pci_dev, address);
1346         val = assigned_dev_pci_read(pci_dev, address, len);
1347         return merge_bits(val, pci_get_long(pci_dev->config + address),
1348                           address, len, cap + PCI_CAP_LIST_NEXT, 0xff);
1349     }
1350 
1351     return pci_default_read_config(pci_dev, address, len);
1352 }
1353 
assigned_device_pci_cap_write_config(PCIDevice * pci_dev,uint32_t address,uint32_t val,int len)1354 static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev,
1355                                                  uint32_t address,
1356                                                  uint32_t val, int len)
1357 {
1358     uint8_t cap_id = pci_dev->config_map[address];
1359 
1360     pci_default_write_config(pci_dev, address, val, len);
1361     switch (cap_id) {
1362 #ifdef KVM_CAP_IRQ_ROUTING
1363     case PCI_CAP_ID_MSI:
1364 #ifdef KVM_CAP_DEVICE_MSI
1365         {
1366             uint8_t cap = pci_find_capability(pci_dev, cap_id);
1367             if (ranges_overlap(address - cap, len, PCI_MSI_FLAGS, 1)) {
1368                 assigned_dev_update_msi(pci_dev, cap + PCI_MSI_FLAGS);
1369             }
1370         }
1371 #endif
1372         break;
1373 
1374     case PCI_CAP_ID_MSIX:
1375 #ifdef KVM_CAP_DEVICE_MSIX
1376         {
1377             uint8_t cap = pci_find_capability(pci_dev, cap_id);
1378             if (ranges_overlap(address - cap, len, PCI_MSIX_FLAGS + 1, 1)) {
1379                 assigned_dev_update_msix(pci_dev, cap + PCI_MSIX_FLAGS);
1380             }
1381         }
1382 #endif
1383         break;
1384 #endif
1385 
1386     case PCI_CAP_ID_VPD:
1387     case PCI_CAP_ID_VNDR:
1388         assigned_dev_pci_write(pci_dev, address, val, len);
1389         break;
1390     }
1391 }
1392 
assigned_device_pci_cap_init(PCIDevice * pci_dev)1393 static int assigned_device_pci_cap_init(PCIDevice *pci_dev)
1394 {
1395     AssignedDevice *dev = container_of(pci_dev, AssignedDevice, dev);
1396     PCIRegion *pci_region = dev->real_device.regions;
1397     int ret, pos;
1398 
1399     /* Clear initial capabilities pointer and status copied from hw */
1400     pci_set_byte(pci_dev->config + PCI_CAPABILITY_LIST, 0);
1401     pci_set_word(pci_dev->config + PCI_STATUS,
1402                  pci_get_word(pci_dev->config + PCI_STATUS) &
1403                  ~PCI_STATUS_CAP_LIST);
1404 
1405 #ifdef KVM_CAP_IRQ_ROUTING
1406 #ifdef KVM_CAP_DEVICE_MSI
1407     /* Expose MSI capability
1408      * MSI capability is the 1st capability in capability config */
1409     if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSI, 0))) {
1410         dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI;
1411         /* Only 32-bit/no-mask currently supported */
1412         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSI, pos, 10)) < 0) {
1413             return ret;
1414         }
1415 
1416         pci_set_word(pci_dev->config + pos + PCI_MSI_FLAGS,
1417                      pci_get_word(pci_dev->config + pos + PCI_MSI_FLAGS) &
1418                      PCI_MSI_FLAGS_QMASK);
1419         pci_set_long(pci_dev->config + pos + PCI_MSI_ADDRESS_LO, 0);
1420         pci_set_word(pci_dev->config + pos + PCI_MSI_DATA_32, 0);
1421 
1422         /* Set writable fields */
1423         pci_set_word(pci_dev->wmask + pos + PCI_MSI_FLAGS,
1424                      PCI_MSI_FLAGS_QSIZE | PCI_MSI_FLAGS_ENABLE);
1425         pci_set_long(pci_dev->wmask + pos + PCI_MSI_ADDRESS_LO, 0xfffffffc);
1426         pci_set_word(pci_dev->wmask + pos + PCI_MSI_DATA_32, 0xffff);
1427     }
1428 #endif
1429 #ifdef KVM_CAP_DEVICE_MSIX
1430     /* Expose MSI-X capability */
1431     if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_MSIX, 0))) {
1432         int bar_nr;
1433         uint32_t msix_table_entry;
1434 
1435         dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX;
1436         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_MSIX, pos, 12)) < 0) {
1437             return ret;
1438         }
1439 
1440         pci_set_word(pci_dev->config + pos + PCI_MSIX_FLAGS,
1441                      pci_get_word(pci_dev->config + pos + PCI_MSIX_FLAGS) &
1442                      PCI_MSIX_TABSIZE);
1443 
1444         /* Only enable and function mask bits are writable */
1445         pci_set_word(pci_dev->wmask + pos + PCI_MSIX_FLAGS,
1446                      PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
1447 
1448         msix_table_entry = pci_get_long(pci_dev->config + pos + PCI_MSIX_TABLE);
1449         bar_nr = msix_table_entry & PCI_MSIX_BIR;
1450         msix_table_entry &= ~PCI_MSIX_BIR;
1451         dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry;
1452     }
1453 #endif
1454 #endif
1455 
1456     /* Minimal PM support, nothing writable, device appears to NAK changes */
1457     if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PM, 0))) {
1458         uint16_t pmc;
1459         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, pos,
1460                                       PCI_PM_SIZEOF)) < 0) {
1461             return ret;
1462         }
1463 
1464         pmc = pci_get_word(pci_dev->config + pos + PCI_CAP_FLAGS);
1465         pmc &= (PCI_PM_CAP_VER_MASK | PCI_PM_CAP_DSI);
1466         pci_set_word(pci_dev->config + pos + PCI_CAP_FLAGS, pmc);
1467 
1468         /* assign_device will bring the device up to D0, so we don't need
1469          * to worry about doing that ourselves here. */
1470         pci_set_word(pci_dev->config + pos + PCI_PM_CTRL,
1471                      PCI_PM_CTRL_NO_SOFT_RESET);
1472 
1473         pci_set_byte(pci_dev->config + pos + PCI_PM_PPB_EXTENSIONS, 0);
1474         pci_set_byte(pci_dev->config + pos + PCI_PM_DATA_REGISTER, 0);
1475     }
1476 
1477     if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_EXP, 0))) {
1478         uint8_t version;
1479         uint16_t type, devctl, lnkcap, lnksta;
1480         uint32_t devcap;
1481         int size = 0x3c; /* version 2 size */
1482 
1483         version = pci_get_byte(pci_dev->config + pos + PCI_EXP_FLAGS);
1484         version &= PCI_EXP_FLAGS_VERS;
1485         if (version == 1) {
1486             size = 0x14;
1487         } else if (version > 2) {
1488             fprintf(stderr, "Unsupported PCI express capability version %d\n",
1489                     version);
1490             return -EINVAL;
1491         }
1492 
1493         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_EXP,
1494                                       pos, size)) < 0) {
1495             return ret;
1496         }
1497 
1498         type = pci_get_word(pci_dev->config + pos + PCI_EXP_FLAGS);
1499         type = (type & PCI_EXP_FLAGS_TYPE) >> 8;
1500         if (type != PCI_EXP_TYPE_ENDPOINT &&
1501             type != PCI_EXP_TYPE_LEG_END && type != PCI_EXP_TYPE_RC_END) {
1502             fprintf(stderr,
1503                     "Device assignment only supports endpoint assignment, "
1504                     "device type %d\n", type);
1505             return -EINVAL;
1506         }
1507 
1508         /* capabilities, pass existing read-only copy
1509          * PCI_EXP_FLAGS_IRQ: updated by hardware, should be direct read */
1510 
1511         /* device capabilities: hide FLR */
1512         devcap = pci_get_long(pci_dev->config + pos + PCI_EXP_DEVCAP);
1513         devcap &= ~PCI_EXP_DEVCAP_FLR;
1514         pci_set_long(pci_dev->config + pos + PCI_EXP_DEVCAP, devcap);
1515 
1516         /* device control: clear all error reporting enable bits, leaving
1517          *                 leaving only a few host values.  Note, these are
1518          *                 all writable, but not passed to hw.
1519          */
1520         devctl = pci_get_word(pci_dev->config + pos + PCI_EXP_DEVCTL);
1521         devctl = (devctl & (PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_PAYLOAD)) |
1522                   PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN;
1523         pci_set_word(pci_dev->config + pos + PCI_EXP_DEVCTL, devctl);
1524         devctl = PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_AUX_PME;
1525         pci_set_word(pci_dev->wmask + pos + PCI_EXP_DEVCTL, ~devctl);
1526 
1527         /* Clear device status */
1528         pci_set_word(pci_dev->config + pos + PCI_EXP_DEVSTA, 0);
1529 
1530         /* Link capabilities, expose links and latencues, clear reporting */
1531         lnkcap = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKCAP);
1532         lnkcap &= (PCI_EXP_LNKCAP_SLS | PCI_EXP_LNKCAP_MLW |
1533                    PCI_EXP_LNKCAP_ASPMS | PCI_EXP_LNKCAP_L0SEL |
1534                    PCI_EXP_LNKCAP_L1EL);
1535         pci_set_word(pci_dev->config + pos + PCI_EXP_LNKCAP, lnkcap);
1536         pci_set_word(pci_dev->wmask + pos + PCI_EXP_LNKCAP,
1537                      PCI_EXP_LNKCTL_ASPMC | PCI_EXP_LNKCTL_RCB |
1538                      PCI_EXP_LNKCTL_CCC | PCI_EXP_LNKCTL_ES |
1539                      PCI_EXP_LNKCTL_CLKREQ_EN | PCI_EXP_LNKCTL_HAWD);
1540 
1541         /* Link control, pass existing read-only copy.  Should be writable? */
1542 
1543         /* Link status, only expose current speed and width */
1544         lnksta = pci_get_word(pci_dev->config + pos + PCI_EXP_LNKSTA);
1545         lnksta &= (PCI_EXP_LNKSTA_CLS | PCI_EXP_LNKSTA_NLW);
1546         pci_set_word(pci_dev->config + pos + PCI_EXP_LNKSTA, lnksta);
1547 
1548         if (version >= 2) {
1549             /* Slot capabilities, control, status - not needed for endpoints */
1550             pci_set_long(pci_dev->config + pos + PCI_EXP_SLTCAP, 0);
1551             pci_set_word(pci_dev->config + pos + PCI_EXP_SLTCTL, 0);
1552             pci_set_word(pci_dev->config + pos + PCI_EXP_SLTSTA, 0);
1553 
1554             /* Root control, capabilities, status - not needed for endpoints */
1555             pci_set_word(pci_dev->config + pos + PCI_EXP_RTCTL, 0);
1556             pci_set_word(pci_dev->config + pos + PCI_EXP_RTCAP, 0);
1557             pci_set_long(pci_dev->config + pos + PCI_EXP_RTSTA, 0);
1558 
1559             /* Device capabilities/control 2, pass existing read-only copy */
1560             /* Link control 2, pass existing read-only copy */
1561         }
1562     }
1563 
1564     if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_PCIX, 0))) {
1565         uint16_t cmd;
1566         uint32_t status;
1567 
1568         /* Only expose the minimum, 8 byte capability */
1569         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_PCIX, pos, 8)) < 0) {
1570             return ret;
1571         }
1572 
1573         /* Command register, clear upper bits, including extended modes */
1574         cmd = pci_get_word(pci_dev->config + pos + PCI_X_CMD);
1575         cmd &= (PCI_X_CMD_DPERR_E | PCI_X_CMD_ERO | PCI_X_CMD_MAX_READ |
1576                 PCI_X_CMD_MAX_SPLIT);
1577         pci_set_word(pci_dev->config + pos + PCI_X_CMD, cmd);
1578 
1579         /* Status register, update with emulated PCI bus location, clear
1580          * error bits, leave the rest. */
1581         status = pci_get_long(pci_dev->config + pos + PCI_X_STATUS);
1582         status &= ~(PCI_X_STATUS_BUS | PCI_X_STATUS_DEVFN);
1583         status |= (pci_bus_num(pci_dev->bus) << 8) | pci_dev->devfn;
1584         status &= ~(PCI_X_STATUS_SPL_DISC | PCI_X_STATUS_UNX_SPL |
1585                     PCI_X_STATUS_SPL_ERR);
1586         pci_set_long(pci_dev->config + pos + PCI_X_STATUS, status);
1587     }
1588 
1589     if ((pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VPD, 0))) {
1590         /* Direct R/W passthrough */
1591         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VPD, pos, 8)) < 0) {
1592             return ret;
1593         }
1594     }
1595 
1596     /* Devices can have multiple vendor capabilities, get them all */
1597     for (pos = 0; (pos = pci_find_cap_offset(pci_dev, PCI_CAP_ID_VNDR, pos));
1598         pos += PCI_CAP_LIST_NEXT) {
1599         uint8_t len = pci_get_byte(pci_dev->config + pos + PCI_CAP_FLAGS);
1600         /* Direct R/W passthrough */
1601         if ((ret = pci_add_capability(pci_dev, PCI_CAP_ID_VNDR,
1602                                       pos, len)) < 0) {
1603             return ret;
1604         }
1605     }
1606 
1607     return 0;
1608 }
1609 
msix_mmio_readl(void * opaque,target_phys_addr_t addr)1610 static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
1611 {
1612     AssignedDevice *adev = opaque;
1613     unsigned int offset = addr & 0xfff;
1614     void *page = adev->msix_table_page;
1615     uint32_t val = 0;
1616 
1617     memcpy(&val, (void *)((char *)page + offset), 4);
1618 
1619     return val;
1620 }
1621 
msix_mmio_readb(void * opaque,target_phys_addr_t addr)1622 static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr)
1623 {
1624     return ((msix_mmio_readl(opaque, addr & ~3)) >>
1625             (8 * (addr & 3))) & 0xff;
1626 }
1627 
msix_mmio_readw(void * opaque,target_phys_addr_t addr)1628 static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr)
1629 {
1630     return ((msix_mmio_readl(opaque, addr & ~3)) >>
1631             (8 * (addr & 3))) & 0xffff;
1632 }
1633 
msix_mmio_writel(void * opaque,target_phys_addr_t addr,uint32_t val)1634 static void msix_mmio_writel(void *opaque,
1635                              target_phys_addr_t addr, uint32_t val)
1636 {
1637     AssignedDevice *adev = opaque;
1638     unsigned int offset = addr & 0xfff;
1639     void *page = adev->msix_table_page;
1640 
1641     DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%x\n",
1642 		    addr, val);
1643     memcpy((void *)((char *)page + offset), &val, 4);
1644 }
1645 
msix_mmio_writew(void * opaque,target_phys_addr_t addr,uint32_t val)1646 static void msix_mmio_writew(void *opaque,
1647                              target_phys_addr_t addr, uint32_t val)
1648 {
1649     msix_mmio_writel(opaque, addr & ~3,
1650                      (val & 0xffff) << (8*(addr & 3)));
1651 }
1652 
msix_mmio_writeb(void * opaque,target_phys_addr_t addr,uint32_t val)1653 static void msix_mmio_writeb(void *opaque,
1654                              target_phys_addr_t addr, uint32_t val)
1655 {
1656     msix_mmio_writel(opaque, addr & ~3,
1657                      (val & 0xff) << (8*(addr & 3)));
1658 }
1659 
1660 static CPUWriteMemoryFunc *msix_mmio_write[] = {
1661     msix_mmio_writeb,	msix_mmio_writew,	msix_mmio_writel
1662 };
1663 
1664 static CPUReadMemoryFunc *msix_mmio_read[] = {
1665     msix_mmio_readb,	msix_mmio_readw,	msix_mmio_readl
1666 };
1667 
assigned_dev_register_msix_mmio(AssignedDevice * dev)1668 static int assigned_dev_register_msix_mmio(AssignedDevice *dev)
1669 {
1670     dev->msix_table_page = mmap(NULL, 0x1000,
1671                                 PROT_READ|PROT_WRITE,
1672                                 MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
1673     if (dev->msix_table_page == MAP_FAILED) {
1674         fprintf(stderr, "fail allocate msix_table_page! %s\n",
1675                 strerror(errno));
1676         return -EFAULT;
1677     }
1678     memset(dev->msix_table_page, 0, 0x1000);
1679     dev->mmio_index = cpu_register_io_memory(
1680                         msix_mmio_read, msix_mmio_write, dev,
1681                         DEVICE_NATIVE_ENDIAN);
1682     return 0;
1683 }
1684 
assigned_dev_unregister_msix_mmio(AssignedDevice * dev)1685 static void assigned_dev_unregister_msix_mmio(AssignedDevice *dev)
1686 {
1687     if (!dev->msix_table_page)
1688         return;
1689 
1690     cpu_unregister_io_memory(dev->mmio_index);
1691     dev->mmio_index = 0;
1692 
1693     if (munmap(dev->msix_table_page, 0x1000) == -1) {
1694         fprintf(stderr, "error unmapping msix_table_page! %s\n",
1695                 strerror(errno));
1696     }
1697     dev->msix_table_page = NULL;
1698 }
1699 
1700 static const VMStateDescription vmstate_assigned_device = {
1701     .name = "pci-assign",
1702     .fields = (VMStateField []) {
1703         VMSTATE_END_OF_LIST()
1704     }
1705 };
1706 
reset_assigned_device(DeviceState * dev)1707 static void reset_assigned_device(DeviceState *dev)
1708 {
1709     PCIDevice *d = DO_UPCAST(PCIDevice, qdev, dev);
1710 
1711     /*
1712      * When a 0 is written to the command register, the device is logically
1713      * disconnected from the PCI bus. This avoids further DMA transfers.
1714      */
1715     assigned_dev_pci_write_config(d, PCI_COMMAND, 0, 2);
1716 }
1717 
assigned_initfn(struct PCIDevice * pci_dev)1718 static int assigned_initfn(struct PCIDevice *pci_dev)
1719 {
1720     AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
1721     uint8_t e_device, e_intx;
1722     int r;
1723 
1724     if (!kvm_enabled()) {
1725         error_report("pci-assign: error: requires KVM support");
1726         return -1;
1727     }
1728 
1729     if (!dev->host.seg && !dev->host.bus && !dev->host.dev && !dev->host.func) {
1730         error_report("pci-assign: error: no host device specified");
1731         return -1;
1732     }
1733 
1734     if (get_real_device(dev, dev->host.seg, dev->host.bus,
1735                         dev->host.dev, dev->host.func)) {
1736         error_report("pci-assign: Error: Couldn't get real device (%s)!",
1737                      dev->dev.qdev.id);
1738         goto out;
1739     }
1740 
1741     /* handle real device's MMIO/PIO BARs */
1742     if (assigned_dev_register_regions(dev->real_device.regions,
1743                                       dev->real_device.region_number,
1744                                       dev))
1745         goto out;
1746 
1747     /* handle interrupt routing */
1748     e_device = (dev->dev.devfn >> 3) & 0x1f;
1749     e_intx = dev->dev.config[0x3d] - 1;
1750     dev->intpin = e_intx;
1751     dev->run = 0;
1752     dev->girq = -1;
1753     dev->h_segnr = dev->host.seg;
1754     dev->h_busnr = dev->host.bus;
1755     dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func);
1756 
1757     if (assigned_device_pci_cap_init(pci_dev) < 0)
1758         goto out;
1759 
1760     /* assign device to guest */
1761     r = assign_device(dev);
1762     if (r < 0)
1763         goto out;
1764 
1765     /* assign irq for the device */
1766     r = assign_irq(dev);
1767     if (r < 0)
1768         goto assigned_out;
1769 
1770     /* intercept MSI-X entry page in the MMIO */
1771     if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
1772         if (assigned_dev_register_msix_mmio(dev))
1773             goto assigned_out;
1774 
1775     assigned_dev_load_option_rom(dev);
1776     QLIST_INSERT_HEAD(&devs, dev, next);
1777 
1778     add_boot_device_path(dev->bootindex, &pci_dev->qdev, NULL);
1779 
1780     /* Register a vmsd so that we can mark it unmigratable. */
1781     vmstate_register(&dev->dev.qdev, 0, &vmstate_assigned_device, dev);
1782     register_device_unmigratable(&dev->dev.qdev,
1783                                  vmstate_assigned_device.name, dev);
1784 
1785     return 0;
1786 
1787 assigned_out:
1788     deassign_device(dev);
1789 out:
1790     free_assigned_device(dev);
1791     return -1;
1792 }
1793 
assigned_exitfn(struct PCIDevice * pci_dev)1794 static int assigned_exitfn(struct PCIDevice *pci_dev)
1795 {
1796     AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev);
1797 
1798     vmstate_unregister(&dev->dev.qdev, &vmstate_assigned_device, dev);
1799     QLIST_REMOVE(dev, next);
1800     deassign_device(dev);
1801     free_assigned_device(dev);
1802     return 0;
1803 }
1804 
parse_hostaddr(DeviceState * dev,Property * prop,const char * str)1805 static int parse_hostaddr(DeviceState *dev, Property *prop, const char *str)
1806 {
1807     PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop);
1808     int rc;
1809 
1810     rc = pci_parse_host_devaddr(str, &ptr->seg, &ptr->bus, &ptr->dev, &ptr->func);
1811     if (rc != 0)
1812         return -1;
1813     return 0;
1814 }
1815 
print_hostaddr(DeviceState * dev,Property * prop,char * dest,size_t len)1816 static int print_hostaddr(DeviceState *dev, Property *prop, char *dest, size_t len)
1817 {
1818     PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop);
1819 
1820     return snprintf(dest, len, "%02x:%02x.%x", ptr->bus, ptr->dev, ptr->func);
1821 }
1822 
1823 PropertyInfo qdev_prop_hostaddr = {
1824     .name  = "pci-hostaddr",
1825     .type  = -1,
1826     .size  = sizeof(PCIHostDevice),
1827     .parse = parse_hostaddr,
1828     .print = print_hostaddr,
1829 };
1830 
1831 static PCIDeviceInfo assign_info = {
1832     .qdev.name    = "pci-assign",
1833     .qdev.desc    = "pass through host pci devices to the guest",
1834     .qdev.size    = sizeof(AssignedDevice),
1835     .qdev.reset   = reset_assigned_device,
1836     .init         = assigned_initfn,
1837     .exit         = assigned_exitfn,
1838     .config_read  = assigned_dev_pci_read_config,
1839     .config_write = assigned_dev_pci_write_config,
1840     .qdev.props   = (Property[]) {
1841         DEFINE_PROP("host", AssignedDevice, host, qdev_prop_hostaddr, PCIHostDevice),
1842         DEFINE_PROP_BIT("iommu", AssignedDevice, features,
1843                         ASSIGNED_DEVICE_USE_IOMMU_BIT, true),
1844         DEFINE_PROP_BIT("prefer_msi", AssignedDevice, features,
1845                         ASSIGNED_DEVICE_PREFER_MSI_BIT, true),
1846         DEFINE_PROP_INT32("bootindex", AssignedDevice, bootindex, -1),
1847         DEFINE_PROP_STRING("configfd", AssignedDevice, configfd_name),
1848         DEFINE_PROP_END_OF_LIST(),
1849     },
1850 };
1851 
assign_register_devices(void)1852 static void assign_register_devices(void)
1853 {
1854     pci_qdev_register(&assign_info);
1855 }
1856 
device_init(assign_register_devices)1857 device_init(assign_register_devices)
1858 
1859 /*
1860  * Scan the assigned devices for the devices that have an option ROM, and then
1861  * load the corresponding ROM data to RAM. If an error occurs while loading an
1862  * option ROM, we just ignore that option ROM and continue with the next one.
1863  */
1864 static void assigned_dev_load_option_rom(AssignedDevice *dev)
1865 {
1866     char name[32], rom_file[64];
1867     FILE *fp;
1868     uint8_t val;
1869     struct stat st;
1870     void *ptr;
1871 
1872     /* If loading ROM from file, pci handles it */
1873     if (dev->dev.romfile || !dev->dev.rom_bar)
1874         return;
1875 
1876     snprintf(rom_file, sizeof(rom_file),
1877              "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/rom",
1878              dev->host.seg, dev->host.bus, dev->host.dev, dev->host.func);
1879 
1880     if (stat(rom_file, &st)) {
1881         return;
1882     }
1883 
1884     if (access(rom_file, F_OK)) {
1885         fprintf(stderr, "pci-assign: Insufficient privileges for %s\n",
1886                 rom_file);
1887         return;
1888     }
1889 
1890     /* Write "1" to the ROM file to enable it */
1891     fp = fopen(rom_file, "r+");
1892     if (fp == NULL) {
1893         return;
1894     }
1895     val = 1;
1896     if (fwrite(&val, 1, 1, fp) != 1) {
1897         goto close_rom;
1898     }
1899     fseek(fp, 0, SEEK_SET);
1900 
1901     snprintf(name, sizeof(name), "%s.rom", dev->dev.qdev.info->name);
1902     dev->dev.rom_offset = qemu_ram_alloc(&dev->dev.qdev, name, st.st_size);
1903     ptr = qemu_get_ram_ptr(dev->dev.rom_offset);
1904     memset(ptr, 0xff, st.st_size);
1905 
1906     if (!fread(ptr, 1, st.st_size, fp)) {
1907         fprintf(stderr, "pci-assign: Cannot read from host %s\n"
1908                 "\tDevice option ROM contents are probably invalid "
1909                 "(check dmesg).\n\tSkip option ROM probe with rombar=0, "
1910                 "or load from file with romfile=\n", rom_file);
1911         qemu_ram_free(dev->dev.rom_offset);
1912         dev->dev.rom_offset = 0;
1913         goto close_rom;
1914     }
1915 
1916     pci_register_bar(&dev->dev, PCI_ROM_SLOT,
1917                      st.st_size, 0, pci_map_option_rom);
1918 close_rom:
1919     /* Write "0" to disable ROM */
1920     fseek(fp, 0, SEEK_SET);
1921     val = 0;
1922     if (!fwrite(&val, 1, 1, fp)) {
1923         DEBUG("%s\n", "Failed to disable pci-sysfs rom file");
1924     }
1925     fclose(fp);
1926 }
1927