(很多内容是网上找的,+上我个人的一点理解,推荐大家去看 http://mnstory.net/2014/10/qemu-device-simulation 这篇文章)
qemu启动时,如果配置了相应virtio设备,会对guest的pci总线,virtio设备等进行模拟,先来看看qemu的设备模拟,那i8254/PIT为例(PIT的硬件规范略过,有兴趣的话可以参考 http://wiki.osdev.org/Programmable_Interval_Timer)
hw/timer/i8254.c定义了PIT设备的模拟,通过qom来定义设备对象模型,e.g.
static const TypeInfo pit_info = { .name = TYPE_I8254, .parent = TYPE_PIT_COMMON, .instance_size = sizeof(PITCommonState), .class_init = pit_class_initfn, .class_size = sizeof(PITClass), }; static void pit_register_types(void) { type_register_static(&pit_info); } type_init(pit_register_types)
type_init宏遵循qom的设备定义规范,实际调用的是module_init宏,这个宏被定义为__attribute__((constructor))属性,类似于cpp里面的构建函数,会在main函数之前被调用。
typedef enum { MODULE_INIT_BLOCK, MODULE_INIT_MACHINE, MODULE_INIT_QAPI, MODULE_INIT_QOM, MODULE_INIT_MAX } module_init_type; #define type_init(function) module_init(function, MODULE_INIT_QOM) /* This should not be used directly. Use block_init etc. instead. */ #define module_init(function, type) static void __attribute__((constructor)) do_qemu_init_ ## function(void) { register_module_init(function, type); }
register_module_init的作用就是初始化type类型的ModuleEntry结构,并插入到init_type_list[type]的QLIST列表中。module_call_init则是对type的所有ModuleEntry,调用注册的初始化函数。
typedef struct ModuleEntry { void (*init)(void); QTAILQ_ENTRY(ModuleEntry) node; module_init_type type; } ModuleEntry; typedef QTAILQ_HEAD(, ModuleEntry) ModuleTypeList; static ModuleTypeList init_type_list[MODULE_INIT_MAX]; static ModuleTypeList *find_type(module_init_type type) { ModuleTypeList *l; init_lists(); l = &init_type_list[type]; return l; } void module_call_init(module_init_type type) { ModuleTypeList *l; ModuleEntry *e; module_load(type); l = find_type(type); QTAILQ_FOREACH(e, l, node) { e->init(); } }
现在回到PIT的设备注册函数pit_register_types,最终是通过传入的TypeInfo生成一个TypeImpl,并把这个TypeImpl插入到一个全局静态的GHashTable
type_table中。
static const TypeInfo pit_info = { .name = TYPE_I8254, .parent = TYPE_PIT_COMMON, .instance_size = sizeof(PITCommonState), .class_init = pit_class_initfn, .class_size = sizeof(PITClass), }; static void pit_register_types(void) { type_register_static(&pit_info); } TypeImpl *type_register_static(const TypeInfo *info) { return type_register(info); } TypeImpl *type_register(const TypeInfo *info) { assert(info->parent); return type_register_internal(info); } static TypeImpl *type_register_internal(const TypeInfo *info) { TypeImpl *ti; ti = type_new(info); type_table_add(ti); return ti; } static void type_table_add(TypeImpl *ti) { assert(!enumerating_types); g_hash_table_insert(type_table_get(), (void *)ti->name, ti); }
从PITClass也可以看出,qom实际上把qemu的设备对象模型搞成了类似cpp的对象,通过父子类,继承,虚函数等一系列特性,让qemu设备对象形成了一个树形结构,树根就是object,同时TypeInfo, TypeImpl, Object, ObjectClass都可以支持这种树形结构,e.g.
static const TypeInfo pit_info = { .name = TYPE_I8254, .parent = TYPE_PIT_COMMON, .instance_size = sizeof(PITCommonState), .class_init = pit_class_initfn, .class_size = sizeof(PITClass), }; static const TypeInfo pit_common_type = { .name = TYPE_PIT_COMMON, .parent = TYPE_ISA_DEVICE, .instance_size = sizeof(PITCommonState), .class_size = sizeof(PITCommonClass), .class_init = pit_common_class_init, .abstract = true, }; static const TypeInfo isa_device_type_info = { .name = TYPE_ISA_DEVICE, .parent = TYPE_DEVICE, .instance_size = sizeof(ISADevice), .instance_init = isa_device_init, .abstract = true, .class_size = sizeof(ISADeviceClass), .class_init = isa_device_class_init, }; static const TypeInfo device_type_info = { .name = TYPE_DEVICE, .parent = TYPE_OBJECT, .instance_size = sizeof(DeviceState), .instance_init = device_initfn, .instance_post_init = device_post_init, .instance_finalize = device_finalize, .class_base_init = device_class_base_init, .class_init = device_class_init, .abstract = true, .class_size = sizeof(DeviceClass), }; static TypeInfo object_info = { .name = TYPE_OBJECT, .instance_size = sizeof(Object), .instance_init = object_instance_init, .abstract = true, };
ObjectClass这层同样体现了这种继承关系,同时ObjectClass也被用来实现多态,即Object对象的ObjectClass指针实际是一个虚函数的指针,e.g.
typedef struct PITCommonClass { ISADeviceClass parent_class; void (*set_channel_gate)(PITCommonState *s, PITChannelState *sc, int val); void (*get_channel_info)(PITCommonState *s, PITChannelState *sc, PITChannelInfo *info); void (*pre_save)(PITCommonState *s); void (*post_load)(PITCommonState *s); } PITCommonClass; typedef struct ISADeviceClass { DeviceClass parent_class; } ISADeviceClass; typedef struct DeviceClass { /*< private >*/ ObjectClass parent_class; /*< public >*/ DECLARE_BITMAP(categories, DEVICE_CATEGORY_MAX); const char *fw_name; const char *desc; Property *props; ... } DeviceClass struct ObjectClass { /*< private >*/ Type type; GSList *interfaces; const char *object_cast_cache[OBJECT_CLASS_CAST_CACHE]; const char *class_cast_cache[OBJECT_CLASS_CAST_CACHE]; ObjectUnparent *unparent; };
实际的设备对象继承关系如下:
typedef struct PITCommonState { ISADevice dev; MemoryRegion ioports; uint32_t iobase; PITChannelState channels[3]; } PITCommonState; struct ISADevice { /*< private >*/ DeviceState parent_obj; /*< public >*/ uint32_t isairq[2]; int nirqs; int ioport_id; }; struct DeviceState { /*< private >*/ Object parent_obj; /*< public >*/ const char *id; bool realized; bool pending_deleted_event; QemuOpts *opts; int hotplugged; BusState *parent_bus; QLIST_HEAD(, NamedGPIOList) gpios; QLIST_HEAD(, BusState) child_bus; int num_child_bus; int instance_id_alias; int alias_required_for_version; }; struct Object { ObjectClass *class; ObjectFree *free; QTAILQ_HEAD(, ObjectProperty) properties; uint32_t ref; Object *parent; };
下图清晰的解释了多态是如何实现的,注意ObjectClass* 指针实际指向的是PITCommonClass
言归正传,现在来看下qemu端的virtio pci设备。在qemu里,virtio pci设备是所有其他virtio设备,e.g. virtio block, virtio net, virtio ballon的父类,其定义如下
static void virtio_device_class_init(ObjectClass *klass, void *data) { /* Set the default value here. */ DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = virtio_device_realize; dc->unrealize = virtio_device_unrealize; dc->bus_type = TYPE_VIRTIO_BUS; } static const TypeInfo virtio_device_info = { .name = TYPE_VIRTIO_DEVICE, .parent = TYPE_DEVICE, .instance_size = sizeof(VirtIODevice), .class_init = virtio_device_class_init, .abstract = true, .class_size = sizeof(VirtioDeviceClass), }; static void virtio_register_types(void) { type_register_static(&virtio_device_info); } type_init(virtio_register_types)
virtio设备的结构如下,其中最关键的是VirtQueue的指针,我理解VirtIODevice主要是封装了VirtQueue
struct VirtIODevice { DeviceState parent_obj; const char *name; uint8_t status; uint8_t isr; uint16_t queue_sel; uint32_t guest_features; size_t config_len; void *config; uint16_t config_vector; int nvectors; VirtQueue *vq; uint16_t device_id; bool vm_running; VMChangeStateEntry *vmstate; char *bus_name; uint8_t device_endian; }; typedef struct VirtioDeviceClass { /*< private >*/ DeviceClass parent; /*< public >*/ /* This is what a VirtioDevice must implement */ DeviceRealize realize; DeviceUnrealize unrealize; uint32_t (*get_features)(VirtIODevice *vdev, uint32_t requested_features); uint32_t (*bad_features)(VirtIODevice *vdev); void (*set_features)(VirtIODevice *vdev, uint32_t val); void (*get_config)(VirtIODevice *vdev, uint8_t *config); void (*set_config)(VirtIODevice *vdev, const uint8_t *config); void (*reset)(VirtIODevice *vdev); void (*set_status)(VirtIODevice *vdev, uint8_t val); void (*set_version)(VirtIODevice *vdev, uint32_t val); /* Test and clear event pending status. * Should be called after unmask to avoid losing events. * If backend does not support masking, * must check in frontend instead. */ bool (*guest_notifier_pending)(VirtIODevice *vdev, int n); /* Mask/unmask events from this vq. Any events reported * while masked will become pending. * If backend does not support masking, * must mask in frontend instead. */ void (*guest_notifier_mask)(VirtIODevice *vdev, int n, bool mask); void (*save)(VirtIODevice *vdev, QEMUFile *f); int (*load)(VirtIODevice *vdev, QEMUFile *f, int version_id); } VirtioDeviceClass;
qemu内部也定义了VirtQueue和VRing的结构,注意这里需要保证前端virtio驱动和qemu的vring两者的ABI一致,即内存结构保证一致性,e.g.
/* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * * struct vring * { * // The actual descriptors (16 bytes each) * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. * __u16 avail_flags; * __u16 avail_idx; * __u16 available[num]; * __u16 used_event_idx; * * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. * __u16 used_flags; * __u16 used_idx; * struct vring_used_elem used[num]; * __u16 avail_event_idx; * };
注意qemu对于VRing的操作一般都只限于VRingUsed这部分以及avail_event_idx的更新
#define VIRTIO_PCI_VRING_ALIGN 4096 typedef struct VRingDesc { uint64_t addr; uint32_t len; uint16_t flags; uint16_t next; } VRingDesc; typedef struct VRingAvail { uint16_t flags; uint16_t idx; uint16_t ring[0]; } VRingAvail; typedef struct VRingUsedElem { uint32_t id; uint32_t len; } VRingUsedElem; typedef struct VRingUsed { uint16_t flags; uint16_t idx; VRingUsedElem ring[0]; } VRingUsed; typedef struct VRing { unsigned int num; unsigned int align; hwaddr desc; hwaddr avail; hwaddr used; } VRing; struct VirtQueue { VRing vring; hwaddr pa; uint16_t last_avail_idx; /* Last used index value we have signalled on */ uint16_t signalled_used; /* Last used index value we have signalled on */ bool signalled_used_valid; /* Notification enabled? */ bool notification; /* 是否使能notification,只有enable了event_idx才有效 */ uint16_t queue_index; int inuse; uint16_t vector; void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); VirtIODevice *vdev; EventNotifier guest_notifier; /* guest notifier fd,目前只有vhost使用 */ EventNotifier host_notifier; /* host notifier fd, 目前只有vhost使用 */ };
qemu的VRing通过查看avail_event_idx来判断是否有新的avail buffer到达。关于VirtQueue和VRing的部分后面再详细阐述。
除了TYPE_DEVICE -> TYPE_VIRTIO_DEVICE -> TYPE_VIRTIO_XXX_DEVICE这条继承关系之外,qemu的virtio设备同时也是PCI设备,因此还存在有TYPE_DEVICE -> TYPE_PCI_DEVICE -> TYPE_VIRTIO_PCI -> TYPE_VIRTIO_XXX_PCI的继承关系,同时总线上也存在TYPE_BUS -> TYPE_VIRTIO_BUS -> TYPE_VIRTIO_PCI_BUS
virtio bus定义如下
static const TypeInfo virtio_bus_info = { .name = TYPE_VIRTIO_BUS, .parent = TYPE_BUS, .instance_size = sizeof(VirtioBusState), .abstract = true, .class_size = sizeof(VirtioBusClass), .class_init = virtio_bus_class_init }; static void virtio_register_types(void) { type_register_static(&virtio_bus_info); } type_init(virtio_register_types)
总线设备virtio-bus,对应类型是VirtioBusClass。设备定义了多个虚函数的接口,具体实现分为TYPE_VIRTIO_MMIO_BUS和TYPE_VIRTIO_PCI_BUS两种,对应设备是virtio-pci-bus和virtio-mmio-bus
static const TypeInfo virtio_pci_bus_info = { .name = TYPE_VIRTIO_PCI_BUS, .parent = TYPE_VIRTIO_BUS, .instance_size = sizeof(VirtioPCIBusState), .class_init = virtio_pci_bus_class_init, }; static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) { BusClass *bus_class = BUS_CLASS(klass); VirtioBusClass *k = VIRTIO_BUS_CLASS(klass); bus_class->max_dev = 1; k->notify = virtio_pci_notify; k->save_config = virtio_pci_save_config; k->load_config = virtio_pci_load_config; k->save_queue = virtio_pci_save_queue; k->load_queue = virtio_pci_load_queue; k->get_features = virtio_pci_get_features; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_host_notifier = virtio_pci_set_host_notifier; k->start_host_notifier = virtio_pci_start_host_notifier; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; k->vmstate_change = virtio_pci_vmstate_change; k->device_plugged = virtio_pci_device_plugged; k->device_unplugged = virtio_pci_device_unplugged; }
另一个相关的设备是virtio-pci,代表了挂载virtio pci总线上的pci设备模型,定义如下
static const TypeInfo virtio_pci_info = { .name = TYPE_VIRTIO_PCI, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(VirtIOPCIProxy), .class_init = virtio_pci_class_init, .class_size = sizeof(VirtioPCIClass), .abstract = true, }; struct VirtIOPCIProxy { PCIDevice pci_dev; MemoryRegion bar; uint32_t flags; uint32_t class_code; uint32_t nvectors; uint32_t host_features; bool ioeventfd_disabled; bool ioeventfd_started; VirtIOIRQFD *vector_irqfd; int nvqs_with_notifiers; VirtioBusState bus; }; typedef struct VirtioPCIClass { PCIDeviceClass parent_class; int (*init)(VirtIOPCIProxy *vpci_dev); } VirtioPCIClass;
VirtioPCIBusState除了虚函数的实现,其他地方和VirtioBusState完全一致,同样VirtioBusClass和VirtioPCIBusClass也是完全一致。从代码实现上看,和内核的virtio pci设备基本类似。
virtio_pci_notify用于通知guest中断,根据pci设备是否enable msix
static void virtio_pci_notify(DeviceState *d, uint16_t vector) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d); if (msix_enabled(&proxy->pci_dev)) msix_notify(&proxy->pci_dev, vector); /* msix_notify通过写pci配置空间来传递中断 */ else { VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); pci_set_irq(&proxy->pci_dev, vdev->isr & 1); /* 通过irq传递中断 */ } }
virtio_pci_get_features用于获取pci配置空间的HOST_FEATURES
static unsigned virtio_pci_get_features(DeviceState *d) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); return proxy->host_features; }
virtio_pci_save_config/virtio_pci_load_config,virtio_pci_save_queue/virtio_pci_load_queue用于save/load pci的配置信息和队列信息
virtio_pci_device_plugged当有virtio-pci设备被virtio-pci-bus启动时被调用
static void virtio_pci_device_plugged(DeviceState *d) { VirtIOPCIProxy *proxy = VIRTIO_PCI(d); VirtioBusState *bus = &proxy->bus; uint8_t *config; uint32_t size; config = proxy->pci_dev.config; if (proxy->class_code) { pci_config_set_class(config, proxy->class_code); /* 设置pci配置空间的class code */ } pci_set_word(config + PCI_SUBSYSTEM_VENDOR_ID, pci_get_word(config + PCI_VENDOR_ID)); /* 设置pci配置空间的VENDOR_ID */ pci_set_word(config + PCI_SUBSYSTEM_ID, virtio_bus_get_vdev_id(bus)); /* 设置pci配置空间的SUBSYSTEM_ID */ config[PCI_INTERRUPT_PIN] = 1; if (proxy->nvectors && msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors, 1)) { /* 初始化virtio bar的msix配置 */ error_report("unable to init msix vectors to %" PRIu32, proxy->nvectors); proxy->nvectors = 0; } proxy->pci_dev.config_write = virtio_write_config; size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev) + virtio_bus_get_vdev_config_len(bus); if (size & (size - 1)) { size = 1 << qemu_fls(size); } memory_region_init_io(&proxy->bar, OBJECT(proxy), &virtio_pci_config_ops, proxy, "virtio-pci", size); /* 初始化bar0的内存为virtio的配置空间 */ pci_register_bar(&proxy->pci_dev, 0, PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar); /* 注册virtio pci的bar0 */ if (!kvm_has_many_ioeventfds()) { proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; } proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY; proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE; proxy->host_features = virtio_bus_get_vdev_features(bus, proxy->host_features); }
virtio_pci_device_unplugged当设备停用并从总线上删除时调用,对vhost而言,ioeventfd用于guest通过pci通知host,irqfd用于host把中断注入guest。这里调用了virtio_pci_stop_ioeventfd,该函数最终通过virtio_pci_set_host_notifier_internal来配置ioveventfd
static void virtio_pci_device_unplugged(DeviceState *d) { VirtIOPCIProxy *proxy = VIRTIO_PCI(d); virtio_pci_stop_ioeventfd(proxy); }
VirtioBusClass需要支持guest notifier和host
notifier的相关操作,e.g.
static int virtio_pci_set_host_notifier(DeviceState *d, int n, bool assign) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); /* Stop using ioeventfd for virtqueue kick if the device starts using host * notifiers. This makes it easy to avoid stepping on each others' toes. */ proxy->ioeventfd_disabled = assign; /* assign为true,说明设置host notifier;assign为false,说明清理host notifier */ if (assign) { /* 如果是assign新的fd,需要先清理掉老的fd */ virtio_pci_stop_ioeventfd(proxy); } /* We don't need to start here: it's not needed because backend * currently only stops on status change away from ok, * reset, vmstop and such. If we do add code to start here, * need to check vmstate, device state etc. */ return virtio_pci_set_host_notifier_internal(proxy, n, assign, false); } static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy, int n, bool assign, bool set_handler) { VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); VirtQueue *vq = virtio_get_queue(vdev, n); EventNotifier *notifier = virtio_queue_get_host_notifier(vq); int r = 0; if (assign) { r = event_notifier_init(notifier, 1); if (r < 0) { error_report("%s: unable to init event notifier: %d", __func__, r); return r; } virtio_queue_set_host_notifier_fd_handler(vq, true, set_handler); /* set_host_notifier的时候set_handler设置为false */ /* start_ioeventfd的时候设置为true */ memory_region_add_eventfd(&proxy->bar, VIRTIO_PCI_QUEUE_NOTIFY, 2, true, n, notifier); /* 把ioport/iomem对应的memregion和fd关联起来 */ } else { memory_region_del_eventfd(&proxy->bar, VIRTIO_PCI_QUEUE_NOTIFY, 2, true, n, notifier); virtio_queue_set_host_notifier_fd_handler(vq, false, false); event_notifier_cleanup(notifier); } return r; }
上述的host notifier,实际对应VirtQueue EventNotifier的一个read fd,通常叫做ioevent_fd,可以通过virtio_pci_start_ioeventfd,virtio_pci_stop_ioeventfd来控制;而guest notifier通常对应于VirtIOPCIProxy的vector_irqfd,通常叫做irqfd,e.g.
static int virtio_pci_set_guest_notifiers(DeviceState *d, int nvqs, bool assign) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); int r, n; bool with_irqfd = msix_enabled(&proxy->pci_dev) && kvm_msi_via_irqfd_enabled(); nvqs = MIN(nvqs, VIRTIO_PCI_QUEUE_MAX); /* When deassigning, pass a consistent nvqs value * to avoid leaking notifiers. */ assert(assign || nvqs == proxy->nvqs_with_notifiers); proxy->nvqs_with_notifiers = nvqs; /* Must unset vector notifier while guest notifier is still assigned */ if ((proxy->vector_irqfd || k->guest_notifier_mask) && !assign) { msix_unset_vector_notifiers(&proxy->pci_dev); /* 修改pci配置空间清理msix vector */ if (proxy->vector_irqfd) { kvm_virtio_pci_vector_release(proxy, nvqs); g_free(proxy->vector_irqfd); proxy->vector_irqfd = NULL; } } for (n = 0; n < nvqs; n++) { if (!virtio_queue_get_num(vdev, n)) { break; } r = virtio_pci_set_guest_notifier(d, n, assign, with_irqfd); /* 对每个VirtQueue设置irqfd */ if (r < 0) { goto assign_error; } } /* Must set vector notifier after guest notifier has been assigned */ if ((with_irqfd || k->guest_notifier_mask) && assign) { if (with_irqfd) { proxy->vector_irqfd = g_malloc0(sizeof(*proxy->vector_irqfd) * msix_nr_vectors_allocated(&proxy->pci_dev)); r = kvm_virtio_pci_vector_use(proxy, nvqs); if (r < 0) { goto assign_error; } } r = msix_set_vector_notifiers(&proxy->pci_dev, virtio_pci_vector_unmask, virtio_pci_vector_mask, virtio_pci_vector_poll); /* 同样配置pci空间增加msix vector */ if (r < 0) { goto notifiers_error; } } return 0; notifiers_error: if (with_irqfd) { assert(assign); kvm_virtio_pci_vector_release(proxy, nvqs); } assign_error: /* We get here on assignment failure. Recover by undoing for VQs 0 .. n. */ assert(assign); while (--n >= 0) { virtio_pci_set_guest_notifier(d, n, !assign, with_irqfd); } return r; }
virtio的pci配置空间的读写(bar0的IO空间)操作,用于pci配置空间的offset + size的读写操作,并分为单字节,双字节,四字节三种。对于virtio配置空间20字节之前的读写通过io port完成,20字节之后的通过mmio完成。
static const MemoryRegionOps virtio_pci_config_ops = { .read = virtio_pci_config_read, .write = virtio_pci_config_write, .impl = { .min_access_size = 1, .max_access_size = 4, }, .endianness = DEVICE_LITTLE_ENDIAN, };