OpenStack之虚机热迁移代码解析
话说虚机迁移分为冷迁移以及热迁移,所谓热迁移用度娘的话说即是:热迁移(Live Migration,又叫动态迁移、实时迁移),即虚机保存/恢复(Save/Restore):将整个虚拟机的运行状态完整保存下来,同时可以快速的恢复到原有硬件平台甚至是不同硬件平台上。恢复以后,虚机仍旧平滑运行,用户不会察觉到任何差异。OpenStack的虚机迁移是基于Libvirt实现的,下面来看看Openstack虚机热迁移的具体代码实现。
首先,由API入口进入到nova/api/openstack/compute/contrib/admin_actions.py
1 @wsgi.action(‘os-migrateLive‘) 2 def _migrate_live(self, req, id, body): 3 """Permit admins to (live) migrate a server to a new host.""" 4 context = req.environ["nova.context"] 5 authorize(context, ‘migrateLive‘) 6 7 try: 8 block_migration = body["os-migrateLive"]["block_migration"] 9 disk_over_commit = body["os-migrateLive"]["disk_over_commit"] 10 host = body["os-migrateLive"]["host"] 11 except (TypeError, KeyError): 12 msg = _("host, block_migration and disk_over_commit must " 13 "be specified for live migration.") 14 raise exc.HTTPBadRequest(explanation=msg) 15 16 try: 17 block_migration = strutils.bool_from_string(block_migration, 18 strict=True) 19 disk_over_commit = strutils.bool_from_string(disk_over_commit, 20 strict=True) 21 except ValueError as err: 22 raise exc.HTTPBadRequest(explanation=str(err)) 23 24 try: 25 instance = self.compute_api.get(context, id, want_objects=True) 26 self.compute_api.live_migrate(context, instance, block_migration, 27 disk_over_commit, host) 28 except (exception.ComputeServiceUnavailable, 29 exception.InvalidHypervisorType, 30 exception.UnableToMigrateToSelf, 31 exception.DestinationHypervisorTooOld, 32 exception.NoValidHost, 33 exception.InvalidLocalStorage, 34 exception.InvalidSharedStorage, 35 exception.MigrationPreCheckError) as ex: 36 raise exc.HTTPBadRequest(explanation=ex.format_message()) 37 except exception.InstanceNotFound as e: 38 raise exc.HTTPNotFound(explanation=e.format_message()) 39 except exception.InstanceInvalidState as state_error: 40 common.raise_http_conflict_for_instance_invalid_state(state_error, 41 ‘os-migrateLive‘) 42 except Exception: 43 if host is None: 44 msg = _("Live migration of instance %s to another host " 45 "failed") % id 46 else: 47 msg = _("Live migration of instance %(id)s to host %(host)s " 48 "failed") % {‘id‘: id, ‘host‘: host} 49 LOG.exception(msg) 50 # Return messages from scheduler 51 raise exc.HTTPBadRequest(explanation=msg) 52 53 return webob.Response(status_int=202)
这里第一行可以看到是与API文档的第二行照应的:
1 { 2 "os-migrateLive": { 3 "host": "0443e9a1254044d8b99f35eace132080", 4 "block_migration": false, 5 "disk_over_commit": false 6 } 7 }
好了,源码中其实执行迁移工作的就是第26、27行的一条语句:
1 self.compute_api.live_migrate(context, instance, block_migration, 2 disk_over_commit, host)
由这句进入到nova/compute/api.py中,源码如下:
1 @check_instance_cell 2 @check_instance_state(vm_state=[vm_states.ACTIVE]) 3 def live_migrate(self, context, instance, block_migration, 4 disk_over_commit, host_name): 5 """Migrate a server lively to a new host.""" 6 LOG.debug(_("Going to try to live migrate instance to %s"), 7 host_name or "another host", instance=instance) 8 9 instance.task_state = task_states.MIGRATING 10 instance.save(expected_task_state=[None]) 11 12 self.compute_task_api.live_migrate_instance(context, instance, 13 host_name, block_migration=block_migration, 14 disk_over_commit=disk_over_commit)
第2行是一个装饰器,用于在进入API方法之前,检测虚拟机和/或任务的状态, 如果实例处于错误的状态,将会引发异常;接下来实时迁移虚机到新的主机,并将虚机状态置于“migrating”,然后由12行进入nova/conductor/api.py
1 def live_migrate_instance(self, context, instance, host_name, 2 block_migration, disk_over_commit): 3 scheduler_hint = {‘host‘: host_name} 4 self._manager.migrate_server( 5 context, instance, scheduler_hint, True, False, None, 6 block_migration, disk_over_commit, None)
将主机名存入字典scheduler_hint中,然后调用nova/conductor/manager.py方法migrate_server,
1 def migrate_server(self, context, instance, scheduler_hint, live, rebuild, 2 flavor, block_migration, disk_over_commit, reservations=None): 3 if instance and not isinstance(instance, instance_obj.Instance): 4 # NOTE(danms): Until v2 of the RPC API, we need to tolerate 5 # old-world instance objects here 6 attrs = [‘metadata‘, ‘system_metadata‘, ‘info_cache‘, 7 ‘security_groups‘] 8 instance = instance_obj.Instance._from_db_object( 9 context, instance_obj.Instance(), instance, 10 expected_attrs=attrs) 11 if live and not rebuild and not flavor: 12 self._live_migrate(context, instance, scheduler_hint, 13 block_migration, disk_over_commit) 14 elif not live and not rebuild and flavor: 15 instance_uuid = instance[‘uuid‘] 16 with compute_utils.EventReporter(context, self.db, 17 ‘cold_migrate‘, instance_uuid): 18 self._cold_migrate(context, instance, flavor, 19 scheduler_hint[‘filter_properties‘], 20 reservations) 21 else: 22 raise NotImplementedError()
由于在nova/conductor/api.py中传过来的参数是
1 self._manager.migrate_server( 2 context, instance, scheduler_hint, True, False, None, 3 block_migration, disk_over_commit, None)
因此live是True,rebuild是Flase,flavor是None,执行第12、13行代码:
1 if live and not rebuild and not flavor: 2 self._live_migrate(context, instance, scheduler_hint, 3 block_migration, disk_over_commit)
_live_migrate代码如下:
1 def _live_migrate(self, context, instance, scheduler_hint, 2 block_migration, disk_over_commit): 3 destination = scheduler_hint.get("host") 4 try: 5 live_migrate.execute(context, instance, destination, 6 block_migration, disk_over_commit) 7 except (exception.NoValidHost, 8 exception.ComputeServiceUnavailable, 9 exception.InvalidHypervisorType, 10 exception.InvalidCPUInfo, 11 exception.UnableToMigrateToSelf, 12 exception.DestinationHypervisorTooOld, 13 exception.InvalidLocalStorage, 14 exception.InvalidSharedStorage, 15 exception.HypervisorUnavailable, 16 exception.MigrationPreCheckError) as ex: 17 with excutils.save_and_reraise_exception(): 18 #TODO(johngarbutt) - eventually need instance actions here 19 request_spec = {‘instance_properties‘: { 20 ‘uuid‘: instance[‘uuid‘], }, 21 } 22 scheduler_utils.set_vm_state_and_notify(context, 23 ‘compute_task‘, ‘migrate_server‘, 24 dict(vm_state=instance[‘vm_state‘], 25 task_state=None, 26 expected_task_state=task_states.MIGRATING,), 27 ex, request_spec, self.db) 28 except Exception as ex: 29 LOG.error(_(‘Migration of instance %(instance_id)s to host‘ 30 ‘ %(dest)s unexpectedly failed.‘), 31 {‘instance_id‘: instance[‘uuid‘], ‘dest‘: destination}, 32 exc_info=True) 33 raise exception.MigrationError(reason=ex)
首先,第三行中将主机名赋给destination,然后执行迁移,后面的都是异常的捕捉,执行迁移的代码分为两部分,先看第一部分,在nova/conductor/tasks/live_migrate.py的184行左右:
1 def execute(context, instance, destination, 2 block_migration, disk_over_commit): 3 task = LiveMigrationTask(context, instance, 4 destination, 5 block_migration, 6 disk_over_commit) 7 #TODO(johngarbutt) create a superclass that contains a safe_execute call 8 return task.execute()
先创建包含安全执行回调的超类,然后返回如下函数也即执行迁移的第二部分代码,在54行左右:
1 def execute(self): 2 self._check_instance_is_running() 3 self._check_host_is_up(self.source) 4 5 if not self.destination: 6 self.destination = self._find_destination() 7 else: 8 self._check_requested_destination() 9 10 #TODO(johngarbutt) need to move complexity out of compute manager 11 return self.compute_rpcapi.live_migration(self.context, 12 host=self.source, 13 instance=self.instance, 14 dest=self.destination, 15 block_migration=self.block_migration, 16 migrate_data=self.migrate_data) 17 #TODO(johngarbutt) disk_over_commit?
这里有三部分内容:
- 如果目前主机不存在,则由调度算法选取一个目标主机,并且进行相关的检测,确保能够进行实时迁移操作;
- 如果目标主机存在,则直接进行相关的检测操作,确保能够进行实时迁移操作;
- 执行迁移操作。
前两部分不再赘述,直接看第三部分代码,在nova/compute/rpcapi.py中:
1 def live_migration(self, ctxt, instance, dest, block_migration, host, 2 migrate_data=None): 3 # NOTE(russellb) Havana compat 4 version = self._get_compat_version(‘3.0‘, ‘2.0‘) 5 instance_p = jsonutils.to_primitive(instance) 6 cctxt = self.client.prepare(server=host, version=version) 7 cctxt.cast(ctxt, ‘live_migration‘, instance=instance_p, 8 dest=dest, block_migration=block_migration, 9 migrate_data=migrate_data)
热迁移开始执行:
1 def live_migration(self, context, instance, dest, 2 post_method, recover_method, block_migration=False, 3 migrate_data=None): 4 """Spawning live_migration operation for distributing high-load. 5 6 :param context: security context 7 :param instance: 8 nova.db.sqlalchemy.models.Instance object 9 instance object that is migrated. 10 :param dest: destination host 11 :param post_method: 12 post operation method. 13 expected nova.compute.manager.post_live_migration. 14 :param recover_method: 15 recovery method when any exception occurs. 16 expected nova.compute.manager.recover_live_migration. 17 :param block_migration: if true, do block migration. 18 :param migrate_data: implementation specific params 19 20 """ 21 22 greenthread.spawn(self._live_migration, context, instance, dest, 23 post_method, recover_method, block_migration, 24 migrate_data)
这个方法中建立一个绿色线程来运行方法_live_migration,来执行实时迁移; 主要是调用libvirt python接口方法virDomainMigrateToURI,来实现从当前主机迁移domain对象到给定的目标主机;
spawn:建立一个绿色线程来运行方法“func(*args, **kwargs)”,这里就是来运行方法_live_migration;
_live_migration:执行实时迁移; 主要是调用libvirt python接口方法virDomainMigrateToURI,来实现从当前主机迁移domain对象到给定的目标主机;
接着在绿色线程中调用_live_migration方法:
1 def _live_migration(self, context, instance, dest, post_method, 2 recover_method, block_migration=False, 3 migrate_data=None): 4 """Do live migration. 5 6 :param context: security context 7 :param instance: 8 nova.db.sqlalchemy.models.Instance object 9 instance object that is migrated. 10 :param dest: destination host 11 :param post_method: 12 post operation method. 13 expected nova.compute.manager.post_live_migration. 14 :param recover_method: 15 recovery method when any exception occurs. 16 expected nova.compute.manager.recover_live_migration. 17 :param block_migration: if true, do block migration. 18 :param migrate_data: implementation specific params 19 """ 20 21 # Do live migration. 22 try: 23 if block_migration: 24 flaglist = CONF.libvirt.block_migration_flag.split(‘,‘) 25 else: 26 flaglist = CONF.libvirt.live_migration_flag.split(‘,‘) 27 flagvals = [getattr(libvirt, x.strip()) for x in flaglist] 28 logical_sum = reduce(lambda x, y: x | y, flagvals) 29 30 dom = self._lookup_by_name(instance["name"]) 31 dom.migrateToURI(CONF.libvirt.live_migration_uri % dest, 32 logical_sum, 33 None, 34 CONF.libvirt.live_migration_bandwidth) 35 36 except Exception as e: 37 with excutils.save_and_reraise_exception(): 38 LOG.error(_("Live Migration failure: %s"), e, 39 instance=instance) 40 recover_method(context, instance, dest, block_migration) 41 42 # Waiting for completion of live_migration. 43 timer = loopingcall.FixedIntervalLoopingCall(f=None)
1 if block_migration: 2 flaglist = CONF.libvirt.block_migration_flag.split(‘,‘)
这个获取块迁移标志列表,block_migration_flag:这个参数定义了为块迁移设置迁移标志。
1 else: 2 flaglist = CONF.libvirt.live_migration_flag.split(‘,‘) 3 flagvals = [getattr(libvirt, x.strip()) for x in flaglist] 4 logical_sum = reduce(lambda x, y: x | y, flagvals)
这部分获取实时迁移标志列表,live_migration_flag这个参数定义了实时迁移的迁移标志。
1 dom = self._lookup_by_name(instance["name"])
根据给定的实例名称检索libvirt域对象。
1 timer = loopingcall.FixedIntervalLoopingCall(f=None)
获取等待完成实时迁移的时间。
热迁移代码部分至此结束。