Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -7226,6 +7226,7 @@ protected void scripts() {
self.setZoneUuid(spec.getDestHost().getZoneUuid());
}
}.execute());
syncVmDevicesAddressInfo(self.getUuid());
logger.debug(String.format("vm[uuid:%s] is running ..", self.getUuid()));
VmInstanceInventory inv = VmInstanceInventory.valueOf(self);
extEmitter.afterStartVm(inv);
Expand Down Expand Up @@ -7495,6 +7496,9 @@ public void run(FlowTrigger trigger, Map data) {
self.setHypervisorType(spec.getDestHost().getHypervisorType());
self.setRootVolumeUuid(spec.getDestRootVolume().getUuid());
});
if (struct.getStrategy() == VmCreationStrategy.InstantStart) {
syncVmDevicesAddressInfo(self.getUuid());
}
logger.debug(String.format("vm[uuid:%s] is started ..", self.getUuid()));
VmInstanceInventory inv = VmInstanceInventory.valueOf(self);
extEmitter.afterStartNewCreatedVm(inv);
Expand Down Expand Up @@ -7931,6 +7935,7 @@ public void handle(Map data) {
public void done() {
self = changeVmStateInDb(VmInstanceStateEvent.running,
() -> self.setHostUuid(originalCopy.getHostUuid()));
syncVmDevicesAddressInfo(self.getUuid());
VmInstanceInventory inv = VmInstanceInventory.valueOf(self);
extEmitter.afterRebootVm(inv);
new StaticIpOperator().deleteIpChange(self.getUuid());
Expand Down Expand Up @@ -8347,6 +8352,7 @@ protected void resumeVm(final Message msg, Completion completion) {
@Override
public void handle(Map Data) {
self = changeVmStateInDb(VmInstanceStateEvent.running);
syncVmDevicesAddressInfo(self.getUuid());
completion.success();
}
}).error(new FlowErrorHandler(completion) {
Expand Down Expand Up @@ -8467,6 +8473,28 @@ public String getName() {
});
}

private void syncVmDevicesAddressInfo(String vmUuid) {
if (self.getHostUuid() == null) {
return;
}
SyncVmDeviceInfoMsg msg = new SyncVmDeviceInfoMsg();
msg.setVmInstanceUuid(vmUuid);
msg.setHostUuid(self.getHostUuid());
bus.makeTargetServiceIdByResourceUuid(msg, HostConstant.SERVICE_ID, msg.getHostUuid());
bus.send(msg, new CloudBusCallBack(msg) {
@Override
public void run(MessageReply reply) {
if (!reply.isSuccess()) {
logger.warn(String.format("Failed to sync vm device info for vm[uuid:%s], %s",
vmUuid, reply.getError()));
} else {
logger.debug(String.format("Sent SyncVmDeviceInfoMsg for vm[uuid:%s] on host[uuid:%s]",
vmUuid, self.getHostUuid()));
}
}
});
}

private void deleteVmCdRom(String cdRomUuid, Completion completion) {
boolean exist = dbf.isExist(cdRomUuid, VmCdRomVO.class);
if (!exist) {
Expand Down
2 changes: 1 addition & 1 deletion conf/i18n/globalErrorCodeMapping/global-error-en_US.json
Original file line number Diff line number Diff line change
Expand Up @@ -3374,7 +3374,7 @@
"ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "delete token of SDN controller [IP:%s] failed because %s",
"ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "Cannot execute volume mapping to host flow due to invalid volume ID.%s",
"ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "port forwarding rule [uuid:%s] has not been attached to any virtual machine network interface, cannot detach",
"ORG_ZSTACK_MEVOCO_10088": "cannot take a snapshot for volumes[%s] when volume[uuid: %s] is not attached",
"ORG_ZSTACK_MEVOCO_10088": "cannot create snapshot for volume[uuid:%s] because it is not attached to any VM instance. Please attach the volume to a VM first. Affected volumes: %s",
"ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "Cannot execute map LUN to host flow due to invalid LUN type: %s",
"ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "port forwarding rule [uuid:%s] has been associated with vm nic [uuid:%s], cannot be reassigned again",
"ORG_ZSTACK_MEVOCO_10087": "A Running VM[uuid:%s] has no associated Host UUID.",
Expand Down
2 changes: 1 addition & 1 deletion conf/i18n/globalErrorCodeMapping/global-error-zh_CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -3374,7 +3374,7 @@
"ORG_ZSTACK_NETWORK_HUAWEI_IMASTER_10019": "删除 SDN 控制器 [IP:%s] 的令牌失败,因为 %s",
"ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10004": "无法执行映射LUN到主机流程,无效的LUN ID",
"ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10007": "端口转发规则 rule[uuid:%s] 没有绑定到任何 VM 的网卡上,无法解除绑定",
"ORG_ZSTACK_MEVOCO_10088": "无法为挂载状态以外的卷[%s]创建快照",
"ORG_ZSTACK_MEVOCO_10088": "无法为云盘[uuid:%s]创建快照,因为该云盘未挂载到任何云主机。请先将云盘挂载到云主机后再创建快照。相关云盘: %s",
"ORG_ZSTACK_STORAGE_PRIMARY_BLOCK_10005": "无法执行映射LUN到主机流程,无效的LUN类型",
"ORG_ZSTACK_NETWORK_SERVICE_PORTFORWARDING_10008": "端口转发规则[uuid:%s]已绑定到VM网卡[uuid:%s],无法再次绑定",
"ORG_ZSTACK_MEVOCO_10087": "如何一个运行中的VM[uuid:%s]没有宿主机uuid?",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,9 @@ public void setBootMode(String bootMode) {

public long getRootDiskAllocateSize() {
if (rootDiskOffering == null) {
return this.getImageSpec().getInventory().getSize();
long virtualSize = this.getImageSpec().getInventory().getSize();
long actualSize = this.getImageSpec().getInventory().getActualSize();
return Math.max(virtualSize, actualSize);
}
return rootDiskOffering.getDiskSize();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ public enum VmInstanceState {
new Transaction(VmInstanceStateEvent.destroyed, VmInstanceState.Destroyed),
new Transaction(VmInstanceStateEvent.destroying, VmInstanceState.Destroying),
new Transaction(VmInstanceStateEvent.running, VmInstanceState.Running),
new Transaction(VmInstanceStateEvent.stopped, VmInstanceState.Stopped),
new Transaction(VmInstanceStateEvent.expunging, VmInstanceState.Expunging)
);
Destroyed.transactions(
Expand Down
62 changes: 60 additions & 2 deletions plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ public class KvmVmSyncPingTask extends VmTracer implements KVMPingAgentNoFailure
private List<Class> skipVmTracerReplies = new ArrayList<>();
private Map<String, Integer> vmInShutdownMap = new ConcurrentHashMap<>();

// Orphaned skip entries from departed MN nodes. Key=vmUuid, Value=timestamp when orphaned.
// These VMs remain in skip-trace state for ORPHAN_TTL_MS to avoid false HA triggers
// when a MN restarts and its in-flight VM operations haven't completed yet. See ZSTAC-80821.
private final ConcurrentHashMap<String, Long> orphanedSkipVms = new ConcurrentHashMap<>();
private static final long ORPHAN_TTL_MS = 10 * 60 * 1000; // 10 minutes
Comment on lines +72 to +76
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

find . -name "KVMGlobalConfig.java" -type f

Repository: MatheMatrix/zstack

Length of output: 124


🏁 Script executed:

rg -n "orphanedVmSkipTimeout|orphanedSkipTimeout|ORPHAN.*TIMEOUT" --type java -A 2 -B 2

Repository: MatheMatrix/zstack

Length of output: 552


🏁 Script executed:

rg -n "class KVMGlobalConfig" --type java -A 50

Repository: MatheMatrix/zstack

Length of output: 6922


🏁 Script executed:

cat ./plugin/kvm/src/main/java/org/zstack/kvm/KVMGlobalConfig.java

Repository: MatheMatrix/zstack

Length of output: 9331


🏁 Script executed:

rg -n "ORPHAN_TTL_MS|orphanedVmSkipTimeout|orphanedSkipTimeout" --type java -B 3 -A 3

Repository: MatheMatrix/zstack

Length of output: 4660


建议将硬编码的超时常量改为 KVMGlobalConfig 配置项

当前使用硬编码常量 ORPHAN_TTL_MS = 10 * 60 * 1000 限制了运维灵活性。建议在 KVMGlobalConfig 中添加可配置的超时参数(如 kvm.orphanedVmSkipTimeout),使不同环境可以根据实际需求调整孤立VM的保留时间。

🤖 Prompt for AI Agents
In `@plugin/kvm/src/main/java/org/zstack/kvm/KvmVmSyncPingTask.java` around lines
72 - 76, The hardcoded ORPHAN_TTL_MS in KvmVmSyncPingTask reduces operational
flexibility; replace it with a KVMGlobalConfig entry (e.g.,
kvm.orphanedVmSkipTimeout) and read that value at startup (and/or subscribe to
updates) instead of using the fixed constant. Update KvmVmSyncPingTask to
retrieve the timeout from the new config key and use that value when managing
orphanedSkipVms/expiry checks (keep a sensible default of 10*60*1000 ms if the
config is missing), and ensure the code references ORPHAN_TTL_MS is replaced
with the config-backed field or accessor so the TTL becomes configurable at
runtime.


{
getReflections().getTypesAnnotatedWith(SkipVmTracer.class).forEach(clz -> {
skipVmTracerMessages.add(clz.asSubclass(Message.class));
Expand Down Expand Up @@ -196,8 +202,13 @@ private void syncVm(final HostInventory host, final Completion completion) {
// Get vms to skip before send command to host to confirm the vm will be skipped after sync command finished.
// The problem is if one vm-sync skipped operation is started and finished during vm sync command's handling
// vm state would still be sync to mn
// ZSTAC-80821: clean up expired orphaned entries each sync cycle
cleanupExpiredOrphanedSkipVms();

Set<String> vmsToSkipSetHostSide = new HashSet<>();
vmsToSkip.values().forEach(vmsToSkipSetHostSide::addAll);
// ZSTAC-80821: also skip VMs from departed MN nodes that are still within TTL
vmsToSkipSetHostSide.addAll(orphanedSkipVms.keySet());

// if the vm is not running on host when sync command executing but started as soon as possible
// before response handling of vm sync, mgmtSideStates will including the running vm but not result in
Expand Down Expand Up @@ -228,6 +239,8 @@ public void run(MessageReply reply) {

// Get vms to skip after sync result returned.
vmsToSkip.values().forEach(vmsToSkipSetHostSide::addAll);
// ZSTAC-80821: include orphaned entries from departed MN nodes
vmsToSkipSetHostSide.addAll(orphanedSkipVms.keySet());

Collection<String> vmUuidsInDeleteVmGC = DeleteVmGC.queryVmInGC(host.getUuid(), ret.getStates().keySet());

Expand Down Expand Up @@ -446,7 +459,19 @@ public void nodeJoin(ManagementNodeInventory inv) {
@Override
public void nodeLeft(ManagementNodeInventory inv) {
vmApis.remove(inv.getUuid());
vmsToSkip.remove(inv.getUuid());

// ZSTAC-80821: Instead of immediately removing skip list entries, move them
// to the orphaned set with a TTL. This prevents false HA triggers for VMs that
// are still being started by kvmagent but whose controlling MN has restarted.
Set<String> skippedVms = vmsToSkip.remove(inv.getUuid());
if (skippedVms != null && !skippedVms.isEmpty()) {
long now = System.currentTimeMillis();
for (String vmUuid : skippedVms) {
orphanedSkipVms.put(vmUuid, now);
logger.info(String.format("moved VM[uuid:%s] from departed MN[uuid:%s] skip list to orphaned set" +
" (will expire in %d minutes)", vmUuid, inv.getUuid(), ORPHAN_TTL_MS / 60000));
}
}
}

@Override
Expand All @@ -460,6 +485,39 @@ public void iJoin(ManagementNodeInventory inv) {
}

public boolean isVmDoNotNeedToTrace(String vmUuid) {
return vmsToSkip.values().stream().anyMatch(vmsToSkipSet -> vmsToSkipSet.contains(vmUuid));
if (vmsToSkip.values().stream().anyMatch(vmsToSkipSet -> vmsToSkipSet.contains(vmUuid))) {
return true;
}

// ZSTAC-80821: Also check orphaned skip entries from departed MN nodes
Long orphanedAt = orphanedSkipVms.get(vmUuid);
if (orphanedAt != null) {
if (System.currentTimeMillis() - orphanedAt < ORPHAN_TTL_MS) {
logger.debug(String.format("VM[uuid:%s] is in orphaned skip set, skipping trace", vmUuid));
return true;
} else {
// Expired, clean up
orphanedSkipVms.remove(vmUuid, orphanedAt);
logger.info(String.format("orphaned skip entry for VM[uuid:%s] expired after %d minutes, resuming trace",
vmUuid, ORPHAN_TTL_MS / 60000));
}
}

return false;
}

// Periodically clean up expired orphaned entries. Called from VM sync cycle.
private void cleanupExpiredOrphanedSkipVms() {
if (orphanedSkipVms.isEmpty()) {
return;
}

long now = System.currentTimeMillis();
for (Map.Entry<String, Long> entry : orphanedSkipVms.entrySet()) {
if (now - entry.getValue() >= ORPHAN_TTL_MS) {
orphanedSkipVms.remove(entry.getKey(), entry.getValue());
logger.info(String.format("cleaned up expired orphaned skip entry for VM[uuid:%s]", entry.getKey()));
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,10 @@ public List<ActiveVolumeClient> getActiveClients(String installPath, String prot
if (VolumeProtocol.CBD.toString().equals(protocol)) {
GetVolumeClientsCmd cmd = new GetVolumeClientsCmd();
cmd.setPath(installPath);
GetVolumeClientsRsp rsp = syncHttpCall(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class);
GetVolumeClientsRsp rsp = new HttpCaller<>(GET_VOLUME_CLIENTS_PATH, cmd, GetVolumeClientsRsp.class,
null, TimeUnit.SECONDS, 30, true)
.setTryNext(true)
.syncCall();
List<ActiveVolumeClient> clients = new ArrayList<>();

if (!rsp.isSuccess()) {
Expand Down Expand Up @@ -1411,6 +1414,11 @@ public class HttpCaller<T extends AgentResponse> {

private boolean tryNext = false;

HttpCaller<T> setTryNext(boolean tryNext) {
this.tryNext = tryNext;
return this;
}

public HttpCaller(String path, AgentCommand cmd, Class<T> retClass, ReturnValueCompletion<T> callback) {
this(path, cmd, retClass, callback, null, 0, false);
}
Expand Down