fix(runtime): materialise from applied artifact + restore served state on bootstrap
v2-ci / build (push) Failing after 47s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
v2-ci / build (push) Failing after 47s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
Two ordering/lifecycle gaps surfaced once tag values began streaming: 1. OpcUaPublishActor.HandleRebuild loaded the latest *Sealed* artifact, but the rebuild fires at apply time — before this deployment seals — so it materialised the PREVIOUS revision while SubscribeBulk subscribed to the applied one. The two disagreed (4 variables materialised vs 396 subscribed) and every config needed two deploys. RebuildAddressSpace now carries the applied DeploymentId and the rebuild loads that exact artifact. 2. On restart a node recovered its revision from NodeDeploymentState but left the driver children + address space empty (and an identical-config redeploy no-ops on the unchanged revision), so a rebuilt node served nothing until a config change. Bootstrap now calls RestoreApplied: re-spawn drivers, rebuild from the applied artifact, re-push SubscribeBulk — no re-ack. Verified live: recreating the driver nodes auto-restores all 396 galaxy mirror tags across 40 machines with Good live values, no deploy required.
This commit is contained in:
@@ -173,6 +173,11 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
_currentRevision = revision;
|
||||
_log.Info("DriverHost {Node}: recovered Applied state at rev {Rev}", _localNode, revision);
|
||||
Become(Steady);
|
||||
// The revision is recovered but the in-memory driver children + OPC UA address
|
||||
// space were lost on restart. Re-spawn + re-materialise + re-subscribe from the
|
||||
// applied deployment so a restarted/rebuilt node restores its served state instead
|
||||
// of waiting for a config change (whose identical-config revision would no-op).
|
||||
RestoreApplied(new DeploymentId(latest.DeploymentId));
|
||||
break;
|
||||
case NodeDeploymentStatus.Applying:
|
||||
_log.Warning("DriverHost {Node}: found orphan Applying row for deployment {Id}; replaying",
|
||||
@@ -310,7 +315,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
// Trigger the OPC UA address-space rebuild so the local SDK reflects the new
|
||||
// composition. The publish actor handles the load-compose-diff-apply pipeline; we
|
||||
// just forward the same correlation id so the audit trail joins up.
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation));
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation, deploymentId));
|
||||
// SubscribeBulk pass: hand each driver its desired tag references so live values flow into
|
||||
// the just-rebuilt address space instead of staying BadWaitingForInitialData.
|
||||
PushDesiredSubscriptions(deploymentId);
|
||||
@@ -371,6 +376,31 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
foreach (var spec in plan.ToSpawn) SpawnChild(spec);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Restore the served state for an already-applied deployment after a process restart.
|
||||
/// <see cref="Bootstrap"/> recovers <see cref="_currentRevision"/> from NodeDeploymentState,
|
||||
/// but the driver children and OPC UA address space are in-memory and gone after a restart —
|
||||
/// so without this a restarted node serves an empty address space until the next config
|
||||
/// change (and an identical-config redeploy no-ops on the unchanged revision). Re-spawns
|
||||
/// drivers, rebuilds the address space from the applied artifact, and re-pushes SubscribeBulk.
|
||||
/// No re-ack: the deployment is already Applied.
|
||||
/// </summary>
|
||||
private void RestoreApplied(DeploymentId deploymentId)
|
||||
{
|
||||
var correlation = CorrelationId.NewId();
|
||||
try
|
||||
{
|
||||
ReconcileDrivers(deploymentId);
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation, deploymentId));
|
||||
PushDesiredSubscriptions(deploymentId);
|
||||
_log.Info("DriverHost {Node}: restored served state for applied deployment {Id} on bootstrap", _localNode, deploymentId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.Warning(ex, "DriverHost {Node}: failed to restore served state for {Id} on bootstrap", _localNode, deploymentId);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SubscribeBulk pass. After an apply, read the deployment's SystemPlatform / Galaxy tags,
|
||||
/// group their dot-form MXAccess references by driver instance, and hand each running driver
|
||||
|
||||
Reference in New Issue
Block a user