fix(runtime): materialise from applied artifact + restore served state on bootstrap
v2-ci / build (push) Failing after 47s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
v2-ci / build (push) Failing after 47s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
Two ordering/lifecycle gaps surfaced once tag values began streaming: 1. OpcUaPublishActor.HandleRebuild loaded the latest *Sealed* artifact, but the rebuild fires at apply time — before this deployment seals — so it materialised the PREVIOUS revision while SubscribeBulk subscribed to the applied one. The two disagreed (4 variables materialised vs 396 subscribed) and every config needed two deploys. RebuildAddressSpace now carries the applied DeploymentId and the rebuild loads that exact artifact. 2. On restart a node recovered its revision from NodeDeploymentState but left the driver children + address space empty (and an identical-config redeploy no-ops on the unchanged revision), so a rebuilt node served nothing until a config change. Bootstrap now calls RestoreApplied: re-spawn drivers, rebuild from the applied artifact, re-push SubscribeBulk — no re-ack. Verified live: recreating the driver nodes auto-restores all 396 galaxy mirror tags across 40 machines with Good live values, no deploy required.
This commit is contained in:
@@ -173,6 +173,11 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
_currentRevision = revision;
|
||||
_log.Info("DriverHost {Node}: recovered Applied state at rev {Rev}", _localNode, revision);
|
||||
Become(Steady);
|
||||
// The revision is recovered but the in-memory driver children + OPC UA address
|
||||
// space were lost on restart. Re-spawn + re-materialise + re-subscribe from the
|
||||
// applied deployment so a restarted/rebuilt node restores its served state instead
|
||||
// of waiting for a config change (whose identical-config revision would no-op).
|
||||
RestoreApplied(new DeploymentId(latest.DeploymentId));
|
||||
break;
|
||||
case NodeDeploymentStatus.Applying:
|
||||
_log.Warning("DriverHost {Node}: found orphan Applying row for deployment {Id}; replaying",
|
||||
@@ -310,7 +315,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
// Trigger the OPC UA address-space rebuild so the local SDK reflects the new
|
||||
// composition. The publish actor handles the load-compose-diff-apply pipeline; we
|
||||
// just forward the same correlation id so the audit trail joins up.
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation));
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation, deploymentId));
|
||||
// SubscribeBulk pass: hand each driver its desired tag references so live values flow into
|
||||
// the just-rebuilt address space instead of staying BadWaitingForInitialData.
|
||||
PushDesiredSubscriptions(deploymentId);
|
||||
@@ -371,6 +376,31 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
foreach (var spec in plan.ToSpawn) SpawnChild(spec);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Restore the served state for an already-applied deployment after a process restart.
|
||||
/// <see cref="Bootstrap"/> recovers <see cref="_currentRevision"/> from NodeDeploymentState,
|
||||
/// but the driver children and OPC UA address space are in-memory and gone after a restart —
|
||||
/// so without this a restarted node serves an empty address space until the next config
|
||||
/// change (and an identical-config redeploy no-ops on the unchanged revision). Re-spawns
|
||||
/// drivers, rebuilds the address space from the applied artifact, and re-pushes SubscribeBulk.
|
||||
/// No re-ack: the deployment is already Applied.
|
||||
/// </summary>
|
||||
private void RestoreApplied(DeploymentId deploymentId)
|
||||
{
|
||||
var correlation = CorrelationId.NewId();
|
||||
try
|
||||
{
|
||||
ReconcileDrivers(deploymentId);
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation, deploymentId));
|
||||
PushDesiredSubscriptions(deploymentId);
|
||||
_log.Info("DriverHost {Node}: restored served state for applied deployment {Id} on bootstrap", _localNode, deploymentId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.Warning(ex, "DriverHost {Node}: failed to restore served state for {Id} on bootstrap", _localNode, deploymentId);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SubscribeBulk pass. After an apply, read the deployment's SystemPlatform / Galaxy tags,
|
||||
/// group their dot-form MXAccess references by driver instance, and hand each running driver
|
||||
|
||||
@@ -31,7 +31,13 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
|
||||
public sealed record AttributeValueUpdate(string NodeId, object? Value, OpcUaQuality Quality, DateTime TimestampUtc);
|
||||
public sealed record AlarmStateUpdate(string AlarmNodeId, bool Active, bool Acknowledged, DateTime TimestampUtc);
|
||||
public sealed record RebuildAddressSpace(CorrelationId Correlation);
|
||||
/// <summary>
|
||||
/// Triggers an address-space rebuild. <paramref name="DeploymentId"/> is the deployment
|
||||
/// just applied by the host; the rebuild loads THAT artifact so materialisation matches the
|
||||
/// applied config + the SubscribeBulk pass. It is null only for legacy/dev callers, which
|
||||
/// fall back to the latest sealed deployment (lags a not-yet-sealed apply by one revision).
|
||||
/// </summary>
|
||||
public sealed record RebuildAddressSpace(CorrelationId Correlation, DeploymentId? DeploymentId = null);
|
||||
public sealed record ServiceLevelChanged(byte ServiceLevel);
|
||||
|
||||
private readonly IOpcUaAddressSpaceSink _sink;
|
||||
@@ -196,7 +202,13 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
|
||||
try
|
||||
{
|
||||
var artifact = LoadLatestArtifact();
|
||||
// Prefer the artifact of the deployment the host just applied — at apply time it is not
|
||||
// yet Sealed, so LoadLatestArtifact would return the PREVIOUS revision and materialise a
|
||||
// stale composition (variables that don't match the SubscribeBulk refs). Fall back to
|
||||
// latest-sealed only for legacy callers that don't carry a DeploymentId.
|
||||
var artifact = msg.DeploymentId is { } depId
|
||||
? LoadArtifact(depId)
|
||||
: LoadLatestArtifact();
|
||||
var composition = DeploymentArtifact.ParseComposition(artifact);
|
||||
var plan = Phase7Planner.Compute(_lastApplied, composition);
|
||||
|
||||
@@ -229,6 +241,25 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Read a specific deployment's artifact blob from ConfigDb (the one just applied,
|
||||
/// which may not be Sealed yet). Empty array on any failure — parser treats it as "no composition".</summary>
|
||||
private byte[] LoadArtifact(DeploymentId deploymentId)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var db = _dbFactory!.CreateDbContext();
|
||||
return db.Deployments.AsNoTracking()
|
||||
.Where(d => d.DeploymentId == deploymentId.Value)
|
||||
.Select(d => d.ArtifactBlob)
|
||||
.FirstOrDefault() ?? Array.Empty<byte>();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.Warning(ex, "OpcUaPublish: failed to load artifact for deployment {Id}; rebuild becomes no-op", deploymentId);
|
||||
return Array.Empty<byte>();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Read the most recent <c>Sealed</c> deployment's artifact blob from ConfigDb.
|
||||
/// Empty array on any failure — the parser treats empty blob as "no composition".</summary>
|
||||
private byte[] LoadLatestArtifact()
|
||||
|
||||
Reference in New Issue
Block a user