fix(runtime): materialise from applied artifact + restore served state on bootstrap
v2-ci / build (push) Failing after 47s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped

Two ordering/lifecycle gaps surfaced once tag values began streaming:

1. OpcUaPublishActor.HandleRebuild loaded the latest *Sealed* artifact, but the
   rebuild fires at apply time — before this deployment seals — so it materialised
   the PREVIOUS revision while SubscribeBulk subscribed to the applied one. The two
   disagreed (4 variables materialised vs 396 subscribed) and every config needed
   two deploys. RebuildAddressSpace now carries the applied DeploymentId and the
   rebuild loads that exact artifact.

2. On restart a node recovered its revision from NodeDeploymentState but left the
   driver children + address space empty (and an identical-config redeploy no-ops on
   the unchanged revision), so a rebuilt node served nothing until a config change.
   Bootstrap now calls RestoreApplied: re-spawn drivers, rebuild from the applied
   artifact, re-push SubscribeBulk — no re-ack.

Verified live: recreating the driver nodes auto-restores all 396 galaxy mirror
tags across 40 machines with Good live values, no deploy required.
This commit is contained in:
Joseph Doherty
2026-06-06 12:53:38 -04:00
parent c1ce5833e9
commit b1b3f3ff23
2 changed files with 64 additions and 3 deletions
@@ -173,6 +173,11 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
_currentRevision = revision;
_log.Info("DriverHost {Node}: recovered Applied state at rev {Rev}", _localNode, revision);
Become(Steady);
// The revision is recovered but the in-memory driver children + OPC UA address
// space were lost on restart. Re-spawn + re-materialise + re-subscribe from the
// applied deployment so a restarted/rebuilt node restores its served state instead
// of waiting for a config change (whose identical-config revision would no-op).
RestoreApplied(new DeploymentId(latest.DeploymentId));
break;
case NodeDeploymentStatus.Applying:
_log.Warning("DriverHost {Node}: found orphan Applying row for deployment {Id}; replaying",
@@ -310,7 +315,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
// Trigger the OPC UA address-space rebuild so the local SDK reflects the new
// composition. The publish actor handles the load-compose-diff-apply pipeline; we
// just forward the same correlation id so the audit trail joins up.
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation));
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation, deploymentId));
// SubscribeBulk pass: hand each driver its desired tag references so live values flow into
// the just-rebuilt address space instead of staying BadWaitingForInitialData.
PushDesiredSubscriptions(deploymentId);
@@ -371,6 +376,31 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
foreach (var spec in plan.ToSpawn) SpawnChild(spec);
}
/// <summary>
/// Restore the served state for an already-applied deployment after a process restart.
/// <see cref="Bootstrap"/> recovers <see cref="_currentRevision"/> from NodeDeploymentState,
/// but the driver children and OPC UA address space are in-memory and gone after a restart —
/// so without this a restarted node serves an empty address space until the next config
/// change (and an identical-config redeploy no-ops on the unchanged revision). Re-spawns
/// drivers, rebuilds the address space from the applied artifact, and re-pushes SubscribeBulk.
/// No re-ack: the deployment is already Applied.
/// </summary>
private void RestoreApplied(DeploymentId deploymentId)
{
var correlation = CorrelationId.NewId();
try
{
ReconcileDrivers(deploymentId);
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation, deploymentId));
PushDesiredSubscriptions(deploymentId);
_log.Info("DriverHost {Node}: restored served state for applied deployment {Id} on bootstrap", _localNode, deploymentId);
}
catch (Exception ex)
{
_log.Warning(ex, "DriverHost {Node}: failed to restore served state for {Id} on bootstrap", _localNode, deploymentId);
}
}
/// <summary>
/// SubscribeBulk pass. After an apply, read the deployment's SystemPlatform / Galaxy tags,
/// group their dot-form MXAccess references by driver instance, and hand each running driver
@@ -31,7 +31,13 @@ public sealed class OpcUaPublishActor : ReceiveActor
public sealed record AttributeValueUpdate(string NodeId, object? Value, OpcUaQuality Quality, DateTime TimestampUtc);
public sealed record AlarmStateUpdate(string AlarmNodeId, bool Active, bool Acknowledged, DateTime TimestampUtc);
public sealed record RebuildAddressSpace(CorrelationId Correlation);
/// <summary>
/// Triggers an address-space rebuild. <paramref name="DeploymentId"/> is the deployment
/// just applied by the host; the rebuild loads THAT artifact so materialisation matches the
/// applied config + the SubscribeBulk pass. It is null only for legacy/dev callers, which
/// fall back to the latest sealed deployment (lags a not-yet-sealed apply by one revision).
/// </summary>
public sealed record RebuildAddressSpace(CorrelationId Correlation, DeploymentId? DeploymentId = null);
public sealed record ServiceLevelChanged(byte ServiceLevel);
private readonly IOpcUaAddressSpaceSink _sink;
@@ -196,7 +202,13 @@ public sealed class OpcUaPublishActor : ReceiveActor
try
{
var artifact = LoadLatestArtifact();
// Prefer the artifact of the deployment the host just applied — at apply time it is not
// yet Sealed, so LoadLatestArtifact would return the PREVIOUS revision and materialise a
// stale composition (variables that don't match the SubscribeBulk refs). Fall back to
// latest-sealed only for legacy callers that don't carry a DeploymentId.
var artifact = msg.DeploymentId is { } depId
? LoadArtifact(depId)
: LoadLatestArtifact();
var composition = DeploymentArtifact.ParseComposition(artifact);
var plan = Phase7Planner.Compute(_lastApplied, composition);
@@ -229,6 +241,25 @@ public sealed class OpcUaPublishActor : ReceiveActor
}
}
/// <summary>Read a specific deployment's artifact blob from ConfigDb (the one just applied,
/// which may not be Sealed yet). Empty array on any failure — parser treats it as "no composition".</summary>
private byte[] LoadArtifact(DeploymentId deploymentId)
{
try
{
using var db = _dbFactory!.CreateDbContext();
return db.Deployments.AsNoTracking()
.Where(d => d.DeploymentId == deploymentId.Value)
.Select(d => d.ArtifactBlob)
.FirstOrDefault() ?? Array.Empty<byte>();
}
catch (Exception ex)
{
_log.Warning(ex, "OpcUaPublish: failed to load artifact for deployment {Id}; rebuild becomes no-op", deploymentId);
return Array.Empty<byte>();
}
}
/// <summary>Read the most recent <c>Sealed</c> deployment's artifact blob from ConfigDb.
/// Empty array on any failure — the parser treats empty blob as "no composition".</summary>
private byte[] LoadLatestArtifact()