Files
lmxopcua/src/Server/ZB.MOM.WW.OtOpcUa.Runtime/OpcUa/OpcUaPublishActor.cs
T
Joseph Doherty b1b3f3ff23
v2-ci / build (push) Failing after 47s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.Host.IntegrationTests) (push) Has been skipped
v2-ci / integration (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.IntegrationTests) (push) Has been skipped
fix(runtime): materialise from applied artifact + restore served state on bootstrap
Two ordering/lifecycle gaps surfaced once tag values began streaming:

1. OpcUaPublishActor.HandleRebuild loaded the latest *Sealed* artifact, but the
   rebuild fires at apply time — before this deployment seals — so it materialised
   the PREVIOUS revision while SubscribeBulk subscribed to the applied one. The two
   disagreed (4 variables materialised vs 396 subscribed) and every config needed
   two deploys. RebuildAddressSpace now carries the applied DeploymentId and the
   rebuild loads that exact artifact.

2. On restart a node recovered its revision from NodeDeploymentState but left the
   driver children + address space empty (and an identical-config redeploy no-ops on
   the unchanged revision), so a rebuilt node served nothing until a config change.
   Bootstrap now calls RestoreApplied: re-spawn drivers, rebuild from the applied
   artifact, re-push SubscribeBulk — no re-ack.

Verified live: recreating the driver nodes auto-restores all 396 galaxy mirror
tags across 40 machines with Good live values, no deploy required.
2026-06-06 12:53:38 -04:00

327 lines
15 KiB
C#

using Akka.Actor;
using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.OpcUaServer;
using ZB.MOM.WW.OtOpcUa.Runtime.Drivers;
namespace ZB.MOM.WW.OtOpcUa.Runtime.OpcUa;
/// <summary>
/// Single-threaded bridge between Akka messages and the OPC UA SDK address space. Hosted on
/// the pinned <c>opcua-synchronized-dispatcher</c> (Task 19 HOCON) so the OPC UA SDK sees
/// only one thread per actor instance — its session/subscription locks expect strict
/// single-threaded access.
///
/// Address-space writes route through <see cref="IOpcUaAddressSpaceSink"/>; ServiceLevel
/// writes route through <see cref="IServiceLevelPublisher"/>. Production binds SDK-backed
/// implementations; dev/Mac/tests bind the Null* defaults so the actor stays decoupled from
/// <c>Opc.Ua.Server</c>. The remaining piece is wiring those bindings to a real
/// <c>StandardServer</c> address space — tracked as F10b.
/// </summary>
public sealed class OpcUaPublishActor : ReceiveActor
{
public const string DispatcherId = "opcua-synchronized-dispatcher";
public const string RedundancyStateTopic = "redundancy-state";
public sealed record AttributeValueUpdate(string NodeId, object? Value, OpcUaQuality Quality, DateTime TimestampUtc);
public sealed record AlarmStateUpdate(string AlarmNodeId, bool Active, bool Acknowledged, DateTime TimestampUtc);
/// <summary>
/// Triggers an address-space rebuild. <paramref name="DeploymentId"/> is the deployment
/// just applied by the host; the rebuild loads THAT artifact so materialisation matches the
/// applied config + the SubscribeBulk pass. It is null only for legacy/dev callers, which
/// fall back to the latest sealed deployment (lags a not-yet-sealed apply by one revision).
/// </summary>
public sealed record RebuildAddressSpace(CorrelationId Correlation, DeploymentId? DeploymentId = null);
public sealed record ServiceLevelChanged(byte ServiceLevel);
private readonly IOpcUaAddressSpaceSink _sink;
private readonly IServiceLevelPublisher _serviceLevel;
private readonly bool _subscribeRedundancyTopic;
private readonly NodeId? _localNode;
private readonly IDbContextFactory<OtOpcUaConfigDbContext>? _dbFactory;
private readonly Phase7Applier? _applier;
private readonly ILoggingAdapter _log = Context.GetLogger();
private int _writes;
private byte _lastServiceLevel;
private Phase7CompositionResult _lastApplied = new(
Array.Empty<UnsAreaProjection>(),
Array.Empty<UnsLineProjection>(),
Array.Empty<EquipmentNode>(),
Array.Empty<DriverInstancePlan>(),
Array.Empty<ScriptedAlarmPlan>(),
Array.Empty<GalaxyTagPlan>());
/// <summary>Gets the number of writes performed.</summary>
public int WriteCount => _writes;
/// <summary>Gets the last published service level.</summary>
public byte LastServiceLevel => _lastServiceLevel;
/// <summary>Production Props — pins the OPC UA dispatcher + subscribes to the
/// <c>redundancy-state</c> DPS topic so cluster transitions drive the local ServiceLevel
/// publish path. When <paramref name="dbFactory"/> + <paramref name="applier"/> are supplied,
/// <see cref="RebuildAddressSpace"/> reads the latest deployment artifact + drives the
/// applier through the sink.</summary>
/// <param name="sink">The OPC UA address space sink.</param>
/// <param name="serviceLevel">The service level publisher.</param>
/// <param name="localNode">The local cluster node ID.</param>
/// <param name="dbFactory">The optional database context factory.</param>
/// <param name="applier">The optional Phase 7 applier.</param>
public static Props Props(
IOpcUaAddressSpaceSink? sink = null,
IServiceLevelPublisher? serviceLevel = null,
NodeId? localNode = null,
IDbContextFactory<OtOpcUaConfigDbContext>? dbFactory = null,
Phase7Applier? applier = null) =>
Akka.Actor.Props.Create(() => new OpcUaPublishActor(
sink ?? NullOpcUaAddressSpaceSink.Instance,
serviceLevel ?? NullServiceLevelPublisher.Instance,
subscribeRedundancyTopic: true,
localNode,
dbFactory,
applier)).WithDispatcher(DispatcherId);
/// <summary>Test-only Props that omits the pinned-dispatcher requirement and skips the
/// DPS subscribe so unit tests can spin up the actor on a vanilla TestKit cluster.</summary>
/// <param name="sink">The OPC UA address space sink.</param>
/// <param name="serviceLevel">The service level publisher.</param>
/// <param name="subscribeRedundancyTopic">Whether to subscribe to the redundancy topic.</param>
/// <param name="localNode">The local cluster node ID.</param>
/// <param name="dbFactory">The optional database context factory.</param>
/// <param name="applier">The optional Phase 7 applier.</param>
public static Props PropsForTests(
IOpcUaAddressSpaceSink? sink = null,
IServiceLevelPublisher? serviceLevel = null,
bool subscribeRedundancyTopic = false,
NodeId? localNode = null,
IDbContextFactory<OtOpcUaConfigDbContext>? dbFactory = null,
Phase7Applier? applier = null) =>
Akka.Actor.Props.Create(() => new OpcUaPublishActor(
sink ?? NullOpcUaAddressSpaceSink.Instance,
serviceLevel ?? NullServiceLevelPublisher.Instance,
subscribeRedundancyTopic,
localNode,
dbFactory,
applier));
/// <summary>Initializes a new instance of the <see cref="OpcUaPublishActor"/> class.</summary>
/// <param name="sink">The OPC UA address space sink.</param>
/// <param name="serviceLevel">The service level publisher.</param>
/// <param name="subscribeRedundancyTopic">Whether to subscribe to the redundancy topic.</param>
/// <param name="localNode">The local cluster node ID.</param>
/// <param name="dbFactory">The optional database context factory.</param>
/// <param name="applier">The optional Phase 7 applier.</param>
public OpcUaPublishActor(
IOpcUaAddressSpaceSink sink,
IServiceLevelPublisher serviceLevel,
bool subscribeRedundancyTopic,
NodeId? localNode,
IDbContextFactory<OtOpcUaConfigDbContext>? dbFactory = null,
Phase7Applier? applier = null)
{
_sink = sink;
_serviceLevel = serviceLevel;
_subscribeRedundancyTopic = subscribeRedundancyTopic;
_localNode = localNode;
_dbFactory = dbFactory;
_applier = applier;
Receive<AttributeValueUpdate>(HandleAttributeUpdate);
Receive<AlarmStateUpdate>(HandleAlarmUpdate);
Receive<RebuildAddressSpace>(HandleRebuild);
Receive<ServiceLevelChanged>(HandleServiceLevelChanged);
Receive<RedundancyStateChanged>(HandleRedundancyStateChanged);
Receive<SubscribeAck>(_ => { /* PubSub ack */ });
}
/// <inheritdoc />
protected override void PreStart()
{
if (_subscribeRedundancyTopic)
{
DistributedPubSub.Get(Context.System).Mediator.Tell(new Subscribe(RedundancyStateTopic, Self));
}
}
private void HandleAttributeUpdate(AttributeValueUpdate msg)
{
try
{
_sink.WriteValue(msg.NodeId, msg.Value, msg.Quality, msg.TimestampUtc);
Interlocked.Increment(ref _writes);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "value"));
}
catch (Exception ex)
{
_log.Warning(ex, "OpcUaPublish: sink.WriteValue threw for {Node}", msg.NodeId);
}
}
private void HandleAlarmUpdate(AlarmStateUpdate msg)
{
try
{
_sink.WriteAlarmState(msg.AlarmNodeId, msg.Active, msg.Acknowledged, msg.TimestampUtc);
Interlocked.Increment(ref _writes);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "alarm"));
}
catch (Exception ex)
{
_log.Warning(ex, "OpcUaPublish: sink.WriteAlarmState threw for {Node}", msg.AlarmNodeId);
}
}
private void HandleRebuild(RebuildAddressSpace msg)
{
using var span = OtOpcUaTelemetry.StartAddressSpaceRebuildSpan();
span?.SetTag("otopcua.correlation_id", msg.Correlation.ToString());
// Two modes: when dbFactory + applier are wired, do a real diff-and-apply pass against
// the latest deployment artifact. Without them, fall back to a raw sink rebuild — the
// F10b/dev path before the integration completes.
if (_dbFactory is null || _applier is null)
{
try
{
_sink.RebuildAddressSpace();
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
}
catch (Exception ex)
{
_log.Error(ex, "OpcUaPublish: sink.RebuildAddressSpace threw (correlation={Correlation})",
msg.Correlation);
}
return;
}
try
{
// Prefer the artifact of the deployment the host just applied — at apply time it is not
// yet Sealed, so LoadLatestArtifact would return the PREVIOUS revision and materialise a
// stale composition (variables that don't match the SubscribeBulk refs). Fall back to
// latest-sealed only for legacy callers that don't carry a DeploymentId.
var artifact = msg.DeploymentId is { } depId
? LoadArtifact(depId)
: LoadLatestArtifact();
var composition = DeploymentArtifact.ParseComposition(artifact);
var plan = Phase7Planner.Compute(_lastApplied, composition);
if (plan.IsEmpty)
{
_log.Debug("OpcUaPublish: rebuild requested but plan is empty (correlation={Correlation})",
msg.Correlation);
return;
}
var outcome = _applier.Apply(plan);
_lastApplied = composition;
// #85 — after the plan diff lands, rebuild the UNS folder hierarchy so OPC UA
// clients see Area/Line/Equipment as proper folders. Idempotent; Phase7Applier
// skips folders that already exist with the same node id.
_applier.MaterialiseHierarchy(composition);
// Galaxy / SystemPlatform tags get their own pass: ensures their FolderPath folder
// + Variable node exist so clients can browse them. The Galaxy driver fills values
// on a future SubscribeBulk pass; until then variables show BadWaitingForInitialData.
_applier.MaterialiseGalaxyTags(composition);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
_log.Info("OpcUaPublish: applied rebuild (correlation={Correlation}, added={Added}, removed={Removed}, changed={Changed}, rebuild={Rebuild})",
msg.Correlation, outcome.AddedNodes, outcome.RemovedNodes, outcome.ChangedNodes, outcome.RebuildCalled);
}
catch (Exception ex)
{
_log.Error(ex, "OpcUaPublish: rebuild pipeline threw (correlation={Correlation})", msg.Correlation);
}
}
/// <summary>Read a specific deployment's artifact blob from ConfigDb (the one just applied,
/// which may not be Sealed yet). Empty array on any failure — parser treats it as "no composition".</summary>
private byte[] LoadArtifact(DeploymentId deploymentId)
{
try
{
using var db = _dbFactory!.CreateDbContext();
return db.Deployments.AsNoTracking()
.Where(d => d.DeploymentId == deploymentId.Value)
.Select(d => d.ArtifactBlob)
.FirstOrDefault() ?? Array.Empty<byte>();
}
catch (Exception ex)
{
_log.Warning(ex, "OpcUaPublish: failed to load artifact for deployment {Id}; rebuild becomes no-op", deploymentId);
return Array.Empty<byte>();
}
}
/// <summary>Read the most recent <c>Sealed</c> deployment's artifact blob from ConfigDb.
/// Empty array on any failure — the parser treats empty blob as "no composition".</summary>
private byte[] LoadLatestArtifact()
{
try
{
using var db = _dbFactory!.CreateDbContext();
return db.Deployments.AsNoTracking()
.Where(d => d.Status == Configuration.Enums.DeploymentStatus.Sealed)
.OrderByDescending(d => d.SealedAtUtc)
.Select(d => d.ArtifactBlob)
.FirstOrDefault() ?? Array.Empty<byte>();
}
catch (Exception ex)
{
_log.Warning(ex, "OpcUaPublish: failed to load latest deployment artifact; rebuild becomes no-op");
return Array.Empty<byte>();
}
}
private void HandleServiceLevelChanged(ServiceLevelChanged msg)
{
if (msg.ServiceLevel == _lastServiceLevel) return;
_lastServiceLevel = msg.ServiceLevel;
try
{
_serviceLevel.Publish(msg.ServiceLevel);
OtOpcUaTelemetry.ServiceLevelChange.Add(1,
new KeyValuePair<string, object?>("level", msg.ServiceLevel));
_log.Debug("OpcUaPublish: ServiceLevel={Level}", msg.ServiceLevel);
}
catch (Exception ex)
{
_log.Warning(ex, "OpcUaPublish: ServiceLevel publisher threw at level {Level}", msg.ServiceLevel);
}
}
/// <summary>
/// Compute a coarse ServiceLevel from the cluster snapshot and forward to the
/// <see cref="IServiceLevelPublisher"/>. This is a placeholder for F10b's full health
/// aggregation — for now we surface "primary-leader → 240, secondary → 100, detached → 0"
/// so the local SDK at least reflects role state. The full <see cref="ServiceLevelCalculator"/>
/// path (with DB-reachable, OPC UA probe inputs) lives in <c>RedundancyStateActor</c> on
/// admin nodes; this driver-side mirror exists so each node's own SDK exposes a sensible
/// ServiceLevel without round-tripping back through the admin singleton.
/// </summary>
private void HandleRedundancyStateChanged(RedundancyStateChanged msg)
{
if (_localNode is null) return;
var local = msg.Nodes.FirstOrDefault(n => n.NodeId == _localNode.Value);
if (local is null) return;
byte level = local.Role switch
{
RedundancyRole.Primary when local.IsRoleLeaderForDriver => 240,
RedundancyRole.Primary => 200,
RedundancyRole.Secondary => 100,
RedundancyRole.Detached => 0,
_ => 0,
};
Self.Tell(new ServiceLevelChanged(level));
}
}