feat(observability): F13d Prometheus + OpenTelemetry instrumentation
v2-ci / build (push) Failing after 38s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (push) Has been skipped

OtOpcUaTelemetry (Commons/Observability) centralizes the project's Meter
+ ActivitySource so all instrumentation points emit through a single
named surface. Counters cover the hot paths:

  otopcua.deploy.applied               (outcome=ack|reject)
  otopcua.deploy.apply.duration        (s, histogram)
  otopcua.driver.lifecycle             (event=spawn|spawn_stub|stop|fault)
  otopcua.virtualtag.eval              (outcome=ok|fail|skip)
  otopcua.scriptedalarm.transition     (state=activated|acknowledged|cleared)
  otopcua.opcua.sink.write             (kind=value|alarm|rebuild)
  otopcua.redundancy.service_level_change (level=byte)

Plus two ActivitySource spans:

  otopcua.deploy.apply                 wraps DriverHostActor.ApplyAndAck
  otopcua.opcua.address_space_rebuild  wraps OpcUaPublishActor.HandleRebuild

Instruments are no-op until a listener attaches, so tests + dev hosts
pay nothing for unread telemetry.

Host Program.cs gains AddOtOpcUaObservability() (binds the OtOpcUa Meter
+ ActivitySource to OpenTelemetry, attaches a Prometheus exporter) and
MapOtOpcUaMetrics() (mounts /metrics scrape endpoint). Driver-side
internals + ASP.NET request metrics deliberately stay off — the scrape
payload is scoped to OtOpcUa signals only.

Tests use MeterListener + ActivityListener to verify
VirtualTagActor.eval, OpcUaPublishActor.AttributeValueUpdate, and
RebuildAddressSpace actually emit on the central instruments. Runtime
suite is 72 / 72 green (+3).

Closes #105. Path A (F13b/c/d) complete; next batch options: #85 UNS
folder hierarchy in SDK, or F8b/F9b production engine bindings.
This commit is contained in:
Joseph Doherty
2026-05-26 10:29:40 -04:00
parent 21eac21409
commit 52997ee164
10 changed files with 352 additions and 3 deletions
@@ -3,6 +3,7 @@ using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Configuration;
@@ -124,6 +125,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
{
_sink.WriteValue(msg.NodeId, msg.Value, msg.Quality, msg.TimestampUtc);
Interlocked.Increment(ref _writes);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "value"));
}
catch (Exception ex)
{
@@ -137,6 +139,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
{
_sink.WriteAlarmState(msg.AlarmNodeId, msg.Active, msg.Acknowledged, msg.TimestampUtc);
Interlocked.Increment(ref _writes);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "alarm"));
}
catch (Exception ex)
{
@@ -146,12 +149,19 @@ public sealed class OpcUaPublishActor : ReceiveActor
private void HandleRebuild(RebuildAddressSpace msg)
{
using var span = OtOpcUaTelemetry.StartAddressSpaceRebuildSpan();
span?.SetTag("otopcua.correlation_id", msg.Correlation.ToString());
// Two modes: when dbFactory + applier are wired, do a real diff-and-apply pass against
// the latest deployment artifact. Without them, fall back to a raw sink rebuild — the
// F10b/dev path before the integration completes.
if (_dbFactory is null || _applier is null)
{
try { _sink.RebuildAddressSpace(); }
try
{
_sink.RebuildAddressSpace();
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
}
catch (Exception ex)
{
_log.Error(ex, "OpcUaPublish: sink.RebuildAddressSpace threw (correlation={Correlation})",
@@ -175,6 +185,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
var outcome = _applier.Apply(plan);
_lastApplied = composition;
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
_log.Info("OpcUaPublish: applied rebuild (correlation={Correlation}, added={Added}, removed={Removed}, changed={Changed}, rebuild={Rebuild})",
msg.Correlation, outcome.AddedNodes, outcome.RemovedNodes, outcome.ChangedNodes, outcome.RebuildCalled);
}
@@ -211,6 +222,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
try
{
_serviceLevel.Publish(msg.ServiceLevel);
OtOpcUaTelemetry.ServiceLevelChange.Add(1,
new KeyValuePair<string, object?>("level", msg.ServiceLevel));
_log.Debug("OpcUaPublish: ServiceLevel={Level}", msg.ServiceLevel);
}
catch (Exception ex)