feat(observability): F13d Prometheus + OpenTelemetry instrumentation
v2-ci / build (push) Failing after 38s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (push) Has been skipped
v2-ci / build (push) Failing after 38s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (push) Has been skipped
OtOpcUaTelemetry (Commons/Observability) centralizes the project's Meter + ActivitySource so all instrumentation points emit through a single named surface. Counters cover the hot paths: otopcua.deploy.applied (outcome=ack|reject) otopcua.deploy.apply.duration (s, histogram) otopcua.driver.lifecycle (event=spawn|spawn_stub|stop|fault) otopcua.virtualtag.eval (outcome=ok|fail|skip) otopcua.scriptedalarm.transition (state=activated|acknowledged|cleared) otopcua.opcua.sink.write (kind=value|alarm|rebuild) otopcua.redundancy.service_level_change (level=byte) Plus two ActivitySource spans: otopcua.deploy.apply wraps DriverHostActor.ApplyAndAck otopcua.opcua.address_space_rebuild wraps OpcUaPublishActor.HandleRebuild Instruments are no-op until a listener attaches, so tests + dev hosts pay nothing for unread telemetry. Host Program.cs gains AddOtOpcUaObservability() (binds the OtOpcUa Meter + ActivitySource to OpenTelemetry, attaches a Prometheus exporter) and MapOtOpcUaMetrics() (mounts /metrics scrape endpoint). Driver-side internals + ASP.NET request metrics deliberately stay off — the scrape payload is scoped to OtOpcUa signals only. Tests use MeterListener + ActivityListener to verify VirtualTagActor.eval, OpcUaPublishActor.AttributeValueUpdate, and RebuildAddressSpace actually emit on the central instruments. Runtime suite is 72 / 72 green (+3). Closes #105. Path A (F13b/c/d) complete; next batch options: #85 UNS folder hierarchy in SDK, or F8b/F9b production engine bindings.
This commit is contained in:
@@ -3,6 +3,7 @@ using Akka.Cluster.Tools.PublishSubscribe;
|
||||
using Akka.Event;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Types;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
@@ -124,6 +125,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
{
|
||||
_sink.WriteValue(msg.NodeId, msg.Value, msg.Quality, msg.TimestampUtc);
|
||||
Interlocked.Increment(ref _writes);
|
||||
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "value"));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -137,6 +139,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
{
|
||||
_sink.WriteAlarmState(msg.AlarmNodeId, msg.Active, msg.Acknowledged, msg.TimestampUtc);
|
||||
Interlocked.Increment(ref _writes);
|
||||
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "alarm"));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -146,12 +149,19 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
|
||||
private void HandleRebuild(RebuildAddressSpace msg)
|
||||
{
|
||||
using var span = OtOpcUaTelemetry.StartAddressSpaceRebuildSpan();
|
||||
span?.SetTag("otopcua.correlation_id", msg.Correlation.ToString());
|
||||
|
||||
// Two modes: when dbFactory + applier are wired, do a real diff-and-apply pass against
|
||||
// the latest deployment artifact. Without them, fall back to a raw sink rebuild — the
|
||||
// F10b/dev path before the integration completes.
|
||||
if (_dbFactory is null || _applier is null)
|
||||
{
|
||||
try { _sink.RebuildAddressSpace(); }
|
||||
try
|
||||
{
|
||||
_sink.RebuildAddressSpace();
|
||||
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_log.Error(ex, "OpcUaPublish: sink.RebuildAddressSpace threw (correlation={Correlation})",
|
||||
@@ -175,6 +185,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
|
||||
var outcome = _applier.Apply(plan);
|
||||
_lastApplied = composition;
|
||||
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
|
||||
_log.Info("OpcUaPublish: applied rebuild (correlation={Correlation}, added={Added}, removed={Removed}, changed={Changed}, rebuild={Rebuild})",
|
||||
msg.Correlation, outcome.AddedNodes, outcome.RemovedNodes, outcome.ChangedNodes, outcome.RebuildCalled);
|
||||
}
|
||||
@@ -211,6 +222,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
|
||||
try
|
||||
{
|
||||
_serviceLevel.Publish(msg.ServiceLevel);
|
||||
OtOpcUaTelemetry.ServiceLevelChange.Add(1,
|
||||
new KeyValuePair<string, object?>("level", msg.ServiceLevel));
|
||||
_log.Debug("OpcUaPublish: ServiceLevel={Level}", msg.ServiceLevel);
|
||||
}
|
||||
catch (Exception ex)
|
||||
|
||||
Reference in New Issue
Block a user