feat(observability): F13d Prometheus + OpenTelemetry instrumentation
v2-ci / build (push) Failing after 38s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (push) Has been skipped
v2-ci / build (push) Failing after 38s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (push) Has been skipped
OtOpcUaTelemetry (Commons/Observability) centralizes the project's Meter + ActivitySource so all instrumentation points emit through a single named surface. Counters cover the hot paths: otopcua.deploy.applied (outcome=ack|reject) otopcua.deploy.apply.duration (s, histogram) otopcua.driver.lifecycle (event=spawn|spawn_stub|stop|fault) otopcua.virtualtag.eval (outcome=ok|fail|skip) otopcua.scriptedalarm.transition (state=activated|acknowledged|cleared) otopcua.opcua.sink.write (kind=value|alarm|rebuild) otopcua.redundancy.service_level_change (level=byte) Plus two ActivitySource spans: otopcua.deploy.apply wraps DriverHostActor.ApplyAndAck otopcua.opcua.address_space_rebuild wraps OpcUaPublishActor.HandleRebuild Instruments are no-op until a listener attaches, so tests + dev hosts pay nothing for unread telemetry. Host Program.cs gains AddOtOpcUaObservability() (binds the OtOpcUa Meter + ActivitySource to OpenTelemetry, attaches a Prometheus exporter) and MapOtOpcUaMetrics() (mounts /metrics scrape endpoint). Driver-side internals + ASP.NET request metrics deliberately stay off — the scrape payload is scoped to OtOpcUa signals only. Tests use MeterListener + ActivityListener to verify VirtualTagActor.eval, OpcUaPublishActor.AttributeValueUpdate, and RebuildAddressSpace actually emit on the central instruments. Runtime suite is 72 / 72 green (+3). Closes #105. Path A (F13b/c/d) complete; next batch options: #85 UNS folder hierarchy in SDK, or F8b/F9b production engine bindings.
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
using System.Diagnostics;
|
||||
using Akka.Actor;
|
||||
using Akka.Cluster.Tools.PublishSubscribe;
|
||||
using Akka.Event;
|
||||
@@ -5,6 +6,7 @@ using Microsoft.EntityFrameworkCore;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Interfaces;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Deploy;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Fleet;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
|
||||
using ZB.MOM.WW.OtOpcUa.Commons.Types;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration;
|
||||
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
|
||||
@@ -239,6 +241,12 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
_applyingDeploymentId = deploymentId;
|
||||
Become(Applying);
|
||||
|
||||
using var span = OtOpcUaTelemetry.StartDeployApplySpan(deploymentId.ToString());
|
||||
span?.SetTag("otopcua.node_id", _localNode.ToString());
|
||||
span?.SetTag("otopcua.revision", revision.ToString());
|
||||
span?.SetTag("otopcua.correlation_id", correlation.ToString());
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
// Persist Applying row (idempotent on PK).
|
||||
UpsertNodeDeploymentState(deploymentId, NodeDeploymentStatus.Applying, failureReason: null);
|
||||
|
||||
@@ -252,6 +260,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
// composition. The publish actor handles the load-compose-diff-apply pipeline; we
|
||||
// just forward the same correlation id so the audit trail joins up.
|
||||
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation));
|
||||
OtOpcUaTelemetry.DeploymentApplied.Add(1, new KeyValuePair<string, object?>("outcome", "ack"));
|
||||
_log.Info("DriverHost {Node}: applied deployment {Id} (rev {Rev}, children={Count})",
|
||||
_localNode, deploymentId, revision, _children.Count);
|
||||
}
|
||||
@@ -259,10 +268,13 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
UpsertNodeDeploymentState(deploymentId, NodeDeploymentStatus.Failed, ex.Message);
|
||||
SendAck(deploymentId, ApplyAckOutcome.Failed, ex.Message, correlation);
|
||||
OtOpcUaTelemetry.DeploymentApplied.Add(1, new KeyValuePair<string, object?>("outcome", "reject"));
|
||||
span?.SetStatus(ActivityStatusCode.Error, ex.Message);
|
||||
_log.Error(ex, "DriverHost {Node}: apply of {Id} failed", _localNode, deploymentId);
|
||||
}
|
||||
finally
|
||||
{
|
||||
OtOpcUaTelemetry.DeploymentApplyDurationSec.Record(sw.Elapsed.TotalSeconds);
|
||||
_applyingDeploymentId = null;
|
||||
Become(Steady);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user