feat(observability): F13d Prometheus + OpenTelemetry instrumentation
Some checks failed
v2-ci / build (push) Failing after 38s
v2-ci / unit-tests (tests/Core/ZB.MOM.WW.OtOpcUa.Cluster.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.ControlPlane.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.OpcUaServer.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Runtime.Tests) (push) Has been skipped
v2-ci / unit-tests (tests/Server/ZB.MOM.WW.OtOpcUa.Security.Tests) (push) Has been skipped
v2-ci / integration (push) Has been skipped

OtOpcUaTelemetry (Commons/Observability) centralizes the project's Meter
+ ActivitySource so all instrumentation points emit through a single
named surface. Counters cover the hot paths:

  otopcua.deploy.applied               (outcome=ack|reject)
  otopcua.deploy.apply.duration        (s, histogram)
  otopcua.driver.lifecycle             (event=spawn|spawn_stub|stop|fault)
  otopcua.virtualtag.eval              (outcome=ok|fail|skip)
  otopcua.scriptedalarm.transition     (state=activated|acknowledged|cleared)
  otopcua.opcua.sink.write             (kind=value|alarm|rebuild)
  otopcua.redundancy.service_level_change (level=byte)

Plus two ActivitySource spans:

  otopcua.deploy.apply                 wraps DriverHostActor.ApplyAndAck
  otopcua.opcua.address_space_rebuild  wraps OpcUaPublishActor.HandleRebuild

Instruments are no-op until a listener attaches, so tests + dev hosts
pay nothing for unread telemetry.

Host Program.cs gains AddOtOpcUaObservability() (binds the OtOpcUa Meter
+ ActivitySource to OpenTelemetry, attaches a Prometheus exporter) and
MapOtOpcUaMetrics() (mounts /metrics scrape endpoint). Driver-side
internals + ASP.NET request metrics deliberately stay off — the scrape
payload is scoped to OtOpcUa signals only.

Tests use MeterListener + ActivityListener to verify
VirtualTagActor.eval, OpcUaPublishActor.AttributeValueUpdate, and
RebuildAddressSpace actually emit on the central instruments. Runtime
suite is 72 / 72 green (+3).

Closes #105. Path A (F13b/c/d) complete; next batch options: #85 UNS
folder hierarchy in SDK, or F8b/F9b production engine bindings.
This commit is contained in:
Joseph Doherty
2026-05-26 10:29:40 -04:00
parent 21eac21409
commit 52997ee164
10 changed files with 352 additions and 3 deletions

View File

@@ -0,0 +1,81 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace ZB.MOM.WW.OtOpcUa.Commons.Observability;
/// <summary>
/// Central <see cref="Meter"/> + <see cref="ActivitySource"/> definitions for OtOpcUa.
/// All Akka actors, the OPC UA publish path, and the deploy coordinator emit through these
/// pre-created instruments so a single OpenTelemetry / Prometheus binding in <c>Host</c>
/// catches everything. No exporter is required — instruments are no-op until a listener
/// attaches, so tests and dev hosts pay nothing for instrumentation that nobody scrapes.
///
/// Instrument names follow the OpenTelemetry semantic convention pattern
/// <c>otopcua.&lt;subsystem&gt;.&lt;event&gt;</c>. Subsystem is one of: deploy, driver,
/// virtualtag, scriptedalarm, opcua, redundancy.
/// </summary>
public static class OtOpcUaTelemetry
{
public const string MeterName = "ZB.MOM.WW.OtOpcUa";
public const string ActivitySourceName = "ZB.MOM.WW.OtOpcUa";
/// <summary>Singleton <see cref="Meter"/> all counters/histograms hang off.</summary>
public static readonly Meter Meter = new(MeterName);
/// <summary>Singleton <see cref="ActivitySource"/> used to start spans wrapping deploy/apply/rebuild.</summary>
public static readonly ActivitySource ActivitySource = new(ActivitySourceName);
// ---------------- Deployment / driver-host coordination ----------------
/// <summary>Incremented every time DriverHostActor finishes applying a deployment (Ack or Reject).</summary>
public static readonly Counter<long> DeploymentApplied =
Meter.CreateCounter<long>("otopcua.deploy.applied", unit: "{deployment}",
description: "Deployments applied by a driver-role node (outcome=ack|reject).");
/// <summary>Time from DriverHostActor receiving DispatchDeployment to emitting the ack/reject.</summary>
public static readonly Histogram<double> DeploymentApplyDurationSec =
Meter.CreateHistogram<double>("otopcua.deploy.apply.duration", unit: "s",
description: "Driver-role apply latency from DispatchDeployment → Ack/Reject.");
/// <summary>DriverInstanceActor spawn count (added=new instance; stop=disposed).</summary>
public static readonly Counter<long> DriverInstanceLifecycle =
Meter.CreateCounter<long>("otopcua.driver.lifecycle", unit: "{event}",
description: "DriverInstanceActor lifecycle transitions (event=spawn|stop|fault).");
// ---------------- VirtualTag / ScriptedAlarm engines ----------------
public static readonly Counter<long> VirtualTagEval =
Meter.CreateCounter<long>("otopcua.virtualtag.eval", unit: "{eval}",
description: "Virtual-tag evaluations attempted (outcome=ok|fail|skip).");
public static readonly Counter<long> ScriptedAlarmTransition =
Meter.CreateCounter<long>("otopcua.scriptedalarm.transition", unit: "{transition}",
description: "Scripted-alarm state transitions (state=active|acknowledged|inactive).");
// ---------------- OPC UA address-space + redundancy ----------------
public static readonly Counter<long> OpcUaSinkWrite =
Meter.CreateCounter<long>("otopcua.opcua.sink.write", unit: "{write}",
description: "Writes that landed in IOpcUaAddressSpaceSink (kind=value|alarm|rebuild).");
public static readonly Counter<long> ServiceLevelChange =
Meter.CreateCounter<long>("otopcua.redundancy.service_level_change", unit: "{change}",
description: "OPC UA Server.ServiceLevel transitions emitted by the redundancy state.");
// ---------------- Convenience helpers ----------------
/// <summary>
/// Starts a deploy span tagged with the deployment id. Caller disposes to close. Returns
/// null when no listener is attached so the call site stays cheap on undecorated builds.
/// </summary>
public static Activity? StartDeployApplySpan(string deploymentId)
{
var activity = ActivitySource.StartActivity("otopcua.deploy.apply", ActivityKind.Internal);
activity?.SetTag("otopcua.deployment_id", deploymentId);
return activity;
}
/// <summary>Span wrapping a full OPC UA address-space rebuild (Phase7 plan → apply).</summary>
public static Activity? StartAddressSpaceRebuildSpan()
=> ActivitySource.StartActivity("otopcua.opcua.address_space_rebuild", ActivityKind.Internal);
}

View File

@@ -0,0 +1,38 @@
using OpenTelemetry.Metrics;
using OpenTelemetry.Trace;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
namespace ZB.MOM.WW.OtOpcUa.Host.Observability;
/// <summary>
/// Wires the OtOpcUa Meter + ActivitySource into OpenTelemetry and exposes a Prometheus
/// scrape endpoint at <c>/metrics</c> on the host pipeline. F13d slice — only the meter +
/// activity source declared in <see cref="OtOpcUaTelemetry"/> are surfaced; per-Akka
/// internals + ASP.NET request metrics stay off by default to keep the scrape payload
/// scoped to OtOpcUa-owned signals.
/// </summary>
public static class ObservabilityExtensions
{
public static IServiceCollection AddOtOpcUaObservability(this IServiceCollection services)
{
services.AddOpenTelemetry()
.WithMetrics(b => b
.AddMeter(OtOpcUaTelemetry.MeterName)
.AddPrometheusExporter())
.WithTracing(b => b
.AddSource(OtOpcUaTelemetry.ActivitySourceName));
return services;
}
/// <summary>
/// Mounts the Prometheus scrape endpoint on the existing ASP.NET pipeline. Call after
/// <c>app.UseAuthentication/UseAuthorization</c> if metrics access should require auth;
/// the default leaves it unauthenticated for local Prometheus scrapes.
/// </summary>
public static IEndpointRouteBuilder MapOtOpcUaMetrics(this IEndpointRouteBuilder app)
{
app.MapPrometheusScrapingEndpoint("/metrics");
return app;
}
}

View File

@@ -11,6 +11,7 @@ using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Host;
using ZB.MOM.WW.OtOpcUa.Host.Drivers;
using ZB.MOM.WW.OtOpcUa.Host.Health;
using ZB.MOM.WW.OtOpcUa.Host.Observability;
using ZB.MOM.WW.OtOpcUa.Host.OpcUa;
using ZB.MOM.WW.OtOpcUa.OpcUaServer.Security;
using ZB.MOM.WW.OtOpcUa.Runtime;
@@ -94,6 +95,7 @@ if (hasAdmin)
}
builder.Services.AddOtOpcUaHealth();
builder.Services.AddOtOpcUaObservability();
var app = builder.Build();
app.UseSerilogRequestLogging();
@@ -109,6 +111,7 @@ if (hasAdmin)
}
app.MapOtOpcUaHealth();
app.MapOtOpcUaMetrics();
Log.Information("OtOpcUa.Host starting with roles=[{Roles}] (admin={HasAdmin}, driver={HasDriver})",
string.Join(",", roles), hasAdmin, hasDriver);

View File

@@ -15,6 +15,8 @@
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="OpenTelemetry.Extensions.Hosting"/>
<PackageReference Include="OpenTelemetry.Exporter.Prometheus.AspNetCore"/>
</ItemGroup>
<ItemGroup>

View File

@@ -1,3 +1,4 @@
using System.Diagnostics;
using Akka.Actor;
using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
@@ -5,6 +6,7 @@ using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Commons.Interfaces;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Deploy;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Fleet;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Configuration;
using ZB.MOM.WW.OtOpcUa.Configuration.Entities;
@@ -239,6 +241,12 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
_applyingDeploymentId = deploymentId;
Become(Applying);
using var span = OtOpcUaTelemetry.StartDeployApplySpan(deploymentId.ToString());
span?.SetTag("otopcua.node_id", _localNode.ToString());
span?.SetTag("otopcua.revision", revision.ToString());
span?.SetTag("otopcua.correlation_id", correlation.ToString());
var sw = Stopwatch.StartNew();
// Persist Applying row (idempotent on PK).
UpsertNodeDeploymentState(deploymentId, NodeDeploymentStatus.Applying, failureReason: null);
@@ -252,6 +260,7 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
// composition. The publish actor handles the load-compose-diff-apply pipeline; we
// just forward the same correlation id so the audit trail joins up.
_opcUaPublishActor?.Tell(new ZB.MOM.WW.OtOpcUa.Runtime.OpcUa.OpcUaPublishActor.RebuildAddressSpace(correlation));
OtOpcUaTelemetry.DeploymentApplied.Add(1, new KeyValuePair<string, object?>("outcome", "ack"));
_log.Info("DriverHost {Node}: applied deployment {Id} (rev {Rev}, children={Count})",
_localNode, deploymentId, revision, _children.Count);
}
@@ -259,10 +268,13 @@ public sealed class DriverHostActor : ReceiveActor, IWithTimers
{
UpsertNodeDeploymentState(deploymentId, NodeDeploymentStatus.Failed, ex.Message);
SendAck(deploymentId, ApplyAckOutcome.Failed, ex.Message, correlation);
OtOpcUaTelemetry.DeploymentApplied.Add(1, new KeyValuePair<string, object?>("outcome", "reject"));
span?.SetStatus(ActivityStatusCode.Error, ex.Message);
_log.Error(ex, "DriverHost {Node}: apply of {Id} failed", _localNode, deploymentId);
}
finally
{
OtOpcUaTelemetry.DeploymentApplyDurationSec.Record(sw.Elapsed.TotalSeconds);
_applyingDeploymentId = null;
Become(Steady);
}

View File

@@ -1,5 +1,6 @@
using Akka.Actor;
using Akka.Event;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
@@ -82,6 +83,9 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
_driver = driver;
_driverInstanceId = driver.DriverInstanceId;
_reconnectInterval = reconnectInterval;
OtOpcUaTelemetry.DriverInstanceLifecycle.Add(1,
new KeyValuePair<string, object?>("event", startStubbed ? "spawn_stub" : "spawn"),
new KeyValuePair<string, object?>("driver_type", driver.DriverType));
if (startStubbed)
{
Context.GetLogger().Info("[DEV-STUB] driver={Name} type={Type}",
@@ -314,5 +318,8 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
DetachSubscription();
try { _driver.ShutdownAsync(CancellationToken.None).GetAwaiter().GetResult(); }
catch (Exception ex) { _log.Warning(ex, "DriverInstance {Id}: ShutdownAsync threw on PostStop", _driverInstanceId); }
OtOpcUaTelemetry.DriverInstanceLifecycle.Add(1,
new KeyValuePair<string, object?>("event", "stop"),
new KeyValuePair<string, object?>("driver_type", _driver.DriverType));
}
}

View File

@@ -3,6 +3,7 @@ using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using Microsoft.EntityFrameworkCore;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Redundancy;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Commons.OpcUa;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
using ZB.MOM.WW.OtOpcUa.Configuration;
@@ -124,6 +125,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
{
_sink.WriteValue(msg.NodeId, msg.Value, msg.Quality, msg.TimestampUtc);
Interlocked.Increment(ref _writes);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "value"));
}
catch (Exception ex)
{
@@ -137,6 +139,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
{
_sink.WriteAlarmState(msg.AlarmNodeId, msg.Active, msg.Acknowledged, msg.TimestampUtc);
Interlocked.Increment(ref _writes);
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "alarm"));
}
catch (Exception ex)
{
@@ -146,12 +149,19 @@ public sealed class OpcUaPublishActor : ReceiveActor
private void HandleRebuild(RebuildAddressSpace msg)
{
using var span = OtOpcUaTelemetry.StartAddressSpaceRebuildSpan();
span?.SetTag("otopcua.correlation_id", msg.Correlation.ToString());
// Two modes: when dbFactory + applier are wired, do a real diff-and-apply pass against
// the latest deployment artifact. Without them, fall back to a raw sink rebuild — the
// F10b/dev path before the integration completes.
if (_dbFactory is null || _applier is null)
{
try { _sink.RebuildAddressSpace(); }
try
{
_sink.RebuildAddressSpace();
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
}
catch (Exception ex)
{
_log.Error(ex, "OpcUaPublish: sink.RebuildAddressSpace threw (correlation={Correlation})",
@@ -175,6 +185,7 @@ public sealed class OpcUaPublishActor : ReceiveActor
var outcome = _applier.Apply(plan);
_lastApplied = composition;
OtOpcUaTelemetry.OpcUaSinkWrite.Add(1, new KeyValuePair<string, object?>("kind", "rebuild"));
_log.Info("OpcUaPublish: applied rebuild (correlation={Correlation}, added={Added}, removed={Removed}, changed={Changed}, rebuild={Rebuild})",
msg.Correlation, outcome.AddedNodes, outcome.RemovedNodes, outcome.ChangedNodes, outcome.RebuildCalled);
}
@@ -211,6 +222,8 @@ public sealed class OpcUaPublishActor : ReceiveActor
try
{
_serviceLevel.Publish(msg.ServiceLevel);
OtOpcUaTelemetry.ServiceLevelChange.Add(1,
new KeyValuePair<string, object?>("level", msg.ServiceLevel));
_log.Debug("OpcUaPublish: ServiceLevel={Level}", msg.ServiceLevel);
}
catch (Exception ex)

View File

@@ -4,6 +4,7 @@ using Akka.Event;
using ZB.MOM.WW.OtOpcUa.Commons.Engines;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Alerts;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Logging;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Runtime.VirtualTags;
namespace ZB.MOM.WW.OtOpcUa.Runtime.ScriptedAlarms;
@@ -173,6 +174,9 @@ public sealed class ScriptedAlarmActor : ReceiveActor
_ => next.ToString(),
};
OtOpcUaTelemetry.ScriptedAlarmTransition.Add(1,
new KeyValuePair<string, object?>("state", kind.ToLowerInvariant()));
var evt = new AlarmTransitionEvent(
AlarmId: _config.AlarmId,
EquipmentPath: _config.EquipmentPath,

View File

@@ -3,6 +3,7 @@ using Akka.Cluster.Tools.PublishSubscribe;
using Akka.Event;
using ZB.MOM.WW.OtOpcUa.Commons.Engines;
using ZB.MOM.WW.OtOpcUa.Commons.Messages.Logging;
using ZB.MOM.WW.OtOpcUa.Commons.Observability;
using ZB.MOM.WW.OtOpcUa.Commons.Types;
namespace ZB.MOM.WW.OtOpcUa.Runtime.VirtualTags;
@@ -95,24 +96,35 @@ public sealed class VirtualTagActor : ReceiveActor
catch (Exception ex)
{
_log.Warning(ex, "VirtualTag {Id}: evaluator threw", _virtualTagId);
OtOpcUaTelemetry.VirtualTagEval.Add(1, new KeyValuePair<string, object?>("outcome", "fail"));
PublishLog("Error", $"evaluator threw: {ex.Message}");
return;
}
if (!result.Success)
{
OtOpcUaTelemetry.VirtualTagEval.Add(1, new KeyValuePair<string, object?>("outcome", "fail"));
PublishLog("Warning", result.Reason ?? "evaluator failure");
return;
}
// Skip no-change results. Real evaluator returns Ok(value); Null returns NoChange — both
// safe because Null never produces a fresh value.
if (ReferenceEquals(result, VirtualTagEvalResult.NoChange)) return;
if (ReferenceEquals(result, VirtualTagEvalResult.NoChange))
{
OtOpcUaTelemetry.VirtualTagEval.Add(1, new KeyValuePair<string, object?>("outcome", "skip"));
return;
}
if (_hasLastValue && Equals(_lastValue, result.Value)) return;
if (_hasLastValue && Equals(_lastValue, result.Value))
{
OtOpcUaTelemetry.VirtualTagEval.Add(1, new KeyValuePair<string, object?>("outcome", "skip"));
return;
}
_hasLastValue = true;
_lastValue = result.Value;
OtOpcUaTelemetry.VirtualTagEval.Add(1, new KeyValuePair<string, object?>("outcome", "ok"));
var evalResult = new EvaluationResult(_virtualTagId, result.Value, msg.TimestampUtc, CorrelationId.NewId());
Context.Parent.Tell(evalResult);
}