PR 6.1 — OpenTelemetry traces around gw calls
In-box ActivitySource ("ZB.MOM.WW.OtOpcUa.Driver.Galaxy") wrapped around
the three gw-facing seams via decorators:
- TracedGalaxySubscriber — galaxy.subscribe_bulk / galaxy.unsubscribe_bulk
/ galaxy.stream_events spans. Stream span covers the entire stream
lifetime with a galaxy.event_count tag (per-event spans would dominate
the trace volume at 50k tags / 1Hz; PR 6.2 owns per-event metrics).
- TracedGalaxyDataWriter — galaxy.write spans tagged with
galaxy.tag_count, galaxy.secured_write_count (split between FreeAccess
/Operate vs Tune/Configure/VerifiedWrite, computed only when a listener
is recording so the hot path stays free), galaxy.success_count.
- TracedGalaxyHierarchySource — galaxy.get_hierarchy spans tagged with
galaxy.object_count.
GalaxyDriver.BuildProductionRuntimeAsync wraps the production seams in
the decorators. The driver itself doesn't take an OpenTelemetry package
dependency — System.Diagnostics.ActivitySource is in-box; the host
process picks the listener.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
using MxGateway.Contracts.Proto.Galaxy;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
|
||||
/// per <c>GetHierarchy</c> RPC. <c>galaxy.object_count</c> on the span lets ops
|
||||
/// correlate slow Discover passes with Galaxy size without instrumenting the
|
||||
/// discoverer's translation step.
|
||||
/// </summary>
|
||||
internal sealed class TracedGalaxyHierarchySource(IGalaxyHierarchySource inner, string clientName) : IGalaxyHierarchySource
|
||||
{
|
||||
public async Task<IReadOnlyList<GalaxyObject>> GetHierarchyAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.get_hierarchy");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
try
|
||||
{
|
||||
var hierarchy = await inner.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
|
||||
activity?.SetTag("galaxy.object_count", hierarchy.Count);
|
||||
return hierarchy;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -185,8 +185,16 @@ public sealed class GalaxyDriver
|
||||
_ownedMxSession = new GalaxyMxSession(_options.MxAccess, _logger);
|
||||
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_subscriber = new GatewayGalaxySubscriber(_ownedMxSession);
|
||||
_dataWriter = new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger);
|
||||
// PR 6.1 — wrap the gw-facing seams in tracing decorators so every Subscribe /
|
||||
// Unsubscribe / Write / StreamEvents call emits a span on the
|
||||
// "ZB.MOM.WW.OtOpcUa.Driver.Galaxy" ActivitySource. The host process's tracing
|
||||
// listener (OTLP exporter, dotnet-trace, etc.) consumes these without the driver
|
||||
// taking a dependency on the OpenTelemetry packages.
|
||||
_subscriber = new TracedGalaxySubscriber(
|
||||
new GatewayGalaxySubscriber(_ownedMxSession), _options.MxAccess.ClientName);
|
||||
_dataWriter = new TracedGalaxyDataWriter(
|
||||
new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger),
|
||||
_options.MxAccess.ClientName);
|
||||
|
||||
_supervisor = new ReconnectSupervisor(
|
||||
reopen: ReopenAsync,
|
||||
@@ -559,7 +567,8 @@ public sealed class GalaxyDriver
|
||||
: null,
|
||||
};
|
||||
_ownedRepositoryClient = GalaxyRepositoryClient.Create(clientOptions);
|
||||
return new GatewayGalaxyHierarchySource(_ownedRepositoryClient);
|
||||
return new TracedGalaxyHierarchySource(
|
||||
new GatewayGalaxyHierarchySource(_ownedRepositoryClient), _options.MxAccess.ClientName);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — In-box <see cref="ActivitySource"/> wired around every gw call the
|
||||
/// driver makes (Subscribe/Unsubscribe, Write/WriteSecured, GetHierarchy). The
|
||||
/// decorators in this folder produce one span per call, tagged with the inputs
|
||||
/// ops needs to triage a slow or failing operation:
|
||||
/// <c>galaxy.tag_count</c>, <c>galaxy.success_count</c>, <c>galaxy.client</c>.
|
||||
/// <para>
|
||||
/// The driver itself doesn't take a dependency on the OpenTelemetry packages —
|
||||
/// <c>System.Diagnostics.ActivitySource</c> is in the BCL. The host process
|
||||
/// decides which listener (OTLP exporter, Application Insights, dotnet-trace)
|
||||
/// subscribes to <see cref="ActivitySourceName"/>.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
internal static class GalaxyTelemetry
|
||||
{
|
||||
public const string ActivitySourceName = "ZB.MOM.WW.OtOpcUa.Driver.Galaxy";
|
||||
|
||||
public static readonly ActivitySource ActivitySource = new(ActivitySourceName);
|
||||
|
||||
/// <summary>
|
||||
/// Tag a span with a failure reason and set its status to <c>Error</c>. Helper
|
||||
/// so the decorators don't repeat the four-line idiom on every catch block.
|
||||
/// </summary>
|
||||
public static void RecordError(this Activity? activity, Exception ex)
|
||||
{
|
||||
if (activity is null) return;
|
||||
activity.SetStatus(ActivityStatusCode.Error, ex.Message);
|
||||
activity.SetTag("exception.type", ex.GetType().FullName);
|
||||
activity.SetTag("exception.message", ex.Message);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
|
||||
/// per gw write batch. Tags secured-write counts so ops can see the routing-by-
|
||||
/// classification split (FreeAccess/Operate vs Tune/Configure) without re-reading
|
||||
/// the discovery dictionary.
|
||||
/// </summary>
|
||||
internal sealed class TracedGalaxyDataWriter(IGalaxyDataWriter inner, string clientName) : IGalaxyDataWriter
|
||||
{
|
||||
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes,
|
||||
Func<string, SecurityClassification> securityResolver,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.write");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
activity?.SetTag("galaxy.tag_count", writes.Count);
|
||||
|
||||
if (activity is { IsAllDataRequested: true })
|
||||
{
|
||||
// Counting the secured-write split is cheap (one resolver call per request)
|
||||
// and only happens when a tracing listener is actively recording — keeps the
|
||||
// hot path free when no one's listening.
|
||||
var securedCount = 0;
|
||||
foreach (var w in writes)
|
||||
{
|
||||
var sc = securityResolver(w.FullReference);
|
||||
if (sc is SecurityClassification.Tune
|
||||
or SecurityClassification.Configure
|
||||
or SecurityClassification.VerifiedWrite)
|
||||
{
|
||||
securedCount++;
|
||||
}
|
||||
}
|
||||
activity.SetTag("galaxy.secured_write_count", securedCount);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var results = await inner.WriteAsync(writes, securityResolver, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
activity?.SetTag("galaxy.success_count", results.Count(r => r.StatusCode < 0x80000000u));
|
||||
return results;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using System.Runtime.CompilerServices;
|
||||
using MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
|
||||
/// per gw subscription RPC. Wraps the production <see cref="GatewayGalaxySubscriber"/>;
|
||||
/// tests substitute a fake at the same seam without taking the tracing overhead.
|
||||
/// </summary>
|
||||
internal sealed class TracedGalaxySubscriber(IGalaxySubscriber inner, string clientName) : IGalaxySubscriber
|
||||
{
|
||||
public async Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
|
||||
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.subscribe_bulk");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
activity?.SetTag("galaxy.tag_count", fullReferences.Count);
|
||||
activity?.SetTag("galaxy.buffered_interval_ms", bufferedUpdateIntervalMs);
|
||||
try
|
||||
{
|
||||
var results = await inner.SubscribeBulkAsync(fullReferences, bufferedUpdateIntervalMs, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
activity?.SetTag("galaxy.success_count", results.Count(r => r.WasSuccessful));
|
||||
return results;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.unsubscribe_bulk");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
activity?.SetTag("galaxy.tag_count", itemHandles.Count);
|
||||
try
|
||||
{
|
||||
await inner.UnsubscribeBulkAsync(itemHandles, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Streaming RPC — one parent span covers the entire stream lifetime. Per-event
|
||||
/// spans would dominate the trace volume at 50k tags / 1Hz; ops gets per-event
|
||||
/// visibility through <see cref="EventPump"/>'s metrics in PR 6.2 instead.
|
||||
/// </summary>
|
||||
public async IAsyncEnumerable<MxEvent> StreamEventsAsync(
|
||||
[EnumeratorCancellation] CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.stream_events");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
|
||||
IAsyncEnumerator<MxEvent>? enumerator = null;
|
||||
try
|
||||
{
|
||||
enumerator = inner.StreamEventsAsync(cancellationToken).GetAsyncEnumerator(cancellationToken);
|
||||
var eventCount = 0L;
|
||||
while (true)
|
||||
{
|
||||
bool moveNext;
|
||||
try
|
||||
{
|
||||
moveNext = await enumerator.MoveNextAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
activity?.SetTag("galaxy.event_count", eventCount);
|
||||
throw;
|
||||
}
|
||||
|
||||
if (!moveNext) break;
|
||||
eventCount++;
|
||||
yield return enumerator.Current;
|
||||
}
|
||||
activity?.SetTag("galaxy.event_count", eventCount);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (enumerator is not null) await enumerator.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
using System.Diagnostics;
|
||||
using MxGateway.Contracts.Proto;
|
||||
using Shouldly;
|
||||
using Xunit;
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — pins that every gw-facing call produces a span on the
|
||||
/// <c>ZB.MOM.WW.OtOpcUa.Driver.Galaxy</c> ActivitySource. We listen via
|
||||
/// <see cref="ActivityListener"/> rather than asserting on internal state, so the
|
||||
/// tests double as documentation of the listener-side contract.
|
||||
/// </summary>
|
||||
public sealed class GalaxyTelemetryTests
|
||||
{
|
||||
/// <summary>Subscribes an ActivityListener for the test, captures each spawned activity.</summary>
|
||||
private static (ActivityListener Listener, List<Activity> Captured) StartCapture()
|
||||
{
|
||||
var captured = new List<Activity>();
|
||||
var listener = new ActivityListener
|
||||
{
|
||||
ShouldListenTo = src => src.Name == GalaxyTelemetry.ActivitySourceName,
|
||||
Sample = (ref ActivityCreationOptions<ActivityContext> _) => ActivitySamplingResult.AllDataAndRecorded,
|
||||
ActivityStopped = activity => captured.Add(activity),
|
||||
};
|
||||
ActivitySource.AddActivityListener(listener);
|
||||
return (listener, captured);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TracedGalaxySubscriber_emits_subscribe_bulk_span_with_tag_count()
|
||||
{
|
||||
var (listener, captured) = StartCapture();
|
||||
try
|
||||
{
|
||||
var inner = new FakeSubscriber();
|
||||
var sut = new TracedGalaxySubscriber(inner, "OtOpcUa-Test");
|
||||
await sut.SubscribeBulkAsync(["A", "B", "C"], 500, CancellationToken.None);
|
||||
|
||||
var span = captured.ShouldHaveSingleItem();
|
||||
span.OperationName.ShouldBe("galaxy.subscribe_bulk");
|
||||
span.GetTagItem("galaxy.client").ShouldBe("OtOpcUa-Test");
|
||||
span.GetTagItem("galaxy.tag_count").ShouldBe(3);
|
||||
span.GetTagItem("galaxy.buffered_interval_ms").ShouldBe(500);
|
||||
span.GetTagItem("galaxy.success_count").ShouldBe(3);
|
||||
}
|
||||
finally { listener.Dispose(); }
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TracedGalaxySubscriber_records_error_and_rethrows_on_failure()
|
||||
{
|
||||
var (listener, captured) = StartCapture();
|
||||
try
|
||||
{
|
||||
var sut = new TracedGalaxySubscriber(new ThrowingSubscriber(), "OtOpcUa-Test");
|
||||
await Should.ThrowAsync<InvalidOperationException>(() =>
|
||||
sut.SubscribeBulkAsync(["A"], 0, CancellationToken.None));
|
||||
|
||||
var span = captured.ShouldHaveSingleItem();
|
||||
span.Status.ShouldBe(ActivityStatusCode.Error);
|
||||
span.GetTagItem("exception.type").ShouldBe(typeof(InvalidOperationException).FullName);
|
||||
}
|
||||
finally { listener.Dispose(); }
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TracedGalaxyDataWriter_tags_secured_write_count()
|
||||
{
|
||||
var (listener, captured) = StartCapture();
|
||||
try
|
||||
{
|
||||
var inner = new RecordingWriter();
|
||||
var sut = new TracedGalaxyDataWriter(inner, "OtOpcUa-Test");
|
||||
|
||||
var requests = new[]
|
||||
{
|
||||
new WriteRequest("FreeTag", 1.0),
|
||||
new WriteRequest("OperateTag", 2.0),
|
||||
new WriteRequest("TuneTag", 3.0),
|
||||
new WriteRequest("ConfigTag", 4.0),
|
||||
};
|
||||
SecurityClassification Resolver(string fullRef) => fullRef switch
|
||||
{
|
||||
"FreeTag" => SecurityClassification.FreeAccess,
|
||||
"OperateTag" => SecurityClassification.Operate,
|
||||
"TuneTag" => SecurityClassification.Tune,
|
||||
"ConfigTag" => SecurityClassification.Configure,
|
||||
_ => SecurityClassification.FreeAccess,
|
||||
};
|
||||
|
||||
await sut.WriteAsync(requests, Resolver, CancellationToken.None);
|
||||
|
||||
var span = captured.ShouldHaveSingleItem();
|
||||
span.OperationName.ShouldBe("galaxy.write");
|
||||
span.GetTagItem("galaxy.tag_count").ShouldBe(4);
|
||||
span.GetTagItem("galaxy.secured_write_count").ShouldBe(2); // Tune + Configure
|
||||
}
|
||||
finally { listener.Dispose(); }
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TracedGalaxyHierarchySource_tags_object_count()
|
||||
{
|
||||
var (listener, captured) = StartCapture();
|
||||
try
|
||||
{
|
||||
var sut = new TracedGalaxyHierarchySource(new FakeHierarchy(), "OtOpcUa-Test");
|
||||
var hierarchy = await sut.GetHierarchyAsync(CancellationToken.None);
|
||||
hierarchy.Count.ShouldBe(2);
|
||||
|
||||
var span = captured.ShouldHaveSingleItem();
|
||||
span.OperationName.ShouldBe("galaxy.get_hierarchy");
|
||||
span.GetTagItem("galaxy.object_count").ShouldBe(2);
|
||||
}
|
||||
finally { listener.Dispose(); }
|
||||
}
|
||||
|
||||
private sealed class FakeSubscriber : IGalaxySubscriber
|
||||
{
|
||||
public Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
|
||||
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
|
||||
=> Task.FromResult<IReadOnlyList<SubscribeResult>>(
|
||||
fullReferences.Select((r, i) => new SubscribeResult
|
||||
{
|
||||
TagAddress = r,
|
||||
ItemHandle = i + 1,
|
||||
WasSuccessful = true,
|
||||
}).ToList());
|
||||
|
||||
public Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
|
||||
=> Task.CompletedTask;
|
||||
|
||||
public async IAsyncEnumerable<MxEvent> StreamEventsAsync(
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken)
|
||||
{
|
||||
await Task.CompletedTask;
|
||||
yield break;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class ThrowingSubscriber : IGalaxySubscriber
|
||||
{
|
||||
public Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
|
||||
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
|
||||
=> throw new InvalidOperationException("gw down");
|
||||
|
||||
public Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
|
||||
=> Task.CompletedTask;
|
||||
|
||||
public async IAsyncEnumerable<MxEvent> StreamEventsAsync(
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken)
|
||||
{
|
||||
await Task.CompletedTask;
|
||||
yield break;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class RecordingWriter : IGalaxyDataWriter
|
||||
{
|
||||
public Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes,
|
||||
Func<string, SecurityClassification> securityResolver,
|
||||
CancellationToken cancellationToken)
|
||||
=> Task.FromResult<IReadOnlyList<WriteResult>>(
|
||||
writes.Select(_ => new WriteResult(0u)).ToList());
|
||||
}
|
||||
|
||||
private sealed class FakeHierarchy : IGalaxyHierarchySource
|
||||
{
|
||||
public Task<IReadOnlyList<MxGateway.Contracts.Proto.Galaxy.GalaxyObject>> GetHierarchyAsync(
|
||||
CancellationToken cancellationToken)
|
||||
=> Task.FromResult<IReadOnlyList<MxGateway.Contracts.Proto.Galaxy.GalaxyObject>>(
|
||||
[new(), new()]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user