PR 6.1 — OpenTelemetry traces around gw calls
In-box ActivitySource ("ZB.MOM.WW.OtOpcUa.Driver.Galaxy") wrapped around
the three gw-facing seams via decorators:
- TracedGalaxySubscriber — galaxy.subscribe_bulk / galaxy.unsubscribe_bulk
/ galaxy.stream_events spans. Stream span covers the entire stream
lifetime with a galaxy.event_count tag (per-event spans would dominate
the trace volume at 50k tags / 1Hz; PR 6.2 owns per-event metrics).
- TracedGalaxyDataWriter — galaxy.write spans tagged with
galaxy.tag_count, galaxy.secured_write_count (split between FreeAccess
/Operate vs Tune/Configure/VerifiedWrite, computed only when a listener
is recording so the hot path stays free), galaxy.success_count.
- TracedGalaxyHierarchySource — galaxy.get_hierarchy spans tagged with
galaxy.object_count.
GalaxyDriver.BuildProductionRuntimeAsync wraps the production seams in
the decorators. The driver itself doesn't take an OpenTelemetry package
dependency — System.Diagnostics.ActivitySource is in-box; the host
process picks the listener.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
using MxGateway.Contracts.Proto.Galaxy;
|
||||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
|
||||
/// per <c>GetHierarchy</c> RPC. <c>galaxy.object_count</c> on the span lets ops
|
||||
/// correlate slow Discover passes with Galaxy size without instrumenting the
|
||||
/// discoverer's translation step.
|
||||
/// </summary>
|
||||
internal sealed class TracedGalaxyHierarchySource(IGalaxyHierarchySource inner, string clientName) : IGalaxyHierarchySource
|
||||
{
|
||||
public async Task<IReadOnlyList<GalaxyObject>> GetHierarchyAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.get_hierarchy");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
try
|
||||
{
|
||||
var hierarchy = await inner.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
|
||||
activity?.SetTag("galaxy.object_count", hierarchy.Count);
|
||||
return hierarchy;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -185,8 +185,16 @@ public sealed class GalaxyDriver
|
||||
_ownedMxSession = new GalaxyMxSession(_options.MxAccess, _logger);
|
||||
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_subscriber = new GatewayGalaxySubscriber(_ownedMxSession);
|
||||
_dataWriter = new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger);
|
||||
// PR 6.1 — wrap the gw-facing seams in tracing decorators so every Subscribe /
|
||||
// Unsubscribe / Write / StreamEvents call emits a span on the
|
||||
// "ZB.MOM.WW.OtOpcUa.Driver.Galaxy" ActivitySource. The host process's tracing
|
||||
// listener (OTLP exporter, dotnet-trace, etc.) consumes these without the driver
|
||||
// taking a dependency on the OpenTelemetry packages.
|
||||
_subscriber = new TracedGalaxySubscriber(
|
||||
new GatewayGalaxySubscriber(_ownedMxSession), _options.MxAccess.ClientName);
|
||||
_dataWriter = new TracedGalaxyDataWriter(
|
||||
new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger),
|
||||
_options.MxAccess.ClientName);
|
||||
|
||||
_supervisor = new ReconnectSupervisor(
|
||||
reopen: ReopenAsync,
|
||||
@@ -559,7 +567,8 @@ public sealed class GalaxyDriver
|
||||
: null,
|
||||
};
|
||||
_ownedRepositoryClient = GalaxyRepositoryClient.Create(clientOptions);
|
||||
return new GatewayGalaxyHierarchySource(_ownedRepositoryClient);
|
||||
return new TracedGalaxyHierarchySource(
|
||||
new GatewayGalaxyHierarchySource(_ownedRepositoryClient), _options.MxAccess.ClientName);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — In-box <see cref="ActivitySource"/> wired around every gw call the
|
||||
/// driver makes (Subscribe/Unsubscribe, Write/WriteSecured, GetHierarchy). The
|
||||
/// decorators in this folder produce one span per call, tagged with the inputs
|
||||
/// ops needs to triage a slow or failing operation:
|
||||
/// <c>galaxy.tag_count</c>, <c>galaxy.success_count</c>, <c>galaxy.client</c>.
|
||||
/// <para>
|
||||
/// The driver itself doesn't take a dependency on the OpenTelemetry packages —
|
||||
/// <c>System.Diagnostics.ActivitySource</c> is in the BCL. The host process
|
||||
/// decides which listener (OTLP exporter, Application Insights, dotnet-trace)
|
||||
/// subscribes to <see cref="ActivitySourceName"/>.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
internal static class GalaxyTelemetry
|
||||
{
|
||||
public const string ActivitySourceName = "ZB.MOM.WW.OtOpcUa.Driver.Galaxy";
|
||||
|
||||
public static readonly ActivitySource ActivitySource = new(ActivitySourceName);
|
||||
|
||||
/// <summary>
|
||||
/// Tag a span with a failure reason and set its status to <c>Error</c>. Helper
|
||||
/// so the decorators don't repeat the four-line idiom on every catch block.
|
||||
/// </summary>
|
||||
public static void RecordError(this Activity? activity, Exception ex)
|
||||
{
|
||||
if (activity is null) return;
|
||||
activity.SetStatus(ActivityStatusCode.Error, ex.Message);
|
||||
activity.SetTag("exception.type", ex.GetType().FullName);
|
||||
activity.SetTag("exception.message", ex.Message);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
|
||||
/// per gw write batch. Tags secured-write counts so ops can see the routing-by-
|
||||
/// classification split (FreeAccess/Operate vs Tune/Configure) without re-reading
|
||||
/// the discovery dictionary.
|
||||
/// </summary>
|
||||
internal sealed class TracedGalaxyDataWriter(IGalaxyDataWriter inner, string clientName) : IGalaxyDataWriter
|
||||
{
|
||||
public async Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||||
IReadOnlyList<WriteRequest> writes,
|
||||
Func<string, SecurityClassification> securityResolver,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.write");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
activity?.SetTag("galaxy.tag_count", writes.Count);
|
||||
|
||||
if (activity is { IsAllDataRequested: true })
|
||||
{
|
||||
// Counting the secured-write split is cheap (one resolver call per request)
|
||||
// and only happens when a tracing listener is actively recording — keeps the
|
||||
// hot path free when no one's listening.
|
||||
var securedCount = 0;
|
||||
foreach (var w in writes)
|
||||
{
|
||||
var sc = securityResolver(w.FullReference);
|
||||
if (sc is SecurityClassification.Tune
|
||||
or SecurityClassification.Configure
|
||||
or SecurityClassification.VerifiedWrite)
|
||||
{
|
||||
securedCount++;
|
||||
}
|
||||
}
|
||||
activity.SetTag("galaxy.secured_write_count", securedCount);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var results = await inner.WriteAsync(writes, securityResolver, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
activity?.SetTag("galaxy.success_count", results.Count(r => r.StatusCode < 0x80000000u));
|
||||
return results;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
using System.Runtime.CompilerServices;
|
||||
using MxGateway.Contracts.Proto;
|
||||
|
||||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||
|
||||
/// <summary>
|
||||
/// PR 6.1 — Decorator that emits one <see cref="System.Diagnostics.Activity"/> span
|
||||
/// per gw subscription RPC. Wraps the production <see cref="GatewayGalaxySubscriber"/>;
|
||||
/// tests substitute a fake at the same seam without taking the tracing overhead.
|
||||
/// </summary>
|
||||
internal sealed class TracedGalaxySubscriber(IGalaxySubscriber inner, string clientName) : IGalaxySubscriber
|
||||
{
|
||||
public async Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
|
||||
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.subscribe_bulk");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
activity?.SetTag("galaxy.tag_count", fullReferences.Count);
|
||||
activity?.SetTag("galaxy.buffered_interval_ms", bufferedUpdateIntervalMs);
|
||||
try
|
||||
{
|
||||
var results = await inner.SubscribeBulkAsync(fullReferences, bufferedUpdateIntervalMs, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
activity?.SetTag("galaxy.success_count", results.Count(r => r.WasSuccessful));
|
||||
return results;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.unsubscribe_bulk");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
activity?.SetTag("galaxy.tag_count", itemHandles.Count);
|
||||
try
|
||||
{
|
||||
await inner.UnsubscribeBulkAsync(itemHandles, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Streaming RPC — one parent span covers the entire stream lifetime. Per-event
|
||||
/// spans would dominate the trace volume at 50k tags / 1Hz; ops gets per-event
|
||||
/// visibility through <see cref="EventPump"/>'s metrics in PR 6.2 instead.
|
||||
/// </summary>
|
||||
public async IAsyncEnumerable<MxEvent> StreamEventsAsync(
|
||||
[EnumeratorCancellation] CancellationToken cancellationToken)
|
||||
{
|
||||
using var activity = GalaxyTelemetry.ActivitySource.StartActivity("galaxy.stream_events");
|
||||
activity?.SetTag("galaxy.client", clientName);
|
||||
|
||||
IAsyncEnumerator<MxEvent>? enumerator = null;
|
||||
try
|
||||
{
|
||||
enumerator = inner.StreamEventsAsync(cancellationToken).GetAsyncEnumerator(cancellationToken);
|
||||
var eventCount = 0L;
|
||||
while (true)
|
||||
{
|
||||
bool moveNext;
|
||||
try
|
||||
{
|
||||
moveNext = await enumerator.MoveNextAsync().ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
activity.RecordError(ex);
|
||||
activity?.SetTag("galaxy.event_count", eventCount);
|
||||
throw;
|
||||
}
|
||||
|
||||
if (!moveNext) break;
|
||||
eventCount++;
|
||||
yield return enumerator.Current;
|
||||
}
|
||||
activity?.SetTag("galaxy.event_count", eventCount);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (enumerator is not null) await enumerator.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user