PR 6.2 — Bounded EventPump channel + drop-newest metrics

Decouples the gw stream-read loop from the listener-fanout loop with a
bounded Channel<MxEvent> (default capacity 50_000) sitting between them.
When a slow listener fills the channel, the producer's TryWrite returns
false and we count the drop rather than back-pressuring the gw stream.

Three counters on the ZB.MOM.WW.OtOpcUa.Driver.Galaxy meter expose the
pressure curve before it manifests as user-visible loss:

- galaxy.events.received  — MxEvents read from StreamEvents
- galaxy.events.dispatched — MxEvents that made it through to OnDataChange
- galaxy.events.dropped   — MxEvents discarded because the channel was full

Each measurement carries a galaxy.client tag so multi-driver hosts can
split by source. The driver wires _options.MxAccess.ClientName into the
new EventPump constructor parameter.

Tests: drop-newest under pressure, capacity validation, and per-pump
measurement filtering (xUnit can run other pump tests in parallel and
their measurements land on the same listener — the test filters to its
own client name).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Joseph Doherty
2026-04-29 16:50:39 -04:00
parent 619207e7f5
commit 7b21c3b428
3 changed files with 272 additions and 3 deletions

View File

@@ -511,7 +511,9 @@ public sealed class GalaxyDriver
lock (_pumpLock)
{
if (_eventPump is not null) return _eventPump;
_eventPump = new EventPump(_subscriber!, _subscriptions, _logger);
_eventPump = new EventPump(
_subscriber!, _subscriptions, _logger,
clientName: _options.MxAccess.ClientName);
_eventPump.OnDataChange += OnPumpDataChange;
_eventPump.Start();
return _eventPump;

View File

@@ -1,3 +1,5 @@
using System.Diagnostics.Metrics;
using System.Threading.Channels;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using MxGateway.Contracts.Proto;
@@ -13,19 +15,47 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
/// <see cref="SubscriptionRegistry.ResolveSubscribers"/>).
/// </summary>
/// <remarks>
/// <para>
/// One pump per connected <see cref="GalaxyMxSession"/>. Reconnect lives in PR 4.5's
/// supervisor; on transport failure here we log + propagate so the supervisor can
/// decide whether to restart.
/// </para>
/// <para>
/// PR 6.2 — the network-read loop and the listener-fanout loop are decoupled by a
/// bounded <see cref="Channel{T}"/>. When a listener is slow enough to fill the
/// channel, new events are dropped (newest-dropped semantics: producer's
/// <c>TryWrite</c> fails) rather than back-pressuring the gw stream. Three counters
/// on the <c>ZB.MOM.WW.OtOpcUa.Driver.Galaxy</c> meter expose received / dispatched
/// / dropped totals so ops sees pressure before it manifests as user-visible loss.
/// </para>
/// </remarks>
internal sealed class EventPump : IAsyncDisposable
{
public const string MeterName = "ZB.MOM.WW.OtOpcUa.Driver.Galaxy";
private const int DefaultChannelCapacity = 50_000;
// Single static meter so a host-level MeterListener catches all pump instances.
private static readonly Meter Meter = new(MeterName);
private static readonly Counter<long> EventsReceived =
Meter.CreateCounter<long>("galaxy.events.received", unit: "{event}",
description: "MxEvents read from the gateway StreamEvents stream.");
private static readonly Counter<long> EventsDispatched =
Meter.CreateCounter<long>("galaxy.events.dispatched", unit: "{event}",
description: "MxEvents passed through the bounded channel and into OnDataChange.");
private static readonly Counter<long> EventsDropped =
Meter.CreateCounter<long>("galaxy.events.dropped", unit: "{event}",
description: "MxEvents dropped because the bounded channel was full (newest-dropped).");
private readonly IGalaxySubscriber _subscriber;
private readonly SubscriptionRegistry _registry;
private readonly ILogger _logger;
private readonly Func<long, ISubscriptionHandle> _handleFactory;
private readonly Channel<MxEvent> _channel;
private readonly KeyValuePair<string, object?> _clientTag;
private readonly CancellationTokenSource _cts = new();
private Task? _loop;
private Task? _dispatchLoop;
private bool _disposed;
public event EventHandler<DataChangeEventArgs>? OnDataChange;
@@ -34,12 +64,30 @@ internal sealed class EventPump : IAsyncDisposable
IGalaxySubscriber subscriber,
SubscriptionRegistry registry,
ILogger? logger = null,
Func<long, ISubscriptionHandle>? handleFactory = null)
Func<long, ISubscriptionHandle>? handleFactory = null,
int channelCapacity = DefaultChannelCapacity,
string? clientName = null)
{
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
_logger = logger ?? NullLogger.Instance;
_handleFactory = handleFactory ?? (id => new GalaxySubscriptionHandle(id));
if (channelCapacity < 1)
{
throw new ArgumentOutOfRangeException(nameof(channelCapacity),
"channelCapacity must be >= 1; recommended 50_000 for 50k-tag deployments.");
}
_channel = Channel.CreateBounded<MxEvent>(new BoundedChannelOptions(channelCapacity)
{
// Newest-dropped policy: when full, the producer's TryWrite returns false
// and we account for the drop. We do this manually rather than relying on
// BoundedChannelFullMode.DropWrite so we can count drops without polling.
FullMode = BoundedChannelFullMode.Wait,
SingleReader = true,
SingleWriter = true,
});
_clientTag = new KeyValuePair<string, object?>("galaxy.client", clientName ?? "<unknown>");
}
/// <summary>
@@ -51,6 +99,7 @@ internal sealed class EventPump : IAsyncDisposable
ObjectDisposedException.ThrowIf(_disposed, this);
if (_loop is not null) return;
_loop = Task.Run(() => RunAsync(_cts.Token));
_dispatchLoop = Task.Run(() => DispatchLoopAsync(_cts.Token));
}
private async Task RunAsync(CancellationToken ct)
@@ -60,7 +109,15 @@ internal sealed class EventPump : IAsyncDisposable
await foreach (var ev in _subscriber.StreamEventsAsync(ct).WithCancellation(ct).ConfigureAwait(false))
{
if (ct.IsCancellationRequested) break;
Dispatch(ev);
EventsReceived.Add(1, _clientTag);
// Newest-dropped: TryWrite fast-paths the common case (channel has room).
// When full we count the drop and continue reading the gw stream so
// back-pressure doesn't propagate upstream.
if (!_channel.Writer.TryWrite(ev))
{
EventsDropped.Add(1, _clientTag);
}
}
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
@@ -72,6 +129,32 @@ internal sealed class EventPump : IAsyncDisposable
_logger.LogWarning(ex,
"Galaxy EventPump loop ended with an exception — reconnect supervisor (PR 4.5) handles restart.");
}
finally
{
// Tell the dispatch loop the producer is done so it drains and exits.
_channel.Writer.TryComplete();
}
}
private async Task DispatchLoopAsync(CancellationToken ct)
{
try
{
await foreach (var ev in _channel.Reader.ReadAllAsync(ct).ConfigureAwait(false))
{
Dispatch(ev);
EventsDispatched.Add(1, _clientTag);
}
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
// Clean shutdown.
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Galaxy EventPump dispatch loop ended with an exception — events past this point will be lost until restart.");
}
}
private void Dispatch(MxEvent ev)
@@ -121,10 +204,15 @@ internal sealed class EventPump : IAsyncDisposable
if (_disposed) return;
_disposed = true;
_cts.Cancel();
_channel.Writer.TryComplete();
if (_loop is not null)
{
try { await _loop.ConfigureAwait(false); } catch { /* shutdown */ }
}
if (_dispatchLoop is not null)
{
try { await _dispatchLoop.ConfigureAwait(false); } catch { /* shutdown */ }
}
_cts.Dispose();
}
}

View File

@@ -0,0 +1,179 @@
using System.Diagnostics.Metrics;
using System.Threading.Channels;
using Google.Protobuf.WellKnownTypes;
using MxGateway.Contracts.Proto;
using Shouldly;
using Xunit;
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime;
/// <summary>
/// PR 6.2 — pins the EventPump's bounded-channel + drop-newest behavior. We
/// hold the dispatch loop with a slow handler so the channel fills, then verify
/// the producer keeps reading from the gw stream and increments the
/// <c>galaxy.events.dropped</c> counter rather than blocking.
/// </summary>
public sealed class EventPumpBoundedChannelTests
{
[Fact]
public async Task Drops_newest_when_channel_fills_and_records_metric()
{
var counters = StartMeterCapture();
try
{
var subscriber = new ManualSubscriber();
var registry = new SubscriptionRegistry();
registry.Register(1, [new TagBinding("Tag.A", ItemHandle: 7)]);
// Tiny channel + slow handler ⇒ producer hits FullMode.Wait → TryWrite false
// for every overflow event.
var dispatchGate = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
await using var pump = new EventPump(
subscriber, registry, channelCapacity: 2, clientName: "PumpTest");
pump.OnDataChange += async (_, _) =>
{
// Block the dispatch loop until we've shoved enough events through to
// overflow the bounded channel. Consume the gate exactly once.
await dispatchGate.Task.ConfigureAwait(false);
};
pump.Start();
const int totalEvents = 10;
for (var i = 0; i < totalEvents; i++)
{
await subscriber.EmitAsync(itemHandle: 7, value: i);
}
// Give the producer a beat to run TryWrite for every event.
await Task.Delay(150);
// Capacity 2 + 1 in-flight in the dispatcher = 3 may have been accepted; the
// remainder should have hit the dropped counter. Don't pin exact counts —
// the scheduler can interleave; pin the invariants instead.
counters.Received.ShouldBeGreaterThanOrEqualTo(totalEvents);
counters.Dropped.ShouldBeGreaterThan(0,
"with capacity=2 and a held dispatcher we must drop at least one of 10 events");
(counters.Received).ShouldBe(counters.Dispatched + counters.Dropped + counters.InFlight,
"received = dispatched + dropped + (events still queued)");
// Release the dispatcher so DisposeAsync can drain cleanly.
dispatchGate.TrySetResult();
}
finally
{
counters.Dispose();
}
}
[Fact]
public async Task Throws_when_channelCapacity_is_invalid()
{
var subscriber = new ManualSubscriber();
var registry = new SubscriptionRegistry();
Should.Throw<ArgumentOutOfRangeException>(() =>
new EventPump(subscriber, registry, channelCapacity: 0));
Should.Throw<ArgumentOutOfRangeException>(() =>
new EventPump(subscriber, registry, channelCapacity: -1));
await Task.CompletedTask;
}
[Fact]
public async Task Tags_metrics_with_client_name_for_multi_driver_hosts()
{
var captured = new List<(string Instrument, KeyValuePair<string, object?>[] Tags)>();
using var listener = new MeterListener();
listener.InstrumentPublished = (instr, l) =>
{
if (instr.Meter.Name == EventPump.MeterName) l.EnableMeasurementEvents(instr);
};
listener.SetMeasurementEventCallback<long>((instr, _, tags, _) =>
{
captured.Add((instr.Name, tags.ToArray()));
});
listener.Start();
var subscriber = new ManualSubscriber();
var registry = new SubscriptionRegistry();
registry.Register(1, [new TagBinding("Tag.A", ItemHandle: 7)]);
await using (var pump = new EventPump(subscriber, registry, channelCapacity: 4, clientName: "Driver-X"))
{
pump.Start();
await subscriber.EmitAsync(7, 42.0);
await Task.Delay(100);
listener.RecordObservableInstruments();
}
// The static Meter is shared across all EventPump instances in the test
// assembly; xUnit may run other pump tests in parallel and their
// measurements land on the same listener. Filter to our pump's tag value.
var ours = captured
.Where(c => c.Tags.Any(t => t.Key == "galaxy.client"
&& string.Equals((string?)t.Value, "Driver-X", StringComparison.Ordinal)))
.ToList();
ours.ShouldNotBeEmpty(
"at least one measurement from this test's pump must carry galaxy.client=Driver-X");
ours.ShouldContain(c => c.Instrument == "galaxy.events.received");
}
private static CounterCapture StartMeterCapture()
{
var capture = new CounterCapture();
var listener = new MeterListener();
listener.InstrumentPublished = (instr, l) =>
{
if (instr.Meter.Name == EventPump.MeterName) l.EnableMeasurementEvents(instr);
};
listener.SetMeasurementEventCallback<long>((instr, value, _, _) =>
{
switch (instr.Name)
{
case "galaxy.events.received": Interlocked.Add(ref capture._received, value); break;
case "galaxy.events.dispatched": Interlocked.Add(ref capture._dispatched, value); break;
case "galaxy.events.dropped": Interlocked.Add(ref capture._dropped, value); break;
}
});
listener.Start();
capture.Listener = listener;
return capture;
}
private sealed class CounterCapture : IDisposable
{
public MeterListener? Listener;
internal long _received, _dispatched, _dropped;
public long Received => Interlocked.Read(ref _received);
public long Dispatched => Interlocked.Read(ref _dispatched);
public long Dropped => Interlocked.Read(ref _dropped);
public long InFlight => Math.Max(0, Received - Dispatched - Dropped);
public void Dispose() => Listener?.Dispose();
}
private sealed class ManualSubscriber : IGalaxySubscriber
{
private readonly Channel<MxEvent> _stream =
Channel.CreateUnbounded<MxEvent>(new UnboundedChannelOptions { SingleReader = true });
public Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
=> Task.FromResult<IReadOnlyList<SubscribeResult>>([]);
public Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
=> Task.CompletedTask;
public IAsyncEnumerable<MxEvent> StreamEventsAsync(CancellationToken cancellationToken)
=> _stream.Reader.ReadAllAsync(cancellationToken);
public ValueTask EmitAsync(int itemHandle, double value) =>
_stream.Writer.WriteAsync(new MxEvent
{
Family = MxEventFamily.OnDataChange,
ItemHandle = itemHandle,
Value = new MxValue { DoubleValue = value },
Quality = 192,
SourceTimestamp = Timestamp.FromDateTime(DateTime.UtcNow),
});
}
}