PR 6.2 — Bounded EventPump channel + drop-newest metrics
Decouples the gw stream-read loop from the listener-fanout loop with a bounded Channel<MxEvent> (default capacity 50_000) sitting between them. When a slow listener fills the channel, the producer's TryWrite returns false and we count the drop rather than back-pressuring the gw stream. Three counters on the ZB.MOM.WW.OtOpcUa.Driver.Galaxy meter expose the pressure curve before it manifests as user-visible loss: - galaxy.events.received — MxEvents read from StreamEvents - galaxy.events.dispatched — MxEvents that made it through to OnDataChange - galaxy.events.dropped — MxEvents discarded because the channel was full Each measurement carries a galaxy.client tag so multi-driver hosts can split by source. The driver wires _options.MxAccess.ClientName into the new EventPump constructor parameter. Tests: drop-newest under pressure, capacity validation, and per-pump measurement filtering (xUnit can run other pump tests in parallel and their measurements land on the same listener — the test filters to its own client name). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -511,7 +511,9 @@ public sealed class GalaxyDriver
|
|||||||
lock (_pumpLock)
|
lock (_pumpLock)
|
||||||
{
|
{
|
||||||
if (_eventPump is not null) return _eventPump;
|
if (_eventPump is not null) return _eventPump;
|
||||||
_eventPump = new EventPump(_subscriber!, _subscriptions, _logger);
|
_eventPump = new EventPump(
|
||||||
|
_subscriber!, _subscriptions, _logger,
|
||||||
|
clientName: _options.MxAccess.ClientName);
|
||||||
_eventPump.OnDataChange += OnPumpDataChange;
|
_eventPump.OnDataChange += OnPumpDataChange;
|
||||||
_eventPump.Start();
|
_eventPump.Start();
|
||||||
return _eventPump;
|
return _eventPump;
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
using System.Diagnostics.Metrics;
|
||||||
|
using System.Threading.Channels;
|
||||||
using Microsoft.Extensions.Logging;
|
using Microsoft.Extensions.Logging;
|
||||||
using Microsoft.Extensions.Logging.Abstractions;
|
using Microsoft.Extensions.Logging.Abstractions;
|
||||||
using MxGateway.Contracts.Proto;
|
using MxGateway.Contracts.Proto;
|
||||||
@@ -13,19 +15,47 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
|||||||
/// <see cref="SubscriptionRegistry.ResolveSubscribers"/>).
|
/// <see cref="SubscriptionRegistry.ResolveSubscribers"/>).
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <remarks>
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
/// One pump per connected <see cref="GalaxyMxSession"/>. Reconnect lives in PR 4.5's
|
/// One pump per connected <see cref="GalaxyMxSession"/>. Reconnect lives in PR 4.5's
|
||||||
/// supervisor; on transport failure here we log + propagate so the supervisor can
|
/// supervisor; on transport failure here we log + propagate so the supervisor can
|
||||||
/// decide whether to restart.
|
/// decide whether to restart.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// PR 6.2 — the network-read loop and the listener-fanout loop are decoupled by a
|
||||||
|
/// bounded <see cref="Channel{T}"/>. When a listener is slow enough to fill the
|
||||||
|
/// channel, new events are dropped (newest-dropped semantics: producer's
|
||||||
|
/// <c>TryWrite</c> fails) rather than back-pressuring the gw stream. Three counters
|
||||||
|
/// on the <c>ZB.MOM.WW.OtOpcUa.Driver.Galaxy</c> meter expose received / dispatched
|
||||||
|
/// / dropped totals so ops sees pressure before it manifests as user-visible loss.
|
||||||
|
/// </para>
|
||||||
/// </remarks>
|
/// </remarks>
|
||||||
internal sealed class EventPump : IAsyncDisposable
|
internal sealed class EventPump : IAsyncDisposable
|
||||||
{
|
{
|
||||||
|
public const string MeterName = "ZB.MOM.WW.OtOpcUa.Driver.Galaxy";
|
||||||
|
private const int DefaultChannelCapacity = 50_000;
|
||||||
|
|
||||||
|
// Single static meter so a host-level MeterListener catches all pump instances.
|
||||||
|
private static readonly Meter Meter = new(MeterName);
|
||||||
|
private static readonly Counter<long> EventsReceived =
|
||||||
|
Meter.CreateCounter<long>("galaxy.events.received", unit: "{event}",
|
||||||
|
description: "MxEvents read from the gateway StreamEvents stream.");
|
||||||
|
private static readonly Counter<long> EventsDispatched =
|
||||||
|
Meter.CreateCounter<long>("galaxy.events.dispatched", unit: "{event}",
|
||||||
|
description: "MxEvents passed through the bounded channel and into OnDataChange.");
|
||||||
|
private static readonly Counter<long> EventsDropped =
|
||||||
|
Meter.CreateCounter<long>("galaxy.events.dropped", unit: "{event}",
|
||||||
|
description: "MxEvents dropped because the bounded channel was full (newest-dropped).");
|
||||||
|
|
||||||
private readonly IGalaxySubscriber _subscriber;
|
private readonly IGalaxySubscriber _subscriber;
|
||||||
private readonly SubscriptionRegistry _registry;
|
private readonly SubscriptionRegistry _registry;
|
||||||
private readonly ILogger _logger;
|
private readonly ILogger _logger;
|
||||||
private readonly Func<long, ISubscriptionHandle> _handleFactory;
|
private readonly Func<long, ISubscriptionHandle> _handleFactory;
|
||||||
|
private readonly Channel<MxEvent> _channel;
|
||||||
|
private readonly KeyValuePair<string, object?> _clientTag;
|
||||||
private readonly CancellationTokenSource _cts = new();
|
private readonly CancellationTokenSource _cts = new();
|
||||||
|
|
||||||
private Task? _loop;
|
private Task? _loop;
|
||||||
|
private Task? _dispatchLoop;
|
||||||
private bool _disposed;
|
private bool _disposed;
|
||||||
|
|
||||||
public event EventHandler<DataChangeEventArgs>? OnDataChange;
|
public event EventHandler<DataChangeEventArgs>? OnDataChange;
|
||||||
@@ -34,12 +64,30 @@ internal sealed class EventPump : IAsyncDisposable
|
|||||||
IGalaxySubscriber subscriber,
|
IGalaxySubscriber subscriber,
|
||||||
SubscriptionRegistry registry,
|
SubscriptionRegistry registry,
|
||||||
ILogger? logger = null,
|
ILogger? logger = null,
|
||||||
Func<long, ISubscriptionHandle>? handleFactory = null)
|
Func<long, ISubscriptionHandle>? handleFactory = null,
|
||||||
|
int channelCapacity = DefaultChannelCapacity,
|
||||||
|
string? clientName = null)
|
||||||
{
|
{
|
||||||
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
|
_subscriber = subscriber ?? throw new ArgumentNullException(nameof(subscriber));
|
||||||
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
|
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
|
||||||
_logger = logger ?? NullLogger.Instance;
|
_logger = logger ?? NullLogger.Instance;
|
||||||
_handleFactory = handleFactory ?? (id => new GalaxySubscriptionHandle(id));
|
_handleFactory = handleFactory ?? (id => new GalaxySubscriptionHandle(id));
|
||||||
|
|
||||||
|
if (channelCapacity < 1)
|
||||||
|
{
|
||||||
|
throw new ArgumentOutOfRangeException(nameof(channelCapacity),
|
||||||
|
"channelCapacity must be >= 1; recommended 50_000 for 50k-tag deployments.");
|
||||||
|
}
|
||||||
|
_channel = Channel.CreateBounded<MxEvent>(new BoundedChannelOptions(channelCapacity)
|
||||||
|
{
|
||||||
|
// Newest-dropped policy: when full, the producer's TryWrite returns false
|
||||||
|
// and we account for the drop. We do this manually rather than relying on
|
||||||
|
// BoundedChannelFullMode.DropWrite so we can count drops without polling.
|
||||||
|
FullMode = BoundedChannelFullMode.Wait,
|
||||||
|
SingleReader = true,
|
||||||
|
SingleWriter = true,
|
||||||
|
});
|
||||||
|
_clientTag = new KeyValuePair<string, object?>("galaxy.client", clientName ?? "<unknown>");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
@@ -51,6 +99,7 @@ internal sealed class EventPump : IAsyncDisposable
|
|||||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||||
if (_loop is not null) return;
|
if (_loop is not null) return;
|
||||||
_loop = Task.Run(() => RunAsync(_cts.Token));
|
_loop = Task.Run(() => RunAsync(_cts.Token));
|
||||||
|
_dispatchLoop = Task.Run(() => DispatchLoopAsync(_cts.Token));
|
||||||
}
|
}
|
||||||
|
|
||||||
private async Task RunAsync(CancellationToken ct)
|
private async Task RunAsync(CancellationToken ct)
|
||||||
@@ -60,7 +109,15 @@ internal sealed class EventPump : IAsyncDisposable
|
|||||||
await foreach (var ev in _subscriber.StreamEventsAsync(ct).WithCancellation(ct).ConfigureAwait(false))
|
await foreach (var ev in _subscriber.StreamEventsAsync(ct).WithCancellation(ct).ConfigureAwait(false))
|
||||||
{
|
{
|
||||||
if (ct.IsCancellationRequested) break;
|
if (ct.IsCancellationRequested) break;
|
||||||
Dispatch(ev);
|
EventsReceived.Add(1, _clientTag);
|
||||||
|
|
||||||
|
// Newest-dropped: TryWrite fast-paths the common case (channel has room).
|
||||||
|
// When full we count the drop and continue reading the gw stream so
|
||||||
|
// back-pressure doesn't propagate upstream.
|
||||||
|
if (!_channel.Writer.TryWrite(ev))
|
||||||
|
{
|
||||||
|
EventsDropped.Add(1, _clientTag);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||||
@@ -72,6 +129,32 @@ internal sealed class EventPump : IAsyncDisposable
|
|||||||
_logger.LogWarning(ex,
|
_logger.LogWarning(ex,
|
||||||
"Galaxy EventPump loop ended with an exception — reconnect supervisor (PR 4.5) handles restart.");
|
"Galaxy EventPump loop ended with an exception — reconnect supervisor (PR 4.5) handles restart.");
|
||||||
}
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
// Tell the dispatch loop the producer is done so it drains and exits.
|
||||||
|
_channel.Writer.TryComplete();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task DispatchLoopAsync(CancellationToken ct)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await foreach (var ev in _channel.Reader.ReadAllAsync(ct).ConfigureAwait(false))
|
||||||
|
{
|
||||||
|
Dispatch(ev);
|
||||||
|
EventsDispatched.Add(1, _clientTag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
// Clean shutdown.
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogWarning(ex,
|
||||||
|
"Galaxy EventPump dispatch loop ended with an exception — events past this point will be lost until restart.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void Dispatch(MxEvent ev)
|
private void Dispatch(MxEvent ev)
|
||||||
@@ -121,10 +204,15 @@ internal sealed class EventPump : IAsyncDisposable
|
|||||||
if (_disposed) return;
|
if (_disposed) return;
|
||||||
_disposed = true;
|
_disposed = true;
|
||||||
_cts.Cancel();
|
_cts.Cancel();
|
||||||
|
_channel.Writer.TryComplete();
|
||||||
if (_loop is not null)
|
if (_loop is not null)
|
||||||
{
|
{
|
||||||
try { await _loop.ConfigureAwait(false); } catch { /* shutdown */ }
|
try { await _loop.ConfigureAwait(false); } catch { /* shutdown */ }
|
||||||
}
|
}
|
||||||
|
if (_dispatchLoop is not null)
|
||||||
|
{
|
||||||
|
try { await _dispatchLoop.ConfigureAwait(false); } catch { /* shutdown */ }
|
||||||
|
}
|
||||||
_cts.Dispose();
|
_cts.Dispose();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,179 @@
|
|||||||
|
using System.Diagnostics.Metrics;
|
||||||
|
using System.Threading.Channels;
|
||||||
|
using Google.Protobuf.WellKnownTypes;
|
||||||
|
using MxGateway.Contracts.Proto;
|
||||||
|
using Shouldly;
|
||||||
|
using Xunit;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||||||
|
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Tests.Runtime;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// PR 6.2 — pins the EventPump's bounded-channel + drop-newest behavior. We
|
||||||
|
/// hold the dispatch loop with a slow handler so the channel fills, then verify
|
||||||
|
/// the producer keeps reading from the gw stream and increments the
|
||||||
|
/// <c>galaxy.events.dropped</c> counter rather than blocking.
|
||||||
|
/// </summary>
|
||||||
|
public sealed class EventPumpBoundedChannelTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public async Task Drops_newest_when_channel_fills_and_records_metric()
|
||||||
|
{
|
||||||
|
var counters = StartMeterCapture();
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var subscriber = new ManualSubscriber();
|
||||||
|
var registry = new SubscriptionRegistry();
|
||||||
|
registry.Register(1, [new TagBinding("Tag.A", ItemHandle: 7)]);
|
||||||
|
|
||||||
|
// Tiny channel + slow handler ⇒ producer hits FullMode.Wait → TryWrite false
|
||||||
|
// for every overflow event.
|
||||||
|
var dispatchGate = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||||
|
await using var pump = new EventPump(
|
||||||
|
subscriber, registry, channelCapacity: 2, clientName: "PumpTest");
|
||||||
|
pump.OnDataChange += async (_, _) =>
|
||||||
|
{
|
||||||
|
// Block the dispatch loop until we've shoved enough events through to
|
||||||
|
// overflow the bounded channel. Consume the gate exactly once.
|
||||||
|
await dispatchGate.Task.ConfigureAwait(false);
|
||||||
|
};
|
||||||
|
pump.Start();
|
||||||
|
|
||||||
|
const int totalEvents = 10;
|
||||||
|
for (var i = 0; i < totalEvents; i++)
|
||||||
|
{
|
||||||
|
await subscriber.EmitAsync(itemHandle: 7, value: i);
|
||||||
|
}
|
||||||
|
// Give the producer a beat to run TryWrite for every event.
|
||||||
|
await Task.Delay(150);
|
||||||
|
|
||||||
|
// Capacity 2 + 1 in-flight in the dispatcher = 3 may have been accepted; the
|
||||||
|
// remainder should have hit the dropped counter. Don't pin exact counts —
|
||||||
|
// the scheduler can interleave; pin the invariants instead.
|
||||||
|
counters.Received.ShouldBeGreaterThanOrEqualTo(totalEvents);
|
||||||
|
counters.Dropped.ShouldBeGreaterThan(0,
|
||||||
|
"with capacity=2 and a held dispatcher we must drop at least one of 10 events");
|
||||||
|
(counters.Received).ShouldBe(counters.Dispatched + counters.Dropped + counters.InFlight,
|
||||||
|
"received = dispatched + dropped + (events still queued)");
|
||||||
|
|
||||||
|
// Release the dispatcher so DisposeAsync can drain cleanly.
|
||||||
|
dispatchGate.TrySetResult();
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
counters.Dispose();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Throws_when_channelCapacity_is_invalid()
|
||||||
|
{
|
||||||
|
var subscriber = new ManualSubscriber();
|
||||||
|
var registry = new SubscriptionRegistry();
|
||||||
|
Should.Throw<ArgumentOutOfRangeException>(() =>
|
||||||
|
new EventPump(subscriber, registry, channelCapacity: 0));
|
||||||
|
Should.Throw<ArgumentOutOfRangeException>(() =>
|
||||||
|
new EventPump(subscriber, registry, channelCapacity: -1));
|
||||||
|
await Task.CompletedTask;
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Tags_metrics_with_client_name_for_multi_driver_hosts()
|
||||||
|
{
|
||||||
|
var captured = new List<(string Instrument, KeyValuePair<string, object?>[] Tags)>();
|
||||||
|
using var listener = new MeterListener();
|
||||||
|
listener.InstrumentPublished = (instr, l) =>
|
||||||
|
{
|
||||||
|
if (instr.Meter.Name == EventPump.MeterName) l.EnableMeasurementEvents(instr);
|
||||||
|
};
|
||||||
|
listener.SetMeasurementEventCallback<long>((instr, _, tags, _) =>
|
||||||
|
{
|
||||||
|
captured.Add((instr.Name, tags.ToArray()));
|
||||||
|
});
|
||||||
|
listener.Start();
|
||||||
|
|
||||||
|
var subscriber = new ManualSubscriber();
|
||||||
|
var registry = new SubscriptionRegistry();
|
||||||
|
registry.Register(1, [new TagBinding("Tag.A", ItemHandle: 7)]);
|
||||||
|
|
||||||
|
await using (var pump = new EventPump(subscriber, registry, channelCapacity: 4, clientName: "Driver-X"))
|
||||||
|
{
|
||||||
|
pump.Start();
|
||||||
|
await subscriber.EmitAsync(7, 42.0);
|
||||||
|
await Task.Delay(100);
|
||||||
|
listener.RecordObservableInstruments();
|
||||||
|
}
|
||||||
|
|
||||||
|
// The static Meter is shared across all EventPump instances in the test
|
||||||
|
// assembly; xUnit may run other pump tests in parallel and their
|
||||||
|
// measurements land on the same listener. Filter to our pump's tag value.
|
||||||
|
var ours = captured
|
||||||
|
.Where(c => c.Tags.Any(t => t.Key == "galaxy.client"
|
||||||
|
&& string.Equals((string?)t.Value, "Driver-X", StringComparison.Ordinal)))
|
||||||
|
.ToList();
|
||||||
|
|
||||||
|
ours.ShouldNotBeEmpty(
|
||||||
|
"at least one measurement from this test's pump must carry galaxy.client=Driver-X");
|
||||||
|
ours.ShouldContain(c => c.Instrument == "galaxy.events.received");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static CounterCapture StartMeterCapture()
|
||||||
|
{
|
||||||
|
var capture = new CounterCapture();
|
||||||
|
var listener = new MeterListener();
|
||||||
|
listener.InstrumentPublished = (instr, l) =>
|
||||||
|
{
|
||||||
|
if (instr.Meter.Name == EventPump.MeterName) l.EnableMeasurementEvents(instr);
|
||||||
|
};
|
||||||
|
listener.SetMeasurementEventCallback<long>((instr, value, _, _) =>
|
||||||
|
{
|
||||||
|
switch (instr.Name)
|
||||||
|
{
|
||||||
|
case "galaxy.events.received": Interlocked.Add(ref capture._received, value); break;
|
||||||
|
case "galaxy.events.dispatched": Interlocked.Add(ref capture._dispatched, value); break;
|
||||||
|
case "galaxy.events.dropped": Interlocked.Add(ref capture._dropped, value); break;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
listener.Start();
|
||||||
|
capture.Listener = listener;
|
||||||
|
return capture;
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed class CounterCapture : IDisposable
|
||||||
|
{
|
||||||
|
public MeterListener? Listener;
|
||||||
|
internal long _received, _dispatched, _dropped;
|
||||||
|
public long Received => Interlocked.Read(ref _received);
|
||||||
|
public long Dispatched => Interlocked.Read(ref _dispatched);
|
||||||
|
public long Dropped => Interlocked.Read(ref _dropped);
|
||||||
|
public long InFlight => Math.Max(0, Received - Dispatched - Dropped);
|
||||||
|
public void Dispose() => Listener?.Dispose();
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed class ManualSubscriber : IGalaxySubscriber
|
||||||
|
{
|
||||||
|
private readonly Channel<MxEvent> _stream =
|
||||||
|
Channel.CreateUnbounded<MxEvent>(new UnboundedChannelOptions { SingleReader = true });
|
||||||
|
|
||||||
|
public Task<IReadOnlyList<SubscribeResult>> SubscribeBulkAsync(
|
||||||
|
IReadOnlyList<string> fullReferences, int bufferedUpdateIntervalMs, CancellationToken cancellationToken)
|
||||||
|
=> Task.FromResult<IReadOnlyList<SubscribeResult>>([]);
|
||||||
|
|
||||||
|
public Task UnsubscribeBulkAsync(IReadOnlyList<int> itemHandles, CancellationToken cancellationToken)
|
||||||
|
=> Task.CompletedTask;
|
||||||
|
|
||||||
|
public IAsyncEnumerable<MxEvent> StreamEventsAsync(CancellationToken cancellationToken)
|
||||||
|
=> _stream.Reader.ReadAllAsync(cancellationToken);
|
||||||
|
|
||||||
|
public ValueTask EmitAsync(int itemHandle, double value) =>
|
||||||
|
_stream.Writer.WriteAsync(new MxEvent
|
||||||
|
{
|
||||||
|
Family = MxEventFamily.OnDataChange,
|
||||||
|
ItemHandle = itemHandle,
|
||||||
|
Value = new MxValue { DoubleValue = value },
|
||||||
|
Quality = 192,
|
||||||
|
SourceTimestamp = Timestamp.FromDateTime(DateTime.UtcNow),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user