feat(scadabridge): wire scadabridge.store_and_forward.queue.depth gauge to buffered count
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Observability;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
@@ -98,6 +99,37 @@ public class StoreAndForwardService
|
||||
/// </summary>
|
||||
private static readonly TimeSpan SweepShutdownWaitTimeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>
|
||||
/// WP-14 (telemetry): cached count of messages currently buffered for
|
||||
/// forwarding — i.e. rows in <see cref="StoreAndForwardMessageStatus.Pending"/>,
|
||||
/// the live store-and-forward queue waiting to be delivered. This backs the
|
||||
/// <c>scadabridge.store_and_forward.queue.depth</c> observable gauge.
|
||||
/// <para>
|
||||
/// The gauge's collection callback is synchronous and is invoked frequently by
|
||||
/// the OpenTelemetry/Prometheus collector, so it must never run an async SQLite
|
||||
/// <c>COUNT(*)</c>. Instead this <see cref="long"/> is seeded once from storage
|
||||
/// in <see cref="StartAsync"/> and then adjusted in-process on the existing
|
||||
/// paths that change the Pending population: <see cref="BufferAsync"/> (+1),
|
||||
/// successful-retry removal and Pending→Parked transitions in
|
||||
/// <see cref="RetryMessageAsync"/> (-1), and operator requeue in
|
||||
/// <see cref="RetryParkedMessageAsync"/> (+1). The provider registered with
|
||||
/// <see cref="ScadaBridgeTelemetry.SetQueueDepthProvider"/> reads it via
|
||||
/// <see cref="Interlocked.Read"/> — non-blocking and sync-safe. It is an
|
||||
/// approximate, eventually-consistent gauge (concurrent failover replication
|
||||
/// applies to the standby's own store, not this counter), which is exactly
|
||||
/// what a queue-depth metric needs.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
private long _bufferedCount;
|
||||
|
||||
/// <summary>
|
||||
/// WP-14 (telemetry): guards one-time registration of the queue-depth provider
|
||||
/// with <see cref="ScadaBridgeTelemetry"/>. The gauge is process-global, so only
|
||||
/// the first <see cref="StartAsync"/> wins; registering per message (or per start
|
||||
/// of multiple service instances) is avoided. 0 = not yet registered, 1 = done.
|
||||
/// </summary>
|
||||
private int _queueDepthProviderRegistered;
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Delivery handler delegate. The return value / exception is interpreted
|
||||
/// the same way on both the immediate-delivery path (<see cref="EnqueueAsync"/>)
|
||||
@@ -170,6 +202,19 @@ public class StoreAndForwardService
|
||||
public async Task StartAsync()
|
||||
{
|
||||
await _storage.InitializeAsync();
|
||||
|
||||
// WP-14 (telemetry): seed the cached buffered-message count from the
|
||||
// store exactly once (the gauge callback cannot run an async COUNT), then
|
||||
// register the sync, non-blocking provider with the process-global
|
||||
// ScadaBridgeTelemetry gauge — guarded so only the first start registers.
|
||||
var pending = await _storage.GetMessageCountByStatusAsync(
|
||||
StoreAndForwardMessageStatus.Pending);
|
||||
Interlocked.Exchange(ref _bufferedCount, pending);
|
||||
if (Interlocked.CompareExchange(ref _queueDepthProviderRegistered, 1, 0) == 0)
|
||||
{
|
||||
ScadaBridgeTelemetry.SetQueueDepthProvider(() => Interlocked.Read(ref _bufferedCount));
|
||||
}
|
||||
|
||||
_retryTimer = new Timer(
|
||||
// StoreAndForward-024: capture the sweep Task on each tick so
|
||||
// StopAsync can await any in-flight invocation before the host
|
||||
@@ -396,6 +441,10 @@ public class StoreAndForwardService
|
||||
{
|
||||
await _storage.EnqueueAsync(message);
|
||||
_replication?.ReplicateEnqueue(message);
|
||||
// WP-14 (telemetry): a freshly buffered row is Pending → grows the live
|
||||
// queue depth. Bumped after the durable write so the gauge never leads the
|
||||
// store.
|
||||
Interlocked.Increment(ref _bufferedCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -452,6 +501,8 @@ public class StoreAndForwardService
|
||||
{
|
||||
await _storage.RemoveMessageAsync(message.Id);
|
||||
_replication?.ReplicateRemove(message.Id);
|
||||
// WP-14 (telemetry): a delivered row leaves the Pending queue.
|
||||
Interlocked.Decrement(ref _bufferedCount);
|
||||
RaiseActivity("Delivered", message.Category,
|
||||
$"Delivered to {message.Target} after {message.RetryCount} retries");
|
||||
|
||||
@@ -483,6 +534,9 @@ public class StoreAndForwardService
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
// WP-14 (telemetry): the row committed Pending→Parked, leaving the live
|
||||
// forward queue. Only counted when the conditional update actually won.
|
||||
Interlocked.Decrement(ref _bufferedCount);
|
||||
_replication?.ReplicatePark(message);
|
||||
RaiseActivity("Parked", message.Category,
|
||||
$"Permanent failure for {message.Target}: handler returned false");
|
||||
@@ -519,6 +573,9 @@ public class StoreAndForwardService
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
// WP-14 (telemetry): the row committed Pending→Parked, leaving the
|
||||
// live forward queue. Only counted when the conditional update won.
|
||||
Interlocked.Decrement(ref _bufferedCount);
|
||||
_replication?.ReplicatePark(message);
|
||||
RaiseActivity("Parked", message.Category,
|
||||
$"Max retries ({message.MaxRetries}) reached for {message.Target}");
|
||||
@@ -737,6 +794,11 @@ public class StoreAndForwardService
|
||||
return false;
|
||||
}
|
||||
|
||||
// WP-14 (telemetry): an operator requeue moves Parked→Pending, re-adding the
|
||||
// row to the live forward queue. Counted only when the conditional storage
|
||||
// update actually flipped the row.
|
||||
Interlocked.Increment(ref _bufferedCount);
|
||||
|
||||
// The active node just rewrote this row to Pending with retry_count = 0
|
||||
// and cleared last_error / last_attempt_at (see
|
||||
// StoreAndForwardStorage.RetryParkedMessageAsync). Reconstruct the
|
||||
|
||||
Reference in New Issue
Block a user