fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection

This commit is contained in:
Joseph Doherty
2026-05-16 20:58:03 -04:00
parent 3e7a3d7e31
commit 31a6995d24
12 changed files with 656 additions and 51 deletions

View File

@@ -84,6 +84,15 @@ public class CentralCommunicationActor : ReceiveActor
// Periodic refresh trigger
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
// Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
// Status.Failure. Without this handler the failure was an unhandled message
// (debug-level only) and the refresh failed silently — operators could not
// distinguish "no sites configured" from "database is down". Log at Warning.
Receive<Status.Failure>(failure =>
_log.Warning(failure.Cause,
"Failed to load site addresses from the database; the site ClusterClient "
+ "cache was not refreshed and may be stale or empty"));
// Health monitoring: heartbeats and health reports from sites
Receive<HeartbeatMessage>(HandleHeartbeat);
Receive<SiteHealthReport>(HandleSiteHealthReport);
@@ -296,6 +305,25 @@ public class CentralCommunicationActor : ReceiveActor
}
}
/// <summary>
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
/// CentralCommunicationActor is a long-lived coordinator that owns the per-site
/// ClusterClient map; a transient fault in a child (e.g. a ClusterClient child)
/// must Resume so the child's connection state is preserved rather than wiped by
/// a Restart.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: -1,
withinTimeRange: Timeout.InfiniteTimeSpan,
decider: Decider.From(ex =>
{
_log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
return Directive.Resume;
}));
}
protected override void PreStart()
{
_log.Info("CentralCommunicationActor started");

View File

@@ -28,7 +28,19 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
private const int MaxRetries = 3;
private const string ReconnectTimerKey = "grpc-reconnect";
private const string StabilityTimerKey = "grpc-stability";
internal static TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
/// <summary>
/// How long a freshly-opened gRPC stream must stay up before its retry budget
/// is considered "recovered" and <see cref="_retryCount"/> is reset to 0.
/// Communication-008: the retry count must NOT be reset by individual events —
/// a stream that connects, delivers one event, then fails repeatedly would
/// otherwise reconnect forever and never trip <see cref="MaxRetries"/>. Resetting
/// only after a stable interval bounds a flapping stream.
/// </summary>
internal static TimeSpan StabilityWindow { get; set; } = TimeSpan.FromSeconds(60);
private int _retryCount;
private bool _useNodeA = true;
private bool _stopped;
@@ -66,16 +78,21 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
OpenGrpcStream();
});
// Domain events arriving via Self.Tell from gRPC callback
Receive<AttributeValueChanged>(changed =>
{
_retryCount = 0; // Successful event resets retry count
_onEvent(changed);
});
Receive<AlarmStateChanged>(changed =>
// Domain events arriving via Self.Tell from gRPC callback.
// Communication-008: receiving an event must NOT reset _retryCount — a
// flapping stream that delivers a single event between failures would
// otherwise never trip MaxRetries. The retry budget is recovered only by
// GrpcStreamStable (a stream that has stayed up for StabilityWindow).
Receive<AttributeValueChanged>(changed => _onEvent(changed));
Receive<AlarmStateChanged>(changed => _onEvent(changed));
// Stream has been stably connected for StabilityWindow — recover the
// retry budget so a future transient fault gets a fresh set of retries.
Receive<GrpcStreamStable>(_ =>
{
if (_stopped) return;
_retryCount = 0;
_onEvent(changed);
_log.Debug("gRPC stream for {0} stable, retry count reset", _instanceUniqueName);
});
// gRPC stream error — attempt reconnection
@@ -151,6 +168,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
_grpcCts?.Dispose();
_grpcCts = new CancellationTokenSource();
// Arm the stability timer: if the stream stays up for StabilityWindow the
// retry budget is recovered (Communication-008). Cancelled by HandleGrpcError.
Timers.StartSingleTimer(StabilityTimerKey, new GrpcStreamStable(), StabilityWindow);
var client = _grpcFactory.GetOrCreate(_siteIdentifier, endpoint);
var self = Self;
var ct = _grpcCts.Token;
@@ -171,6 +192,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
{
if (_stopped) return;
// The stream failed before reaching the stability window — its retry
// budget is NOT recovered (Communication-008).
Timers.Cancel(StabilityTimerKey);
_retryCount++;
if (_retryCount > MaxRetries)
@@ -239,3 +264,10 @@ internal record GrpcStreamError(Exception Exception);
/// Internal message to trigger gRPC stream reconnection.
/// </summary>
internal record ReconnectGrpcStream;
/// <summary>
/// Internal message indicating the current gRPC stream has been connected long
/// enough (<see cref="DebugStreamBridgeActor.StabilityWindow"/>) to be considered
/// stable, so the reconnect retry budget can be recovered.
/// </summary>
internal record GrpcStreamStable;

View File

@@ -177,6 +177,24 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
}
/// <summary>
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
/// SiteCommunicationActor is a long-lived coordinator routing all message
/// patterns to local handlers; a transient fault in a child must Resume so the
/// child's in-memory state is preserved rather than discarded by a Restart.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: -1,
withinTimeRange: Timeout.InfiniteTimeSpan,
decider: Decider.From(ex =>
{
_log.Warning(ex, "Child actor of SiteCommunicationActor faulted, resuming (state preserved)");
return Directive.Resume;
}));
}
protected override void PreStart()
{
_log.Info("SiteCommunicationActor started for site {0}", _siteId);

View File

@@ -13,21 +13,45 @@ namespace ScadaLink.Communication.Grpc;
/// SiteStreamGrpcServer. The central-side DebugStreamBridgeActor uses this
/// to open server-streaming calls for individual instances.
/// </summary>
public class SiteStreamGrpcClient : IAsyncDisposable
public class SiteStreamGrpcClient : IAsyncDisposable, IDisposable
{
private readonly GrpcChannel? _channel;
private readonly SiteStreamService.SiteStreamServiceClient? _client;
private readonly ILogger? _logger;
private readonly ConcurrentDictionary<string, CancellationTokenSource> _subscriptions = new();
/// <summary>
/// The HTTP/2 keepalive ping delay actually applied to this client's channel.
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
/// </summary>
internal TimeSpan KeepAlivePingDelay { get; }
/// <summary>
/// The HTTP/2 keepalive ping timeout actually applied to this client's channel.
/// Exposed for tests verifying that <see cref="CommunicationOptions"/> is honoured.
/// </summary>
internal TimeSpan KeepAlivePingTimeout { get; }
public SiteStreamGrpcClient(string endpoint, ILogger logger)
: this(endpoint, logger, new CommunicationOptions())
{
}
/// <summary>
/// Creates a client whose HTTP/2 keepalive is taken from <see cref="CommunicationOptions"/>
/// rather than hard-coded, satisfying the design doc's "gRPC Connection Keepalive"
/// section which states these values are configurable.
/// </summary>
public SiteStreamGrpcClient(string endpoint, ILogger logger, CommunicationOptions options)
{
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay;
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout;
_channel = GrpcChannel.ForAddress(endpoint, new GrpcChannelOptions
{
HttpHandler = new SocketsHttpHandler
{
KeepAlivePingDelay = TimeSpan.FromSeconds(15),
KeepAlivePingTimeout = TimeSpan.FromSeconds(10),
KeepAlivePingDelay = options.GrpcKeepAlivePingDelay,
KeepAlivePingTimeout = options.GrpcKeepAlivePingTimeout,
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always
}
});
@@ -205,7 +229,13 @@ public class SiteStreamGrpcClient : IAsyncDisposable
_ => AlarmLevel.None
};
public async ValueTask DisposeAsync()
/// <summary>
/// Releases all subscription CancellationTokenSources and the underlying
/// gRPC channel. All teardown here is synchronous (CTS disposal and
/// <see cref="GrpcChannel.Dispose"/>), so a synchronous <see cref="Dispose"/>
/// can release everything without sync-over-async blocking.
/// </summary>
private void ReleaseResources()
{
foreach (var cts in _subscriptions.Values)
{
@@ -214,9 +244,22 @@ public class SiteStreamGrpcClient : IAsyncDisposable
}
_subscriptions.Clear();
if (_channel is not null)
_channel.Dispose();
_channel?.Dispose();
}
await ValueTask.CompletedTask;
public virtual ValueTask DisposeAsync()
{
ReleaseResources();
return ValueTask.CompletedTask;
}
/// <summary>
/// Synchronous disposal. All resources held by this client are released
/// synchronously, so callers (e.g. <see cref="SiteStreamGrpcClientFactory.Dispose"/>)
/// need not block on the async disposal path.
/// </summary>
public virtual void Dispose()
{
ReleaseResources();
}
}

View File

@@ -1,5 +1,6 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace ScadaLink.Communication.Grpc;
@@ -12,22 +13,43 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
{
private readonly ConcurrentDictionary<string, SiteStreamGrpcClient> _clients = new();
private readonly ILoggerFactory _loggerFactory;
private readonly CommunicationOptions _options;
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory)
: this(loggerFactory, Options.Create(new CommunicationOptions()))
{
_loggerFactory = loggerFactory;
}
/// <summary>
/// Returns an existing client for the site or creates a new one.
/// DI constructor — flows <see cref="CommunicationOptions"/> into every created
/// <see cref="SiteStreamGrpcClient"/> so the configured gRPC keepalive settings
/// are applied rather than hard-coded defaults.
/// </summary>
public SiteStreamGrpcClientFactory(ILoggerFactory loggerFactory, IOptions<CommunicationOptions> options)
{
_loggerFactory = loggerFactory;
_options = options.Value;
}
/// <summary>
/// Returns an existing client for the site or creates a new one. The new
/// client is created via <see cref="CreateClient"/> and tracked so the
/// factory's <see cref="Dispose"/> / <see cref="DisposeAsync"/> release it.
/// </summary>
public virtual SiteStreamGrpcClient GetOrCreate(string siteIdentifier, string grpcEndpoint)
{
return _clients.GetOrAdd(siteIdentifier, _ =>
{
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
return new SiteStreamGrpcClient(grpcEndpoint, logger);
});
return _clients.GetOrAdd(siteIdentifier, _ => CreateClient(grpcEndpoint));
}
/// <summary>
/// Creates a single <see cref="SiteStreamGrpcClient"/>. Overridable so tests
/// can substitute a tracking client while still exercising the factory's real
/// caching and disposal machinery.
/// </summary>
protected virtual SiteStreamGrpcClient CreateClient(string grpcEndpoint)
{
var logger = _loggerFactory.CreateLogger<SiteStreamGrpcClient>();
return new SiteStreamGrpcClient(grpcEndpoint, logger, _options);
}
/// <summary>
@@ -50,8 +72,19 @@ public class SiteStreamGrpcClientFactory : IAsyncDisposable, IDisposable
_clients.Clear();
}
/// <summary>
/// Synchronous disposal. Communication-007: this used to block on
/// <c>DisposeAsync().AsTask().GetAwaiter().GetResult()</c> (sync-over-async,
/// a stall/deadlock risk during host shutdown). Each
/// <see cref="SiteStreamGrpcClient"/> releases all of its resources
/// synchronously, so we dispose them directly with no async path.
/// </summary>
public void Dispose()
{
DisposeAsync().AsTask().GetAwaiter().GetResult();
foreach (var client in _clients.Values)
{
client.Dispose();
}
_clients.Clear();
}
}

View File

@@ -3,6 +3,7 @@ using System.Threading.Channels;
using Akka.Actor;
using Grpc.Core;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using GrpcStatus = Grpc.Core.Status;
namespace ScadaLink.Communication.Grpc;
@@ -19,6 +20,7 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
private readonly ILogger<SiteStreamGrpcServer> _logger;
private readonly ConcurrentDictionary<string, StreamEntry> _activeStreams = new();
private readonly int _maxConcurrentStreams;
private readonly TimeSpan _maxStreamLifetime;
private volatile bool _ready;
private long _actorCounter;
@@ -26,10 +28,36 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
ISiteStreamSubscriber streamSubscriber,
ILogger<SiteStreamGrpcServer> logger,
int maxConcurrentStreams = 100)
: this(streamSubscriber, logger, maxConcurrentStreams, TimeSpan.FromHours(4))
{
}
/// <summary>
/// DI constructor — binds <see cref="CommunicationOptions.GrpcMaxConcurrentStreams"/>
/// and <see cref="CommunicationOptions.GrpcMaxStreamLifetime"/> so the documented
/// concurrency limit and the 4-hour zombie-stream session timeout are honoured
/// rather than hard-coded.
/// </summary>
public SiteStreamGrpcServer(
ISiteStreamSubscriber streamSubscriber,
ILogger<SiteStreamGrpcServer> logger,
IOptions<CommunicationOptions> options)
: this(streamSubscriber, logger,
options.Value.GrpcMaxConcurrentStreams,
options.Value.GrpcMaxStreamLifetime)
{
}
private SiteStreamGrpcServer(
ISiteStreamSubscriber streamSubscriber,
ILogger<SiteStreamGrpcServer> logger,
int maxConcurrentStreams,
TimeSpan maxStreamLifetime)
{
_streamSubscriber = streamSubscriber;
_logger = logger;
_maxConcurrentStreams = maxConcurrentStreams;
_maxStreamLifetime = maxStreamLifetime;
}
/// <summary>
@@ -49,6 +77,12 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
/// </summary>
public int ActiveStreamCount => _activeStreams.Count;
/// <summary>Effective max concurrent stream limit. Exposed for tests.</summary>
internal int MaxConcurrentStreams => _maxConcurrentStreams;
/// <summary>Effective per-stream session lifetime. Exposed for tests.</summary>
internal TimeSpan MaxStreamLifetime => _maxStreamLifetime;
public override async Task SubscribeInstance(
InstanceStreamRequest request,
IServerStreamWriter<SiteStreamEvent> responseStream,
@@ -69,6 +103,11 @@ public class SiteStreamGrpcServer : SiteStreamService.SiteStreamServiceBase
throw new RpcException(new GrpcStatus(StatusCode.ResourceExhausted, "Max concurrent streams reached"));
using var streamCts = CancellationTokenSource.CreateLinkedTokenSource(context.CancellationToken);
// Session timeout (design doc "gRPC Connection Keepalive": 4-hour third layer
// of dead-client detection) — forces a long-lived zombie stream to terminate
// even if keepalive PINGs never detect the loss.
if (_maxStreamLifetime > TimeSpan.Zero && _maxStreamLifetime != Timeout.InfiniteTimeSpan)
streamCts.CancelAfter(_maxStreamLifetime);
var entry = new StreamEntry(streamCts);
_activeStreams[request.CorrelationId] = entry;