fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection
This commit is contained in:
@@ -84,6 +84,15 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
// Periodic refresh trigger
|
||||
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
|
||||
|
||||
// Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
|
||||
// Status.Failure. Without this handler the failure was an unhandled message
|
||||
// (debug-level only) and the refresh failed silently — operators could not
|
||||
// distinguish "no sites configured" from "database is down". Log at Warning.
|
||||
Receive<Status.Failure>(failure =>
|
||||
_log.Warning(failure.Cause,
|
||||
"Failed to load site addresses from the database; the site ClusterClient "
|
||||
+ "cache was not refreshed and may be stale or empty"));
|
||||
|
||||
// Health monitoring: heartbeats and health reports from sites
|
||||
Receive<HeartbeatMessage>(HandleHeartbeat);
|
||||
Receive<SiteHealthReport>(HandleSiteHealthReport);
|
||||
@@ -296,6 +305,25 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
|
||||
/// CentralCommunicationActor is a long-lived coordinator that owns the per-site
|
||||
/// ClusterClient map; a transient fault in a child (e.g. a ClusterClient child)
|
||||
/// must Resume so the child's connection state is preserved rather than wiped by
|
||||
/// a Restart.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: -1,
|
||||
withinTimeRange: Timeout.InfiniteTimeSpan,
|
||||
decider: Decider.From(ex =>
|
||||
{
|
||||
_log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
|
||||
return Directive.Resume;
|
||||
}));
|
||||
}
|
||||
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("CentralCommunicationActor started");
|
||||
|
||||
@@ -28,7 +28,19 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
|
||||
private const int MaxRetries = 3;
|
||||
private const string ReconnectTimerKey = "grpc-reconnect";
|
||||
private const string StabilityTimerKey = "grpc-stability";
|
||||
internal static TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <summary>
|
||||
/// How long a freshly-opened gRPC stream must stay up before its retry budget
|
||||
/// is considered "recovered" and <see cref="_retryCount"/> is reset to 0.
|
||||
/// Communication-008: the retry count must NOT be reset by individual events —
|
||||
/// a stream that connects, delivers one event, then fails repeatedly would
|
||||
/// otherwise reconnect forever and never trip <see cref="MaxRetries"/>. Resetting
|
||||
/// only after a stable interval bounds a flapping stream.
|
||||
/// </summary>
|
||||
internal static TimeSpan StabilityWindow { get; set; } = TimeSpan.FromSeconds(60);
|
||||
|
||||
private int _retryCount;
|
||||
private bool _useNodeA = true;
|
||||
private bool _stopped;
|
||||
@@ -66,16 +78,21 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
OpenGrpcStream();
|
||||
});
|
||||
|
||||
// Domain events arriving via Self.Tell from gRPC callback
|
||||
Receive<AttributeValueChanged>(changed =>
|
||||
{
|
||||
_retryCount = 0; // Successful event resets retry count
|
||||
_onEvent(changed);
|
||||
});
|
||||
Receive<AlarmStateChanged>(changed =>
|
||||
// Domain events arriving via Self.Tell from gRPC callback.
|
||||
// Communication-008: receiving an event must NOT reset _retryCount — a
|
||||
// flapping stream that delivers a single event between failures would
|
||||
// otherwise never trip MaxRetries. The retry budget is recovered only by
|
||||
// GrpcStreamStable (a stream that has stayed up for StabilityWindow).
|
||||
Receive<AttributeValueChanged>(changed => _onEvent(changed));
|
||||
Receive<AlarmStateChanged>(changed => _onEvent(changed));
|
||||
|
||||
// Stream has been stably connected for StabilityWindow — recover the
|
||||
// retry budget so a future transient fault gets a fresh set of retries.
|
||||
Receive<GrpcStreamStable>(_ =>
|
||||
{
|
||||
if (_stopped) return;
|
||||
_retryCount = 0;
|
||||
_onEvent(changed);
|
||||
_log.Debug("gRPC stream for {0} stable, retry count reset", _instanceUniqueName);
|
||||
});
|
||||
|
||||
// gRPC stream error — attempt reconnection
|
||||
@@ -151,6 +168,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
_grpcCts?.Dispose();
|
||||
_grpcCts = new CancellationTokenSource();
|
||||
|
||||
// Arm the stability timer: if the stream stays up for StabilityWindow the
|
||||
// retry budget is recovered (Communication-008). Cancelled by HandleGrpcError.
|
||||
Timers.StartSingleTimer(StabilityTimerKey, new GrpcStreamStable(), StabilityWindow);
|
||||
|
||||
var client = _grpcFactory.GetOrCreate(_siteIdentifier, endpoint);
|
||||
var self = Self;
|
||||
var ct = _grpcCts.Token;
|
||||
@@ -171,6 +192,10 @@ public class DebugStreamBridgeActor : ReceiveActor, IWithTimers
|
||||
{
|
||||
if (_stopped) return;
|
||||
|
||||
// The stream failed before reaching the stability window — its retry
|
||||
// budget is NOT recovered (Communication-008).
|
||||
Timers.Cancel(StabilityTimerKey);
|
||||
|
||||
_retryCount++;
|
||||
|
||||
if (_retryCount > MaxRetries)
|
||||
@@ -239,3 +264,10 @@ internal record GrpcStreamError(Exception Exception);
|
||||
/// Internal message to trigger gRPC stream reconnection.
|
||||
/// </summary>
|
||||
internal record ReconnectGrpcStream;
|
||||
|
||||
/// <summary>
|
||||
/// Internal message indicating the current gRPC stream has been connected long
|
||||
/// enough (<see cref="DebugStreamBridgeActor.StabilityWindow"/>) to be considered
|
||||
/// stable, so the reconnect retry budget can be recovered.
|
||||
/// </summary>
|
||||
internal record GrpcStreamStable;
|
||||
|
||||
@@ -177,6 +177,24 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
|
||||
/// SiteCommunicationActor is a long-lived coordinator routing all message
|
||||
/// patterns to local handlers; a transient fault in a child must Resume so the
|
||||
/// child's in-memory state is preserved rather than discarded by a Restart.
|
||||
/// </summary>
|
||||
protected override SupervisorStrategy SupervisorStrategy()
|
||||
{
|
||||
return new OneForOneStrategy(
|
||||
maxNrOfRetries: -1,
|
||||
withinTimeRange: Timeout.InfiniteTimeSpan,
|
||||
decider: Decider.From(ex =>
|
||||
{
|
||||
_log.Warning(ex, "Child actor of SiteCommunicationActor faulted, resuming (state preserved)");
|
||||
return Directive.Resume;
|
||||
}));
|
||||
}
|
||||
|
||||
protected override void PreStart()
|
||||
{
|
||||
_log.Info("SiteCommunicationActor started for site {0}", _siteId);
|
||||
|
||||
Reference in New Issue
Block a user