fix(communication): resolve Communication-004..008 — Resume supervision, gRPC option wiring, address-load logging, sync dispose, flap detection

This commit is contained in:
Joseph Doherty
2026-05-16 20:58:03 -04:00
parent 3e7a3d7e31
commit 31a6995d24
12 changed files with 656 additions and 51 deletions
@@ -84,6 +84,15 @@ public class CentralCommunicationActor : ReceiveActor
// Periodic refresh trigger
Receive<RefreshSiteAddresses>(_ => LoadSiteAddressesFromDb());
// Communication-006: a faulted LoadSiteAddressesFromDb task is piped here as a
// Status.Failure. Without this handler the failure was an unhandled message
// (debug-level only) and the refresh failed silently — operators could not
// distinguish "no sites configured" from "database is down". Log at Warning.
Receive<Status.Failure>(failure =>
_log.Warning(failure.Cause,
"Failed to load site addresses from the database; the site ClusterClient "
+ "cache was not refreshed and may be stale or empty"));
// Health monitoring: heartbeats and health reports from sites
Receive<HeartbeatMessage>(HandleHeartbeat);
Receive<SiteHealthReport>(HandleSiteHealthReport);
@@ -296,6 +305,25 @@ public class CentralCommunicationActor : ReceiveActor
}
}
/// <summary>
/// Coordinator supervision strategy (CLAUDE.md: "Resume for coordinator actors").
/// CentralCommunicationActor is a long-lived coordinator that owns the per-site
/// ClusterClient map; a transient fault in a child (e.g. a ClusterClient child)
/// must Resume so the child's connection state is preserved rather than wiped by
/// a Restart.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
return new OneForOneStrategy(
maxNrOfRetries: -1,
withinTimeRange: Timeout.InfiniteTimeSpan,
decider: Decider.From(ex =>
{
_log.Warning(ex, "Child actor of CentralCommunicationActor faulted, resuming (state preserved)");
return Directive.Resume;
}));
}
protected override void PreStart()
{
_log.Info("CentralCommunicationActor started");