feat(site): per-node startup reconciliation actor (self-heal missing/stale configs)

This commit is contained in:
Joseph Doherty
2026-06-26 16:35:57 -04:00
parent 96192950a0
commit eb59c4244f
4 changed files with 609 additions and 0 deletions
@@ -1016,6 +1016,39 @@ akka {{
contacts.Count, _nodeOptions.SiteId);
}
// Task 18c — per-node startup reconciliation. Created on EVERY site node (NOT a
// singleton) so a standby that was DOWN during a deploy self-heals on its next
// restart: it reports its local deployed inventory to central via the
// SiteCommunicationActor Ask, fetches the gap (missing/stale) over HTTP, and
// guarded-writes it (orphans are logged, never deleted). Requires the HTTP
// config fetcher; if it is somehow absent the self-heal is skipped (best-effort —
// replication remains the primary path and the next restart retries).
if (deploymentConfigFetcher != null)
{
var reconcileLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
.CreateLogger<SiteReconciliationActor>();
_actorSystem.ActorOf(
Props.Create(() => new SiteReconciliationActor(
storage,
deploymentConfigFetcher,
siteCommActor,
_nodeOptions.SiteId!,
_nodeOptions.NodeName,
reconcileLogger,
null,
null)),
"site-reconciliation");
_logger.LogInformation(
"SiteReconciliationActor created (per-node startup self-heal) for site {SiteId} node {Node}",
_nodeOptions.SiteId, _nodeOptions.NodeName);
}
else
{
_logger.LogWarning(
"No IDeploymentConfigFetcher available; SiteReconciliationActor not created — "
+ "startup self-heal disabled (replication remains the primary path)");
}
// Audit Log (#23) — site-side telemetry actor that drains the SQLite
// Pending queue and pushes to central via IngestAuditEvents. Not a
// cluster singleton: each site is its own cluster, and the actor reads