feat(site): per-node startup reconciliation actor (self-heal missing/stale configs)

2026-06-26 16:35:57 -04:00
parent 96192950a0
commit eb59c4244f
4 changed files with 609 additions and 0 deletions
@@ -359,6 +359,34 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
                new ClusterClient.Send("/user/central-communication", msg), Sender);
        });

+        // Site startup reconciliation (Task 18): forward the node's local-inventory
+        // ReconcileSiteRequest to the central cluster. The original Sender (the
+        // SiteReconciliationActor's Ask) is passed as the ClusterClient.Send sender so
+        // the ReconcileSiteResponse routes straight back to the waiting Ask, not here.
+        // Mirrors IngestAuditEventsCommand.
+        Receive<ReconcileSiteRequest>(msg =>
+        {
+            if (_centralClient == null)
+            {
+                // No ClusterClient registered yet (e.g. central contact points not
+                // configured, or registration not yet completed). Faulting the Ask makes
+                // the SiteReconciliationActor treat the pass as best-effort-failed; it
+                // logs a warning and retries reconcile on the next node startup.
+                _log.Warning(
+                    "Cannot forward ReconcileSiteRequest for site {0} node {1} — no central ClusterClient registered",
+                    msg.SiteIdentifier, msg.NodeId);
+                Sender.Tell(new Status.Failure(
+                    new InvalidOperationException("Central ClusterClient not registered")));
+                return;
+            }
+
+            _log.Debug(
+                "Forwarding ReconcileSiteRequest for site {0} node {1} ({2} local instance(s)) to central",
+                msg.SiteIdentifier, msg.NodeId, msg.LocalNameToRevisionHash.Count);
+            _centralClient.Tell(
+                new ClusterClient.Send("/user/central-communication", msg), Sender);
+        });
+
        // Internal: send heartbeat tick
        Receive<SendHeartbeat>(_ => SendHeartbeatToCentral());

@@ -1016,6 +1016,39 @@ akka {{
                contacts.Count, _nodeOptions.SiteId);
        }

+        // Task 18c — per-node startup reconciliation. Created on EVERY site node (NOT a
+        // singleton) so a standby that was DOWN during a deploy self-heals on its next
+        // restart: it reports its local deployed inventory to central via the
+        // SiteCommunicationActor Ask, fetches the gap (missing/stale) over HTTP, and
+        // guarded-writes it (orphans are logged, never deleted). Requires the HTTP
+        // config fetcher; if it is somehow absent the self-heal is skipped (best-effort —
+        // replication remains the primary path and the next restart retries).
+        if (deploymentConfigFetcher != null)
+        {
+            var reconcileLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
+                .CreateLogger<SiteReconciliationActor>();
+            _actorSystem.ActorOf(
+                Props.Create(() => new SiteReconciliationActor(
+                    storage,
+                    deploymentConfigFetcher,
+                    siteCommActor,
+                    _nodeOptions.SiteId!,
+                    _nodeOptions.NodeName,
+                    reconcileLogger,
+                    null,
+                    null)),
+                "site-reconciliation");
+            _logger.LogInformation(
+                "SiteReconciliationActor created (per-node startup self-heal) for site {SiteId} node {Node}",
+                _nodeOptions.SiteId, _nodeOptions.NodeName);
+        }
+        else
+        {
+            _logger.LogWarning(
+                "No IDeploymentConfigFetcher available; SiteReconciliationActor not created — "
+                + "startup self-heal disabled (replication remains the primary path)");
+        }
+
        // Audit Log (#23) — site-side telemetry actor that drains the SQLite
        // Pending queue and pushes to central via IngestAuditEvents. Not a
        // cluster singleton: each site is its own cluster, and the actor reads
@@ -0,0 +1,239 @@
+using Akka.Actor;
+using Microsoft.Extensions.Logging;
+using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
+using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
+using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
+
+namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
+
+/// <summary>
+/// Runs on EVERY site node (NOT a singleton) so a standby that was DOWN during a deploy
+/// self-heals on its next restart. On startup the actor performs one best-effort
+/// reconciliation pass:
+/// <list type="number">
+///   <item>read the node's local deployed inventory from SQLite,</item>
+///   <item>report it to central via the <c>SiteCommunicationActor</c> Ask
+///         (<see cref="ReconcileSiteRequest"/> → <see cref="ReconcileSiteResponse"/>),</item>
+///   <item>fetch each gap item's config over HTTP and guarded-write it, and</item>
+///   <item>LOG (never delete) any orphan the node still has but central no longer deploys.</item>
+/// </list>
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Best-effort throughout.</b> A central-unreachable / timed-out Ask is caught, logged
+/// at Warning, and the pass simply ends — reconcile re-runs on the next node startup; it is a
+/// self-heal, not a critical path. A per-item fetch/write failure is caught and logged, then
+/// the remaining gap items continue (one bad item must not abort the rest). The actor never
+/// crashes on these failures.
+/// </para>
+/// <para>
+/// The pass runs after a small startup delay (so the central ClusterClient has time to
+/// register) and is driven entirely off the actor thread: the Ask + fetch + write happen in
+/// an awaited continuation whose summary is captured in an internal message
+/// <see cref="ReconcilePassResult"/> piped back to <c>Self</c>. The actor thread never blocks.
+/// </para>
+/// <para>
+/// The site does NOT carry the central fetch base URL in its own config — it uses
+/// <see cref="ReconcileSiteResponse.CentralFetchBaseUrl"/> from central's reply.
+/// </para>
+/// </remarks>
+public sealed class SiteReconciliationActor : ReceiveActor, IWithTimers
+{
+    private const string StartupTimerKey = "reconcile-startup";
+
+    private readonly SiteStorageService _storage;
+    private readonly IDeploymentConfigFetcher _configFetcher;
+    private readonly IActorRef _siteCommunicationActor;
+    private readonly string _siteIdentifier;
+    private readonly string _nodeId;
+    private readonly ILogger<SiteReconciliationActor> _logger;
+    private readonly TimeSpan _initialDelay;
+    private readonly TimeSpan _askTimeout;
+
+    /// <summary>Akka timer scheduler injected by the framework via <see cref="IWithTimers"/>.</summary>
+    public ITimerScheduler Timers { get; set; } = null!;
+
+    /// <summary>
+    /// Initializes the per-node startup-reconciliation actor.
+    /// </summary>
+    /// <param name="storage">Site-local SQLite store — read for the inventory, written for the gap.</param>
+    /// <param name="configFetcher">Fetches a deployment's flattened config JSON from central over HTTP.</param>
+    /// <param name="siteCommunicationActor">
+    /// The site's <c>SiteCommunicationActor</c>; it forwards the
+    /// <see cref="ReconcileSiteRequest"/> over the registered central ClusterClient and routes
+    /// the <see cref="ReconcileSiteResponse"/> back to this actor's Ask.
+    /// </param>
+    /// <param name="siteIdentifier">This node's site identifier (resolved by central).</param>
+    /// <param name="nodeId">This node's semantic id (e.g. <c>node-a</c>/<c>node-b</c>), for logging/diagnostics.</param>
+    /// <param name="logger">Logger.</param>
+    /// <param name="initialDelay">
+    /// Delay before the single startup pass, giving the central ClusterClient time to register.
+    /// Defaults to 5 seconds.
+    /// </param>
+    /// <param name="askTimeout">Round-trip timeout for the reconcile Ask to central. Defaults to 30 seconds.</param>
+    public SiteReconciliationActor(
+        SiteStorageService storage,
+        IDeploymentConfigFetcher configFetcher,
+        IActorRef siteCommunicationActor,
+        string siteIdentifier,
+        string nodeId,
+        ILogger<SiteReconciliationActor> logger,
+        TimeSpan? initialDelay = null,
+        TimeSpan? askTimeout = null)
+    {
+        _storage = storage;
+        _configFetcher = configFetcher;
+        _siteCommunicationActor = siteCommunicationActor;
+        _siteIdentifier = siteIdentifier;
+        _nodeId = nodeId;
+        _logger = logger;
+        _initialDelay = initialDelay ?? TimeSpan.FromSeconds(5);
+        _askTimeout = askTimeout ?? TimeSpan.FromSeconds(30);
+
+        Receive<RunReconcile>(_ => RunReconcilePassAsync().PipeTo(Self));
+        Receive<ReconcilePassResult>(HandleReconcilePassResult);
+
+        // Defensive: RunReconcilePassAsync is designed never to throw (it returns a faulted
+        // ReconcilePassResult instead), but if anything unexpected faults the piped Task the
+        // Status.Failure would otherwise go to dead letters silently. Log it instead.
+        Receive<Status.Failure>(f => _logger.LogWarning(f.Cause,
+            "Reconcile pass faulted unexpectedly for site {Site} node {Node}",
+            _siteIdentifier, _nodeId));
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        base.PreStart();
+        // One-shot pass after a small delay so the central ClusterClient can register first.
+        // Non-blocking: the timer fires RunReconcile back onto this actor's mailbox.
+        Timers.StartSingleTimer(StartupTimerKey, RunReconcile.Instance, _initialDelay);
+        _logger.LogInformation(
+            "SiteReconciliationActor started for site {Site} node {Node}; startup reconcile scheduled in {Delay}",
+            _siteIdentifier, _nodeId, _initialDelay);
+    }
+
+    /// <summary>
+    /// Runs the full reconcile pass off the actor thread. Never throws: a central-unreachable
+    /// Ask (or any other top-level failure) is captured as a faulted
+    /// <see cref="ReconcilePassResult"/>; per-item fetch/write failures are caught per item so
+    /// the rest of the gap still applies.
+    /// </summary>
+    private async Task<ReconcilePassResult> RunReconcilePassAsync()
+    {
+        Dictionary<string, string> localMap;
+        try
+        {
+            var configs = await _storage.GetAllDeployedConfigsAsync().ConfigureAwait(false);
+            localMap = new Dictionary<string, string>(configs.Count, StringComparer.Ordinal);
+            foreach (var c in configs)
+                localMap[c.InstanceUniqueName] = c.RevisionHash;
+        }
+        catch (Exception ex)
+        {
+            return ReconcilePassResult.Faulted(ex);
+        }
+
+        // Report inventory to central and get fresh fetch tokens for the gap. Best-effort:
+        // a central-unreachable / timed-out Ask faults here and is reported as a faulted pass
+        // (logged Warning; reconcile re-runs next startup).
+        ReconcileSiteResponse response;
+        try
+        {
+            response = await _siteCommunicationActor
+                .Ask<ReconcileSiteResponse>(
+                    new ReconcileSiteRequest(_siteIdentifier, _nodeId, localMap),
+                    _askTimeout)
+                .ConfigureAwait(false);
+        }
+        catch (Exception ex)
+        {
+            return ReconcilePassResult.Faulted(ex);
+        }
+
+        var fetched = 0;
+        var failed = 0;
+
+        // Fetch + guarded-write each gap item. Per-item failure must not abort the rest.
+        foreach (var item in response.Gap)
+        {
+            try
+            {
+                var configJson = await _configFetcher
+                    .FetchAsync(response.CentralFetchBaseUrl, item.DeploymentId, item.FetchToken, CancellationToken.None)
+                    .ConfigureAwait(false);
+
+                await _storage.StoreDeployedConfigIfNewerAsync(
+                    item.InstanceUniqueName, configJson, item.DeploymentId, item.RevisionHash, item.IsEnabled)
+                    .ConfigureAwait(false);
+
+                fetched++;
+                _logger.LogInformation(
+                    "Reconcile: fetched + stored config for {Instance} (deployment {DeploymentId}, rev {Revision})",
+                    item.InstanceUniqueName, item.DeploymentId, item.RevisionHash);
+            }
+            catch (DeploymentConfigFetchException ex) when (ex.IsSuperseded)
+            {
+                // 404 = superseded/expired between staging and fetch; a newer deploy will
+                // replicate it. Not a failure — skip quietly.
+                _logger.LogInformation(
+                    "Reconcile: skip {Instance} (deployment {DeploymentId}) — superseded/expired",
+                    item.InstanceUniqueName, item.DeploymentId);
+            }
+            catch (Exception ex)
+            {
+                failed++;
+                _logger.LogError(ex,
+                    "Reconcile: failed to fetch/store config for {Instance} (deployment {DeploymentId}) — continuing with remaining items",
+                    item.InstanceUniqueName, item.DeploymentId);
+            }
+        }
+
+        // Orphans: present locally but no longer deployed at central. LOG only — never delete
+        // (a stale local row is harmless; deleting risks dropping a config a later deploy needs).
+        foreach (var name in response.OrphanNames)
+        {
+            _logger.LogWarning(
+                "Reconcile: local instance {Instance} is no longer deployed at central — leaving in place; manual cleanup may be needed",
+                name);
+        }
+
+        return ReconcilePassResult.Completed(fetched, failed, response.OrphanNames.Count);
+    }
+
+    private void HandleReconcilePassResult(ReconcilePassResult result)
+    {
+        if (result.Error != null)
+        {
+            // Best-effort: a failed pass (central unreachable, Ask timeout, local read error) is
+            // logged at Warning and the actor stays alive. Reconcile re-runs on the next startup.
+            _logger.LogWarning(result.Error,
+                "Reconcile pass for site {Site} node {Node} did not complete (central unreachable or read error) — will retry on next startup",
+                _siteIdentifier, _nodeId);
+            return;
+        }
+
+        _logger.LogInformation(
+            "Reconcile pass for site {Site} node {Node} complete: {Fetched} fetched, {Failed} failed, {Orphans} orphan(s)",
+            _siteIdentifier, _nodeId, result.Fetched, result.Failed, result.Orphans);
+    }
+
+    // ── Internal messages ──
+
+    /// <summary>Self-tick that drives the one-shot startup reconcile pass.</summary>
+    private sealed class RunReconcile
+    {
+        public static readonly RunReconcile Instance = new();
+        private RunReconcile() { }
+    }
+
+    /// <summary>Summary of one reconcile pass, piped to <c>Self</c> for logging.</summary>
+    private sealed record ReconcilePassResult(int Fetched, int Failed, int Orphans, Exception? Error)
+    {
+        public static ReconcilePassResult Completed(int fetched, int failed, int orphans)
+            => new(fetched, failed, orphans, null);
+
+        public static ReconcilePassResult Faulted(Exception error)
+            => new(0, 0, 0, error);
+    }
+}