feat(site): per-node startup reconciliation actor (self-heal missing/stale configs)
This commit is contained in:
@@ -359,6 +359,34 @@ public class SiteCommunicationActor : ReceiveActor, IWithTimers
|
|||||||
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Site startup reconciliation (Task 18): forward the node's local-inventory
|
||||||
|
// ReconcileSiteRequest to the central cluster. The original Sender (the
|
||||||
|
// SiteReconciliationActor's Ask) is passed as the ClusterClient.Send sender so
|
||||||
|
// the ReconcileSiteResponse routes straight back to the waiting Ask, not here.
|
||||||
|
// Mirrors IngestAuditEventsCommand.
|
||||||
|
Receive<ReconcileSiteRequest>(msg =>
|
||||||
|
{
|
||||||
|
if (_centralClient == null)
|
||||||
|
{
|
||||||
|
// No ClusterClient registered yet (e.g. central contact points not
|
||||||
|
// configured, or registration not yet completed). Faulting the Ask makes
|
||||||
|
// the SiteReconciliationActor treat the pass as best-effort-failed; it
|
||||||
|
// logs a warning and retries reconcile on the next node startup.
|
||||||
|
_log.Warning(
|
||||||
|
"Cannot forward ReconcileSiteRequest for site {0} node {1} — no central ClusterClient registered",
|
||||||
|
msg.SiteIdentifier, msg.NodeId);
|
||||||
|
Sender.Tell(new Status.Failure(
|
||||||
|
new InvalidOperationException("Central ClusterClient not registered")));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
_log.Debug(
|
||||||
|
"Forwarding ReconcileSiteRequest for site {0} node {1} ({2} local instance(s)) to central",
|
||||||
|
msg.SiteIdentifier, msg.NodeId, msg.LocalNameToRevisionHash.Count);
|
||||||
|
_centralClient.Tell(
|
||||||
|
new ClusterClient.Send("/user/central-communication", msg), Sender);
|
||||||
|
});
|
||||||
|
|
||||||
// Internal: send heartbeat tick
|
// Internal: send heartbeat tick
|
||||||
Receive<SendHeartbeat>(_ => SendHeartbeatToCentral());
|
Receive<SendHeartbeat>(_ => SendHeartbeatToCentral());
|
||||||
|
|
||||||
|
|||||||
@@ -1016,6 +1016,39 @@ akka {{
|
|||||||
contacts.Count, _nodeOptions.SiteId);
|
contacts.Count, _nodeOptions.SiteId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Task 18c — per-node startup reconciliation. Created on EVERY site node (NOT a
|
||||||
|
// singleton) so a standby that was DOWN during a deploy self-heals on its next
|
||||||
|
// restart: it reports its local deployed inventory to central via the
|
||||||
|
// SiteCommunicationActor Ask, fetches the gap (missing/stale) over HTTP, and
|
||||||
|
// guarded-writes it (orphans are logged, never deleted). Requires the HTTP
|
||||||
|
// config fetcher; if it is somehow absent the self-heal is skipped (best-effort —
|
||||||
|
// replication remains the primary path and the next restart retries).
|
||||||
|
if (deploymentConfigFetcher != null)
|
||||||
|
{
|
||||||
|
var reconcileLogger = _serviceProvider.GetRequiredService<ILoggerFactory>()
|
||||||
|
.CreateLogger<SiteReconciliationActor>();
|
||||||
|
_actorSystem.ActorOf(
|
||||||
|
Props.Create(() => new SiteReconciliationActor(
|
||||||
|
storage,
|
||||||
|
deploymentConfigFetcher,
|
||||||
|
siteCommActor,
|
||||||
|
_nodeOptions.SiteId!,
|
||||||
|
_nodeOptions.NodeName,
|
||||||
|
reconcileLogger,
|
||||||
|
null,
|
||||||
|
null)),
|
||||||
|
"site-reconciliation");
|
||||||
|
_logger.LogInformation(
|
||||||
|
"SiteReconciliationActor created (per-node startup self-heal) for site {SiteId} node {Node}",
|
||||||
|
_nodeOptions.SiteId, _nodeOptions.NodeName);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_logger.LogWarning(
|
||||||
|
"No IDeploymentConfigFetcher available; SiteReconciliationActor not created — "
|
||||||
|
+ "startup self-heal disabled (replication remains the primary path)");
|
||||||
|
}
|
||||||
|
|
||||||
// Audit Log (#23) — site-side telemetry actor that drains the SQLite
|
// Audit Log (#23) — site-side telemetry actor that drains the SQLite
|
||||||
// Pending queue and pushes to central via IngestAuditEvents. Not a
|
// Pending queue and pushes to central via IngestAuditEvents. Not a
|
||||||
// cluster singleton: each site is its own cluster, and the actor reads
|
// cluster singleton: each site is its own cluster, and the actor reads
|
||||||
|
|||||||
@@ -0,0 +1,239 @@
|
|||||||
|
using Akka.Actor;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Runs on EVERY site node (NOT a singleton) so a standby that was DOWN during a deploy
|
||||||
|
/// self-heals on its next restart. On startup the actor performs one best-effort
|
||||||
|
/// reconciliation pass:
|
||||||
|
/// <list type="number">
|
||||||
|
/// <item>read the node's local deployed inventory from SQLite,</item>
|
||||||
|
/// <item>report it to central via the <c>SiteCommunicationActor</c> Ask
|
||||||
|
/// (<see cref="ReconcileSiteRequest"/> → <see cref="ReconcileSiteResponse"/>),</item>
|
||||||
|
/// <item>fetch each gap item's config over HTTP and guarded-write it, and</item>
|
||||||
|
/// <item>LOG (never delete) any orphan the node still has but central no longer deploys.</item>
|
||||||
|
/// </list>
|
||||||
|
/// </summary>
|
||||||
|
/// <remarks>
|
||||||
|
/// <para>
|
||||||
|
/// <b>Best-effort throughout.</b> A central-unreachable / timed-out Ask is caught, logged
|
||||||
|
/// at Warning, and the pass simply ends — reconcile re-runs on the next node startup; it is a
|
||||||
|
/// self-heal, not a critical path. A per-item fetch/write failure is caught and logged, then
|
||||||
|
/// the remaining gap items continue (one bad item must not abort the rest). The actor never
|
||||||
|
/// crashes on these failures.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// The pass runs after a small startup delay (so the central ClusterClient has time to
|
||||||
|
/// register) and is driven entirely off the actor thread: the Ask + fetch + write happen in
|
||||||
|
/// an awaited continuation whose summary is captured in an internal message
|
||||||
|
/// <see cref="ReconcilePassResult"/> piped back to <c>Self</c>. The actor thread never blocks.
|
||||||
|
/// </para>
|
||||||
|
/// <para>
|
||||||
|
/// The site does NOT carry the central fetch base URL in its own config — it uses
|
||||||
|
/// <see cref="ReconcileSiteResponse.CentralFetchBaseUrl"/> from central's reply.
|
||||||
|
/// </para>
|
||||||
|
/// </remarks>
|
||||||
|
public sealed class SiteReconciliationActor : ReceiveActor, IWithTimers
|
||||||
|
{
|
||||||
|
private const string StartupTimerKey = "reconcile-startup";
|
||||||
|
|
||||||
|
private readonly SiteStorageService _storage;
|
||||||
|
private readonly IDeploymentConfigFetcher _configFetcher;
|
||||||
|
private readonly IActorRef _siteCommunicationActor;
|
||||||
|
private readonly string _siteIdentifier;
|
||||||
|
private readonly string _nodeId;
|
||||||
|
private readonly ILogger<SiteReconciliationActor> _logger;
|
||||||
|
private readonly TimeSpan _initialDelay;
|
||||||
|
private readonly TimeSpan _askTimeout;
|
||||||
|
|
||||||
|
/// <summary>Akka timer scheduler injected by the framework via <see cref="IWithTimers"/>.</summary>
|
||||||
|
public ITimerScheduler Timers { get; set; } = null!;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Initializes the per-node startup-reconciliation actor.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="storage">Site-local SQLite store — read for the inventory, written for the gap.</param>
|
||||||
|
/// <param name="configFetcher">Fetches a deployment's flattened config JSON from central over HTTP.</param>
|
||||||
|
/// <param name="siteCommunicationActor">
|
||||||
|
/// The site's <c>SiteCommunicationActor</c>; it forwards the
|
||||||
|
/// <see cref="ReconcileSiteRequest"/> over the registered central ClusterClient and routes
|
||||||
|
/// the <see cref="ReconcileSiteResponse"/> back to this actor's Ask.
|
||||||
|
/// </param>
|
||||||
|
/// <param name="siteIdentifier">This node's site identifier (resolved by central).</param>
|
||||||
|
/// <param name="nodeId">This node's semantic id (e.g. <c>node-a</c>/<c>node-b</c>), for logging/diagnostics.</param>
|
||||||
|
/// <param name="logger">Logger.</param>
|
||||||
|
/// <param name="initialDelay">
|
||||||
|
/// Delay before the single startup pass, giving the central ClusterClient time to register.
|
||||||
|
/// Defaults to 5 seconds.
|
||||||
|
/// </param>
|
||||||
|
/// <param name="askTimeout">Round-trip timeout for the reconcile Ask to central. Defaults to 30 seconds.</param>
|
||||||
|
public SiteReconciliationActor(
|
||||||
|
SiteStorageService storage,
|
||||||
|
IDeploymentConfigFetcher configFetcher,
|
||||||
|
IActorRef siteCommunicationActor,
|
||||||
|
string siteIdentifier,
|
||||||
|
string nodeId,
|
||||||
|
ILogger<SiteReconciliationActor> logger,
|
||||||
|
TimeSpan? initialDelay = null,
|
||||||
|
TimeSpan? askTimeout = null)
|
||||||
|
{
|
||||||
|
_storage = storage;
|
||||||
|
_configFetcher = configFetcher;
|
||||||
|
_siteCommunicationActor = siteCommunicationActor;
|
||||||
|
_siteIdentifier = siteIdentifier;
|
||||||
|
_nodeId = nodeId;
|
||||||
|
_logger = logger;
|
||||||
|
_initialDelay = initialDelay ?? TimeSpan.FromSeconds(5);
|
||||||
|
_askTimeout = askTimeout ?? TimeSpan.FromSeconds(30);
|
||||||
|
|
||||||
|
Receive<RunReconcile>(_ => RunReconcilePassAsync().PipeTo(Self));
|
||||||
|
Receive<ReconcilePassResult>(HandleReconcilePassResult);
|
||||||
|
|
||||||
|
// Defensive: RunReconcilePassAsync is designed never to throw (it returns a faulted
|
||||||
|
// ReconcilePassResult instead), but if anything unexpected faults the piped Task the
|
||||||
|
// Status.Failure would otherwise go to dead letters silently. Log it instead.
|
||||||
|
Receive<Status.Failure>(f => _logger.LogWarning(f.Cause,
|
||||||
|
"Reconcile pass faulted unexpectedly for site {Site} node {Node}",
|
||||||
|
_siteIdentifier, _nodeId));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <inheritdoc />
|
||||||
|
protected override void PreStart()
|
||||||
|
{
|
||||||
|
base.PreStart();
|
||||||
|
// One-shot pass after a small delay so the central ClusterClient can register first.
|
||||||
|
// Non-blocking: the timer fires RunReconcile back onto this actor's mailbox.
|
||||||
|
Timers.StartSingleTimer(StartupTimerKey, RunReconcile.Instance, _initialDelay);
|
||||||
|
_logger.LogInformation(
|
||||||
|
"SiteReconciliationActor started for site {Site} node {Node}; startup reconcile scheduled in {Delay}",
|
||||||
|
_siteIdentifier, _nodeId, _initialDelay);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Runs the full reconcile pass off the actor thread. Never throws: a central-unreachable
|
||||||
|
/// Ask (or any other top-level failure) is captured as a faulted
|
||||||
|
/// <see cref="ReconcilePassResult"/>; per-item fetch/write failures are caught per item so
|
||||||
|
/// the rest of the gap still applies.
|
||||||
|
/// </summary>
|
||||||
|
private async Task<ReconcilePassResult> RunReconcilePassAsync()
|
||||||
|
{
|
||||||
|
Dictionary<string, string> localMap;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var configs = await _storage.GetAllDeployedConfigsAsync().ConfigureAwait(false);
|
||||||
|
localMap = new Dictionary<string, string>(configs.Count, StringComparer.Ordinal);
|
||||||
|
foreach (var c in configs)
|
||||||
|
localMap[c.InstanceUniqueName] = c.RevisionHash;
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
return ReconcilePassResult.Faulted(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Report inventory to central and get fresh fetch tokens for the gap. Best-effort:
|
||||||
|
// a central-unreachable / timed-out Ask faults here and is reported as a faulted pass
|
||||||
|
// (logged Warning; reconcile re-runs next startup).
|
||||||
|
ReconcileSiteResponse response;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
response = await _siteCommunicationActor
|
||||||
|
.Ask<ReconcileSiteResponse>(
|
||||||
|
new ReconcileSiteRequest(_siteIdentifier, _nodeId, localMap),
|
||||||
|
_askTimeout)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
return ReconcilePassResult.Faulted(ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
var fetched = 0;
|
||||||
|
var failed = 0;
|
||||||
|
|
||||||
|
// Fetch + guarded-write each gap item. Per-item failure must not abort the rest.
|
||||||
|
foreach (var item in response.Gap)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var configJson = await _configFetcher
|
||||||
|
.FetchAsync(response.CentralFetchBaseUrl, item.DeploymentId, item.FetchToken, CancellationToken.None)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
|
||||||
|
await _storage.StoreDeployedConfigIfNewerAsync(
|
||||||
|
item.InstanceUniqueName, configJson, item.DeploymentId, item.RevisionHash, item.IsEnabled)
|
||||||
|
.ConfigureAwait(false);
|
||||||
|
|
||||||
|
fetched++;
|
||||||
|
_logger.LogInformation(
|
||||||
|
"Reconcile: fetched + stored config for {Instance} (deployment {DeploymentId}, rev {Revision})",
|
||||||
|
item.InstanceUniqueName, item.DeploymentId, item.RevisionHash);
|
||||||
|
}
|
||||||
|
catch (DeploymentConfigFetchException ex) when (ex.IsSuperseded)
|
||||||
|
{
|
||||||
|
// 404 = superseded/expired between staging and fetch; a newer deploy will
|
||||||
|
// replicate it. Not a failure — skip quietly.
|
||||||
|
_logger.LogInformation(
|
||||||
|
"Reconcile: skip {Instance} (deployment {DeploymentId}) — superseded/expired",
|
||||||
|
item.InstanceUniqueName, item.DeploymentId);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
_logger.LogError(ex,
|
||||||
|
"Reconcile: failed to fetch/store config for {Instance} (deployment {DeploymentId}) — continuing with remaining items",
|
||||||
|
item.InstanceUniqueName, item.DeploymentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Orphans: present locally but no longer deployed at central. LOG only — never delete
|
||||||
|
// (a stale local row is harmless; deleting risks dropping a config a later deploy needs).
|
||||||
|
foreach (var name in response.OrphanNames)
|
||||||
|
{
|
||||||
|
_logger.LogWarning(
|
||||||
|
"Reconcile: local instance {Instance} is no longer deployed at central — leaving in place; manual cleanup may be needed",
|
||||||
|
name);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ReconcilePassResult.Completed(fetched, failed, response.OrphanNames.Count);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void HandleReconcilePassResult(ReconcilePassResult result)
|
||||||
|
{
|
||||||
|
if (result.Error != null)
|
||||||
|
{
|
||||||
|
// Best-effort: a failed pass (central unreachable, Ask timeout, local read error) is
|
||||||
|
// logged at Warning and the actor stays alive. Reconcile re-runs on the next startup.
|
||||||
|
_logger.LogWarning(result.Error,
|
||||||
|
"Reconcile pass for site {Site} node {Node} did not complete (central unreachable or read error) — will retry on next startup",
|
||||||
|
_siteIdentifier, _nodeId);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger.LogInformation(
|
||||||
|
"Reconcile pass for site {Site} node {Node} complete: {Fetched} fetched, {Failed} failed, {Orphans} orphan(s)",
|
||||||
|
_siteIdentifier, _nodeId, result.Fetched, result.Failed, result.Orphans);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Internal messages ──
|
||||||
|
|
||||||
|
/// <summary>Self-tick that drives the one-shot startup reconcile pass.</summary>
|
||||||
|
private sealed class RunReconcile
|
||||||
|
{
|
||||||
|
public static readonly RunReconcile Instance = new();
|
||||||
|
private RunReconcile() { }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Summary of one reconcile pass, piped to <c>Self</c> for logging.</summary>
|
||||||
|
private sealed record ReconcilePassResult(int Fetched, int Failed, int Orphans, Exception? Error)
|
||||||
|
{
|
||||||
|
public static ReconcilePassResult Completed(int fetched, int failed, int orphans)
|
||||||
|
=> new(fetched, failed, orphans, null);
|
||||||
|
|
||||||
|
public static ReconcilePassResult Faulted(Exception error)
|
||||||
|
=> new(0, 0, 0, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,309 @@
|
|||||||
|
using System.Collections.Concurrent;
|
||||||
|
using Akka.Actor;
|
||||||
|
using Akka.TestKit.Xunit2;
|
||||||
|
using Microsoft.Extensions.Logging;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Actors;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Deployment;
|
||||||
|
using ZB.MOM.WW.ScadaBridge.SiteRuntime.Persistence;
|
||||||
|
|
||||||
|
namespace ZB.MOM.WW.ScadaBridge.SiteRuntime.Tests.Actors;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Tests for <see cref="SiteReconciliationActor"/>: the per-node startup self-heal that
|
||||||
|
/// reports the node's local deployed inventory to central (over the SiteCommunicationActor
|
||||||
|
/// Ask), fetches the gap (missing/stale configs) over HTTP, guarded-writes them, and only
|
||||||
|
/// LOGS orphans (never deletes). Best-effort throughout — a central-unreachable Ask or a
|
||||||
|
/// per-item fetch failure must never crash the actor.
|
||||||
|
/// </summary>
|
||||||
|
public class SiteReconciliationActorTests : TestKit, IDisposable
|
||||||
|
{
|
||||||
|
private const string SiteIdentifier = "site-1";
|
||||||
|
private const string NodeId = "node-a";
|
||||||
|
|
||||||
|
private readonly SiteStorageService _storage;
|
||||||
|
private readonly string _dbFile;
|
||||||
|
|
||||||
|
public SiteReconciliationActorTests()
|
||||||
|
{
|
||||||
|
_dbFile = Path.Combine(Path.GetTempPath(), $"site-reconcile-test-{Guid.NewGuid():N}.db");
|
||||||
|
_storage = new SiteStorageService(
|
||||||
|
$"Data Source={_dbFile}", Microsoft.Extensions.Logging.Abstractions.NullLogger<SiteStorageService>.Instance);
|
||||||
|
_storage.InitializeAsync().GetAwaiter().GetResult();
|
||||||
|
}
|
||||||
|
|
||||||
|
void IDisposable.Dispose()
|
||||||
|
{
|
||||||
|
Shutdown();
|
||||||
|
try { File.Delete(_dbFile); } catch { /* cleanup */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
private IActorRef CreateReconciliationActor(
|
||||||
|
IActorRef siteCommunicationActor,
|
||||||
|
IDeploymentConfigFetcher fetcher,
|
||||||
|
ILogger<SiteReconciliationActor>? logger = null,
|
||||||
|
TimeSpan? askTimeout = null) =>
|
||||||
|
ActorOf(Props.Create(() => new SiteReconciliationActor(
|
||||||
|
_storage,
|
||||||
|
fetcher,
|
||||||
|
siteCommunicationActor,
|
||||||
|
SiteIdentifier,
|
||||||
|
NodeId,
|
||||||
|
logger ?? Microsoft.Extensions.Logging.Abstractions.NullLogger<SiteReconciliationActor>.Instance,
|
||||||
|
TimeSpan.FromMilliseconds(50), // initial delay — fast for tests
|
||||||
|
askTimeout ?? TimeSpan.FromSeconds(5)))); // ask timeout
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task MissingInstance_IsFetchedAndStored()
|
||||||
|
{
|
||||||
|
// Local inventory has A@rev1; central reports B is missing → fetch + store B.
|
||||||
|
await _storage.StoreDeployedConfigAsync("A", "{\"instanceUniqueName\":\"A\"}", "depA", "rev1", true);
|
||||||
|
|
||||||
|
const string configB = "{\"instanceUniqueName\":\"B\"}";
|
||||||
|
var fetcher = new FakeConfigFetcher(_ => Task.FromResult(configB));
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
CreateReconciliationActor(commProbe, fetcher);
|
||||||
|
|
||||||
|
var req = commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
Assert.Equal(SiteIdentifier, req.SiteIdentifier);
|
||||||
|
Assert.Equal(NodeId, req.NodeId);
|
||||||
|
Assert.True(req.LocalNameToRevisionHash.TryGetValue("A", out var hashA));
|
||||||
|
Assert.Equal("rev1", hashA);
|
||||||
|
|
||||||
|
commProbe.Reply(new ReconcileSiteResponse(
|
||||||
|
[new ReconcileGapItem("B", "depB", "rev2", true, "tok-b")],
|
||||||
|
[],
|
||||||
|
"http://central:9000"));
|
||||||
|
|
||||||
|
await AwaitAssertAsync(async () =>
|
||||||
|
{
|
||||||
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
var row = Assert.Single(configs, c => c.InstanceUniqueName == "B");
|
||||||
|
Assert.Equal(configB, row.ConfigJson);
|
||||||
|
Assert.Equal("depB", row.DeploymentId);
|
||||||
|
Assert.Equal("rev2", row.RevisionHash);
|
||||||
|
Assert.True(row.IsEnabled);
|
||||||
|
}, TimeSpan.FromSeconds(5));
|
||||||
|
|
||||||
|
var call = Assert.Single(fetcher.Calls);
|
||||||
|
Assert.Equal("http://central:9000", call.BaseUrl);
|
||||||
|
Assert.Equal("depB", call.DeploymentId);
|
||||||
|
Assert.Equal("tok-b", call.Token);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task StaleInstance_IsRefreshed()
|
||||||
|
{
|
||||||
|
// Local A@rev1; central reports A is stale (now depA2/rev2) → fetch + guarded-write A.
|
||||||
|
await _storage.StoreDeployedConfigAsync("A", "{\"old\":true}", "depA", "rev1", true);
|
||||||
|
|
||||||
|
const string configA2 = "{\"instanceUniqueName\":\"A\",\"v\":2}";
|
||||||
|
var fetcher = new FakeConfigFetcher(_ => Task.FromResult(configA2));
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
CreateReconciliationActor(commProbe, fetcher);
|
||||||
|
|
||||||
|
commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
commProbe.Reply(new ReconcileSiteResponse(
|
||||||
|
[new ReconcileGapItem("A", "depA2", "rev2", false, "tok-a")],
|
||||||
|
[],
|
||||||
|
"http://central:9000"));
|
||||||
|
|
||||||
|
await AwaitAssertAsync(async () =>
|
||||||
|
{
|
||||||
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
var row = Assert.Single(configs, c => c.InstanceUniqueName == "A");
|
||||||
|
Assert.Equal(configA2, row.ConfigJson);
|
||||||
|
Assert.Equal("depA2", row.DeploymentId);
|
||||||
|
Assert.Equal("rev2", row.RevisionHash);
|
||||||
|
Assert.False(row.IsEnabled);
|
||||||
|
}, TimeSpan.FromSeconds(5));
|
||||||
|
|
||||||
|
var call = Assert.Single(fetcher.Calls);
|
||||||
|
Assert.Equal("depA2", call.DeploymentId);
|
||||||
|
Assert.Equal("tok-a", call.Token);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task Orphan_IsLoggedNotDeleted()
|
||||||
|
{
|
||||||
|
// Local Z exists; central reports Z as an orphan → log a warning, leave Z in place.
|
||||||
|
await _storage.StoreDeployedConfigAsync("Z", "{\"instanceUniqueName\":\"Z\"}", "depZ", "revZ", true);
|
||||||
|
|
||||||
|
var fetcher = new FakeConfigFetcher(_ => Task.FromResult("never"));
|
||||||
|
var logger = new CapturingLogger<SiteReconciliationActor>();
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
CreateReconciliationActor(commProbe, fetcher, logger);
|
||||||
|
|
||||||
|
commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
commProbe.Reply(new ReconcileSiteResponse([], ["Z"], "http://central:9000"));
|
||||||
|
|
||||||
|
// A warning mentioning the orphan name must be logged.
|
||||||
|
await AwaitAssertAsync(() =>
|
||||||
|
{
|
||||||
|
Assert.Contains(logger.Entries, e =>
|
||||||
|
e.Level == LogLevel.Warning && e.Message.Contains("Z"));
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}, TimeSpan.FromSeconds(5));
|
||||||
|
|
||||||
|
// ...and Z is still present (never deleted) and no fetch happened.
|
||||||
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
Assert.Contains(configs, c => c.InstanceUniqueName == "Z");
|
||||||
|
Assert.Empty(fetcher.Calls);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task NoGap_DoesNotFetch()
|
||||||
|
{
|
||||||
|
// Empty gap and no orphans → the fetcher is never called.
|
||||||
|
var fetcher = new FakeConfigFetcher(_ => Task.FromResult("never"));
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
var actor = CreateReconciliationActor(commProbe, fetcher);
|
||||||
|
|
||||||
|
commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
commProbe.Reply(new ReconcileSiteResponse([], [], "http://central:9000"));
|
||||||
|
|
||||||
|
// Give any (erroneous) continuation time to run, then prove no fetch happened.
|
||||||
|
Watch(actor);
|
||||||
|
ExpectNoMsg(TimeSpan.FromMilliseconds(500));
|
||||||
|
Assert.Empty(fetcher.Calls);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task PerItemFetchFailure_DoesNotAbortTheRest()
|
||||||
|
{
|
||||||
|
// Gap = [Bad, Good]; the Bad fetch throws but Good must still be fetched + stored.
|
||||||
|
const string configGood = "{\"instanceUniqueName\":\"Good\"}";
|
||||||
|
var fetcher = new FakeConfigFetcher(depId => depId == "depBad"
|
||||||
|
? Task.FromException<string>(new DeploymentConfigFetchException("boom", isSuperseded: false))
|
||||||
|
: Task.FromResult(configGood));
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
var actor = CreateReconciliationActor(commProbe, fetcher);
|
||||||
|
Watch(actor);
|
||||||
|
|
||||||
|
commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
commProbe.Reply(new ReconcileSiteResponse(
|
||||||
|
[
|
||||||
|
new ReconcileGapItem("Bad", "depBad", "revBad", true, "tok-bad"),
|
||||||
|
new ReconcileGapItem("Good", "depGood", "revGood", true, "tok-good")
|
||||||
|
],
|
||||||
|
[],
|
||||||
|
"http://central:9000"));
|
||||||
|
|
||||||
|
await AwaitAssertAsync(async () =>
|
||||||
|
{
|
||||||
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
var row = Assert.Single(configs, c => c.InstanceUniqueName == "Good");
|
||||||
|
Assert.Equal(configGood, row.ConfigJson);
|
||||||
|
}, TimeSpan.FromSeconds(5));
|
||||||
|
|
||||||
|
// Bad was attempted but never stored; the actor survived the per-item failure.
|
||||||
|
var all = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
Assert.DoesNotContain(all, c => c.InstanceUniqueName == "Bad");
|
||||||
|
ExpectNoMsg(TimeSpan.FromMilliseconds(300));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public async Task SupersededItem_IsSkippedQuietly_OthersStillApply()
|
||||||
|
{
|
||||||
|
// Gap = [Gone, Good]; the Gone fetch throws a 404 (superseded/expired). That branch is
|
||||||
|
// a quiet skip (logged Info, NOT counted as a failure) — the Good item must still apply
|
||||||
|
// and the actor must not crash.
|
||||||
|
const string configGood = "{\"instanceUniqueName\":\"Good\"}";
|
||||||
|
var fetcher = new FakeConfigFetcher(depId => depId == "depGone"
|
||||||
|
? Task.FromException<string>(new DeploymentConfigFetchException("expired", isSuperseded: true))
|
||||||
|
: Task.FromResult(configGood));
|
||||||
|
var logger = new CapturingLogger<SiteReconciliationActor>();
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
var actor = CreateReconciliationActor(commProbe, fetcher, logger);
|
||||||
|
Watch(actor);
|
||||||
|
|
||||||
|
commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
commProbe.Reply(new ReconcileSiteResponse(
|
||||||
|
[
|
||||||
|
new ReconcileGapItem("Gone", "depGone", "revGone", true, "tok-gone"),
|
||||||
|
new ReconcileGapItem("Good", "depGood", "revGood", true, "tok-good")
|
||||||
|
],
|
||||||
|
[],
|
||||||
|
"http://central:9000"));
|
||||||
|
|
||||||
|
await AwaitAssertAsync(async () =>
|
||||||
|
{
|
||||||
|
var configs = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
var row = Assert.Single(configs, c => c.InstanceUniqueName == "Good");
|
||||||
|
Assert.Equal(configGood, row.ConfigJson);
|
||||||
|
}, TimeSpan.FromSeconds(5));
|
||||||
|
|
||||||
|
// The superseded item was attempted but never stored, and was logged at Info as a skip
|
||||||
|
// (not Error) — and the actor survived.
|
||||||
|
var all = await _storage.GetAllDeployedConfigsAsync();
|
||||||
|
Assert.DoesNotContain(all, c => c.InstanceUniqueName == "Gone");
|
||||||
|
Assert.Contains(logger.Entries, e =>
|
||||||
|
e.Level == LogLevel.Information && e.Message.Contains("Gone"));
|
||||||
|
Assert.DoesNotContain(logger.Entries, e =>
|
||||||
|
e.Level == LogLevel.Error && e.Message.Contains("Gone"));
|
||||||
|
ExpectNoMsg(TimeSpan.FromMilliseconds(300));
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void CentralAskFails_ActorSurvives()
|
||||||
|
{
|
||||||
|
// The SiteCommunicationActor probe never replies → the Ask times out. The actor must
|
||||||
|
// log + survive (reconcile re-runs on the next startup), not crash.
|
||||||
|
var fetcher = new FakeConfigFetcher(_ => Task.FromResult("never"));
|
||||||
|
var logger = new CapturingLogger<SiteReconciliationActor>();
|
||||||
|
var commProbe = CreateTestProbe();
|
||||||
|
|
||||||
|
var actor = CreateReconciliationActor(
|
||||||
|
commProbe, fetcher, logger, askTimeout: TimeSpan.FromMilliseconds(300));
|
||||||
|
Watch(actor);
|
||||||
|
|
||||||
|
// The request is sent...
|
||||||
|
commProbe.ExpectMsg<ReconcileSiteRequest>(TimeSpan.FromSeconds(5));
|
||||||
|
// ...but the probe does NOT reply, forcing an Ask timeout.
|
||||||
|
|
||||||
|
// The actor does not die (no Terminated within the window) and never fetched.
|
||||||
|
ExpectNoMsg(TimeSpan.FromSeconds(1));
|
||||||
|
Assert.Empty(fetcher.Calls);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// In-test fake <see cref="IDeploymentConfigFetcher"/>: runs a per-deploymentId behavior
|
||||||
|
/// (return config JSON or throw, as a Task) and records every call's coords thread-safely
|
||||||
|
/// (the fetch runs on a pool thread).
|
||||||
|
/// </summary>
|
||||||
|
private sealed class FakeConfigFetcher : IDeploymentConfigFetcher
|
||||||
|
{
|
||||||
|
private readonly Func<string, Task<string>> _behavior;
|
||||||
|
public ConcurrentQueue<(string BaseUrl, string DeploymentId, string Token)> Calls { get; } = new();
|
||||||
|
|
||||||
|
public FakeConfigFetcher(Func<string, Task<string>> behavior) => _behavior = behavior;
|
||||||
|
|
||||||
|
public async Task<string> FetchAsync(
|
||||||
|
string centralFetchBaseUrl, string deploymentId, string token, CancellationToken ct)
|
||||||
|
{
|
||||||
|
Calls.Enqueue((centralFetchBaseUrl, deploymentId, token));
|
||||||
|
await Task.Yield();
|
||||||
|
return await _behavior(deploymentId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>Thread-safe capturing logger so tests can assert on emitted warnings.</summary>
|
||||||
|
private sealed class CapturingLogger<T> : ILogger<T>
|
||||||
|
{
|
||||||
|
public ConcurrentQueue<(LogLevel Level, string Message)> Entries { get; } = new();
|
||||||
|
|
||||||
|
IDisposable? ILogger.BeginScope<TState>(TState state) => null;
|
||||||
|
public bool IsEnabled(LogLevel logLevel) => true;
|
||||||
|
|
||||||
|
public void Log<TState>(
|
||||||
|
LogLevel logLevel, EventId eventId, TState state, Exception? exception,
|
||||||
|
Func<TState, Exception?, string> formatter)
|
||||||
|
=> Entries.Enqueue((logLevel, formatter(state, exception)));
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user