feat(reconcile): central handler — gap diff + fresh tokens + orphans

This commit is contained in:
Joseph Doherty
2026-06-26 16:20:17 -04:00
parent ec2aa2bbac
commit 96192950a0
5 changed files with 462 additions and 0 deletions
@@ -7,6 +7,7 @@ using Microsoft.Extensions.DependencyInjection;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories; using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Communication; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Communication;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification; using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
using ZB.MOM.WW.ScadaBridge.HealthMonitoring; using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
@@ -213,6 +214,12 @@ public class CentralCommunicationActor : ReceiveActor
// Audit Log (#23 M3) combined-telemetry ingest: routes to the same proxy // Audit Log (#23 M3) combined-telemetry ingest: routes to the same proxy
// the same way; the proxy replies with an IngestCachedTelemetryReply. // the same way; the proxy replies with an IngestCachedTelemetryReply.
Receive<IngestCachedTelemetryCommand>(HandleIngestCachedTelemetry); Receive<IngestCachedTelemetryCommand>(HandleIngestCachedTelemetry);
// Startup reconciliation: a site node forwards its local deployed inventory on
// startup via ClusterClient. Resolve the scoped ReconcileService, diff the
// inventory against central's expected set, and pipe the ReconcileSiteResponse
// (gap fetch tokens + orphans) straight back to the site node's ClusterClient.
Receive<ReconcileSiteRequest>(HandleReconcileSiteRequest);
} }
private void HandleNotificationSubmit(NotificationSubmit msg) private void HandleNotificationSubmit(NotificationSubmit msg)
@@ -297,6 +304,48 @@ public class CentralCommunicationActor : ReceiveActor
.PipeTo(replyTo); .PipeTo(replyTo);
} }
/// <summary>
/// Startup reconciliation (site→central over ClusterClient): resolve the scoped
/// <see cref="ReconcileService"/> in a DI scope, diff the node's reported inventory
/// against central's expected set, and pipe the <see cref="ReconcileSiteResponse"/>
/// back to the site node's ClusterClient path. The actor stays thin — all the diff
/// and staging logic lives in the service. Mirrors the DB-access pattern used by
/// <see cref="LoadSiteAddressesFromDb"/> (Task.Run + CreateScope + PipeTo) and the
/// Sender-preservation pattern of <see cref="HandleIngestAuditEvents"/>.
///
/// On a faulted task PipeTo delivers a <see cref="Status.Failure"/> to the node; its
/// Ask faults and it simply retries reconcile on the next startup — reconcile is
/// best-effort, so the fault is allowed to propagate rather than being swallowed.
/// </summary>
private void HandleReconcileSiteRequest(ReconcileSiteRequest msg)
{
// Capture Sender before the async/PipeTo — Akka resets Sender between dispatches.
var replyTo = Sender;
// Bound the DB work by the actor lifecycle (Communication-019). The CTS may have
// been disposed by PostStop on a racing late message; treat that as "actor gone".
CancellationToken ct;
try
{
ct = _lifecycleCts.Token;
}
catch (ObjectDisposedException)
{
return;
}
_log.Debug(
"Handling ReconcileSiteRequest from site {0} node {1} ({2} local instance(s))",
msg.SiteIdentifier, msg.NodeId, msg.LocalNameToRevisionHash.Count);
Task.Run(async () =>
{
using var scope = _serviceProvider.CreateScope();
var service = scope.ServiceProvider.GetRequiredService<ReconcileService>();
return await service.ReconcileAsync(msg, ct).ConfigureAwait(false);
}).PipeTo(replyTo);
}
private void HandleHeartbeat(HeartbeatMessage heartbeat) private void HandleHeartbeat(HeartbeatMessage heartbeat)
{ {
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>(); var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
@@ -0,0 +1,155 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Deployment;
namespace ZB.MOM.WW.ScadaBridge.Communication;
/// <summary>
/// Central-side startup-reconciliation handler. A site node, on startup, reports its
/// local deployed inventory via <see cref="ReconcileSiteRequest"/> (delivered over
/// ClusterClient to <see cref="Actors.CentralCommunicationActor"/>); this service diffs
/// it against central's expected deployed set and replies with fresh fetch tokens for the
/// gap — instances the node is missing or has at a stale revision — plus the orphan names
/// (present locally but no longer deployed centrally, which the node only logs).
///
/// This self-heals a node that was DOWN during a deploy and therefore missed the
/// replicate. The node fetches each gap item's config over the existing token-gated HTTP
/// endpoint; this service stages those tokens as <c>PendingDeployment</c> rows.
///
/// Scoped service (holds scoped repositories); resolved per-request by the actor inside a
/// DI scope. The actor stays thin: it captures <c>Sender</c>, resolves this service in a
/// scope, awaits <see cref="ReconcileAsync"/>, and pipes the response back.
/// </summary>
public class ReconcileService
{
private readonly IDeploymentManagerRepository _deploymentRepository;
private readonly ISiteRepository _siteRepository;
private readonly CommunicationOptions _options;
private readonly ILogger<ReconcileService> _logger;
/// <summary>Initializes the reconciliation service.</summary>
/// <param name="deploymentRepository">Repository for the expected-set query and pending-row staging.</param>
/// <param name="siteRepository">Repository used to resolve a site's numeric id from its identifier.</param>
/// <param name="options">Communication options carrying <see cref="CommunicationOptions.CentralFetchBaseUrl"/> and <see cref="CommunicationOptions.PendingDeploymentTtl"/>.</param>
/// <param name="logger">Logger.</param>
public ReconcileService(
IDeploymentManagerRepository deploymentRepository,
ISiteRepository siteRepository,
IOptions<CommunicationOptions> options,
ILogger<ReconcileService> logger)
{
_deploymentRepository = deploymentRepository;
_siteRepository = siteRepository;
_options = options.Value;
_logger = logger;
}
/// <summary>
/// Diffs the node's reported inventory against central's expected deployed set,
/// stages fresh fetch tokens for the gap, and returns the reconcile response.
/// </summary>
/// <param name="request">The site node's reconcile request (its local name→revision-hash map).</param>
/// <param name="cancellationToken">A cancellation token.</param>
/// <returns>The gap (with fresh tokens), orphan names, and the central fetch base URL.</returns>
public async Task<ReconcileSiteResponse> ReconcileAsync(
ReconcileSiteRequest request, CancellationToken cancellationToken = default)
{
var baseUrl = _options.CentralFetchBaseUrl;
// 1. Resolve the numeric site id. An unknown site is non-fatal: reply empty
// (the node simply finds no gap to fetch) and log a warning so a
// misconfigured SiteIdentifier is visible to operators.
var site = await _siteRepository
.GetSiteByIdentifierAsync(request.SiteIdentifier, cancellationToken)
.ConfigureAwait(false);
if (site == null)
{
_logger.LogWarning(
"Reconcile request from unknown site '{SiteIdentifier}' (node {NodeId}); replying with empty gap",
request.SiteIdentifier, request.NodeId);
return new ReconcileSiteResponse(
Array.Empty<ReconcileGapItem>(), Array.Empty<string>(), baseUrl);
}
// 2. Central's expected deployed set for this site (instances with a snapshot).
var expected = await _deploymentRepository
.GetExpectedDeploymentsForSiteAsync(site.Id, cancellationToken)
.ConfigureAwait(false);
var localMap = request.LocalNameToRevisionHash;
var gap = new List<ReconcileGapItem>();
var now = DateTimeOffset.UtcNow;
var expiresAt = now + _options.PendingDeploymentTtl;
// 3. GAP = expected items the node is MISSING (name absent locally) or STALE
// (local revision != expected revision). Current items are omitted.
foreach (var exp in expected)
{
var present = localMap.TryGetValue(exp.InstanceUniqueName, out var localHash);
var stale = present && !string.Equals(localHash, exp.RevisionHash, StringComparison.Ordinal);
if (present && !stale)
continue; // node already has the current revision
// 4. Read the frozen snapshot config to stage. Null = the snapshot was deleted
// between the expected-set query and now (instance removed mid-reconcile);
// skip it — best-effort reconcile re-runs on the next node startup.
var snapshot = await _deploymentRepository
.GetDeployedSnapshotByInstanceIdAsync(exp.InstanceId, cancellationToken)
.ConfigureAwait(false);
if (snapshot == null)
{
_logger.LogDebug(
"Reconcile: snapshot for instance {Instance} disappeared (deleted race); skipping",
exp.InstanceUniqueName);
continue;
}
var token = DeploymentFetchToken.Generate();
// Stage with the snapshot's DeploymentId as the deploymentId so the gap item's
// DeploymentId + token point the node at the right pending row to fetch.
//
// Reconcile staging is safe without a DB uniqueness guard: a gap arises only
// from one-node-down-during-a-successful-deploy, so at most one node ever
// reconciles a given instance (if BOTH were down the deploy failed and no
// snapshot exists, so it is never in the expected set). Deploy-time
// supersession serializes via the per-instance operation lock.
var staged = await _deploymentRepository.StagePendingIfAbsentAsync(
exp.InstanceId, snapshot.DeploymentId, exp.RevisionHash,
snapshot.ConfigurationJson, token, now, expiresAt, cancellationToken)
.ConfigureAwait(false);
if (!staged)
{
// A pending row already exists — an in-flight deploy is mid-flight and its
// replication will deliver this instance to the node shortly. Omit it from
// the gap (reconcile is best-effort and re-runs).
_logger.LogDebug(
"Reconcile: pending row already exists for instance {Instance} (in-flight deploy); omitting from gap",
exp.InstanceUniqueName);
continue;
}
gap.Add(new ReconcileGapItem(
exp.InstanceUniqueName, snapshot.DeploymentId, exp.RevisionHash, exp.IsEnabled, token));
}
// 5. ORPHANS = names the node has locally that central no longer considers deployed.
// The node only LOGS these (never deletes).
var expectedNames = new HashSet<string>(
expected.Select(e => e.InstanceUniqueName), StringComparer.Ordinal);
var orphans = localMap.Keys
.Where(name => !expectedNames.Contains(name))
.ToList();
_logger.LogDebug(
"Reconcile for site {SiteIdentifier} (node {NodeId}): {GapCount} gap, {OrphanCount} orphan(s)",
request.SiteIdentifier, request.NodeId, gap.Count, orphans.Count);
// 6. Reply.
return new ReconcileSiteResponse(gap, orphans, baseUrl);
}
}
@@ -17,6 +17,11 @@ public static class ServiceCollectionExtensions
services.AddSingleton<SiteStreamGrpcClientFactory>(); services.AddSingleton<SiteStreamGrpcClientFactory>();
services.AddSingleton<DebugStreamService>(); services.AddSingleton<DebugStreamService>();
// Startup reconciliation handler — scoped (holds scoped repositories), resolved
// per-request by CentralCommunicationActor inside a DI scope. Harmless on site
// hosts: only the central actor ever resolves it.
services.AddScoped<ReconcileService>();
return services; return services;
} }
@@ -0,0 +1,79 @@
using Akka.Actor;
using Akka.TestKit.Xunit2;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using NSubstitute;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Sites;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Deployment;
using ZB.MOM.WW.ScadaBridge.Communication.Actors;
namespace ZB.MOM.WW.ScadaBridge.Communication.Tests;
/// <summary>
/// Tests that <see cref="CentralCommunicationActor"/> routes a site→central
/// <see cref="ReconcileSiteRequest"/> through the scoped <see cref="ReconcileService"/>
/// and pipes the resulting <see cref="ReconcileSiteResponse"/> back to the original
/// sender (the site's ClusterClient path). Mirrors the audit-ingest routing tests.
/// </summary>
public class CentralCommunicationActorReconcileTests : TestKit
{
[Fact]
public void ReconcileSiteRequest_RoutesResponseToSender()
{
var deploymentRepo = Substitute.For<IDeploymentManagerRepository>();
var siteRepo = Substitute.For<ISiteRepository>();
// GetAllSitesAsync is called by the actor's periodic refresh; keep it empty.
siteRepo.GetAllSitesAsync(Arg.Any<CancellationToken>())
.Returns(new List<Site>());
siteRepo.GetSiteByIdentifierAsync("site1", Arg.Any<CancellationToken>())
.Returns(new Site("Site One", "site1") { Id = 7 });
deploymentRepo.GetExpectedDeploymentsForSiteAsync(7, Arg.Any<CancellationToken>())
.Returns(new List<ExpectedDeployment>
{
new(2, "inst-B", "rev2", "dep-B", true),
});
deploymentRepo.GetDeployedSnapshotByInstanceIdAsync(2, Arg.Any<CancellationToken>())
.Returns(new DeployedConfigSnapshot("dep-B", "rev2", "{\"cfg\":\"B\"}"));
deploymentRepo.StagePendingIfAbsentAsync(
Arg.Any<int>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(),
Arg.Any<string>(), Arg.Any<DateTimeOffset>(), Arg.Any<DateTimeOffset>(),
Arg.Any<CancellationToken>())
.Returns(true);
var options = Options.Create(new CommunicationOptions
{
CentralFetchBaseUrl = "https://central.example:9000",
PendingDeploymentTtl = TimeSpan.FromMinutes(5),
});
var services = new ServiceCollection();
services.AddScoped(_ => deploymentRepo);
services.AddScoped(_ => siteRepo);
services.AddSingleton(options);
services.AddSingleton<Microsoft.Extensions.Logging.ILogger<ReconcileService>>(
NullLogger<ReconcileService>.Instance);
services.AddScoped<ReconcileService>();
var sp = services.BuildServiceProvider();
var factory = Substitute.For<ISiteClientFactory>();
var actor = Sys.ActorOf(Props.Create(() => new CentralCommunicationActor(sp, factory, null)));
// Node B is missing inst-B entirely → it should come back as a gap item.
actor.Tell(new ReconcileSiteRequest(
"site1", "node-b",
new Dictionary<string, string>()));
var response = ExpectMsg<ReconcileSiteResponse>(TimeSpan.FromSeconds(5));
var gap = Assert.Single(response.Gap);
Assert.Equal("inst-B", gap.InstanceUniqueName);
Assert.Equal("dep-B", gap.DeploymentId);
Assert.False(string.IsNullOrWhiteSpace(gap.FetchToken));
}
}
@@ -0,0 +1,174 @@
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using NSubstitute;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Sites;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Deployment;
namespace ZB.MOM.WW.ScadaBridge.Communication.Tests;
/// <summary>
/// Unit tests for the central-side startup-reconciliation handler
/// (<see cref="ReconcileService"/>). A site node reports its local inventory; the
/// service diffs it against central's expected deployed set, stages fresh fetch
/// tokens for the gap (missing/stale), and reports orphans.
/// </summary>
public class ReconcileServiceTests
{
private const string BaseUrl = "https://central.example:9000";
private const string SiteIdentifier = "site1";
private const int SiteId = 7;
private readonly IDeploymentManagerRepository _deploymentRepo =
Substitute.For<IDeploymentManagerRepository>();
private readonly ISiteRepository _siteRepo = Substitute.For<ISiteRepository>();
private ReconcileService CreateService(TimeSpan? ttl = null)
{
var options = Options.Create(new CommunicationOptions
{
CentralFetchBaseUrl = BaseUrl,
PendingDeploymentTtl = ttl ?? TimeSpan.FromMinutes(5),
});
return new ReconcileService(
_deploymentRepo, _siteRepo, options, NullLogger<ReconcileService>.Instance);
}
private void SiteResolves() =>
_siteRepo.GetSiteByIdentifierAsync(SiteIdentifier, Arg.Any<CancellationToken>())
.Returns(new Site("Site One", SiteIdentifier) { Id = SiteId });
private static ExpectedDeployment Expected(int id, string name, string rev, string dep, bool enabled = true) =>
new(id, name, rev, dep, enabled);
private void ExpectedSet(params ExpectedDeployment[] expected) =>
_deploymentRepo.GetExpectedDeploymentsForSiteAsync(SiteId, Arg.Any<CancellationToken>())
.Returns(expected.ToList());
private void SnapshotFor(ExpectedDeployment exp) =>
_deploymentRepo.GetDeployedSnapshotByInstanceIdAsync(exp.InstanceId, Arg.Any<CancellationToken>())
.Returns(new DeployedConfigSnapshot(exp.DeploymentId, exp.RevisionHash, $"{{\"cfg\":\"{exp.InstanceUniqueName}\"}}"));
private void StageReturns(bool result) =>
_deploymentRepo.StagePendingIfAbsentAsync(
Arg.Any<int>(), Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(),
Arg.Any<string>(), Arg.Any<DateTimeOffset>(), Arg.Any<DateTimeOffset>(),
Arg.Any<CancellationToken>())
.Returns(result);
private static ReconcileSiteRequest Request(params (string Name, string Rev)[] local) =>
new(SiteIdentifier, "node-a",
local.ToDictionary(x => x.Name, x => x.Rev));
[Fact]
public async Task Reconcile_GapIsMissingAndStale_CurrentOmitted_WithFreshTokensAndSnapshotDeploymentIds()
{
SiteResolves();
var a = Expected(1, "inst-A", "rev1", "dep-A");
var b = Expected(2, "inst-B", "rev2", "dep-B");
var c = Expected(3, "inst-C", "rev3", "dep-C", enabled: false);
ExpectedSet(a, b, c);
SnapshotFor(b);
SnapshotFor(c);
StageReturns(true);
// Node has A current, B stale (revOLD), C missing entirely.
var response = await CreateService().ReconcileAsync(
Request(("inst-A", "rev1"), ("inst-B", "revOLD")));
Assert.Equal(BaseUrl, response.CentralFetchBaseUrl);
Assert.Equal(2, response.Gap.Count);
Assert.DoesNotContain(response.Gap, g => g.InstanceUniqueName == "inst-A");
var gapB = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-B");
Assert.Equal("dep-B", gapB.DeploymentId);
Assert.Equal("rev2", gapB.RevisionHash);
Assert.True(gapB.IsEnabled);
Assert.False(string.IsNullOrWhiteSpace(gapB.FetchToken));
var gapC = Assert.Single(response.Gap, g => g.InstanceUniqueName == "inst-C");
Assert.Equal("dep-C", gapC.DeploymentId);
Assert.Equal("rev3", gapC.RevisionHash);
Assert.False(gapC.IsEnabled);
Assert.False(string.IsNullOrWhiteSpace(gapC.FetchToken));
// Fresh, distinct tokens per gap item.
Assert.NotEqual(gapB.FetchToken, gapC.FetchToken);
Assert.Empty(response.OrphanNames);
}
[Fact]
public async Task Reconcile_LocalNameNotInExpected_IsReportedAsOrphan()
{
SiteResolves();
var a = Expected(1, "inst-A", "rev1", "dep-A");
ExpectedSet(a);
StageReturns(true);
// inst-A is current; inst-Z is not deployed centrally → orphan.
var response = await CreateService().ReconcileAsync(
Request(("inst-A", "rev1"), ("inst-Z", "revX")));
Assert.Empty(response.Gap);
var orphan = Assert.Single(response.OrphanNames);
Assert.Equal("inst-Z", orphan);
}
[Fact]
public async Task Reconcile_StagePendingReturnsFalse_OmitsThatGapItem()
{
SiteResolves();
var b = Expected(2, "inst-B", "rev2", "dep-B");
var c = Expected(3, "inst-C", "rev3", "dep-C");
ExpectedSet(b, c);
SnapshotFor(b);
SnapshotFor(c);
// Both missing locally, but C already has an in-flight pending row.
StageReturns(true);
_deploymentRepo.StagePendingIfAbsentAsync(
3, Arg.Any<string>(), Arg.Any<string>(), Arg.Any<string>(),
Arg.Any<string>(), Arg.Any<DateTimeOffset>(), Arg.Any<DateTimeOffset>(),
Arg.Any<CancellationToken>())
.Returns(false);
var response = await CreateService().ReconcileAsync(Request(/* empty local inventory */));
var gap = Assert.Single(response.Gap);
Assert.Equal("inst-B", gap.InstanceUniqueName);
Assert.DoesNotContain(response.Gap, g => g.InstanceUniqueName == "inst-C");
}
[Fact]
public async Task Reconcile_SnapshotMissing_SkipsGapItem()
{
SiteResolves();
var b = Expected(2, "inst-B", "rev2", "dep-B");
ExpectedSet(b);
// No snapshot configured for inst-B → repo returns null (deleted race).
StageReturns(true);
var response = await CreateService().ReconcileAsync(Request());
Assert.Empty(response.Gap);
Assert.Empty(response.OrphanNames);
}
[Fact]
public async Task Reconcile_UnknownSite_ReturnsEmptyResponse_NoThrow()
{
_siteRepo.GetSiteByIdentifierAsync(SiteIdentifier, Arg.Any<CancellationToken>())
.Returns((Site?)null);
var response = await CreateService().ReconcileAsync(
Request(("inst-A", "rev1")));
Assert.Empty(response.Gap);
Assert.Empty(response.OrphanNames);
Assert.Equal(BaseUrl, response.CentralFetchBaseUrl);
await _deploymentRepo.DidNotReceive()
.GetExpectedDeploymentsForSiteAsync(Arg.Any<int>(), Arg.Any<CancellationToken>());
}
}