feat(reconcile): central handler — gap diff + fresh tokens + orphans
This commit is contained in:
@@ -7,6 +7,7 @@ using Microsoft.Extensions.DependencyInjection;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Communication;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Health;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
|
||||
using ZB.MOM.WW.ScadaBridge.HealthMonitoring;
|
||||
@@ -213,6 +214,12 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
// Audit Log (#23 M3) combined-telemetry ingest: routes to the same proxy
|
||||
// the same way; the proxy replies with an IngestCachedTelemetryReply.
|
||||
Receive<IngestCachedTelemetryCommand>(HandleIngestCachedTelemetry);
|
||||
|
||||
// Startup reconciliation: a site node forwards its local deployed inventory on
|
||||
// startup via ClusterClient. Resolve the scoped ReconcileService, diff the
|
||||
// inventory against central's expected set, and pipe the ReconcileSiteResponse
|
||||
// (gap fetch tokens + orphans) straight back to the site node's ClusterClient.
|
||||
Receive<ReconcileSiteRequest>(HandleReconcileSiteRequest);
|
||||
}
|
||||
|
||||
private void HandleNotificationSubmit(NotificationSubmit msg)
|
||||
@@ -297,6 +304,48 @@ public class CentralCommunicationActor : ReceiveActor
|
||||
.PipeTo(replyTo);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Startup reconciliation (site→central over ClusterClient): resolve the scoped
|
||||
/// <see cref="ReconcileService"/> in a DI scope, diff the node's reported inventory
|
||||
/// against central's expected set, and pipe the <see cref="ReconcileSiteResponse"/>
|
||||
/// back to the site node's ClusterClient path. The actor stays thin — all the diff
|
||||
/// and staging logic lives in the service. Mirrors the DB-access pattern used by
|
||||
/// <see cref="LoadSiteAddressesFromDb"/> (Task.Run + CreateScope + PipeTo) and the
|
||||
/// Sender-preservation pattern of <see cref="HandleIngestAuditEvents"/>.
|
||||
///
|
||||
/// On a faulted task PipeTo delivers a <see cref="Status.Failure"/> to the node; its
|
||||
/// Ask faults and it simply retries reconcile on the next startup — reconcile is
|
||||
/// best-effort, so the fault is allowed to propagate rather than being swallowed.
|
||||
/// </summary>
|
||||
private void HandleReconcileSiteRequest(ReconcileSiteRequest msg)
|
||||
{
|
||||
// Capture Sender before the async/PipeTo — Akka resets Sender between dispatches.
|
||||
var replyTo = Sender;
|
||||
|
||||
// Bound the DB work by the actor lifecycle (Communication-019). The CTS may have
|
||||
// been disposed by PostStop on a racing late message; treat that as "actor gone".
|
||||
CancellationToken ct;
|
||||
try
|
||||
{
|
||||
ct = _lifecycleCts.Token;
|
||||
}
|
||||
catch (ObjectDisposedException)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_log.Debug(
|
||||
"Handling ReconcileSiteRequest from site {0} node {1} ({2} local instance(s))",
|
||||
msg.SiteIdentifier, msg.NodeId, msg.LocalNameToRevisionHash.Count);
|
||||
|
||||
Task.Run(async () =>
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
var service = scope.ServiceProvider.GetRequiredService<ReconcileService>();
|
||||
return await service.ReconcileAsync(msg, ct).ConfigureAwait(false);
|
||||
}).PipeTo(replyTo);
|
||||
}
|
||||
|
||||
private void HandleHeartbeat(HeartbeatMessage heartbeat)
|
||||
{
|
||||
var aggregator = _serviceProvider.GetService<ICentralHealthAggregator>();
|
||||
|
||||
@@ -0,0 +1,155 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Deployment;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Deployment;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication;
|
||||
|
||||
/// <summary>
|
||||
/// Central-side startup-reconciliation handler. A site node, on startup, reports its
|
||||
/// local deployed inventory via <see cref="ReconcileSiteRequest"/> (delivered over
|
||||
/// ClusterClient to <see cref="Actors.CentralCommunicationActor"/>); this service diffs
|
||||
/// it against central's expected deployed set and replies with fresh fetch tokens for the
|
||||
/// gap — instances the node is missing or has at a stale revision — plus the orphan names
|
||||
/// (present locally but no longer deployed centrally, which the node only logs).
|
||||
///
|
||||
/// This self-heals a node that was DOWN during a deploy and therefore missed the
|
||||
/// replicate. The node fetches each gap item's config over the existing token-gated HTTP
|
||||
/// endpoint; this service stages those tokens as <c>PendingDeployment</c> rows.
|
||||
///
|
||||
/// Scoped service (holds scoped repositories); resolved per-request by the actor inside a
|
||||
/// DI scope. The actor stays thin: it captures <c>Sender</c>, resolves this service in a
|
||||
/// scope, awaits <see cref="ReconcileAsync"/>, and pipes the response back.
|
||||
/// </summary>
|
||||
public class ReconcileService
|
||||
{
|
||||
private readonly IDeploymentManagerRepository _deploymentRepository;
|
||||
private readonly ISiteRepository _siteRepository;
|
||||
private readonly CommunicationOptions _options;
|
||||
private readonly ILogger<ReconcileService> _logger;
|
||||
|
||||
/// <summary>Initializes the reconciliation service.</summary>
|
||||
/// <param name="deploymentRepository">Repository for the expected-set query and pending-row staging.</param>
|
||||
/// <param name="siteRepository">Repository used to resolve a site's numeric id from its identifier.</param>
|
||||
/// <param name="options">Communication options carrying <see cref="CommunicationOptions.CentralFetchBaseUrl"/> and <see cref="CommunicationOptions.PendingDeploymentTtl"/>.</param>
|
||||
/// <param name="logger">Logger.</param>
|
||||
public ReconcileService(
|
||||
IDeploymentManagerRepository deploymentRepository,
|
||||
ISiteRepository siteRepository,
|
||||
IOptions<CommunicationOptions> options,
|
||||
ILogger<ReconcileService> logger)
|
||||
{
|
||||
_deploymentRepository = deploymentRepository;
|
||||
_siteRepository = siteRepository;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Diffs the node's reported inventory against central's expected deployed set,
|
||||
/// stages fresh fetch tokens for the gap, and returns the reconcile response.
|
||||
/// </summary>
|
||||
/// <param name="request">The site node's reconcile request (its local name→revision-hash map).</param>
|
||||
/// <param name="cancellationToken">A cancellation token.</param>
|
||||
/// <returns>The gap (with fresh tokens), orphan names, and the central fetch base URL.</returns>
|
||||
public async Task<ReconcileSiteResponse> ReconcileAsync(
|
||||
ReconcileSiteRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var baseUrl = _options.CentralFetchBaseUrl;
|
||||
|
||||
// 1. Resolve the numeric site id. An unknown site is non-fatal: reply empty
|
||||
// (the node simply finds no gap to fetch) and log a warning so a
|
||||
// misconfigured SiteIdentifier is visible to operators.
|
||||
var site = await _siteRepository
|
||||
.GetSiteByIdentifierAsync(request.SiteIdentifier, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
if (site == null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Reconcile request from unknown site '{SiteIdentifier}' (node {NodeId}); replying with empty gap",
|
||||
request.SiteIdentifier, request.NodeId);
|
||||
return new ReconcileSiteResponse(
|
||||
Array.Empty<ReconcileGapItem>(), Array.Empty<string>(), baseUrl);
|
||||
}
|
||||
|
||||
// 2. Central's expected deployed set for this site (instances with a snapshot).
|
||||
var expected = await _deploymentRepository
|
||||
.GetExpectedDeploymentsForSiteAsync(site.Id, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var localMap = request.LocalNameToRevisionHash;
|
||||
var gap = new List<ReconcileGapItem>();
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var expiresAt = now + _options.PendingDeploymentTtl;
|
||||
|
||||
// 3. GAP = expected items the node is MISSING (name absent locally) or STALE
|
||||
// (local revision != expected revision). Current items are omitted.
|
||||
foreach (var exp in expected)
|
||||
{
|
||||
var present = localMap.TryGetValue(exp.InstanceUniqueName, out var localHash);
|
||||
var stale = present && !string.Equals(localHash, exp.RevisionHash, StringComparison.Ordinal);
|
||||
if (present && !stale)
|
||||
continue; // node already has the current revision
|
||||
|
||||
// 4. Read the frozen snapshot config to stage. Null = the snapshot was deleted
|
||||
// between the expected-set query and now (instance removed mid-reconcile);
|
||||
// skip it — best-effort reconcile re-runs on the next node startup.
|
||||
var snapshot = await _deploymentRepository
|
||||
.GetDeployedSnapshotByInstanceIdAsync(exp.InstanceId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
if (snapshot == null)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Reconcile: snapshot for instance {Instance} disappeared (deleted race); skipping",
|
||||
exp.InstanceUniqueName);
|
||||
continue;
|
||||
}
|
||||
|
||||
var token = DeploymentFetchToken.Generate();
|
||||
|
||||
// Stage with the snapshot's DeploymentId as the deploymentId so the gap item's
|
||||
// DeploymentId + token point the node at the right pending row to fetch.
|
||||
//
|
||||
// Reconcile staging is safe without a DB uniqueness guard: a gap arises only
|
||||
// from one-node-down-during-a-successful-deploy, so at most one node ever
|
||||
// reconciles a given instance (if BOTH were down the deploy failed and no
|
||||
// snapshot exists, so it is never in the expected set). Deploy-time
|
||||
// supersession serializes via the per-instance operation lock.
|
||||
var staged = await _deploymentRepository.StagePendingIfAbsentAsync(
|
||||
exp.InstanceId, snapshot.DeploymentId, exp.RevisionHash,
|
||||
snapshot.ConfigurationJson, token, now, expiresAt, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!staged)
|
||||
{
|
||||
// A pending row already exists — an in-flight deploy is mid-flight and its
|
||||
// replication will deliver this instance to the node shortly. Omit it from
|
||||
// the gap (reconcile is best-effort and re-runs).
|
||||
_logger.LogDebug(
|
||||
"Reconcile: pending row already exists for instance {Instance} (in-flight deploy); omitting from gap",
|
||||
exp.InstanceUniqueName);
|
||||
continue;
|
||||
}
|
||||
|
||||
gap.Add(new ReconcileGapItem(
|
||||
exp.InstanceUniqueName, snapshot.DeploymentId, exp.RevisionHash, exp.IsEnabled, token));
|
||||
}
|
||||
|
||||
// 5. ORPHANS = names the node has locally that central no longer considers deployed.
|
||||
// The node only LOGS these (never deletes).
|
||||
var expectedNames = new HashSet<string>(
|
||||
expected.Select(e => e.InstanceUniqueName), StringComparer.Ordinal);
|
||||
var orphans = localMap.Keys
|
||||
.Where(name => !expectedNames.Contains(name))
|
||||
.ToList();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Reconcile for site {SiteIdentifier} (node {NodeId}): {GapCount} gap, {OrphanCount} orphan(s)",
|
||||
request.SiteIdentifier, request.NodeId, gap.Count, orphans.Count);
|
||||
|
||||
// 6. Reply.
|
||||
return new ReconcileSiteResponse(gap, orphans, baseUrl);
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,11 @@ public static class ServiceCollectionExtensions
|
||||
services.AddSingleton<SiteStreamGrpcClientFactory>();
|
||||
services.AddSingleton<DebugStreamService>();
|
||||
|
||||
// Startup reconciliation handler — scoped (holds scoped repositories), resolved
|
||||
// per-request by CentralCommunicationActor inside a DI scope. Harmless on site
|
||||
// hosts: only the central actor ever resolves it.
|
||||
services.AddScoped<ReconcileService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user