feat(deploy): wire periodic PendingDeployment purge + SQL Server same-id re-stage test

Notify-and-fetch follow-ups: - PendingDeploymentPurgeActor: a central cluster singleton (not readiness-gated, best-effort) that sweeps expired PendingDeployment staging rows on CommunicationOptions.PendingDeploymentPurgeInterval (default 1h). Modeled on the kpi-history-recorder pattern: self-scheduling timer, per-tick DI scope -> IDeploymentManagerRepository, continue-on-error. Wired in AkkaHostedService.RegisterCentralActors (manager + proxy + drain); resolves the deferred TODO in DeploymentService. Correctness never depends on it (supersession bounds rows to <=1/instance; the fetch endpoint enforces the TTL), so it is deliberately absent from RequiredSingletonsHealthCheck. - SQL Server integration test for StagePendingIfAbsentAsync re-staging an instance's OWN DeploymentId over an expired row against the real UNIQUE index on DeploymentId — confirms EF orders DELETE before INSERT in one SaveChanges (SQLite's constraint timing differs from SQL Server's). Plus a same-instance supersession variant on real SQL Server. Tests: 2 TestKit actor tests + 2 SQL Server integration tests (both ran green against the infra MSSQL container); 235 Communication + 15 PendingDeployment tests pass; Host builds 0 warnings.
2026-06-26 23:19:29 -04:00
parent d9f5fbb664
commit 06f2df4f89
6 changed files with 431 additions and 4 deletions
@@ -0,0 +1,126 @@
+using Akka.Actor;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
+
+namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
+
+/// <summary>
+/// Central cluster singleton that periodically reclaims expired (TTL-elapsed)
+/// <c>PendingDeployment</c> staging rows from the central configuration database —
+/// the notify-and-fetch deploy transport stages each deploy's flattened config in a
+/// <c>PendingDeployment</c> row that the site fetches over HTTP using a per-deployment
+/// token, and this actor is the maintenance cadence that sweeps rows whose TTL has
+/// elapsed without a re-deploy superseding them.
+/// </summary>
+/// <remarks>
+/// <para>
+/// <b>Best-effort, not readiness-gated.</b> This purge is pure hygiene: supersession
+/// already bounds the table to ≤1 pending row per instance, and the config-fetch
+/// endpoint enforces the TTL on read (expired rows return 404), so correctness never
+/// depends on this purge running. It exists only to reclaim rows left behind by
+/// instances that are deployed once and never re-deployed. Like
+/// <c>KpiHistoryRecorderActor</c>, it is deliberately absent from
+/// <c>RequiredSingletonsHealthCheck</c> — it must never gate <c>/health/ready</c>.
+/// </para>
+/// <para>
+/// <b>Continue-on-error.</b> The tick handler swallows every exception and logs it; a
+/// transient SQL failure on one tick must not crash the singleton — the next tick
+/// retries. The per-tick try/catch (not the supervisor) is what keeps the actor alive.
+/// </para>
+/// <para>
+/// <b>DI scopes.</b> <see cref="IDeploymentManagerRepository"/> is a scoped EF Core
+/// service registered by <c>AddConfigurationDatabase</c>. The singleton opens one DI
+/// scope per tick and resolves the repository there, mirroring the
+/// <c>AuditLogPurgeActor</c> / <c>KpiHistoryRecorderActor</c> pattern.
+/// </para>
+/// </remarks>
+public class PendingDeploymentPurgeActor : ReceiveActor
+{
+    private readonly IServiceProvider _services;
+    private readonly TimeSpan _interval;
+    private readonly ILogger<PendingDeploymentPurgeActor> _logger;
+    private ICancelable? _timer;
+
+    /// <summary>Initializes a new instance of <see cref="PendingDeploymentPurgeActor"/> and registers the tick handler.</summary>
+    /// <param name="services">Root DI provider used to create a scoped repository per tick.</param>
+    /// <param name="options">Communication options supplying the purge interval.</param>
+    /// <param name="logger">Logger instance.</param>
+    public PendingDeploymentPurgeActor(
+        IServiceProvider services,
+        IOptions<CommunicationOptions> options,
+        ILogger<PendingDeploymentPurgeActor> logger)
+    {
+        ArgumentNullException.ThrowIfNull(services);
+        ArgumentNullException.ThrowIfNull(options);
+        ArgumentNullException.ThrowIfNull(logger);
+
+        _services = services;
+        _interval = options.Value.PendingDeploymentPurgeInterval;
+        _logger = logger;
+
+        ReceiveAsync<PurgeTick>(_ => OnTickAsync());
+    }
+
+    /// <inheritdoc />
+    protected override void PreStart()
+    {
+        base.PreStart();
+        _timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
+            initialDelay: _interval,
+            interval: _interval,
+            receiver: Self,
+            message: PurgeTick.Instance,
+            sender: Self);
+    }
+
+    /// <inheritdoc />
+    protected override void PostStop()
+    {
+        _timer?.Cancel();
+        base.PostStop();
+    }
+
+    private async Task OnTickAsync()
+    {
+        // CreateAsyncScope + await using so the scoped EF Core DbContext (IAsyncDisposable)
+        // disposes asynchronously without blocking on connection cleanup.
+        await using var scope = _services.CreateAsyncScope();
+        IDeploymentManagerRepository repository;
+        try
+        {
+            repository = scope.ServiceProvider.GetRequiredService<IDeploymentManagerRepository>();
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Failed to resolve IDeploymentManagerRepository for PendingDeployment purge tick.");
+            return;
+        }
+
+        try
+        {
+            var purged = await repository
+                .PurgeExpiredPendingDeploymentsAsync(DateTimeOffset.UtcNow)
+                .ConfigureAwait(false);
+            if (purged > 0)
+            {
+                _logger.LogInformation(
+                    "Purged {Count} expired PendingDeployment staging row(s).", purged);
+            }
+        }
+        catch (Exception ex)
+        {
+            // Best-effort: a failed tick must not crash the singleton. The next
+            // interval retries; correctness does not depend on any single tick.
+            _logger.LogError(ex, "PendingDeployment purge tick failed; retrying on the next interval.");
+        }
+    }
+
+    /// <summary>Self-tick triggering one expired-row purge pass.</summary>
+    internal sealed class PurgeTick
+    {
+        public static readonly PurgeTick Instance = new();
+        private PurgeTick() { }
+    }
+}
@@ -71,4 +71,13 @@ public class CommunicationOptions
    /// comfortably cover both site nodes' fetches within one deploy window.
    /// </summary>
    public TimeSpan PendingDeploymentTtl { get; set; } = TimeSpan.FromMinutes(5);
+
+    /// <summary>
+    /// How often the central <c>PendingDeploymentPurgeActor</c> singleton reclaims
+    /// expired (TTL-elapsed) PendingDeployment staging rows. Best-effort hygiene only:
+    /// supersession bounds pending rows to ≤1 per instance and the config-fetch endpoint
+    /// already enforces the TTL, so this purge merely sweeps rows left behind by instances
+    /// that are deployed once and never re-deployed. Default 1 hour ≫ <see cref="PendingDeploymentTtl"/>.
+    /// </summary>
+    public TimeSpan PendingDeploymentPurgeInterval { get; set; } = TimeSpan.FromHours(1);
 }