feat(deploy): wire periodic PendingDeployment purge + SQL Server same-id re-stage test

Notify-and-fetch follow-ups:

- PendingDeploymentPurgeActor: a central cluster singleton (not
  readiness-gated, best-effort) that sweeps expired PendingDeployment
  staging rows on CommunicationOptions.PendingDeploymentPurgeInterval
  (default 1h). Modeled on the kpi-history-recorder pattern: self-scheduling
  timer, per-tick DI scope -> IDeploymentManagerRepository, continue-on-error.
  Wired in AkkaHostedService.RegisterCentralActors (manager + proxy + drain);
  resolves the deferred TODO in DeploymentService. Correctness never depends
  on it (supersession bounds rows to <=1/instance; the fetch endpoint enforces
  the TTL), so it is deliberately absent from RequiredSingletonsHealthCheck.

- SQL Server integration test for StagePendingIfAbsentAsync re-staging an
  instance's OWN DeploymentId over an expired row against the real UNIQUE
  index on DeploymentId — confirms EF orders DELETE before INSERT in one
  SaveChanges (SQLite's constraint timing differs from SQL Server's). Plus
  a same-instance supersession variant on real SQL Server.

Tests: 2 TestKit actor tests + 2 SQL Server integration tests (both ran
green against the infra MSSQL container); 235 Communication + 15
PendingDeployment tests pass; Host builds 0 warnings.
This commit is contained in:
Joseph Doherty
2026-06-26 23:19:29 -04:00
parent d9f5fbb664
commit 06f2df4f89
6 changed files with 431 additions and 4 deletions
@@ -0,0 +1,126 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
/// <summary>
/// Central cluster singleton that periodically reclaims expired (TTL-elapsed)
/// <c>PendingDeployment</c> staging rows from the central configuration database —
/// the notify-and-fetch deploy transport stages each deploy's flattened config in a
/// <c>PendingDeployment</c> row that the site fetches over HTTP using a per-deployment
/// token, and this actor is the maintenance cadence that sweeps rows whose TTL has
/// elapsed without a re-deploy superseding them.
/// </summary>
/// <remarks>
/// <para>
/// <b>Best-effort, not readiness-gated.</b> This purge is pure hygiene: supersession
/// already bounds the table to ≤1 pending row per instance, and the config-fetch
/// endpoint enforces the TTL on read (expired rows return 404), so correctness never
/// depends on this purge running. It exists only to reclaim rows left behind by
/// instances that are deployed once and never re-deployed. Like
/// <c>KpiHistoryRecorderActor</c>, it is deliberately absent from
/// <c>RequiredSingletonsHealthCheck</c> — it must never gate <c>/health/ready</c>.
/// </para>
/// <para>
/// <b>Continue-on-error.</b> The tick handler swallows every exception and logs it; a
/// transient SQL failure on one tick must not crash the singleton — the next tick
/// retries. The per-tick try/catch (not the supervisor) is what keeps the actor alive.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IDeploymentManagerRepository"/> is a scoped EF Core
/// service registered by <c>AddConfigurationDatabase</c>. The singleton opens one DI
/// scope per tick and resolves the repository there, mirroring the
/// <c>AuditLogPurgeActor</c> / <c>KpiHistoryRecorderActor</c> pattern.
/// </para>
/// </remarks>
public class PendingDeploymentPurgeActor : ReceiveActor
{
private readonly IServiceProvider _services;
private readonly TimeSpan _interval;
private readonly ILogger<PendingDeploymentPurgeActor> _logger;
private ICancelable? _timer;
/// <summary>Initializes a new instance of <see cref="PendingDeploymentPurgeActor"/> and registers the tick handler.</summary>
/// <param name="services">Root DI provider used to create a scoped repository per tick.</param>
/// <param name="options">Communication options supplying the purge interval.</param>
/// <param name="logger">Logger instance.</param>
public PendingDeploymentPurgeActor(
IServiceProvider services,
IOptions<CommunicationOptions> options,
ILogger<PendingDeploymentPurgeActor> logger)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(options);
ArgumentNullException.ThrowIfNull(logger);
_services = services;
_interval = options.Value.PendingDeploymentPurgeInterval;
_logger = logger;
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
initialDelay: _interval,
interval: _interval,
receiver: Self,
message: PurgeTick.Instance,
sender: Self);
}
/// <inheritdoc />
protected override void PostStop()
{
_timer?.Cancel();
base.PostStop();
}
private async Task OnTickAsync()
{
// CreateAsyncScope + await using so the scoped EF Core DbContext (IAsyncDisposable)
// disposes asynchronously without blocking on connection cleanup.
await using var scope = _services.CreateAsyncScope();
IDeploymentManagerRepository repository;
try
{
repository = scope.ServiceProvider.GetRequiredService<IDeploymentManagerRepository>();
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to resolve IDeploymentManagerRepository for PendingDeployment purge tick.");
return;
}
try
{
var purged = await repository
.PurgeExpiredPendingDeploymentsAsync(DateTimeOffset.UtcNow)
.ConfigureAwait(false);
if (purged > 0)
{
_logger.LogInformation(
"Purged {Count} expired PendingDeployment staging row(s).", purged);
}
}
catch (Exception ex)
{
// Best-effort: a failed tick must not crash the singleton. The next
// interval retries; correctness does not depend on any single tick.
_logger.LogError(ex, "PendingDeployment purge tick failed; retrying on the next interval.");
}
}
/// <summary>Self-tick triggering one expired-row purge pass.</summary>
internal sealed class PurgeTick
{
public static readonly PurgeTick Instance = new();
private PurgeTick() { }
}
}
@@ -71,4 +71,13 @@ public class CommunicationOptions
/// comfortably cover both site nodes' fetches within one deploy window.
/// </summary>
public TimeSpan PendingDeploymentTtl { get; set; } = TimeSpan.FromMinutes(5);
/// <summary>
/// How often the central <c>PendingDeploymentPurgeActor</c> singleton reclaims
/// expired (TTL-elapsed) PendingDeployment staging rows. Best-effort hygiene only:
/// supersession bounds pending rows to ≤1 per instance and the config-fetch endpoint
/// already enforces the TTL, so this purge merely sweeps rows left behind by instances
/// that are deployed once and never re-deployed. Default 1 hour ≫ <see cref="PendingDeploymentTtl"/>.
/// </summary>
public TimeSpan PendingDeploymentPurgeInterval { get; set; } = TimeSpan.FromHours(1);
}