feat(deploy): wire periodic PendingDeployment purge + SQL Server same-id re-stage test
Notify-and-fetch follow-ups: - PendingDeploymentPurgeActor: a central cluster singleton (not readiness-gated, best-effort) that sweeps expired PendingDeployment staging rows on CommunicationOptions.PendingDeploymentPurgeInterval (default 1h). Modeled on the kpi-history-recorder pattern: self-scheduling timer, per-tick DI scope -> IDeploymentManagerRepository, continue-on-error. Wired in AkkaHostedService.RegisterCentralActors (manager + proxy + drain); resolves the deferred TODO in DeploymentService. Correctness never depends on it (supersession bounds rows to <=1/instance; the fetch endpoint enforces the TTL), so it is deliberately absent from RequiredSingletonsHealthCheck. - SQL Server integration test for StagePendingIfAbsentAsync re-staging an instance's OWN DeploymentId over an expired row against the real UNIQUE index on DeploymentId — confirms EF orders DELETE before INSERT in one SaveChanges (SQLite's constraint timing differs from SQL Server's). Plus a same-instance supersession variant on real SQL Server. Tests: 2 TestKit actor tests + 2 SQL Server integration tests (both ran green against the infra MSSQL container); 235 Communication + 15 PendingDeployment tests pass; Host builds 0 warnings.
This commit is contained in:
@@ -0,0 +1,126 @@
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.Communication.Actors;
|
||||
|
||||
/// <summary>
|
||||
/// Central cluster singleton that periodically reclaims expired (TTL-elapsed)
|
||||
/// <c>PendingDeployment</c> staging rows from the central configuration database —
|
||||
/// the notify-and-fetch deploy transport stages each deploy's flattened config in a
|
||||
/// <c>PendingDeployment</c> row that the site fetches over HTTP using a per-deployment
|
||||
/// token, and this actor is the maintenance cadence that sweeps rows whose TTL has
|
||||
/// elapsed without a re-deploy superseding them.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>Best-effort, not readiness-gated.</b> This purge is pure hygiene: supersession
|
||||
/// already bounds the table to ≤1 pending row per instance, and the config-fetch
|
||||
/// endpoint enforces the TTL on read (expired rows return 404), so correctness never
|
||||
/// depends on this purge running. It exists only to reclaim rows left behind by
|
||||
/// instances that are deployed once and never re-deployed. Like
|
||||
/// <c>KpiHistoryRecorderActor</c>, it is deliberately absent from
|
||||
/// <c>RequiredSingletonsHealthCheck</c> — it must never gate <c>/health/ready</c>.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Continue-on-error.</b> The tick handler swallows every exception and logs it; a
|
||||
/// transient SQL failure on one tick must not crash the singleton — the next tick
|
||||
/// retries. The per-tick try/catch (not the supervisor) is what keeps the actor alive.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>DI scopes.</b> <see cref="IDeploymentManagerRepository"/> is a scoped EF Core
|
||||
/// service registered by <c>AddConfigurationDatabase</c>. The singleton opens one DI
|
||||
/// scope per tick and resolves the repository there, mirroring the
|
||||
/// <c>AuditLogPurgeActor</c> / <c>KpiHistoryRecorderActor</c> pattern.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public class PendingDeploymentPurgeActor : ReceiveActor
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly TimeSpan _interval;
|
||||
private readonly ILogger<PendingDeploymentPurgeActor> _logger;
|
||||
private ICancelable? _timer;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="PendingDeploymentPurgeActor"/> and registers the tick handler.</summary>
|
||||
/// <param name="services">Root DI provider used to create a scoped repository per tick.</param>
|
||||
/// <param name="options">Communication options supplying the purge interval.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public PendingDeploymentPurgeActor(
|
||||
IServiceProvider services,
|
||||
IOptions<CommunicationOptions> options,
|
||||
ILogger<PendingDeploymentPurgeActor> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
ArgumentNullException.ThrowIfNull(logger);
|
||||
|
||||
_services = services;
|
||||
_interval = options.Value.PendingDeploymentPurgeInterval;
|
||||
_logger = logger;
|
||||
|
||||
ReceiveAsync<PurgeTick>(_ => OnTickAsync());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PreStart()
|
||||
{
|
||||
base.PreStart();
|
||||
_timer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
||||
initialDelay: _interval,
|
||||
interval: _interval,
|
||||
receiver: Self,
|
||||
message: PurgeTick.Instance,
|
||||
sender: Self);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override void PostStop()
|
||||
{
|
||||
_timer?.Cancel();
|
||||
base.PostStop();
|
||||
}
|
||||
|
||||
private async Task OnTickAsync()
|
||||
{
|
||||
// CreateAsyncScope + await using so the scoped EF Core DbContext (IAsyncDisposable)
|
||||
// disposes asynchronously without blocking on connection cleanup.
|
||||
await using var scope = _services.CreateAsyncScope();
|
||||
IDeploymentManagerRepository repository;
|
||||
try
|
||||
{
|
||||
repository = scope.ServiceProvider.GetRequiredService<IDeploymentManagerRepository>();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to resolve IDeploymentManagerRepository for PendingDeployment purge tick.");
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var purged = await repository
|
||||
.PurgeExpiredPendingDeploymentsAsync(DateTimeOffset.UtcNow)
|
||||
.ConfigureAwait(false);
|
||||
if (purged > 0)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Purged {Count} expired PendingDeployment staging row(s).", purged);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Best-effort: a failed tick must not crash the singleton. The next
|
||||
// interval retries; correctness does not depend on any single tick.
|
||||
_logger.LogError(ex, "PendingDeployment purge tick failed; retrying on the next interval.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Self-tick triggering one expired-row purge pass.</summary>
|
||||
internal sealed class PurgeTick
|
||||
{
|
||||
public static readonly PurgeTick Instance = new();
|
||||
private PurgeTick() { }
|
||||
}
|
||||
}
|
||||
@@ -71,4 +71,13 @@ public class CommunicationOptions
|
||||
/// comfortably cover both site nodes' fetches within one deploy window.
|
||||
/// </summary>
|
||||
public TimeSpan PendingDeploymentTtl { get; set; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>
|
||||
/// How often the central <c>PendingDeploymentPurgeActor</c> singleton reclaims
|
||||
/// expired (TTL-elapsed) PendingDeployment staging rows. Best-effort hygiene only:
|
||||
/// supersession bounds pending rows to ≤1 per instance and the config-fetch endpoint
|
||||
/// already enforces the TTL, so this purge merely sweeps rows left behind by instances
|
||||
/// that are deployed once and never re-deployed. Default 1 hour ≫ <see cref="PendingDeploymentTtl"/>.
|
||||
/// </summary>
|
||||
public TimeSpan PendingDeploymentPurgeInterval { get; set; } = TimeSpan.FromHours(1);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user