Files
scadalink-design/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs

146 lines
5.9 KiB
C#

using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Interfaces;
namespace ScadaLink.AuditLog.Central;
/// <summary>
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
/// missing future boundaries — the partition function must always cover at
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
/// future months, otherwise inserts past the highest boundary accumulate in
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
/// cannot purge cleanly.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not an actor.</b> Bundle C's
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
/// because it needs supervised lifecycle alongside the rest of the
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
/// chore with no cross-actor coordination, so we use the much simpler
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
/// between ticks, cancellation on stop. Reusing
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
/// keeps the contract testable without any actor framework involvement.
/// </para>
/// <para>
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
/// a try/catch so a transient SQL Server error never tears down the hosted
/// service — the next tick simply retries. The exception is logged with
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
/// E's central health collector) can subscribe to the logger to alert on
/// repeated failures.
/// </para>
/// <para>
/// <b>Startup ordering.</b> A first tick fires immediately at
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
/// the partition function to come up to spec. This is also what the brief
/// asks for ("Run once on startup").
/// </para>
/// <para>
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
/// (alongside the rest of the EF repositories) because the implementation
/// reuses the per-scope <c>ScadaLinkDbContext</c>. A hosted service is a
/// singleton, so it must open and dispose a scope around each tick — the
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
private CancellationTokenSource? _cts;
private Task? _loop;
public AuditLogPartitionMaintenanceService(
IServiceScopeFactory scopeFactory,
IOptions<AuditLogPartitionMaintenanceOptions> options,
ILogger<AuditLogPartitionMaintenanceService> logger)
{
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// Run once on startup so a fresh deployment isn't gated on the
// IntervalSeconds initial wait — the brief calls this out explicitly.
await SafeMaintainAsync(ct).ConfigureAwait(false);
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
await SafeMaintainAsync(ct).ConfigureAwait(false);
}
}
private async Task SafeMaintainAsync(CancellationToken ct)
{
try
{
await using var scope = _scopeFactory.CreateAsyncScope();
var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
var added = await maintenance
.EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
.ConfigureAwait(false);
if (added.Count > 0)
{
_logger.LogInformation(
"AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
added.Count,
string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
}
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of tick failure (transient SQL, DI resolution, etc.) so
// the next tick gets a chance. The brief's contract is
// "exception logged, not propagated".
_logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
}
}
/// <inheritdoc />
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
public void Dispose()
{
_cts?.Dispose();
}
}