scadalink-design/src/ScadaLink.AuditLog/Central/AuditLogPartitionMaintenanceService.cs

using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Interfaces;

namespace ScadaLink.AuditLog.Central;

/// <summary>
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
/// missing future boundaries — the partition function must always cover at
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
/// future months, otherwise inserts past the highest boundary accumulate in
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
/// cannot purge cleanly.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not an actor.</b> Bundle C's
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
/// because it needs supervised lifecycle alongside the rest of the
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
/// chore with no cross-actor coordination, so we use the much simpler
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
/// between ticks, cancellation on stop. Reusing
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
/// keeps the contract testable without any actor framework involvement.
/// </para>
/// <para>
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
/// a try/catch so a transient SQL Server error never tears down the hosted
/// service — the next tick simply retries. The exception is logged with
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
/// E's central health collector) can subscribe to the logger to alert on
/// repeated failures.
/// </para>
/// <para>
/// <b>Startup ordering.</b> A first tick fires immediately at
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
/// the partition function to come up to spec. This is also what the brief
/// asks for ("Run once on startup").
/// </para>
/// <para>
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
/// (alongside the rest of the EF repositories) because the implementation
/// reuses the per-scope <c>ScadaLinkDbContext</c>. A hosted service is a
/// singleton, so it must open and dispose a scope around each tick — the
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
{
    private readonly IServiceScopeFactory _scopeFactory;
    private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
    private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
    private CancellationTokenSource? _cts;
    private Task? _loop;

    public AuditLogPartitionMaintenanceService(
        IServiceScopeFactory scopeFactory,
        IOptions<AuditLogPartitionMaintenanceOptions> options,
        ILogger<AuditLogPartitionMaintenanceService> logger)
    {
        _scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
        _options = options ?? throw new ArgumentNullException(nameof(options));
        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
    }

    /// <inheritdoc />
    public Task StartAsync(CancellationToken ct)
    {
        // Linked CTS lets StopAsync's cancellation AND the host's shutdown
        // token both terminate the loop; either side firing aborts the
        // pending Task.Delay.
        _cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
        _loop = Task.Run(() => RunLoopAsync(_cts.Token));
        return Task.CompletedTask;
    }

    private async Task RunLoopAsync(CancellationToken ct)
    {
        // Run once on startup so a fresh deployment isn't gated on the
        // IntervalSeconds initial wait — the brief calls this out explicitly.
        await SafeMaintainAsync(ct).ConfigureAwait(false);

        while (!ct.IsCancellationRequested)
        {
            try
            {
                await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
                    .ConfigureAwait(false);
            }
            catch (OperationCanceledException)
            {
                break;
            }

            await SafeMaintainAsync(ct).ConfigureAwait(false);
        }
    }

    private async Task SafeMaintainAsync(CancellationToken ct)
    {
        try
        {
            await using var scope = _scopeFactory.CreateAsyncScope();
            var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
            var added = await maintenance
                .EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
                .ConfigureAwait(false);
            if (added.Count > 0)
            {
                _logger.LogInformation(
                    "AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
                    added.Count,
                    string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
            }
        }
        catch (Exception ex)
        {
            // Catch-all is deliberate: the hosted service must survive every
            // class of tick failure (transient SQL, DI resolution, etc.) so
            // the next tick gets a chance. The brief's contract is
            // "exception logged, not propagated".
            _logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
        }
    }

    /// <inheritdoc />
    public Task StopAsync(CancellationToken ct)
    {
        _cts?.Cancel();
        return _loop ?? Task.CompletedTask;
    }

    /// <inheritdoc />
    public void Dispose()
    {
        _cts?.Dispose();
    }
}