feat(auditlog): AuditLogPartitionMaintenanceService monthly roll-forward (#23 M6)

This commit is contained in:
Joseph Doherty
2026-05-20 18:51:43 -04:00
parent cc2d6e91f1
commit 75b060e0a8
9 changed files with 834 additions and 0 deletions

View File

@@ -0,0 +1,37 @@
namespace ScadaLink.AuditLog.Central;
/// <summary>
/// Tuning knobs for the central
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service (M6-T5).
/// Defaults: once every 24 hours, keep at least one future monthly
/// boundary ahead of <see cref="DateTime.UtcNow"/>.
/// </summary>
/// <remarks>
/// <para>
/// The hosted service drives a daily roll-forward of
/// <c>pf_AuditLog_Month</c>: each tick reads the current max boundary and
/// SPLITs new monthly boundaries until at least
/// <see cref="LookaheadMonths"/> future months are covered. The 1-month
/// default is intentionally conservative — anything less risks an
/// end-of-month race where inserts land in the unbounded tail partition;
/// anything more wastes nothing but represents premature commitment.
/// </para>
/// <para>
/// The 24-hour cadence is the cheapest interval that still guarantees
/// at-most-one missed boundary in steady state (even a hard failover the
/// hosted service can recover on its very next tick). Lowering this below
/// an hour would generate more metadata churn than it saves.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceOptions
{
/// <summary>Period of the maintenance tick in seconds (default 86 400 = 24 h).</summary>
public int IntervalSeconds { get; set; } = 86_400;
/// <summary>
/// Minimum number of future months that <c>pf_AuditLog_Month</c> must
/// cover after each tick. Default 1 — i.e. as of mid-May the partition
/// for the next full month (June) must already be present.
/// </summary>
public int LookaheadMonths { get; set; } = 1;
}

View File

@@ -0,0 +1,145 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.Commons.Interfaces;
namespace ScadaLink.AuditLog.Central;
/// <summary>
/// Central <see cref="IHostedService"/> (M6-T5, Bundle D) that rolls
/// <c>pf_AuditLog_Month</c> forward once a day. Each tick opens a fresh DI
/// scope, resolves <see cref="IPartitionMaintenance"/>, and calls
/// <see cref="IPartitionMaintenance.EnsureLookaheadAsync"/> to SPLIT any
/// missing future boundaries — the partition function must always cover at
/// least <see cref="AuditLogPartitionMaintenanceOptions.LookaheadMonths"/>
/// future months, otherwise inserts past the highest boundary accumulate in
/// a single unbounded tail partition that <c>SwitchOutPartitionAsync</c>
/// cannot purge cleanly.
/// </summary>
/// <remarks>
/// <para>
/// <b>Why a hosted service, not an actor.</b> Bundle C's
/// <see cref="AuditLogPurgeActor"/> sits inside the central singleton
/// because it needs supervised lifecycle alongside the rest of the
/// reconciliation / ingest pipeline. Roll-forward is genuinely a once-a-day
/// chore with no cross-actor coordination, so we use the much simpler
/// hosted-service pattern: <c>Task.Run</c> on start, <c>Task.Delay</c>
/// between ticks, cancellation on stop. Reusing
/// <see cref="IPartitionMaintenance"/> from the central node-only DI graph
/// keeps the contract testable without any actor framework involvement.
/// </para>
/// <para>
/// <b>Failure containment.</b> The tick body wraps the maintenance call in
/// a try/catch so a transient SQL Server error never tears down the hosted
/// service — the next tick simply retries. The exception is logged with
/// the original stack trace at <c>Error</c> level; ops surfaces (M6 Bundle
/// E's central health collector) can subscribe to the logger to alert on
/// repeated failures.
/// </para>
/// <para>
/// <b>Startup ordering.</b> A first tick fires immediately at
/// <see cref="StartAsync"/> so a fresh deployment doesn't need to wait
/// <see cref="AuditLogPartitionMaintenanceOptions.IntervalSeconds"/> for
/// the partition function to come up to spec. This is also what the brief
/// asks for ("Run once on startup").
/// </para>
/// <para>
/// <b>DI scope per tick.</b> <see cref="IPartitionMaintenance"/> is scoped
/// (alongside the rest of the EF repositories) because the implementation
/// reuses the per-scope <c>ScadaLinkDbContext</c>. A hosted service is a
/// singleton, so it must open and dispose a scope around each tick — the
/// same pattern <see cref="AuditLogPurgeActor"/> uses.
/// </para>
/// </remarks>
public sealed class AuditLogPartitionMaintenanceService : IHostedService, IDisposable
{
private readonly IServiceScopeFactory _scopeFactory;
private readonly IOptions<AuditLogPartitionMaintenanceOptions> _options;
private readonly ILogger<AuditLogPartitionMaintenanceService> _logger;
private CancellationTokenSource? _cts;
private Task? _loop;
public AuditLogPartitionMaintenanceService(
IServiceScopeFactory scopeFactory,
IOptions<AuditLogPartitionMaintenanceOptions> options,
ILogger<AuditLogPartitionMaintenanceService> logger)
{
_scopeFactory = scopeFactory ?? throw new ArgumentNullException(nameof(scopeFactory));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public Task StartAsync(CancellationToken ct)
{
// Linked CTS lets StopAsync's cancellation AND the host's shutdown
// token both terminate the loop; either side firing aborts the
// pending Task.Delay.
_cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
_loop = Task.Run(() => RunLoopAsync(_cts.Token));
return Task.CompletedTask;
}
private async Task RunLoopAsync(CancellationToken ct)
{
// Run once on startup so a fresh deployment isn't gated on the
// IntervalSeconds initial wait — the brief calls this out explicitly.
await SafeMaintainAsync(ct).ConfigureAwait(false);
while (!ct.IsCancellationRequested)
{
try
{
await Task.Delay(TimeSpan.FromSeconds(_options.Value.IntervalSeconds), ct)
.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
break;
}
await SafeMaintainAsync(ct).ConfigureAwait(false);
}
}
private async Task SafeMaintainAsync(CancellationToken ct)
{
try
{
await using var scope = _scopeFactory.CreateAsyncScope();
var maintenance = scope.ServiceProvider.GetRequiredService<IPartitionMaintenance>();
var added = await maintenance
.EnsureLookaheadAsync(_options.Value.LookaheadMonths, ct)
.ConfigureAwait(false);
if (added.Count > 0)
{
_logger.LogInformation(
"AuditLogPartitionMaintenance added {Count} boundaries: {Boundaries}",
added.Count,
string.Join(", ", added.Select(b => b.ToString("yyyy-MM-dd"))));
}
}
catch (Exception ex)
{
// Catch-all is deliberate: the hosted service must survive every
// class of tick failure (transient SQL, DI resolution, etc.) so
// the next tick gets a chance. The brief's contract is
// "exception logged, not propagated".
_logger.LogError(ex, "AuditLogPartitionMaintenance tick failed");
}
}
/// <inheritdoc />
public Task StopAsync(CancellationToken ct)
{
_cts?.Cancel();
return _loop ?? Task.CompletedTask;
}
/// <inheritdoc />
public void Dispose()
{
_cts?.Dispose();
}
}

View File

@@ -1,6 +1,7 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ScadaLink.AuditLog.Central;
@@ -43,6 +44,9 @@ public static class ServiceCollectionExtensions
/// <summary>Configuration section bound to <see cref="SiteAuditTelemetryOptions"/>.</summary>
public const string SiteTelemetrySectionName = "AuditLog:SiteTelemetry";
/// <summary>Configuration section bound to <see cref="AuditLogPartitionMaintenanceOptions"/>.</summary>
public const string PartitionMaintenanceSectionName = "AuditLog:PartitionMaintenance";
/// <summary>
/// Registers the Audit Log (#23) component services: options, the site
/// SQLite writer chain (primary + ring fallback + failure-counter sink),
@@ -216,4 +220,37 @@ public static class ServiceCollectionExtensions
ServiceDescriptor.Singleton<IAuditRedactionFailureCounter, HealthMetricsAuditRedactionFailureCounter>());
return services;
}
/// <summary>
/// Audit Log (#23) M6-T5 Bundle D — central-only registration for the
/// <see cref="AuditLogPartitionMaintenanceService"/> hosted service plus
/// its <see cref="AuditLogPartitionMaintenanceOptions"/> binding. Must be
/// called from the Central role's composition root (not from a site
/// composition root); the underlying <c>IPartitionMaintenance</c>
/// implementation is registered by <c>AddConfigurationDatabase</c> and
/// only exists on the central node.
/// </summary>
/// <remarks>
/// <para>
/// Separated from <see cref="AddAuditLog"/> because <c>AddAuditLog</c> is
/// also invoked from site composition roots — silently starting a
/// hosted service that resolves an unregistered dependency on a site
/// would fail every tick. Keeping the central-only registration in its
/// own helper preserves the "every <c>Add*</c> call is safe to issue
/// from any composition root" invariant.
/// </para>
/// </remarks>
public static IServiceCollection AddAuditLogCentralMaintenance(
this IServiceCollection services,
IConfiguration config)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(config);
services.AddOptions<AuditLogPartitionMaintenanceOptions>()
.Bind(config.GetSection(PartitionMaintenanceSectionName));
services.AddHostedService<AuditLogPartitionMaintenanceService>();
return services;
}
}