Files
ScadaBridge/src/ZB.MOM.WW.ScadaBridge.KpiHistory/KpiHistoryRecorderActor.cs
T

306 lines
13 KiB
C#

using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Kpi;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Kpi;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.KpiHistory;
/// <summary>
/// Central cluster-singleton (M6 "KPI History &amp; Trends", K4) that drives the
/// KPI sampling backbone. On a periodic timer (default 60 s) it:
/// <list type="number">
/// <item>Stamps a single <c>capturedAtUtc</c> for the pass.</item>
/// <item>Fans out to every DI-registered <see cref="IKpiSampleSource"/>,
/// collecting each source's point-in-time samples.</item>
/// <item>Bulk-writes the combined batch through
/// <see cref="IKpiHistoryRepository.RecordSamplesAsync"/>.</item>
/// </list>
/// A separate daily timer (default 1 d) runs the retention purge, dropping rows
/// older than <see cref="KpiHistoryOptions.RetentionDays"/> via
/// <see cref="IKpiHistoryRepository.PurgeOlderThanAsync"/>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Best-effort.</b> KPI history is observability — it must NEVER disrupt the
/// system it observes. Every fault is contained: a throwing
/// <see cref="IKpiSampleSource"/> is caught per-source so it neither aborts the
/// pass nor suppresses the other sources; a throwing repository write/purge is
/// caught and logged. No exception escapes either tick handler, so the singleton
/// stays alive across transient DB outages and misbehaving sources.
/// </para>
/// <para>
/// <b>Off-thread work.</b> The actual sampling and purge I/O runs off the actor
/// thread; the result is piped back to <see cref="Self"/> via
/// <see cref="PipeToSupport.PipeTo{T}(Task{T}, ICanTell, IActorRef, System.Func{T, object}, System.Func{System.Exception, object})"/>
/// so the mailbox is never blocked while a DB round-trip is in flight. This
/// mirrors the <see cref="ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor"/>
/// timer + scope-per-tick + PipeTo pattern.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IKpiSampleSource"/>s and
/// <see cref="IKpiHistoryRepository"/> are scoped EF Core-backed services; the
/// recorder opens a fresh <see cref="IServiceScope"/> per tick and resolves
/// there, asynchronously disposing it (<c>CreateAsyncScope</c>) so EF Core's
/// async connection cleanup does not block.
/// </para>
/// <para>
/// <b>Singleton wiring.</b> The <c>Props</c> is built in the Host (K5) on the
/// active central node with the constructor args resolved from DI — this actor
/// has no knowledge of <c>Akka.Cluster.Tools</c>, mirroring the Notification
/// Outbox (#21) singleton split.
/// </para>
/// </remarks>
public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
{
private const string SampleTimerKey = "kpi-sample";
private const string PurgeTimerKey = "kpi-purge";
private readonly IServiceProvider _serviceProvider;
private readonly KpiHistoryOptions _options;
private readonly ILogger<KpiHistoryRecorderActor> _logger;
/// <summary>
/// Lifecycle-scoped cancellation source, cancelled in <see cref="PostStop"/> so any
/// in-flight sample/purge pass observes a coordinated shutdown / failover promptly
/// instead of blocking the singleton handover for a full DB round-trip timeout.
/// </summary>
private CancellationTokenSource? _shutdownCts;
/// <summary>Akka timer scheduler, assigned by the actor system via <see cref="IWithTimers"/>.</summary>
public ITimerScheduler Timers { get; set; } = null!;
/// <summary>
/// Initializes the recorder with its dependencies and registers the tick handlers.
/// </summary>
/// <param name="serviceProvider">DI service provider used to open a scope per tick for the sample sources and repository.</param>
/// <param name="options">KPI history configuration options.</param>
/// <param name="logger">Logger for this actor.</param>
public KpiHistoryRecorderActor(
IServiceProvider serviceProvider,
KpiHistoryOptions options,
ILogger<KpiHistoryRecorderActor> logger)
{
_serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
Receive<SampleTick>(_ => HandleSampleTick());
Receive<SampleComplete>(_ => { }); // best-effort: no actor state to reset on completion
Receive<PurgeTick>(_ => HandlePurgeTick());
Receive<PurgeComplete>(_ => { }); // best-effort: no actor state to reset on completion
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
_shutdownCts = new CancellationTokenSource();
// Fire an initial sample shortly after start so the first trend point lands
// without waiting a full SampleInterval, then settle into the periodic cadence.
Timers.StartPeriodicTimer(
SampleTimerKey,
SampleTick.Instance,
initialDelay: TimeSpan.FromSeconds(5),
interval: _options.SampleInterval);
// The purge is daily and idempotent — no initial fast tick; the first sweep
// fires after a full PurgeInterval.
Timers.StartPeriodicTimer(
PurgeTimerKey,
PurgeTick.Instance,
_options.PurgeInterval);
}
/// <inheritdoc />
protected override void PostStop()
{
// Cancel before disposing so an in-flight pass observes cancellation; disposing
// first would race with a pass registering against the token.
try
{
_shutdownCts?.Cancel();
}
catch (ObjectDisposedException)
{
// Already disposed under a restarted-actor race; nothing to do.
}
_shutdownCts?.Dispose();
_shutdownCts = null;
base.PostStop();
}
/// <summary>
/// Handles a sample tick: captures the shared <c>capturedAtUtc</c> instant on the actor
/// thread, then launches the asynchronous sampling pass off-thread and pipes a
/// completion back to <see cref="Self"/> so the mailbox is never blocked while sources
/// are collected and the batch is written.
/// </summary>
private void HandleSampleTick()
{
var capturedAt = DateTime.UtcNow;
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
// RunSamplePass self-isolates its faults (it never throws), but the failure
// projection is kept as a belt-and-braces guard so even a faulted task still
// produces a SampleComplete.
RunSamplePass(capturedAt, cancellationToken).PipeTo(
Self,
success: () => SampleComplete.Instance,
failure: ex =>
{
_logger.LogError(ex, "KPI sample pass faulted unexpectedly.");
return SampleComplete.Instance;
});
}
/// <summary>
/// Runs a single sampling pass: opens a DI scope, enumerates every registered
/// <see cref="IKpiSampleSource"/>, collects each source's samples (isolating per-source
/// faults), and bulk-writes the combined batch. The whole body is wrapped so the
/// returned task never faults — best-effort observability must never disrupt anything.
/// </summary>
private async Task RunSamplePass(DateTime capturedAt, CancellationToken cancellationToken)
{
try
{
await using var scope = _serviceProvider.CreateAsyncScope();
var sources = scope.ServiceProvider.GetServices<IKpiSampleSource>();
var repository = scope.ServiceProvider.GetRequiredService<IKpiHistoryRepository>();
var samples = new List<KpiSample>();
foreach (var source in sources)
{
try
{
var collected = await source.CollectAsync(capturedAt, cancellationToken);
samples.AddRange(collected);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown interrupted collection; abandon the rest of the pass. The next
// active node samples on its own cadence. Not a failure.
return;
}
catch (Exception ex)
{
// A throwing source must NOT abort the pass or the other sources.
_logger.LogError(ex, "KPI source {Source} failed to collect samples.", source.Source);
}
}
if (samples.Count == 0)
{
return;
}
try
{
await repository.RecordSamplesAsync(samples, cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown interrupted the write; the batch is dropped (best-effort). Not a failure.
}
catch (Exception ex)
{
_logger.LogError(ex, "KPI sample write failed for {SampleCount} sample(s).", samples.Count);
}
}
catch (Exception ex)
{
// Scope creation or service resolution faulted; swallow and log so the returned
// task completes normally and the singleton stays alive.
_logger.LogError(ex, "KPI sample pass failed unexpectedly.");
}
}
/// <summary>
/// Handles a purge tick: computes the retention cut-off on the actor thread, then runs
/// the bulk delete off-thread and pipes a completion back to <see cref="Self"/>. Purges
/// are daily and idempotent, so no in-flight guard is needed.
/// </summary>
private void HandlePurgeTick()
{
var before = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays);
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
RunPurgePass(before, cancellationToken).PipeTo(
Self,
success: deleted =>
{
if (deleted > 0)
{
_logger.LogInformation(
"KPI history purge removed {DeletedCount} sample(s) older than {Cutoff:o}.",
deleted, before);
}
return PurgeComplete.Instance;
},
failure: ex =>
{
_logger.LogError(ex, "KPI history purge faulted unexpectedly.");
return PurgeComplete.Instance;
});
}
/// <summary>
/// Runs a single purge sweep: opens a DI scope, resolves the repository, and bulk-deletes
/// rows captured before <paramref name="before"/>, returning the deleted count. The whole
/// body is wrapped so the returned task never faults — on failure the exception is logged
/// and 0 is returned, mirroring <see cref="RunSamplePass"/>'s best-effort contract.
/// </summary>
private async Task<int> RunPurgePass(DateTime before, CancellationToken cancellationToken)
{
try
{
await using var scope = _serviceProvider.CreateAsyncScope();
var repository = scope.ServiceProvider.GetRequiredService<IKpiHistoryRepository>();
return await repository.PurgeOlderThanAsync(before, cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown interrupted the purge; the next active sweep retries. Not a failure.
return 0;
}
catch (Exception ex)
{
_logger.LogError(ex, "KPI history purge failed unexpectedly.");
return 0;
}
}
/// <summary>Self-tick triggering a sampling pass across all registered sources.</summary>
internal sealed class SampleTick
{
public static readonly SampleTick Instance = new();
private SampleTick() { }
}
/// <summary>Piped-back completion of a sampling pass; lets the pass run off the actor thread.</summary>
internal sealed class SampleComplete
{
public static readonly SampleComplete Instance = new();
private SampleComplete() { }
}
/// <summary>Self-tick triggering a retention purge sweep.</summary>
internal sealed class PurgeTick
{
public static readonly PurgeTick Instance = new();
private PurgeTick() { }
}
/// <summary>Piped-back completion of a purge sweep; lets the sweep run off the actor thread.</summary>
internal sealed class PurgeComplete
{
public static readonly PurgeComplete Instance = new();
private PurgeComplete() { }
}
}