306 lines
13 KiB
C#
306 lines
13 KiB
C#
using Akka.Actor;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Logging;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Kpi;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Kpi;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.KpiHistory;
|
|
|
|
/// <summary>
|
|
/// Central cluster-singleton (M6 "KPI History & Trends", K4) that drives the
|
|
/// KPI sampling backbone. On a periodic timer (default 60 s) it:
|
|
/// <list type="number">
|
|
/// <item>Stamps a single <c>capturedAtUtc</c> for the pass.</item>
|
|
/// <item>Fans out to every DI-registered <see cref="IKpiSampleSource"/>,
|
|
/// collecting each source's point-in-time samples.</item>
|
|
/// <item>Bulk-writes the combined batch through
|
|
/// <see cref="IKpiHistoryRepository.RecordSamplesAsync"/>.</item>
|
|
/// </list>
|
|
/// A separate daily timer (default 1 d) runs the retention purge, dropping rows
|
|
/// older than <see cref="KpiHistoryOptions.RetentionDays"/> via
|
|
/// <see cref="IKpiHistoryRepository.PurgeOlderThanAsync"/>.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// <b>Best-effort.</b> KPI history is observability — it must NEVER disrupt the
|
|
/// system it observes. Every fault is contained: a throwing
|
|
/// <see cref="IKpiSampleSource"/> is caught per-source so it neither aborts the
|
|
/// pass nor suppresses the other sources; a throwing repository write/purge is
|
|
/// caught and logged. No exception escapes either tick handler, so the singleton
|
|
/// stays alive across transient DB outages and misbehaving sources.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>Off-thread work.</b> The actual sampling and purge I/O runs off the actor
|
|
/// thread; the result is piped back to <see cref="Self"/> via
|
|
/// <see cref="PipeToSupport.PipeTo{T}(Task{T}, ICanTell, IActorRef, System.Func{T, object}, System.Func{System.Exception, object})"/>
|
|
/// so the mailbox is never blocked while a DB round-trip is in flight. This
|
|
/// mirrors the <see cref="ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor"/>
|
|
/// timer + scope-per-tick + PipeTo pattern.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>DI scopes.</b> <see cref="IKpiSampleSource"/>s and
|
|
/// <see cref="IKpiHistoryRepository"/> are scoped EF Core-backed services; the
|
|
/// recorder opens a fresh <see cref="IServiceScope"/> per tick and resolves
|
|
/// there, asynchronously disposing it (<c>CreateAsyncScope</c>) so EF Core's
|
|
/// async connection cleanup does not block.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>Singleton wiring.</b> The <c>Props</c> is built in the Host (K5) on the
|
|
/// active central node with the constructor args resolved from DI — this actor
|
|
/// has no knowledge of <c>Akka.Cluster.Tools</c>, mirroring the Notification
|
|
/// Outbox (#21) singleton split.
|
|
/// </para>
|
|
/// </remarks>
|
|
public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
|
|
{
|
|
private const string SampleTimerKey = "kpi-sample";
|
|
private const string PurgeTimerKey = "kpi-purge";
|
|
|
|
private readonly IServiceProvider _serviceProvider;
|
|
private readonly KpiHistoryOptions _options;
|
|
private readonly ILogger<KpiHistoryRecorderActor> _logger;
|
|
|
|
/// <summary>
|
|
/// Lifecycle-scoped cancellation source, cancelled in <see cref="PostStop"/> so any
|
|
/// in-flight sample/purge pass observes a coordinated shutdown / failover promptly
|
|
/// instead of blocking the singleton handover for a full DB round-trip timeout.
|
|
/// </summary>
|
|
private CancellationTokenSource? _shutdownCts;
|
|
|
|
/// <summary>Akka timer scheduler, assigned by the actor system via <see cref="IWithTimers"/>.</summary>
|
|
public ITimerScheduler Timers { get; set; } = null!;
|
|
|
|
/// <summary>
|
|
/// Initializes the recorder with its dependencies and registers the tick handlers.
|
|
/// </summary>
|
|
/// <param name="serviceProvider">DI service provider used to open a scope per tick for the sample sources and repository.</param>
|
|
/// <param name="options">KPI history configuration options.</param>
|
|
/// <param name="logger">Logger for this actor.</param>
|
|
public KpiHistoryRecorderActor(
|
|
IServiceProvider serviceProvider,
|
|
KpiHistoryOptions options,
|
|
ILogger<KpiHistoryRecorderActor> logger)
|
|
{
|
|
_serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider));
|
|
_options = options ?? throw new ArgumentNullException(nameof(options));
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
|
|
Receive<SampleTick>(_ => HandleSampleTick());
|
|
Receive<SampleComplete>(_ => { }); // best-effort: no actor state to reset on completion
|
|
Receive<PurgeTick>(_ => HandlePurgeTick());
|
|
Receive<PurgeComplete>(_ => { }); // best-effort: no actor state to reset on completion
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override void PreStart()
|
|
{
|
|
base.PreStart();
|
|
_shutdownCts = new CancellationTokenSource();
|
|
|
|
// Fire an initial sample shortly after start so the first trend point lands
|
|
// without waiting a full SampleInterval, then settle into the periodic cadence.
|
|
Timers.StartPeriodicTimer(
|
|
SampleTimerKey,
|
|
SampleTick.Instance,
|
|
initialDelay: TimeSpan.FromSeconds(5),
|
|
interval: _options.SampleInterval);
|
|
|
|
// The purge is daily and idempotent — no initial fast tick; the first sweep
|
|
// fires after a full PurgeInterval.
|
|
Timers.StartPeriodicTimer(
|
|
PurgeTimerKey,
|
|
PurgeTick.Instance,
|
|
_options.PurgeInterval);
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override void PostStop()
|
|
{
|
|
// Cancel before disposing so an in-flight pass observes cancellation; disposing
|
|
// first would race with a pass registering against the token.
|
|
try
|
|
{
|
|
_shutdownCts?.Cancel();
|
|
}
|
|
catch (ObjectDisposedException)
|
|
{
|
|
// Already disposed under a restarted-actor race; nothing to do.
|
|
}
|
|
|
|
_shutdownCts?.Dispose();
|
|
_shutdownCts = null;
|
|
|
|
base.PostStop();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a sample tick: captures the shared <c>capturedAtUtc</c> instant on the actor
|
|
/// thread, then launches the asynchronous sampling pass off-thread and pipes a
|
|
/// completion back to <see cref="Self"/> so the mailbox is never blocked while sources
|
|
/// are collected and the batch is written.
|
|
/// </summary>
|
|
private void HandleSampleTick()
|
|
{
|
|
var capturedAt = DateTime.UtcNow;
|
|
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
|
|
|
|
// RunSamplePass self-isolates its faults (it never throws), but the failure
|
|
// projection is kept as a belt-and-braces guard so even a faulted task still
|
|
// produces a SampleComplete.
|
|
RunSamplePass(capturedAt, cancellationToken).PipeTo(
|
|
Self,
|
|
success: () => SampleComplete.Instance,
|
|
failure: ex =>
|
|
{
|
|
_logger.LogError(ex, "KPI sample pass faulted unexpectedly.");
|
|
return SampleComplete.Instance;
|
|
});
|
|
}
|
|
|
|
/// <summary>
|
|
/// Runs a single sampling pass: opens a DI scope, enumerates every registered
|
|
/// <see cref="IKpiSampleSource"/>, collects each source's samples (isolating per-source
|
|
/// faults), and bulk-writes the combined batch. The whole body is wrapped so the
|
|
/// returned task never faults — best-effort observability must never disrupt anything.
|
|
/// </summary>
|
|
private async Task RunSamplePass(DateTime capturedAt, CancellationToken cancellationToken)
|
|
{
|
|
try
|
|
{
|
|
await using var scope = _serviceProvider.CreateAsyncScope();
|
|
var sources = scope.ServiceProvider.GetServices<IKpiSampleSource>();
|
|
var repository = scope.ServiceProvider.GetRequiredService<IKpiHistoryRepository>();
|
|
|
|
var samples = new List<KpiSample>();
|
|
foreach (var source in sources)
|
|
{
|
|
try
|
|
{
|
|
var collected = await source.CollectAsync(capturedAt, cancellationToken);
|
|
samples.AddRange(collected);
|
|
}
|
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
|
{
|
|
// Shutdown interrupted collection; abandon the rest of the pass. The next
|
|
// active node samples on its own cadence. Not a failure.
|
|
return;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// A throwing source must NOT abort the pass or the other sources.
|
|
_logger.LogError(ex, "KPI source {Source} failed to collect samples.", source.Source);
|
|
}
|
|
}
|
|
|
|
if (samples.Count == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
await repository.RecordSamplesAsync(samples, cancellationToken);
|
|
}
|
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
|
{
|
|
// Shutdown interrupted the write; the batch is dropped (best-effort). Not a failure.
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "KPI sample write failed for {SampleCount} sample(s).", samples.Count);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Scope creation or service resolution faulted; swallow and log so the returned
|
|
// task completes normally and the singleton stays alive.
|
|
_logger.LogError(ex, "KPI sample pass failed unexpectedly.");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a purge tick: computes the retention cut-off on the actor thread, then runs
|
|
/// the bulk delete off-thread and pipes a completion back to <see cref="Self"/>. Purges
|
|
/// are daily and idempotent, so no in-flight guard is needed.
|
|
/// </summary>
|
|
private void HandlePurgeTick()
|
|
{
|
|
var before = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays);
|
|
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
|
|
|
|
RunPurgePass(before, cancellationToken).PipeTo(
|
|
Self,
|
|
success: deleted =>
|
|
{
|
|
if (deleted > 0)
|
|
{
|
|
_logger.LogInformation(
|
|
"KPI history purge removed {DeletedCount} sample(s) older than {Cutoff:o}.",
|
|
deleted, before);
|
|
}
|
|
|
|
return PurgeComplete.Instance;
|
|
},
|
|
failure: ex =>
|
|
{
|
|
_logger.LogError(ex, "KPI history purge faulted unexpectedly.");
|
|
return PurgeComplete.Instance;
|
|
});
|
|
}
|
|
|
|
/// <summary>
|
|
/// Runs a single purge sweep: opens a DI scope, resolves the repository, and bulk-deletes
|
|
/// rows captured before <paramref name="before"/>, returning the deleted count. The whole
|
|
/// body is wrapped so the returned task never faults — on failure the exception is logged
|
|
/// and 0 is returned, mirroring <see cref="RunSamplePass"/>'s best-effort contract.
|
|
/// </summary>
|
|
private async Task<int> RunPurgePass(DateTime before, CancellationToken cancellationToken)
|
|
{
|
|
try
|
|
{
|
|
await using var scope = _serviceProvider.CreateAsyncScope();
|
|
var repository = scope.ServiceProvider.GetRequiredService<IKpiHistoryRepository>();
|
|
return await repository.PurgeOlderThanAsync(before, cancellationToken);
|
|
}
|
|
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
|
{
|
|
// Shutdown interrupted the purge; the next active sweep retries. Not a failure.
|
|
return 0;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "KPI history purge failed unexpectedly.");
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/// <summary>Self-tick triggering a sampling pass across all registered sources.</summary>
|
|
internal sealed class SampleTick
|
|
{
|
|
public static readonly SampleTick Instance = new();
|
|
private SampleTick() { }
|
|
}
|
|
|
|
/// <summary>Piped-back completion of a sampling pass; lets the pass run off the actor thread.</summary>
|
|
internal sealed class SampleComplete
|
|
{
|
|
public static readonly SampleComplete Instance = new();
|
|
private SampleComplete() { }
|
|
}
|
|
|
|
/// <summary>Self-tick triggering a retention purge sweep.</summary>
|
|
internal sealed class PurgeTick
|
|
{
|
|
public static readonly PurgeTick Instance = new();
|
|
private PurgeTick() { }
|
|
}
|
|
|
|
/// <summary>Piped-back completion of a purge sweep; lets the sweep run off the actor thread.</summary>
|
|
internal sealed class PurgeComplete
|
|
{
|
|
public static readonly PurgeComplete Instance = new();
|
|
private PurgeComplete() { }
|
|
}
|
|
}
|