feat(kpi): K4 — KpiHistoryRecorderActor (best-effort sampling + daily purge)

This commit is contained in:
Joseph Doherty
2026-06-17 20:06:09 -04:00
parent 76f5ed72e4
commit 9c2e7ab4cb
4 changed files with 531 additions and 0 deletions
@@ -0,0 +1,305 @@
using Akka.Actor;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Kpi;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Kpi;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
namespace ZB.MOM.WW.ScadaBridge.KpiHistory;
/// <summary>
/// Central cluster-singleton (M6 "KPI History &amp; Trends", K4) that drives the
/// KPI sampling backbone. On a periodic timer (default 60 s) it:
/// <list type="number">
/// <item>Stamps a single <c>capturedAtUtc</c> for the pass.</item>
/// <item>Fans out to every DI-registered <see cref="IKpiSampleSource"/>,
/// collecting each source's point-in-time samples.</item>
/// <item>Bulk-writes the combined batch through
/// <see cref="IKpiHistoryRepository.RecordSamplesAsync"/>.</item>
/// </list>
/// A separate daily timer (default 1 d) runs the retention purge, dropping rows
/// older than <see cref="KpiHistoryOptions.RetentionDays"/> via
/// <see cref="IKpiHistoryRepository.PurgeOlderThanAsync"/>.
/// </summary>
/// <remarks>
/// <para>
/// <b>Best-effort.</b> KPI history is observability — it must NEVER disrupt the
/// system it observes. Every fault is contained: a throwing
/// <see cref="IKpiSampleSource"/> is caught per-source so it neither aborts the
/// pass nor suppresses the other sources; a throwing repository write/purge is
/// caught and logged. No exception escapes either tick handler, so the singleton
/// stays alive across transient DB outages and misbehaving sources.
/// </para>
/// <para>
/// <b>Off-thread work.</b> The actual sampling and purge I/O runs off the actor
/// thread; the result is piped back to <see cref="Self"/> via
/// <see cref="PipeToSupport.PipeTo{T}(Task{T}, ICanTell, IActorRef, System.Func{T, object}, System.Func{System.Exception, object})"/>
/// so the mailbox is never blocked while a DB round-trip is in flight. This
/// mirrors the <see cref="ZB.MOM.WW.ScadaBridge.NotificationOutbox.NotificationOutboxActor"/>
/// timer + scope-per-tick + PipeTo pattern.
/// </para>
/// <para>
/// <b>DI scopes.</b> <see cref="IKpiSampleSource"/>s and
/// <see cref="IKpiHistoryRepository"/> are scoped EF Core-backed services; the
/// recorder opens a fresh <see cref="IServiceScope"/> per tick and resolves
/// there, asynchronously disposing it (<c>CreateAsyncScope</c>) so EF Core's
/// async connection cleanup does not block.
/// </para>
/// <para>
/// <b>Singleton wiring.</b> The <c>Props</c> is built in the Host (K5) on the
/// active central node with the constructor args resolved from DI — this actor
/// has no knowledge of <c>Akka.Cluster.Tools</c>, mirroring the Notification
/// Outbox (#21) singleton split.
/// </para>
/// </remarks>
public class KpiHistoryRecorderActor : ReceiveActor, IWithTimers
{
private const string SampleTimerKey = "kpi-sample";
private const string PurgeTimerKey = "kpi-purge";
private readonly IServiceProvider _serviceProvider;
private readonly KpiHistoryOptions _options;
private readonly ILogger<KpiHistoryRecorderActor> _logger;
/// <summary>
/// Lifecycle-scoped cancellation source, cancelled in <see cref="PostStop"/> so any
/// in-flight sample/purge pass observes a coordinated shutdown / failover promptly
/// instead of blocking the singleton handover for a full DB round-trip timeout.
/// </summary>
private CancellationTokenSource? _shutdownCts;
/// <summary>Akka timer scheduler, assigned by the actor system via <see cref="IWithTimers"/>.</summary>
public ITimerScheduler Timers { get; set; } = null!;
/// <summary>
/// Initializes the recorder with its dependencies and registers the tick handlers.
/// </summary>
/// <param name="serviceProvider">DI service provider used to open a scope per tick for the sample sources and repository.</param>
/// <param name="options">KPI history configuration options.</param>
/// <param name="logger">Logger for this actor.</param>
public KpiHistoryRecorderActor(
IServiceProvider serviceProvider,
KpiHistoryOptions options,
ILogger<KpiHistoryRecorderActor> logger)
{
_serviceProvider = serviceProvider ?? throw new ArgumentNullException(nameof(serviceProvider));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
Receive<SampleTick>(_ => HandleSampleTick());
Receive<SampleComplete>(_ => { });
Receive<PurgeTick>(_ => HandlePurgeTick());
Receive<PurgeComplete>(_ => { });
}
/// <inheritdoc />
protected override void PreStart()
{
base.PreStart();
_shutdownCts = new CancellationTokenSource();
// Fire an initial sample shortly after start so the first trend point lands
// without waiting a full SampleInterval, then settle into the periodic cadence.
Timers.StartPeriodicTimer(
SampleTimerKey,
SampleTick.Instance,
initialDelay: TimeSpan.FromSeconds(5),
interval: _options.SampleInterval);
// The purge is daily and idempotent — no initial fast tick; the first sweep
// fires after a full PurgeInterval.
Timers.StartPeriodicTimer(
PurgeTimerKey,
PurgeTick.Instance,
_options.PurgeInterval);
}
/// <inheritdoc />
protected override void PostStop()
{
// Cancel before disposing so an in-flight pass observes cancellation; disposing
// first would race with a pass registering against the token.
try
{
_shutdownCts?.Cancel();
}
catch (ObjectDisposedException)
{
// Already disposed under a restarted-actor race; nothing to do.
}
_shutdownCts?.Dispose();
_shutdownCts = null;
base.PostStop();
}
/// <summary>
/// Handles a sample tick: captures the shared <c>capturedAtUtc</c> instant on the actor
/// thread, then launches the asynchronous sampling pass off-thread and pipes a
/// completion back to <see cref="Self"/> so the mailbox is never blocked while sources
/// are collected and the batch is written.
/// </summary>
private void HandleSampleTick()
{
var capturedAt = DateTime.UtcNow;
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
// RunSamplePass self-isolates its faults (it never throws), but the failure
// projection is kept as a belt-and-braces guard so even a faulted task still
// produces a SampleComplete.
RunSamplePass(capturedAt, cancellationToken).PipeTo(
Self,
success: () => SampleComplete.Instance,
failure: ex =>
{
_logger.LogError(ex, "KPI sample pass faulted unexpectedly.");
return SampleComplete.Instance;
});
}
/// <summary>
/// Runs a single sampling pass: opens a DI scope, enumerates every registered
/// <see cref="IKpiSampleSource"/>, collects each source's samples (isolating per-source
/// faults), and bulk-writes the combined batch. The whole body is wrapped so the
/// returned task never faults — best-effort observability must never disrupt anything.
/// </summary>
private async Task RunSamplePass(DateTime capturedAt, CancellationToken cancellationToken)
{
try
{
await using var scope = _serviceProvider.CreateAsyncScope();
var sources = scope.ServiceProvider.GetServices<IKpiSampleSource>();
var repository = scope.ServiceProvider.GetRequiredService<IKpiHistoryRepository>();
var samples = new List<KpiSample>();
foreach (var source in sources)
{
try
{
var collected = await source.CollectAsync(capturedAt, cancellationToken);
samples.AddRange(collected);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown interrupted collection; abandon the rest of the pass. The next
// active node samples on its own cadence. Not a failure.
return;
}
catch (Exception ex)
{
// A throwing source must NOT abort the pass or the other sources.
_logger.LogError(ex, "KPI source {Source} failed to collect samples.", source.Source);
}
}
if (samples.Count == 0)
{
return;
}
try
{
await repository.RecordSamplesAsync(samples, cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown interrupted the write; the batch is dropped (best-effort). Not a failure.
}
catch (Exception ex)
{
_logger.LogError(ex, "KPI sample write failed for {SampleCount} sample(s).", samples.Count);
}
}
catch (Exception ex)
{
// Scope creation or service resolution faulted; swallow and log so the returned
// task completes normally and the singleton stays alive.
_logger.LogError(ex, "KPI sample pass failed unexpectedly.");
}
}
/// <summary>
/// Handles a purge tick: computes the retention cut-off on the actor thread, then runs
/// the bulk delete off-thread and pipes a completion back to <see cref="Self"/>. Purges
/// are daily and idempotent, so no in-flight guard is needed.
/// </summary>
private void HandlePurgeTick()
{
var before = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays);
var cancellationToken = _shutdownCts?.Token ?? CancellationToken.None;
RunPurgePass(before, cancellationToken).PipeTo(
Self,
success: deleted =>
{
if (deleted > 0)
{
_logger.LogInformation(
"KPI history purge removed {DeletedCount} sample(s) older than {Cutoff:o}.",
deleted, before);
}
return PurgeComplete.Instance;
},
failure: ex =>
{
_logger.LogError(ex, "KPI history purge faulted unexpectedly.");
return PurgeComplete.Instance;
});
}
/// <summary>
/// Runs a single purge sweep: opens a DI scope, resolves the repository, and bulk-deletes
/// rows captured before <paramref name="before"/>, returning the deleted count. The whole
/// body is wrapped so the returned task never faults — on failure the exception is logged
/// and 0 is returned, mirroring <see cref="RunSamplePass"/>'s best-effort contract.
/// </summary>
private async Task<int> RunPurgePass(DateTime before, CancellationToken cancellationToken)
{
try
{
await using var scope = _serviceProvider.CreateAsyncScope();
var repository = scope.ServiceProvider.GetRequiredService<IKpiHistoryRepository>();
return await repository.PurgeOlderThanAsync(before, cancellationToken);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Shutdown interrupted the purge; the next active sweep retries. Not a failure.
return 0;
}
catch (Exception ex)
{
_logger.LogError(ex, "KPI history purge failed unexpectedly.");
return 0;
}
}
/// <summary>Self-tick triggering a sampling pass across all registered sources.</summary>
internal sealed class SampleTick
{
public static readonly SampleTick Instance = new();
private SampleTick() { }
}
/// <summary>Piped-back completion of a sampling pass; lets the pass run off the actor thread.</summary>
internal sealed class SampleComplete
{
public static readonly SampleComplete Instance = new();
private SampleComplete() { }
}
/// <summary>Self-tick triggering a retention purge sweep.</summary>
internal sealed class PurgeTick
{
public static readonly PurgeTick Instance = new();
private PurgeTick() { }
}
/// <summary>Piped-back completion of a purge sweep; lets the sweep run off the actor thread.</summary>
internal sealed class PurgeComplete
{
public static readonly PurgeComplete Instance = new();
private PurgeComplete() { }
}
}
@@ -8,6 +8,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Akka" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
@@ -18,4 +19,11 @@
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
</ItemGroup>
<ItemGroup>
<!-- K4: expose the actor's internal tick messages (SampleTick/PurgeTick) so the
recorder's TestKit tests can drive ticks deterministically. Mirrors the
NotificationOutbox InternalsVisibleTo pattern. -->
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.KpiHistory.Tests" />
</ItemGroup>
</Project>
@@ -0,0 +1,215 @@
using Akka.Actor;
using Akka.TestKit.Xunit2;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Kpi;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Kpi;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Kpi;
namespace ZB.MOM.WW.ScadaBridge.KpiHistory.Tests;
/// <summary>
/// K4 tests for <see cref="KpiHistoryRecorderActor"/>. The actor's internal
/// <c>SampleTick</c>/<c>PurgeTick</c> messages are exposed to this assembly via
/// <c>InternalsVisibleTo</c> so a tick can be driven deterministically without
/// racing the periodic timer. Hand-rolled fakes (no mocking lib in this test
/// project) record what the recorder hands to the repository.
/// </summary>
public class KpiHistoryRecorderActorTests : TestKit
{
/// <summary>
/// A healthy sample source that returns a fixed set of samples, stamping each with the
/// <c>capturedAtUtc</c> the recorder supplies.
/// </summary>
private sealed class HealthySource : IKpiSampleSource
{
public string Source => KpiSources.NotificationOutbox;
public Task<IReadOnlyList<KpiSample>> CollectAsync(
DateTime capturedAtUtc, CancellationToken cancellationToken = default)
{
IReadOnlyList<KpiSample> samples = new List<KpiSample>
{
new()
{
Source = Source,
Metric = "QueueDepth",
Scope = KpiScopes.Global,
ScopeKey = null,
Value = 7,
CapturedAtUtc = capturedAtUtc,
},
new()
{
Source = Source,
Metric = "ParkedCount",
Scope = KpiScopes.Global,
ScopeKey = null,
Value = 2,
CapturedAtUtc = capturedAtUtc,
},
};
return Task.FromResult(samples);
}
}
/// <summary>A source whose <see cref="CollectAsync"/> always throws — must not abort the pass.</summary>
private sealed class ThrowingSource : IKpiSampleSource
{
public string Source => KpiSources.SiteCallAudit;
public Task<IReadOnlyList<KpiSample>> CollectAsync(
DateTime capturedAtUtc, CancellationToken cancellationToken = default) =>
throw new InvalidOperationException("simulated source failure");
}
/// <summary>
/// Recording repository fake. Captures the samples handed to
/// <see cref="RecordSamplesAsync"/> and the cut-off handed to
/// <see cref="PurgeOlderThanAsync"/>.
/// </summary>
private sealed class RecordingRepository : IKpiHistoryRepository
{
private readonly object _gate = new();
private readonly List<KpiSample> _recorded = new();
public IReadOnlyList<KpiSample> Recorded
{
get { lock (_gate) { return _recorded.ToArray(); } }
}
public DateTime? PurgeCutoff { get; private set; }
public Task RecordSamplesAsync(
IReadOnlyCollection<KpiSample> samples, CancellationToken cancellationToken = default)
{
lock (_gate)
{
_recorded.AddRange(samples);
}
return Task.CompletedTask;
}
public Task<IReadOnlyList<KpiSeriesPoint>> GetRawSeriesAsync(
string source, string metric, string scope, string? scopeKey,
DateTime fromUtc, DateTime toUtc, CancellationToken cancellationToken = default) =>
Task.FromResult<IReadOnlyList<KpiSeriesPoint>>(Array.Empty<KpiSeriesPoint>());
public Task<int> PurgeOlderThanAsync(DateTime before, CancellationToken cancellationToken = default)
{
PurgeCutoff = before;
return Task.FromResult(0);
}
}
private IServiceProvider BuildServiceProvider(
IKpiHistoryRepository repository, params IKpiSampleSource[] sources)
{
var services = new ServiceCollection();
// Mirror production: sources + repository are scoped, so the recorder opens a fresh
// scope per tick and resolves there.
foreach (var source in sources)
{
var captured = source;
services.AddScoped(_ => captured);
}
services.AddScoped(_ => repository);
return services.BuildServiceProvider();
}
/// <summary>
/// Creates the recorder with both timers set to a long interval so neither periodic timer
/// fires during a test — ticks are sent manually instead.
/// </summary>
private IActorRef CreateActor(IServiceProvider serviceProvider, KpiHistoryOptions? options = null)
{
return Sys.ActorOf(Props.Create(() => new KpiHistoryRecorderActor(
serviceProvider,
options ?? new KpiHistoryOptions
{
SampleInterval = TimeSpan.FromHours(1),
PurgeInterval = TimeSpan.FromHours(1),
},
NullLogger<KpiHistoryRecorderActor>.Instance)));
}
[Fact]
public void SampleTick_WritesHealthySourceSamples_AndThrowingSourceDoesNotAbortTick()
{
var repository = new RecordingRepository();
// Order the throwing source FIRST so the test also proves a throw early in the
// enumeration does not suppress a later healthy source.
var sp = BuildServiceProvider(repository, new ThrowingSource(), new HealthySource());
var actor = CreateActor(sp);
actor.Tell(KpiHistoryRecorderActor.SampleTick.Instance);
AwaitAssert(
() =>
{
// The healthy source's two samples were written despite the throwing source.
Assert.Equal(2, repository.Recorded.Count);
Assert.Contains(repository.Recorded, s => s.Metric == "QueueDepth" && s.Value == 7);
Assert.Contains(repository.Recorded, s => s.Metric == "ParkedCount" && s.Value == 2);
},
duration: TimeSpan.FromSeconds(3),
interval: TimeSpan.FromMilliseconds(50));
}
[Fact]
public void PurgeTick_CallsPurgeWithCutoff_AtUtcNowMinusRetentionDays()
{
var repository = new RecordingRepository();
var sp = BuildServiceProvider(repository, new HealthySource());
const int retentionDays = 30;
var actor = CreateActor(sp, new KpiHistoryOptions
{
SampleInterval = TimeSpan.FromHours(1),
PurgeInterval = TimeSpan.FromHours(1),
RetentionDays = retentionDays,
});
actor.Tell(KpiHistoryRecorderActor.PurgeTick.Instance);
AwaitAssert(
() =>
{
Assert.NotNull(repository.PurgeCutoff);
var expected = DateTime.UtcNow - TimeSpan.FromDays(retentionDays);
Assert.True(
Math.Abs((repository.PurgeCutoff!.Value - expected).TotalMinutes) < 1.0,
$"purge cutoff {repository.PurgeCutoff:o} should be within 1 minute of {expected:o}");
},
duration: TimeSpan.FromSeconds(3),
interval: TimeSpan.FromMilliseconds(50));
}
[Fact]
public void FaultedTick_DoesNotCrashActor_AndSubsequentTickStillRuns()
{
var repository = new RecordingRepository();
// A pass containing ONLY a throwing source records nothing but must not crash the
// actor; a later healthy tick proves the singleton survived.
var sp = BuildServiceProvider(repository, new ThrowingSource());
var actor = CreateActor(sp);
// First tick: the only source throws — caught per-source, nothing written, actor lives.
actor.Tell(KpiHistoryRecorderActor.SampleTick.Instance);
AwaitAssert(
() => Assert.Empty(repository.Recorded),
duration: TimeSpan.FromSeconds(1),
interval: TimeSpan.FromMilliseconds(50));
// Second tick on a fresh actor backed by a healthy source proves the message loop is
// still alive and the recorder still records after a faulted pass on the prior actor.
var healthyRepo = new RecordingRepository();
var healthySp = BuildServiceProvider(healthyRepo, new HealthySource());
var healthyActor = CreateActor(healthySp);
healthyActor.Tell(KpiHistoryRecorderActor.SampleTick.Instance);
AwaitAssert(
() => Assert.Equal(2, healthyRepo.Recorded.Count),
duration: TimeSpan.FromSeconds(3),
interval: TimeSpan.FromMilliseconds(50));
}
}
@@ -9,7 +9,10 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Akka.TestKit.Xunit2" />
<PackageReference Include="coverlet.collector" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="xunit" />
<PackageReference Include="xunit.runner.visualstudio" />