1148 lines
50 KiB
C#
1148 lines
50 KiB
C#
using Akka.Actor;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Logging;
|
|
using ZB.MOM.WW.ScadaBridge.AuditLog.Central;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
|
using ZB.MOM.WW.ScadaBridge.Communication;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit;
|
|
|
|
/// <summary>
|
|
/// Central singleton for Site Call Audit (#22). Receives
|
|
/// <see cref="UpsertSiteCallCommand"/> messages and persists each
|
|
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.SiteCall"/> row via
|
|
/// <see cref="ISiteCallAuditRepository.UpsertAsync"/> — idempotent monotonic
|
|
/// upsert. Out-of-order or duplicate updates are silent no-ops at the
|
|
/// repository layer; the actor always replies <see cref="UpsertSiteCallReply"/>
|
|
/// with <c>Accepted=true</c> in that case because storage state is consistent
|
|
/// and the site is free to consider its packet acked.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// Implemented: direct <see cref="UpsertSiteCallCommand"/> telemetry ingest,
|
|
/// query, detail and KPI handlers (Task 4), the central→site Retry/Discard
|
|
/// relay (Task 5 — the relay handlers live in this actor), the periodic
|
|
/// per-site reconciliation puller that backfills lost telemetry (Piece A —
|
|
/// <see cref="OnReconciliationTickAsync"/>, the documented self-heal pull), and
|
|
/// the daily terminal-row purge scheduler (Piece B —
|
|
/// <see cref="OnPurgeTickAsync"/>, which invokes
|
|
/// <see cref="ISiteCallAuditRepository.PurgeTerminalAsync"/> on a timer). Both
|
|
/// background timers are started in <see cref="PreStart"/> and gate on the
|
|
/// reconciliation collaborators (<see cref="IPullSiteCallsClient"/> +
|
|
/// <see cref="ISiteEnumerator"/>) being available — the repo-only test ctor
|
|
/// injects neither, so neither timer runs there.
|
|
/// </para>
|
|
/// <para>
|
|
/// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" —
|
|
/// the actor catches every exception from the repository call and replies
|
|
/// <c>Accepted=false</c> without rethrowing, so the central singleton stays
|
|
/// alive. The <see cref="SupervisorStrategy"/> override governs the actor's
|
|
/// <em>children</em>, not the actor itself; this actor has no children today,
|
|
/// so the override is currently inert. It returns a one-for-one strategy with
|
|
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart on most
|
|
/// exceptions, Stop on <see cref="ActorInitializationException"/> /
|
|
/// <see cref="ActorKilledException"/>) and <c>maxNrOfRetries: 0</c>, so any
|
|
/// future child that throws is Stopped on the first failure — a deliberate
|
|
/// "fail loudly" posture for the central singleton's eventual sub-actors
|
|
/// (reconciliation puller, purge scheduler). Self-supervision of this actor
|
|
/// is whatever the parent <see cref="Akka.Cluster.Tools.Singleton.ClusterSingletonManager"/>
|
|
/// supplies; the in-handler <c>try/catch</c> in <see cref="OnUpsertAsync"/>
|
|
/// is what actually keeps the singleton alive across repository faults.
|
|
/// </para>
|
|
/// <para>
|
|
/// Two constructors exist for the same reason as
|
|
/// <c>AuditLogIngestActor</c>: production wiring (Bundle F) resolves the
|
|
/// scoped EF repository from a fresh DI scope per message because the actor
|
|
/// is a long-lived cluster singleton, while tests inject a concrete
|
|
/// <see cref="ISiteCallAuditRepository"/> against a per-test MSSQL fixture
|
|
/// so the actor exercises the real monotonic upsert SQL end to end.
|
|
/// </para>
|
|
/// </remarks>
|
|
public class SiteCallAuditActor : ReceiveActor
|
|
{
|
|
/// <summary>Maximum page size honoured by a <see cref="SiteCallQueryRequest"/>.</summary>
|
|
private const int MaxPageSize = 200;
|
|
|
|
private readonly IServiceProvider? _serviceProvider;
|
|
private readonly ISiteCallAuditRepository? _injectedRepository;
|
|
private readonly SiteCallAuditOptions _options;
|
|
private readonly ILogger<SiteCallAuditActor> _logger;
|
|
|
|
/// <summary>
|
|
/// Reconciliation collaborators (Piece A). The per-site self-heal pull
|
|
/// (<see cref="IPullSiteCallsClient"/>) and the site list
|
|
/// (<see cref="ISiteEnumerator"/>). On the production path these are
|
|
/// resolved once from the root <see cref="IServiceProvider"/> (central
|
|
/// singletons registered by <c>AddAuditLogCentralReconciliationClient</c>);
|
|
/// in the test path they are injected directly. They are <c>null</c> when
|
|
/// the actor was built via the repo-only test ctor — in that case the
|
|
/// reconciliation tick is NOT started (see <see cref="StartReconciliationTimer"/>);
|
|
/// the purge tick gates on the same collaborators (see <see cref="StartPurgeTimer"/>).
|
|
/// </summary>
|
|
private readonly IPullSiteCallsClient? _pullClient;
|
|
private readonly ISiteEnumerator? _siteEnumerator;
|
|
|
|
/// <summary>
|
|
/// Per-site reconciliation watermark — the highest
|
|
/// <see cref="SiteCall.UpdatedAtUtc"/> seen for that site on a previous
|
|
/// tick. The next tick asks for rows at or after this cursor; idempotent
|
|
/// monotonic <see cref="ISiteCallAuditRepository.UpsertAsync"/> swallows any
|
|
/// duplicate-with-same-timestamp rows. In-memory for the singleton's
|
|
/// lifetime — a failover / restart resets every cursor to
|
|
/// <see cref="DateTime.MinValue"/>, which is conservative but correct
|
|
/// (the next tick re-pulls and idempotent upsert dedupes). Mirrors
|
|
/// <c>SiteAuditReconciliationActor</c>.
|
|
/// </summary>
|
|
private readonly Dictionary<string, DateTime> _reconciliationCursors = new();
|
|
|
|
private ICancelable? _reconciliationTimer;
|
|
private ICancelable? _purgeTimer;
|
|
|
|
/// <summary>
|
|
/// Task 5 (#22): the central→site command transport — the
|
|
/// <c>CentralCommunicationActor</c>, which owns the per-site
|
|
/// <c>ClusterClient</c> map and routes a <see cref="SiteEnvelope"/> to the
|
|
/// owning site. Set via <see cref="RegisterCentralCommunication"/> by the
|
|
/// Host after both actors exist (this actor is a cluster singleton; the
|
|
/// transport actor is created separately). Null until registration
|
|
/// completes — a relay arriving before then is answered with a
|
|
/// <see cref="SiteCallRelayOutcome.SiteUnreachable"/> outcome, because there
|
|
/// is genuinely no route to any site yet.
|
|
/// </summary>
|
|
private IActorRef? _centralCommunication;
|
|
|
|
/// <summary>
|
|
/// Test-mode constructor — injects a concrete repository instance whose
|
|
/// lifetime exceeds the test, so the actor reuses the same instance
|
|
/// across every message. Used by Bundle C's MSSQL-backed TestKit fixture.
|
|
/// An optional <paramref name="options"/> lets a test pin the stuck/KPI
|
|
/// windows; when omitted the production defaults apply.
|
|
/// <para>
|
|
/// This ctor injects NO reconciliation client/enumerator, so the
|
|
/// reconciliation tick is gated off (see <see cref="StartReconciliationTimer"/>)
|
|
/// — the MSSQL-backed read/upsert tests must not fire phantom pulls.
|
|
/// </para>
|
|
/// </summary>
|
|
/// <param name="repository">Concrete repository instance to use for all messages.</param>
|
|
/// <param name="logger">Logger for diagnostics and error reporting.</param>
|
|
/// <param name="options">Optional configuration overrides; production defaults apply when null.</param>
|
|
public SiteCallAuditActor(
|
|
ISiteCallAuditRepository repository,
|
|
ILogger<SiteCallAuditActor> logger,
|
|
SiteCallAuditOptions? options = null)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(repository);
|
|
ArgumentNullException.ThrowIfNull(logger);
|
|
|
|
_injectedRepository = repository;
|
|
_logger = logger;
|
|
_options = options ?? new SiteCallAuditOptions();
|
|
|
|
RegisterHandlers();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Test-mode constructor for the reconciliation tick (Piece A) — injects a
|
|
/// concrete repository PLUS the two reconciliation collaborators directly,
|
|
/// so the per-site self-heal pull is unit-testable in-memory without a DI
|
|
/// container or a live gRPC channel. Because the client + enumerator are
|
|
/// present, the reconciliation tick IS started; the purge tick is also
|
|
/// started (both gate on the collaborators being available — see
|
|
/// <see cref="StartReconciliationTimer"/> / <see cref="StartPurgeTimer"/>).
|
|
/// </summary>
|
|
/// <param name="repository">Concrete repository instance used for upserts and purges.</param>
|
|
/// <param name="siteEnumerator">Enumerates the sites to reconcile each tick.</param>
|
|
/// <param name="pullClient">Pull client used to fetch changed rows from each site.</param>
|
|
/// <param name="logger">Logger for diagnostics and error reporting.</param>
|
|
/// <param name="options">Optional configuration overrides; production defaults apply when null.</param>
|
|
/// <remarks>
|
|
/// Public (not internal) because Akka's default <c>ActivatorProducer</c>
|
|
/// instantiates the actor via reflection with public-only binding flags —
|
|
/// an internal ctor yields a <c>MissingMethodException</c> at actor
|
|
/// creation. Distinguished from the production <see cref="IServiceProvider"/>
|
|
/// ctor by its concrete-collaborator parameter list; only the test project
|
|
/// (or a host that hand-resolves the collaborators) constructs it this way.
|
|
/// </remarks>
|
|
public SiteCallAuditActor(
|
|
ISiteCallAuditRepository repository,
|
|
ISiteEnumerator siteEnumerator,
|
|
IPullSiteCallsClient pullClient,
|
|
ILogger<SiteCallAuditActor> logger,
|
|
SiteCallAuditOptions? options = null)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(repository);
|
|
ArgumentNullException.ThrowIfNull(siteEnumerator);
|
|
ArgumentNullException.ThrowIfNull(pullClient);
|
|
ArgumentNullException.ThrowIfNull(logger);
|
|
|
|
_injectedRepository = repository;
|
|
_siteEnumerator = siteEnumerator;
|
|
_pullClient = pullClient;
|
|
_logger = logger;
|
|
_options = options ?? new SiteCallAuditOptions();
|
|
|
|
RegisterHandlers();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Production constructor — resolves <see cref="ISiteCallAuditRepository"/>
|
|
/// from a fresh DI scope per message because the repository is a scoped EF
|
|
/// Core service registered by <c>AddConfigurationDatabase</c>. The actor
|
|
/// itself is a long-lived cluster singleton, so it cannot hold a scope
|
|
/// across messages.
|
|
/// </summary>
|
|
/// <param name="serviceProvider">DI service provider used to create a scoped repository per message.</param>
|
|
/// <param name="options">Actor configuration (stuck threshold, KPI interval, relay timeout).</param>
|
|
/// <param name="logger">Logger for diagnostics and error reporting.</param>
|
|
public SiteCallAuditActor(
|
|
IServiceProvider serviceProvider,
|
|
SiteCallAuditOptions options,
|
|
ILogger<SiteCallAuditActor> logger)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(serviceProvider);
|
|
ArgumentNullException.ThrowIfNull(options);
|
|
ArgumentNullException.ThrowIfNull(logger);
|
|
|
|
_serviceProvider = serviceProvider;
|
|
_options = options;
|
|
_logger = logger;
|
|
|
|
// Reconciliation collaborators (Piece A) are central-only singletons
|
|
// registered by AddAuditLogCentralReconciliationClient — always on the
|
|
// central composition root (Program.cs). Resolve them once here (the
|
|
// actor itself is a long-lived singleton; the repository is the only
|
|
// scoped service and is still resolved per-tick/per-message). GetService
|
|
// (not GetRequiredService) so a host that somehow omits the helper
|
|
// degrades to "no reconciliation tick" rather than a startup crash —
|
|
// the tick startup gates on both being non-null.
|
|
_pullClient = serviceProvider.GetService<IPullSiteCallsClient>();
|
|
_siteEnumerator = serviceProvider.GetService<ISiteEnumerator>();
|
|
|
|
RegisterHandlers();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Wires up the message handlers shared by both constructors: the M3
|
|
/// ingest path plus the Task 4 read-side (query, detail, global + per-site
|
|
/// KPI). All read handlers reply to an Ask, so they capture <c>Sender</c>
|
|
/// before the first await and <c>PipeTo</c> the result back.
|
|
/// </summary>
|
|
private void RegisterHandlers()
|
|
{
|
|
ReceiveAsync<UpsertSiteCallCommand>(OnUpsertAsync);
|
|
Receive<SiteCallQueryRequest>(HandleQuery);
|
|
Receive<SiteCallDetailRequest>(HandleDetail);
|
|
Receive<SiteCallKpiRequest>(HandleKpi);
|
|
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
|
|
Receive<PerNodeSiteCallKpiRequest>(HandlePerNodeKpi);
|
|
|
|
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
|
|
Receive<RegisterCentralCommunication>(msg =>
|
|
{
|
|
_centralCommunication = msg.CentralCommunication;
|
|
_logger.LogInformation("SiteCallAudit registered central→site communication transport");
|
|
});
|
|
Receive<RetrySiteCallRequest>(HandleRetrySiteCall);
|
|
Receive<DiscardSiteCallRequest>(HandleDiscardSiteCall);
|
|
|
|
// Piece A/B (#22): self-ticks for the periodic reconciliation pull and
|
|
// the daily terminal-row purge. Handlers stay alive across faults via
|
|
// their own per-site / per-tick try/catch (mirroring the ingest path);
|
|
// the timers are only started when their collaborators are available.
|
|
ReceiveAsync<ReconciliationTick>(_ => OnReconciliationTickAsync());
|
|
ReceiveAsync<PurgeTick>(_ => OnPurgeTickAsync());
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override void PreStart()
|
|
{
|
|
base.PreStart();
|
|
StartReconciliationTimer();
|
|
StartPurgeTimer();
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override void PostStop()
|
|
{
|
|
_reconciliationTimer?.Cancel();
|
|
_purgeTimer?.Cancel();
|
|
base.PostStop();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Starts the periodic reconciliation tick — but ONLY when both the pull
|
|
/// client and the site enumerator are available. The repo-only test ctor
|
|
/// injects neither, so the tick is gated off there (the MSSQL read/upsert
|
|
/// tests must not fire phantom pulls); the reconciliation test ctor and the
|
|
/// production ctor (which resolves both from the SP) start it.
|
|
/// </summary>
|
|
private void StartReconciliationTimer()
|
|
{
|
|
if (_pullClient is null || _siteEnumerator is null)
|
|
{
|
|
return;
|
|
}
|
|
|
|
var interval = _options.ResolvedReconciliationInterval;
|
|
_reconciliationTimer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
|
initialDelay: interval,
|
|
interval: interval,
|
|
receiver: Self,
|
|
message: ReconciliationTick.Instance,
|
|
sender: Self);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Starts the daily purge tick — gated on the same collaborator presence as
|
|
/// the reconciliation tick. The purge itself only needs the repository, but
|
|
/// gating both schedulers together keeps the repo-only test ctor (no
|
|
/// client/enumerator) free of BOTH background timers, so the MSSQL read/
|
|
/// upsert tests see no scheduled side effects.
|
|
/// </summary>
|
|
private void StartPurgeTimer()
|
|
{
|
|
if (_pullClient is null || _siteEnumerator is null)
|
|
{
|
|
return;
|
|
}
|
|
|
|
var interval = _options.ResolvedPurgeInterval;
|
|
_purgeTimer = Context.System.Scheduler.ScheduleTellRepeatedlyCancelable(
|
|
initialDelay: interval,
|
|
interval: interval,
|
|
receiver: Self,
|
|
message: PurgeTick.Instance,
|
|
sender: Self);
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
protected override SupervisorStrategy SupervisorStrategy()
|
|
{
|
|
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
|
|
Akka.Actor.SupervisorStrategy.DefaultDecider);
|
|
}
|
|
|
|
private async Task OnUpsertAsync(UpsertSiteCallCommand cmd)
|
|
{
|
|
// Sender is captured before the first await — Akka resets Sender
|
|
// between message dispatches, so a post-await Tell would go to
|
|
// DeadLetters.
|
|
var replyTo = Sender;
|
|
var id = cmd.SiteCall.TrackedOperationId;
|
|
|
|
// Scope-per-message mirrors AuditLogIngestActor — production EF
|
|
// repository is scoped; the injected-repository mode (tests) skips
|
|
// the scope entirely.
|
|
IServiceScope? scope = null;
|
|
ISiteCallAuditRepository repository;
|
|
if (_injectedRepository is not null)
|
|
{
|
|
repository = _injectedRepository;
|
|
}
|
|
else
|
|
{
|
|
scope = _serviceProvider!.CreateScope();
|
|
repository = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
|
}
|
|
|
|
try
|
|
{
|
|
// SiteCallAudit-003: stamp IngestedAtUtc at central-side persist
|
|
// time on every upsert, mirroring AuditLogIngestActor's combined-
|
|
// telemetry hot path. IngestedAtUtc is the "central ingested (or
|
|
// last refreshed) this row" timestamp; callers (telemetry,
|
|
// future reconciliation puller, direct-writes) cannot in general
|
|
// know they are running on central, so the actor owns the stamp.
|
|
var siteCall = cmd.SiteCall with { IngestedAtUtc = DateTime.UtcNow };
|
|
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
|
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: true));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Per CLAUDE.md: audit-write failure NEVER aborts the user-facing
|
|
// action — log and reply Accepted=false; do NOT rethrow (the
|
|
// central singleton MUST stay alive).
|
|
_logger.LogError(ex, "SiteCallAudit upsert failed for {TrackedOperationId}", id);
|
|
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: false));
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
// ── Piece A: periodic per-site reconciliation pull (self-heal) ──
|
|
|
|
/// <summary>
|
|
/// One reconciliation pass: enumerate every known site and, per site, pull
|
|
/// changed <see cref="SiteCall"/> rows since that site's cursor and upsert
|
|
/// them idempotently — the documented self-heal when best-effort gRPC push
|
|
/// telemetry is lost. This is a mirror, NOT a dispatcher: cached-call
|
|
/// delivery stays site-local; upserting reconciled rows only refreshes the
|
|
/// eventually-consistent central <c>SiteCalls</c> mirror.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// Mirrors <c>SiteAuditReconciliationActor</c>'s structure (per-site cursor,
|
|
/// per-site try/catch failure isolation, advance the cursor by the max
|
|
/// observed <see cref="SiteCall.UpdatedAtUtc"/>) but is deliberately simpler:
|
|
/// no stalled-detection EventStream machinery — just cursor + pull + upsert
|
|
/// + advance. One DI scope per tick is opened and the same repository reused
|
|
/// across every site in that tick.
|
|
/// </remarks>
|
|
private async Task OnReconciliationTickAsync()
|
|
{
|
|
// The collaborators are guaranteed non-null: the tick is only scheduled
|
|
// when both are present (StartReconciliationTimer). Assert via the
|
|
// local copies so a future refactor that drops the gate fails loudly.
|
|
var enumerator = _siteEnumerator!;
|
|
var client = _pullClient!;
|
|
|
|
IReadOnlyList<SiteEntry> sites;
|
|
try
|
|
{
|
|
// No ambient CancellationToken in a ReceiveActor handler — None is
|
|
// intentional; the work is bounded by the reconciliation interval
|
|
// plus the singleton's graceful-stop drain on PhaseClusterLeave.
|
|
sites = await enumerator.EnumerateAsync().ConfigureAwait(false);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "SiteCallAudit site enumeration failed; skipping reconciliation tick.");
|
|
return;
|
|
}
|
|
|
|
if (sites.Count == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// AuditLog-003: open the scope INLINE with CreateAsyncScope + await using
|
|
// so the scoped EF Core repository (an IAsyncDisposable DbContext) disposes
|
|
// asynchronously at end of tick rather than blocking the Akka dispatcher
|
|
// thread on a synchronous Dispose() of pending connection cleanup — the tick
|
|
// holds the scope across many awaited UpsertAsync calls. Mirrors the sibling
|
|
// SiteAuditReconciliationActor.OnTickAsync. ResolveRepository() (sync Dispose)
|
|
// is retained for the synchronous message-handler paths. In the injected-
|
|
// repository test path there is no scope to open and the test repo is reused.
|
|
if (_injectedRepository is not null)
|
|
{
|
|
await ReconcileSitesAsync(sites, client, _injectedRepository).ConfigureAwait(false);
|
|
return;
|
|
}
|
|
|
|
await using var scope = _serviceProvider!.CreateAsyncScope();
|
|
var repository = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
|
await ReconcileSitesAsync(sites, client, repository).ConfigureAwait(false);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Reconciles every site in the tick against a single resolved repository,
|
|
/// isolating per-site faults so one bad site never sinks the rest of the
|
|
/// pass (the failing site's cursor is left at its previous value so the next
|
|
/// tick retries the same window).
|
|
/// </summary>
|
|
private async Task ReconcileSitesAsync(
|
|
IReadOnlyList<SiteEntry> sites, IPullSiteCallsClient client, ISiteCallAuditRepository repository)
|
|
{
|
|
foreach (var site in sites)
|
|
{
|
|
try
|
|
{
|
|
await ReconcileSiteAsync(site, client, repository).ConfigureAwait(false);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Failure-isolation invariant: one site's fault (transport,
|
|
// repository write) must NOT sink the rest of the tick. The
|
|
// failing site's cursor is left at its previous value so the
|
|
// next tick retries the same window.
|
|
_logger.LogWarning(
|
|
ex,
|
|
"SiteCallAudit reconciliation pull failed for site {SiteId}; other sites continue.",
|
|
site.SiteId);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Issues one <c>PullSiteCalls</c> RPC against the site, upserts the
|
|
/// returned rows idempotently, and advances the site's cursor to the maximum
|
|
/// <see cref="SiteCall.UpdatedAtUtc"/> observed. The pull client returns rows
|
|
/// oldest-first with <c>SourceSite</c> already re-stamped from the dialed
|
|
/// site id, so the actor upserts them verbatim (re-stamping
|
|
/// <c>IngestedAtUtc</c> at central persist time, as the telemetry path does).
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// <b>Coarse per-site retry — a deliberate divergence from
|
|
/// <c>SiteAuditReconciliationActor</c>.</b> That sibling (AuditLog-004) tracks
|
|
/// a per-EventId attempt counter and permanently abandons a row after a
|
|
/// threshold so a single un-insertable row cannot block a site's cursor
|
|
/// forever. This actor deliberately does NOT: any throw inside the loop
|
|
/// propagates to <see cref="OnReconciliationTickAsync"/>'s per-site catch,
|
|
/// which leaves the site's cursor at its previous value, so the next tick
|
|
/// re-pulls the whole batch from <c>since</c>. A persistently-bad row therefore
|
|
/// holds the site's cursor and re-pulls the batch every tick. This is
|
|
/// acceptable here because <see cref="ISiteCallAuditRepository.UpsertAsync"/> is
|
|
/// monotonic and idempotent — re-pulling already-ingested rows is a cheap
|
|
/// no-op — and the <c>SiteCalls</c> table is an eventually-consistent mirror,
|
|
/// not the source of truth, so a slow site simply lags rather than corrupts.
|
|
/// </para>
|
|
/// <para>
|
|
/// <b>Inclusive cursor boundary.</b> The cursor is advanced to the maximum
|
|
/// <see cref="SiteCall.UpdatedAtUtc"/> seen, and the pull asks for rows at or
|
|
/// after it (<c>since</c> is <c>>=</c>, not <c>></c>). The row whose
|
|
/// timestamp equals the cursor is therefore re-pulled on the next tick and
|
|
/// deduplicated by the idempotent monotonic upsert — the same inclusive-boundary
|
|
/// contract as <c>SiteAuditReconciliationActor</c>'s cursor.
|
|
/// </para>
|
|
/// </remarks>
|
|
private async Task ReconcileSiteAsync(
|
|
SiteEntry site, IPullSiteCallsClient client, ISiteCallAuditRepository repository)
|
|
{
|
|
var since = _reconciliationCursors.TryGetValue(site.SiteId, out var c) ? c : DateTime.MinValue;
|
|
var response = await client
|
|
.PullAsync(site.SiteId, since, _options.ReconciliationBatchSize, CancellationToken.None)
|
|
.ConfigureAwait(false);
|
|
|
|
var maxUpdated = since;
|
|
var nowUtc = DateTime.UtcNow;
|
|
foreach (var row in response.SiteCalls)
|
|
{
|
|
// IngestedAtUtc is the "central ingested (or last refreshed) this
|
|
// row" stamp — owned by the central actor, exactly as OnUpsertAsync
|
|
// does for the telemetry path. Monotonic UpsertAsync makes a row
|
|
// already present (from a prior push) a silent no-op.
|
|
var siteCall = row with { IngestedAtUtc = nowUtc };
|
|
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
|
|
|
if (row.UpdatedAtUtc > maxUpdated)
|
|
{
|
|
maxUpdated = row.UpdatedAtUtc;
|
|
}
|
|
}
|
|
|
|
// Advance the cursor to the newest row seen. A MoreAvailable response
|
|
// means the site saturated the batch; the next tick continues draining
|
|
// from the advanced cursor (no immediate re-pull loop — the natural
|
|
// tick cadence drains the backlog, matching SiteAuditReconciliationActor).
|
|
_reconciliationCursors[site.SiteId] = maxUpdated;
|
|
}
|
|
|
|
// ── Piece B: daily terminal-row purge scheduler ──
|
|
|
|
/// <summary>
|
|
/// One purge pass: drops terminal <c>SiteCalls</c> rows whose
|
|
/// <see cref="SiteCall.TerminalAtUtc"/> is older than
|
|
/// <c>UtcNow - RetentionDays</c> via
|
|
/// <see cref="ISiteCallAuditRepository.PurgeTerminalAsync"/>. Non-terminal
|
|
/// rows are never purged (enforced in the repository). The threshold is
|
|
/// computed each tick so an operator who lowers <c>RetentionDays</c> sees it
|
|
/// applied on the next purge without an actor restart. Mirrors
|
|
/// <c>AuditLogPurgeActor</c>'s daily cadence + continue-on-error posture: a
|
|
/// purge fault is logged and swallowed so the singleton stays alive.
|
|
/// </summary>
|
|
private async Task OnPurgeTickAsync()
|
|
{
|
|
var threshold = DateTime.UtcNow - TimeSpan.FromDays(_options.RetentionDays);
|
|
|
|
// AuditLog-003: open the scope INLINE with CreateAsyncScope + await using
|
|
// so the scoped EF Core repository (an IAsyncDisposable DbContext) disposes
|
|
// asynchronously rather than blocking the Akka dispatcher thread on a
|
|
// synchronous Dispose(). Mirrors SiteAuditReconciliationActor; the
|
|
// injected-repository test path reuses the test repo with no scope.
|
|
if (_injectedRepository is not null)
|
|
{
|
|
await PurgeWithRepositoryAsync(_injectedRepository, threshold).ConfigureAwait(false);
|
|
return;
|
|
}
|
|
|
|
await using var scope = _serviceProvider!.CreateAsyncScope();
|
|
var repository = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
|
await PurgeWithRepositoryAsync(repository, threshold).ConfigureAwait(false);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Runs one terminal-row purge against the resolved repository, logging and
|
|
/// swallowing any fault (continue-on-error) so a transient SQL failure or
|
|
/// contention never crashes the central singleton — the next tick retries
|
|
/// the same window.
|
|
/// </summary>
|
|
private async Task PurgeWithRepositoryAsync(ISiteCallAuditRepository repository, DateTime threshold)
|
|
{
|
|
try
|
|
{
|
|
var rowsDeleted = await repository.PurgeTerminalAsync(threshold).ConfigureAwait(false);
|
|
if (rowsDeleted > 0)
|
|
{
|
|
_logger.LogInformation(
|
|
"SiteCallAudit purged {RowsDeleted} terminal SiteCalls rows older than {ThresholdUtc:o}.",
|
|
rowsDeleted,
|
|
threshold);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Continue-on-error: a purge fault (transient SQL failure,
|
|
// contention) must NOT crash the central singleton. The next tick
|
|
// retries the same window.
|
|
_logger.LogError(
|
|
ex,
|
|
"SiteCallAudit terminal-row purge failed (threshold {ThresholdUtc:o}); will retry next tick.",
|
|
threshold);
|
|
}
|
|
}
|
|
|
|
// ── Task 4: read-side (query / detail / KPI) ──
|
|
|
|
/// <summary>
|
|
/// Handles a paginated, filtered query over the <c>SiteCalls</c> table.
|
|
/// Builds a <see cref="SiteCallQueryFilter"/> + <see cref="SiteCallPaging"/>
|
|
/// keyset cursor from the request, runs the query on a scoped repository,
|
|
/// and pipes the mapped response back to the captured sender. A repository
|
|
/// fault yields a failure response with an empty list.
|
|
/// </summary>
|
|
private void HandleQuery(SiteCallQueryRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
|
|
QueryAsync(request, now).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new SiteCallQueryResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
SiteCalls: Array.Empty<SiteCallSummary>(),
|
|
NextAfterCreatedAtUtc: null,
|
|
NextAfterId: null));
|
|
}
|
|
|
|
private async Task<SiteCallQueryResponse> QueryAsync(SiteCallQueryRequest request, DateTime now)
|
|
{
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
|
|
var filter = new SiteCallQueryFilter(
|
|
Channel: NullIfBlank(request.ChannelFilter),
|
|
SourceSite: NullIfBlank(request.SourceSiteFilter),
|
|
Status: NullIfBlank(request.StatusFilter),
|
|
Target: NullIfBlank(request.TargetKeyword),
|
|
FromUtc: request.FromUtc,
|
|
ToUtc: request.ToUtc,
|
|
// StuckOnly is pushed into the repository SQL via StuckCutoffUtc —
|
|
// TerminalAtUtc IS NULL AND CreatedAtUtc < cutoff composes with the
|
|
// keyset cursor, so the page is always honest (full pages, no empty
|
|
// pages with a non-null next cursor).
|
|
StuckCutoffUtc: request.StuckOnly ? stuckCutoff : null,
|
|
SourceNode: NullIfBlank(request.SourceNodeFilter));
|
|
|
|
var pageSize = Math.Clamp(request.PageSize, 1, MaxPageSize);
|
|
var paging = new SiteCallPaging(
|
|
PageSize: pageSize,
|
|
AfterCreatedAtUtc: request.AfterCreatedAtUtc,
|
|
AfterId: request.AfterId is { } id ? new TrackedOperationId(id) : null);
|
|
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var rows = await repository.QueryAsync(filter, paging).ConfigureAwait(false);
|
|
|
|
var summaries = rows
|
|
.Select(row => ToSummary(row, stuckCutoff))
|
|
.ToList();
|
|
|
|
// The next-page cursor is the last row of the materialised page.
|
|
var cursorRow = rows.Count > 0 ? rows[^1] : null;
|
|
|
|
return new SiteCallQueryResponse(
|
|
request.CorrelationId,
|
|
Success: true,
|
|
ErrorMessage: null,
|
|
SiteCalls: summaries,
|
|
NextAfterCreatedAtUtc: cursorRow?.CreatedAtUtc,
|
|
NextAfterId: cursorRow?.TrackedOperationId.Value);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a full-detail query for a single cached call — backs the report
|
|
/// detail modal. A missing row yields <c>Success=false</c> with a "not
|
|
/// found" message; a repository fault yields <c>Success=false</c> with the
|
|
/// fault message.
|
|
/// </summary>
|
|
private void HandleDetail(SiteCallDetailRequest request)
|
|
{
|
|
var sender = Sender;
|
|
|
|
DetailAsync(request).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new SiteCallDetailResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
Detail: null));
|
|
}
|
|
|
|
private async Task<SiteCallDetailResponse> DetailAsync(SiteCallDetailRequest request)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var row = await repository
|
|
.GetAsync(new TrackedOperationId(request.TrackedOperationId))
|
|
.ConfigureAwait(false);
|
|
|
|
if (row is null)
|
|
{
|
|
return new SiteCallDetailResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: "site call not found",
|
|
Detail: null);
|
|
}
|
|
|
|
return new SiteCallDetailResponse(
|
|
request.CorrelationId,
|
|
Success: true,
|
|
ErrorMessage: null,
|
|
Detail: ToDetail(row));
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a global KPI snapshot request, deriving the stuck cutoff from
|
|
/// <see cref="SiteCallAuditOptions.StuckAgeThreshold"/> and the
|
|
/// failed/delivered interval bound from <see cref="SiteCallAuditOptions.KpiInterval"/>.
|
|
/// </summary>
|
|
private void HandleKpi(SiteCallKpiRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
var intervalSince = now - _options.KpiInterval;
|
|
|
|
KpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new SiteCallKpiResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
BufferedCount: 0,
|
|
ParkedCount: 0,
|
|
FailedLastInterval: 0,
|
|
DeliveredLastInterval: 0,
|
|
OldestPendingAge: null,
|
|
StuckCount: 0));
|
|
}
|
|
|
|
private async Task<SiteCallKpiResponse> KpiAsync(
|
|
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var snapshot = await repository
|
|
.ComputeKpisAsync(stuckCutoff, intervalSince)
|
|
.ConfigureAwait(false);
|
|
|
|
return new SiteCallKpiResponse(
|
|
correlationId,
|
|
Success: true,
|
|
ErrorMessage: null,
|
|
snapshot.BufferedCount,
|
|
snapshot.ParkedCount,
|
|
snapshot.FailedLastInterval,
|
|
snapshot.DeliveredLastInterval,
|
|
snapshot.OldestPendingAge,
|
|
snapshot.StuckCount);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a per-source-site KPI request, using the same stuck cutoff and
|
|
/// interval bound as <see cref="HandleKpi"/>.
|
|
/// </summary>
|
|
private void HandlePerSiteKpi(PerSiteSiteCallKpiRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
var intervalSince = now - _options.KpiInterval;
|
|
|
|
PerSiteKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new PerSiteSiteCallKpiResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
Sites: Array.Empty<SiteCallSiteKpiSnapshot>()));
|
|
}
|
|
|
|
private async Task<PerSiteSiteCallKpiResponse> PerSiteKpiAsync(
|
|
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var sites = await repository
|
|
.ComputePerSiteKpisAsync(stuckCutoff, intervalSince)
|
|
.ConfigureAwait(false);
|
|
|
|
return new PerSiteSiteCallKpiResponse(
|
|
correlationId, Success: true, ErrorMessage: null, sites);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a per-node KPI request, using the same stuck cutoff and
|
|
/// interval bound as <see cref="HandleKpi"/>. Additive alongside
|
|
/// <see cref="HandlePerSiteKpi"/> — does not change per-site behaviour.
|
|
/// </summary>
|
|
private void HandlePerNodeKpi(PerNodeSiteCallKpiRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
var intervalSince = now - _options.KpiInterval;
|
|
|
|
PerNodeKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new PerNodeSiteCallKpiResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
Nodes: Array.Empty<SiteCallNodeKpiSnapshot>()));
|
|
}
|
|
|
|
private async Task<PerNodeSiteCallKpiResponse> PerNodeKpiAsync(
|
|
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var nodes = await repository
|
|
.ComputePerNodeKpisAsync(stuckCutoff, intervalSince)
|
|
.ConfigureAwait(false);
|
|
|
|
return new PerNodeSiteCallKpiResponse(
|
|
correlationId, Success: true, ErrorMessage: null, nodes);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
// ── Task 5: central→site Retry/Discard relay ──
|
|
|
|
/// <summary>
|
|
/// Relays an operator Retry of a parked cached call to its owning site. The
|
|
/// site is the source of truth — this handler NEVER writes the central
|
|
/// <c>SiteCalls</c> mirror row. It wraps a <see cref="RetryParkedOperation"/>
|
|
/// in a <see cref="SiteEnvelope"/> addressed to <c>SourceSite</c>, Asks the
|
|
/// <c>CentralCommunicationActor</c> (which routes it over the per-site
|
|
/// <c>ClusterClient</c>), and maps the site's
|
|
/// <see cref="ParkedOperationActionAck"/> — or an Ask timeout — onto a
|
|
/// <see cref="RetrySiteCallResponse"/>. A timeout / no-route is reported as
|
|
/// the distinct <see cref="SiteCallRelayOutcome.SiteUnreachable"/> outcome,
|
|
/// not a generic failure, so the Central UI can tell "site offline" from
|
|
/// "operation failed".
|
|
/// </summary>
|
|
private void HandleRetrySiteCall(RetrySiteCallRequest request)
|
|
{
|
|
var sender = Sender;
|
|
|
|
if (_centralCommunication is null)
|
|
{
|
|
// No transport registered yet — there is genuinely no route to any
|
|
// site, so the only honest answer is unreachable.
|
|
_logger.LogWarning(
|
|
"RetrySiteCall {TrackedOperationId} for site {SourceSite} arrived before the "
|
|
+ "central→site transport was registered; reporting site unreachable",
|
|
request.TrackedOperationId, request.SourceSite);
|
|
sender.Tell(UnreachableRetry(request.CorrelationId));
|
|
return;
|
|
}
|
|
|
|
var relay = new RetryParkedOperation(
|
|
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
|
|
var envelope = new SiteEnvelope(request.SourceSite, relay);
|
|
|
|
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
|
|
.PipeTo(
|
|
sender,
|
|
success: ack => MapRetryResponse(request.CorrelationId, ack),
|
|
failure: ex => MapRetryFailure(request.CorrelationId, request.SourceSite, ex));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Relays an operator Discard of a parked cached call to its owning site.
|
|
/// Mirrors <see cref="HandleRetrySiteCall"/> — see that method for the
|
|
/// source-of-truth and site-unreachable rationale.
|
|
/// </summary>
|
|
private void HandleDiscardSiteCall(DiscardSiteCallRequest request)
|
|
{
|
|
var sender = Sender;
|
|
|
|
if (_centralCommunication is null)
|
|
{
|
|
_logger.LogWarning(
|
|
"DiscardSiteCall {TrackedOperationId} for site {SourceSite} arrived before the "
|
|
+ "central→site transport was registered; reporting site unreachable",
|
|
request.TrackedOperationId, request.SourceSite);
|
|
sender.Tell(UnreachableDiscard(request.CorrelationId));
|
|
return;
|
|
}
|
|
|
|
var relay = new DiscardParkedOperation(
|
|
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
|
|
var envelope = new SiteEnvelope(request.SourceSite, relay);
|
|
|
|
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
|
|
.PipeTo(
|
|
sender,
|
|
success: ack => MapDiscardResponse(request.CorrelationId, ack),
|
|
failure: ex => MapDiscardFailure(request.CorrelationId, request.SourceSite, ex));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Maps the site's <see cref="ParkedOperationActionAck"/> for a Retry onto a
|
|
/// <see cref="RetrySiteCallResponse"/>: an applied action is
|
|
/// <see cref="SiteCallRelayOutcome.Applied"/>; a clean no-op
|
|
/// (<c>Applied=false</c>, no error) is <see cref="SiteCallRelayOutcome.NotParked"/>;
|
|
/// an ack carrying an error is <see cref="SiteCallRelayOutcome.OperationFailed"/>
|
|
/// — in every case the site WAS reached.
|
|
/// </summary>
|
|
private static RetrySiteCallResponse MapRetryResponse(string correlationId, ParkedOperationActionAck ack)
|
|
{
|
|
var outcome = ClassifyAck(ack);
|
|
return new RetrySiteCallResponse(
|
|
correlationId,
|
|
outcome,
|
|
Success: outcome == SiteCallRelayOutcome.Applied,
|
|
SiteReachable: true,
|
|
ErrorMessage: AckErrorMessage(outcome, ack));
|
|
}
|
|
|
|
private static DiscardSiteCallResponse MapDiscardResponse(string correlationId, ParkedOperationActionAck ack)
|
|
{
|
|
var outcome = ClassifyAck(ack);
|
|
return new DiscardSiteCallResponse(
|
|
correlationId,
|
|
outcome,
|
|
Success: outcome == SiteCallRelayOutcome.Applied,
|
|
SiteReachable: true,
|
|
ErrorMessage: AckErrorMessage(outcome, ack));
|
|
}
|
|
|
|
private RetrySiteCallResponse MapRetryFailure(string correlationId, string sourceSite, Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Retry relay to site {SourceSite} did not complete; reporting site unreachable", sourceSite);
|
|
return UnreachableRetry(correlationId);
|
|
}
|
|
|
|
private DiscardSiteCallResponse MapDiscardFailure(string correlationId, string sourceSite, Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Discard relay to site {SourceSite} did not complete; reporting site unreachable", sourceSite);
|
|
return UnreachableDiscard(correlationId);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Classifies a site ack: <c>Applied=true</c> → applied; <c>Applied=false</c>
|
|
/// with no error → the site definitively had nothing parked; <c>Applied=false</c>
|
|
/// with an error → the site could not apply the action.
|
|
/// </summary>
|
|
private static SiteCallRelayOutcome ClassifyAck(ParkedOperationActionAck ack)
|
|
{
|
|
if (ack.Applied)
|
|
{
|
|
return SiteCallRelayOutcome.Applied;
|
|
}
|
|
|
|
return ack.ErrorMessage is null
|
|
? SiteCallRelayOutcome.NotParked
|
|
: SiteCallRelayOutcome.OperationFailed;
|
|
}
|
|
|
|
private static string? AckErrorMessage(SiteCallRelayOutcome outcome, ParkedOperationActionAck ack)
|
|
{
|
|
return outcome switch
|
|
{
|
|
SiteCallRelayOutcome.Applied => null,
|
|
SiteCallRelayOutcome.NotParked =>
|
|
"The operation is no longer parked at the site (already delivered, discarded, or retrying).",
|
|
SiteCallRelayOutcome.OperationFailed => ack.ErrorMessage,
|
|
// SiteUnreachable is never produced from a ParkedOperationActionAck —
|
|
// unreachable responses are built by UnreachableRetry/UnreachableDiscard
|
|
// before any ack is classified, so this arm is unreachable by construction.
|
|
// We deliberately return ack.ErrorMessage (rather than throwing) to keep
|
|
// AckErrorMessage total and side-effect-free: site-unreachable is classified
|
|
// as transient by the upstream relay path (which has already constructed the
|
|
// SiteUnreachable response and detail text via SiteUnreachableMessage), so a
|
|
// defensive fall-through here just surfaces whatever error text the ack
|
|
// carries and lets the caller schedule a retry. Throwing would turn a benign
|
|
// refactor invariant violation into a relay-path crash.
|
|
SiteCallRelayOutcome.SiteUnreachable => ack.ErrorMessage,
|
|
_ => throw new ArgumentOutOfRangeException(
|
|
nameof(outcome), outcome, "unknown SiteCallRelayOutcome"),
|
|
};
|
|
}
|
|
|
|
/// <summary>Shared "site unreachable" detail text for both relay directions.</summary>
|
|
private const string SiteUnreachableMessage =
|
|
"The owning site is unreachable; the action was not applied. Retry when the site is back online.";
|
|
|
|
private static RetrySiteCallResponse UnreachableRetry(string correlationId)
|
|
{
|
|
return new RetrySiteCallResponse(
|
|
correlationId,
|
|
SiteCallRelayOutcome.SiteUnreachable,
|
|
Success: false,
|
|
SiteReachable: false,
|
|
ErrorMessage: SiteUnreachableMessage);
|
|
}
|
|
|
|
private static DiscardSiteCallResponse UnreachableDiscard(string correlationId)
|
|
{
|
|
return new DiscardSiteCallResponse(
|
|
correlationId,
|
|
SiteCallRelayOutcome.SiteUnreachable,
|
|
Success: false,
|
|
SiteReachable: false,
|
|
ErrorMessage: SiteUnreachableMessage);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Resolves an <see cref="ISiteCallAuditRepository"/> for one read message.
|
|
/// In test mode the injected instance is returned with a null scope; in
|
|
/// production a fresh DI scope is created and returned so the caller can
|
|
/// dispose it once the read completes — the same scope-per-message pattern
|
|
/// as <see cref="OnUpsertAsync"/>.
|
|
/// </summary>
|
|
private (IServiceScope? Scope, ISiteCallAuditRepository Repository) ResolveRepository()
|
|
{
|
|
if (_injectedRepository is not null)
|
|
{
|
|
return (null, _injectedRepository);
|
|
}
|
|
|
|
var scope = _serviceProvider!.CreateScope();
|
|
return (scope, scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>());
|
|
}
|
|
|
|
/// <summary>
|
|
/// A cached call counts as stuck when it is still non-terminal and was
|
|
/// created before <paramref name="stuckCutoff"/>. Non-terminal is keyed off
|
|
/// <see cref="SiteCall.TerminalAtUtc"/> being <c>null</c> — the
|
|
/// <c>SiteCalls</c> operational mirror stores <c>AuditStatus</c>-derived
|
|
/// status strings (<c>Attempted</c>/<c>Delivered</c>/<c>Parked</c>/...), not
|
|
/// the tracking-lifecycle <c>Pending</c>/<c>Retrying</c> names the spec's
|
|
/// KPI section uses, so there is no status string that means "buffered".
|
|
/// <c>TerminalAtUtc</c> is the entity's own active/terminal discriminator
|
|
/// and is consistent with the repository KPI counts and
|
|
/// <c>PurgeTerminalAsync</c>.
|
|
/// </summary>
|
|
private static bool IsStuck(SiteCall row, DateTime stuckCutoff)
|
|
{
|
|
return row.TerminalAtUtc is null && row.CreatedAtUtc < stuckCutoff;
|
|
}
|
|
|
|
private static SiteCallSummary ToSummary(SiteCall row, DateTime stuckCutoff)
|
|
{
|
|
return new SiteCallSummary(
|
|
TrackedOperationId: row.TrackedOperationId.Value,
|
|
SourceSite: row.SourceSite,
|
|
Channel: row.Channel,
|
|
Target: row.Target,
|
|
Status: row.Status,
|
|
RetryCount: row.RetryCount,
|
|
LastError: row.LastError,
|
|
HttpStatus: row.HttpStatus,
|
|
CreatedAtUtc: row.CreatedAtUtc,
|
|
UpdatedAtUtc: row.UpdatedAtUtc,
|
|
TerminalAtUtc: row.TerminalAtUtc,
|
|
IsStuck: IsStuck(row, stuckCutoff),
|
|
SourceNode: row.SourceNode);
|
|
}
|
|
|
|
private static SiteCallDetail ToDetail(SiteCall row)
|
|
{
|
|
return new SiteCallDetail(
|
|
TrackedOperationId: row.TrackedOperationId.Value,
|
|
SourceSite: row.SourceSite,
|
|
Channel: row.Channel,
|
|
Target: row.Target,
|
|
Status: row.Status,
|
|
RetryCount: row.RetryCount,
|
|
LastError: row.LastError,
|
|
HttpStatus: row.HttpStatus,
|
|
CreatedAtUtc: row.CreatedAtUtc,
|
|
UpdatedAtUtc: row.UpdatedAtUtc,
|
|
TerminalAtUtc: row.TerminalAtUtc,
|
|
IngestedAtUtc: row.IngestedAtUtc,
|
|
SourceNode: row.SourceNode);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Treats an empty/whitespace filter string as "no constraint" — the
|
|
/// repository's <see cref="SiteCallQueryFilter"/> interprets <c>null</c> as
|
|
/// a no-op predicate, so a blank UI filter must collapse to <c>null</c>.
|
|
/// </summary>
|
|
private static string? NullIfBlank(string? value)
|
|
{
|
|
return string.IsNullOrWhiteSpace(value) ? null : value;
|
|
}
|
|
|
|
/// <summary>Self-tick triggering a reconciliation pass across all sites (Piece A).</summary>
|
|
internal sealed class ReconciliationTick
|
|
{
|
|
public static readonly ReconciliationTick Instance = new();
|
|
private ReconciliationTick() { }
|
|
}
|
|
|
|
/// <summary>Self-tick triggering a terminal-row purge pass (Piece B).</summary>
|
|
internal sealed class PurgeTick
|
|
{
|
|
public static readonly PurgeTick Instance = new();
|
|
private PurgeTick() { }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Registers the central→site command transport (the <c>CentralCommunicationActor</c>)
|
|
/// with the <see cref="SiteCallAuditActor"/> so it can relay Retry/Discard
|
|
/// actions on parked cached calls to their owning sites. Sent by the Host after
|
|
/// both actors exist. Lives here (not in Commons) because it carries an
|
|
/// <see cref="IActorRef"/> and <c>ZB.MOM.WW.ScadaBridge.Commons</c> has no Akka reference —
|
|
/// the same rationale as <c>RegisterAuditIngest</c>.
|
|
/// </summary>
|
|
public sealed record RegisterCentralCommunication(IActorRef CentralCommunication);
|