7b0b9c7365
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
718 lines
31 KiB
C#
718 lines
31 KiB
C#
using Akka.Actor;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Logging;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Repositories;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Audit;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
|
using ZB.MOM.WW.ScadaBridge.Commons.Types.Audit;
|
|
using ZB.MOM.WW.ScadaBridge.Communication;
|
|
|
|
namespace ZB.MOM.WW.ScadaBridge.SiteCallAudit;
|
|
|
|
/// <summary>
|
|
/// Central singleton for Site Call Audit (#22). Receives
|
|
/// <see cref="UpsertSiteCallCommand"/> messages and persists each
|
|
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Entities.Audit.SiteCall"/> row via
|
|
/// <see cref="ISiteCallAuditRepository.UpsertAsync"/> — idempotent monotonic
|
|
/// upsert. Out-of-order or duplicate updates are silent no-ops at the
|
|
/// repository layer; the actor always replies <see cref="UpsertSiteCallReply"/>
|
|
/// with <c>Accepted=true</c> in that case because storage state is consistent
|
|
/// and the site is free to consider its packet acked.
|
|
/// </summary>
|
|
/// <remarks>
|
|
/// <para>
|
|
/// Implemented: direct <see cref="UpsertSiteCallCommand"/> telemetry ingest,
|
|
/// query, detail and KPI handlers (Task 4), and the central→site Retry/Discard
|
|
/// relay (Task 5 — the relay handlers live in this actor). Deferred (per
|
|
/// CLAUDE.md scope discipline — both land in a later follow-up): the periodic
|
|
/// per-site reconciliation puller that backfills lost telemetry, and the daily
|
|
/// terminal-row purge scheduler (the repository exposes
|
|
/// <c>PurgeTerminalAsync</c> but nothing in this module currently invokes it
|
|
/// on a schedule).
|
|
/// </para>
|
|
/// <para>
|
|
/// Per CLAUDE.md "audit-write failure NEVER aborts the user-facing action" —
|
|
/// the actor catches every exception from the repository call and replies
|
|
/// <c>Accepted=false</c> without rethrowing, so the central singleton stays
|
|
/// alive. The <see cref="SupervisorStrategy"/> override governs the actor's
|
|
/// <em>children</em>, not the actor itself; this actor has no children today,
|
|
/// so the override is currently inert. It returns a one-for-one strategy with
|
|
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart on most
|
|
/// exceptions, Stop on <see cref="ActorInitializationException"/> /
|
|
/// <see cref="ActorKilledException"/>) and <c>maxNrOfRetries: 0</c>, so any
|
|
/// future child that throws is Stopped on the first failure — a deliberate
|
|
/// "fail loudly" posture for the central singleton's eventual sub-actors
|
|
/// (reconciliation puller, purge scheduler). Self-supervision of this actor
|
|
/// is whatever the parent <see cref="Akka.Cluster.Tools.Singleton.ClusterSingletonManager"/>
|
|
/// supplies; the in-handler <c>try/catch</c> in <see cref="OnUpsertAsync"/>
|
|
/// is what actually keeps the singleton alive across repository faults.
|
|
/// </para>
|
|
/// <para>
|
|
/// Two constructors exist for the same reason as
|
|
/// <c>AuditLogIngestActor</c>: production wiring (Bundle F) resolves the
|
|
/// scoped EF repository from a fresh DI scope per message because the actor
|
|
/// is a long-lived cluster singleton, while tests inject a concrete
|
|
/// <see cref="ISiteCallAuditRepository"/> against a per-test MSSQL fixture
|
|
/// so the actor exercises the real monotonic upsert SQL end to end.
|
|
/// </para>
|
|
/// </remarks>
|
|
public class SiteCallAuditActor : ReceiveActor
|
|
{
|
|
/// <summary>Maximum page size honoured by a <see cref="SiteCallQueryRequest"/>.</summary>
|
|
private const int MaxPageSize = 200;
|
|
|
|
private readonly IServiceProvider? _serviceProvider;
|
|
private readonly ISiteCallAuditRepository? _injectedRepository;
|
|
private readonly SiteCallAuditOptions _options;
|
|
private readonly ILogger<SiteCallAuditActor> _logger;
|
|
|
|
/// <summary>
|
|
/// Task 5 (#22): the central→site command transport — the
|
|
/// <c>CentralCommunicationActor</c>, which owns the per-site
|
|
/// <c>ClusterClient</c> map and routes a <see cref="SiteEnvelope"/> to the
|
|
/// owning site. Set via <see cref="RegisterCentralCommunication"/> by the
|
|
/// Host after both actors exist (this actor is a cluster singleton; the
|
|
/// transport actor is created separately). Null until registration
|
|
/// completes — a relay arriving before then is answered with a
|
|
/// <see cref="SiteCallRelayOutcome.SiteUnreachable"/> outcome, because there
|
|
/// is genuinely no route to any site yet.
|
|
/// </summary>
|
|
private IActorRef? _centralCommunication;
|
|
|
|
/// <summary>
|
|
/// Test-mode constructor — injects a concrete repository instance whose
|
|
/// lifetime exceeds the test, so the actor reuses the same instance
|
|
/// across every message. Used by Bundle C's MSSQL-backed TestKit fixture.
|
|
/// An optional <paramref name="options"/> lets a test pin the stuck/KPI
|
|
/// windows; when omitted the production defaults apply.
|
|
/// </summary>
|
|
/// <param name="repository">Concrete repository instance to use for all messages.</param>
|
|
/// <param name="logger">Logger for diagnostics and error reporting.</param>
|
|
/// <param name="options">Optional configuration overrides; production defaults apply when null.</param>
|
|
public SiteCallAuditActor(
|
|
ISiteCallAuditRepository repository,
|
|
ILogger<SiteCallAuditActor> logger,
|
|
SiteCallAuditOptions? options = null)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(repository);
|
|
ArgumentNullException.ThrowIfNull(logger);
|
|
|
|
_injectedRepository = repository;
|
|
_logger = logger;
|
|
_options = options ?? new SiteCallAuditOptions();
|
|
|
|
RegisterHandlers();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Production constructor — resolves <see cref="ISiteCallAuditRepository"/>
|
|
/// from a fresh DI scope per message because the repository is a scoped EF
|
|
/// Core service registered by <c>AddConfigurationDatabase</c>. The actor
|
|
/// itself is a long-lived cluster singleton, so it cannot hold a scope
|
|
/// across messages.
|
|
/// </summary>
|
|
/// <param name="serviceProvider">DI service provider used to create a scoped repository per message.</param>
|
|
/// <param name="options">Actor configuration (stuck threshold, KPI interval, relay timeout).</param>
|
|
/// <param name="logger">Logger for diagnostics and error reporting.</param>
|
|
public SiteCallAuditActor(
|
|
IServiceProvider serviceProvider,
|
|
SiteCallAuditOptions options,
|
|
ILogger<SiteCallAuditActor> logger)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(serviceProvider);
|
|
ArgumentNullException.ThrowIfNull(options);
|
|
ArgumentNullException.ThrowIfNull(logger);
|
|
|
|
_serviceProvider = serviceProvider;
|
|
_options = options;
|
|
_logger = logger;
|
|
|
|
RegisterHandlers();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Wires up the message handlers shared by both constructors: the M3
|
|
/// ingest path plus the Task 4 read-side (query, detail, global + per-site
|
|
/// KPI). All read handlers reply to an Ask, so they capture <c>Sender</c>
|
|
/// before the first await and <c>PipeTo</c> the result back.
|
|
/// </summary>
|
|
private void RegisterHandlers()
|
|
{
|
|
ReceiveAsync<UpsertSiteCallCommand>(OnUpsertAsync);
|
|
Receive<SiteCallQueryRequest>(HandleQuery);
|
|
Receive<SiteCallDetailRequest>(HandleDetail);
|
|
Receive<SiteCallKpiRequest>(HandleKpi);
|
|
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
|
|
|
|
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
|
|
Receive<RegisterCentralCommunication>(msg =>
|
|
{
|
|
_centralCommunication = msg.CentralCommunication;
|
|
_logger.LogInformation("SiteCallAudit registered central→site communication transport");
|
|
});
|
|
Receive<RetrySiteCallRequest>(HandleRetrySiteCall);
|
|
Receive<DiscardSiteCallRequest>(HandleDiscardSiteCall);
|
|
}
|
|
|
|
/// <summary>
|
|
/// SiteCallAudit-001: child supervision strategy — governs children, not this
|
|
/// actor. The actor has no children today, so this override is inert; it
|
|
/// returns a one-for-one strategy with the framework
|
|
/// <see cref="Akka.Actor.SupervisorStrategy.DefaultDecider"/> (Restart on
|
|
/// most exceptions; Stop on <see cref="ActorInitializationException"/> /
|
|
/// <see cref="ActorKilledException"/>) and <c>maxNrOfRetries: 0</c>, so any
|
|
/// future child that throws is Stopped on the first failure. The actor's
|
|
/// own resilience comes from the <c>try/catch</c> in <see cref="OnUpsertAsync"/>
|
|
/// plus the parent <see cref="Akka.Cluster.Tools.Singleton.ClusterSingletonManager"/>'s
|
|
/// supervision — not from this override.
|
|
/// </summary>
|
|
protected override SupervisorStrategy SupervisorStrategy()
|
|
{
|
|
return new OneForOneStrategy(maxNrOfRetries: 0, withinTimeRange: TimeSpan.Zero, decider:
|
|
Akka.Actor.SupervisorStrategy.DefaultDecider);
|
|
}
|
|
|
|
private async Task OnUpsertAsync(UpsertSiteCallCommand cmd)
|
|
{
|
|
// Sender is captured before the first await — Akka resets Sender
|
|
// between message dispatches, so a post-await Tell would go to
|
|
// DeadLetters.
|
|
var replyTo = Sender;
|
|
var id = cmd.SiteCall.TrackedOperationId;
|
|
|
|
// Scope-per-message mirrors AuditLogIngestActor — production EF
|
|
// repository is scoped; the injected-repository mode (tests) skips
|
|
// the scope entirely.
|
|
IServiceScope? scope = null;
|
|
ISiteCallAuditRepository repository;
|
|
if (_injectedRepository is not null)
|
|
{
|
|
repository = _injectedRepository;
|
|
}
|
|
else
|
|
{
|
|
scope = _serviceProvider!.CreateScope();
|
|
repository = scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>();
|
|
}
|
|
|
|
try
|
|
{
|
|
// SiteCallAudit-003: stamp IngestedAtUtc at central-side persist
|
|
// time on every upsert, mirroring AuditLogIngestActor's combined-
|
|
// telemetry hot path. IngestedAtUtc is the "central ingested (or
|
|
// last refreshed) this row" timestamp; callers (telemetry,
|
|
// future reconciliation puller, direct-writes) cannot in general
|
|
// know they are running on central, so the actor owns the stamp.
|
|
var siteCall = cmd.SiteCall with { IngestedAtUtc = DateTime.UtcNow };
|
|
await repository.UpsertAsync(siteCall).ConfigureAwait(false);
|
|
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: true));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
// Per CLAUDE.md: audit-write failure NEVER aborts the user-facing
|
|
// action — log and reply Accepted=false; do NOT rethrow (the
|
|
// central singleton MUST stay alive).
|
|
_logger.LogError(ex, "SiteCallAudit upsert failed for {TrackedOperationId}", id);
|
|
replyTo.Tell(new UpsertSiteCallReply(id, Accepted: false));
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
// ── Task 4: read-side (query / detail / KPI) ──
|
|
|
|
/// <summary>
|
|
/// Handles a paginated, filtered query over the <c>SiteCalls</c> table.
|
|
/// Builds a <see cref="SiteCallQueryFilter"/> + <see cref="SiteCallPaging"/>
|
|
/// keyset cursor from the request, runs the query on a scoped repository,
|
|
/// and pipes the mapped response back to the captured sender. A repository
|
|
/// fault yields a failure response with an empty list.
|
|
/// </summary>
|
|
private void HandleQuery(SiteCallQueryRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
|
|
QueryAsync(request, now).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new SiteCallQueryResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
SiteCalls: Array.Empty<SiteCallSummary>(),
|
|
NextAfterCreatedAtUtc: null,
|
|
NextAfterId: null));
|
|
}
|
|
|
|
private async Task<SiteCallQueryResponse> QueryAsync(SiteCallQueryRequest request, DateTime now)
|
|
{
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
|
|
var filter = new SiteCallQueryFilter(
|
|
Channel: NullIfBlank(request.ChannelFilter),
|
|
SourceSite: NullIfBlank(request.SourceSiteFilter),
|
|
Status: NullIfBlank(request.StatusFilter),
|
|
Target: NullIfBlank(request.TargetKeyword),
|
|
FromUtc: request.FromUtc,
|
|
ToUtc: request.ToUtc,
|
|
// StuckOnly is pushed into the repository SQL via StuckCutoffUtc —
|
|
// TerminalAtUtc IS NULL AND CreatedAtUtc < cutoff composes with the
|
|
// keyset cursor, so the page is always honest (full pages, no empty
|
|
// pages with a non-null next cursor).
|
|
StuckCutoffUtc: request.StuckOnly ? stuckCutoff : null,
|
|
SourceNode: NullIfBlank(request.SourceNodeFilter));
|
|
|
|
var pageSize = Math.Clamp(request.PageSize, 1, MaxPageSize);
|
|
var paging = new SiteCallPaging(
|
|
PageSize: pageSize,
|
|
AfterCreatedAtUtc: request.AfterCreatedAtUtc,
|
|
AfterId: request.AfterId is { } id ? new TrackedOperationId(id) : null);
|
|
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var rows = await repository.QueryAsync(filter, paging).ConfigureAwait(false);
|
|
|
|
var summaries = rows
|
|
.Select(row => ToSummary(row, stuckCutoff))
|
|
.ToList();
|
|
|
|
// The next-page cursor is the last row of the materialised page.
|
|
var cursorRow = rows.Count > 0 ? rows[^1] : null;
|
|
|
|
return new SiteCallQueryResponse(
|
|
request.CorrelationId,
|
|
Success: true,
|
|
ErrorMessage: null,
|
|
SiteCalls: summaries,
|
|
NextAfterCreatedAtUtc: cursorRow?.CreatedAtUtc,
|
|
NextAfterId: cursorRow?.TrackedOperationId.Value);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a full-detail query for a single cached call — backs the report
|
|
/// detail modal. A missing row yields <c>Success=false</c> with a "not
|
|
/// found" message; a repository fault yields <c>Success=false</c> with the
|
|
/// fault message.
|
|
/// </summary>
|
|
private void HandleDetail(SiteCallDetailRequest request)
|
|
{
|
|
var sender = Sender;
|
|
|
|
DetailAsync(request).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new SiteCallDetailResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
Detail: null));
|
|
}
|
|
|
|
private async Task<SiteCallDetailResponse> DetailAsync(SiteCallDetailRequest request)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var row = await repository
|
|
.GetAsync(new TrackedOperationId(request.TrackedOperationId))
|
|
.ConfigureAwait(false);
|
|
|
|
if (row is null)
|
|
{
|
|
return new SiteCallDetailResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: "site call not found",
|
|
Detail: null);
|
|
}
|
|
|
|
return new SiteCallDetailResponse(
|
|
request.CorrelationId,
|
|
Success: true,
|
|
ErrorMessage: null,
|
|
Detail: ToDetail(row));
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a global KPI snapshot request, deriving the stuck cutoff from
|
|
/// <see cref="SiteCallAuditOptions.StuckAgeThreshold"/> and the
|
|
/// failed/delivered interval bound from <see cref="SiteCallAuditOptions.KpiInterval"/>.
|
|
/// </summary>
|
|
private void HandleKpi(SiteCallKpiRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
var intervalSince = now - _options.KpiInterval;
|
|
|
|
KpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new SiteCallKpiResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
BufferedCount: 0,
|
|
ParkedCount: 0,
|
|
FailedLastInterval: 0,
|
|
DeliveredLastInterval: 0,
|
|
OldestPendingAge: null,
|
|
StuckCount: 0));
|
|
}
|
|
|
|
private async Task<SiteCallKpiResponse> KpiAsync(
|
|
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var snapshot = await repository
|
|
.ComputeKpisAsync(stuckCutoff, intervalSince)
|
|
.ConfigureAwait(false);
|
|
|
|
return new SiteCallKpiResponse(
|
|
correlationId,
|
|
Success: true,
|
|
ErrorMessage: null,
|
|
snapshot.BufferedCount,
|
|
snapshot.ParkedCount,
|
|
snapshot.FailedLastInterval,
|
|
snapshot.DeliveredLastInterval,
|
|
snapshot.OldestPendingAge,
|
|
snapshot.StuckCount);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Handles a per-source-site KPI request, using the same stuck cutoff and
|
|
/// interval bound as <see cref="HandleKpi"/>.
|
|
/// </summary>
|
|
private void HandlePerSiteKpi(PerSiteSiteCallKpiRequest request)
|
|
{
|
|
var sender = Sender;
|
|
var now = DateTime.UtcNow;
|
|
var stuckCutoff = now - _options.StuckAgeThreshold;
|
|
var intervalSince = now - _options.KpiInterval;
|
|
|
|
PerSiteKpiAsync(request.CorrelationId, stuckCutoff, intervalSince).PipeTo(
|
|
sender,
|
|
success: response => response,
|
|
failure: ex => new PerSiteSiteCallKpiResponse(
|
|
request.CorrelationId,
|
|
Success: false,
|
|
ErrorMessage: ex.GetBaseException().Message,
|
|
Sites: Array.Empty<SiteCallSiteKpiSnapshot>()));
|
|
}
|
|
|
|
private async Task<PerSiteSiteCallKpiResponse> PerSiteKpiAsync(
|
|
string correlationId, DateTime stuckCutoff, DateTime intervalSince)
|
|
{
|
|
var (scope, repository) = ResolveRepository();
|
|
try
|
|
{
|
|
var sites = await repository
|
|
.ComputePerSiteKpisAsync(stuckCutoff, intervalSince)
|
|
.ConfigureAwait(false);
|
|
|
|
return new PerSiteSiteCallKpiResponse(
|
|
correlationId, Success: true, ErrorMessage: null, sites);
|
|
}
|
|
finally
|
|
{
|
|
scope?.Dispose();
|
|
}
|
|
}
|
|
|
|
// ── Task 5: central→site Retry/Discard relay ──
|
|
|
|
/// <summary>
|
|
/// Relays an operator Retry of a parked cached call to its owning site. The
|
|
/// site is the source of truth — this handler NEVER writes the central
|
|
/// <c>SiteCalls</c> mirror row. It wraps a <see cref="RetryParkedOperation"/>
|
|
/// in a <see cref="SiteEnvelope"/> addressed to <c>SourceSite</c>, Asks the
|
|
/// <c>CentralCommunicationActor</c> (which routes it over the per-site
|
|
/// <c>ClusterClient</c>), and maps the site's
|
|
/// <see cref="ParkedOperationActionAck"/> — or an Ask timeout — onto a
|
|
/// <see cref="RetrySiteCallResponse"/>. A timeout / no-route is reported as
|
|
/// the distinct <see cref="SiteCallRelayOutcome.SiteUnreachable"/> outcome,
|
|
/// not a generic failure, so the Central UI can tell "site offline" from
|
|
/// "operation failed".
|
|
/// </summary>
|
|
private void HandleRetrySiteCall(RetrySiteCallRequest request)
|
|
{
|
|
var sender = Sender;
|
|
|
|
if (_centralCommunication is null)
|
|
{
|
|
// No transport registered yet — there is genuinely no route to any
|
|
// site, so the only honest answer is unreachable.
|
|
_logger.LogWarning(
|
|
"RetrySiteCall {TrackedOperationId} for site {SourceSite} arrived before the "
|
|
+ "central→site transport was registered; reporting site unreachable",
|
|
request.TrackedOperationId, request.SourceSite);
|
|
sender.Tell(UnreachableRetry(request.CorrelationId));
|
|
return;
|
|
}
|
|
|
|
var relay = new RetryParkedOperation(
|
|
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
|
|
var envelope = new SiteEnvelope(request.SourceSite, relay);
|
|
|
|
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
|
|
.PipeTo(
|
|
sender,
|
|
success: ack => MapRetryResponse(request.CorrelationId, ack),
|
|
failure: ex => MapRetryFailure(request.CorrelationId, request.SourceSite, ex));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Relays an operator Discard of a parked cached call to its owning site.
|
|
/// Mirrors <see cref="HandleRetrySiteCall"/> — see that method for the
|
|
/// source-of-truth and site-unreachable rationale.
|
|
/// </summary>
|
|
private void HandleDiscardSiteCall(DiscardSiteCallRequest request)
|
|
{
|
|
var sender = Sender;
|
|
|
|
if (_centralCommunication is null)
|
|
{
|
|
_logger.LogWarning(
|
|
"DiscardSiteCall {TrackedOperationId} for site {SourceSite} arrived before the "
|
|
+ "central→site transport was registered; reporting site unreachable",
|
|
request.TrackedOperationId, request.SourceSite);
|
|
sender.Tell(UnreachableDiscard(request.CorrelationId));
|
|
return;
|
|
}
|
|
|
|
var relay = new DiscardParkedOperation(
|
|
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
|
|
var envelope = new SiteEnvelope(request.SourceSite, relay);
|
|
|
|
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
|
|
.PipeTo(
|
|
sender,
|
|
success: ack => MapDiscardResponse(request.CorrelationId, ack),
|
|
failure: ex => MapDiscardFailure(request.CorrelationId, request.SourceSite, ex));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Maps the site's <see cref="ParkedOperationActionAck"/> for a Retry onto a
|
|
/// <see cref="RetrySiteCallResponse"/>: an applied action is
|
|
/// <see cref="SiteCallRelayOutcome.Applied"/>; a clean no-op
|
|
/// (<c>Applied=false</c>, no error) is <see cref="SiteCallRelayOutcome.NotParked"/>;
|
|
/// an ack carrying an error is <see cref="SiteCallRelayOutcome.OperationFailed"/>
|
|
/// — in every case the site WAS reached.
|
|
/// </summary>
|
|
private static RetrySiteCallResponse MapRetryResponse(string correlationId, ParkedOperationActionAck ack)
|
|
{
|
|
var outcome = ClassifyAck(ack);
|
|
return new RetrySiteCallResponse(
|
|
correlationId,
|
|
outcome,
|
|
Success: outcome == SiteCallRelayOutcome.Applied,
|
|
SiteReachable: true,
|
|
ErrorMessage: AckErrorMessage(outcome, ack));
|
|
}
|
|
|
|
private static DiscardSiteCallResponse MapDiscardResponse(string correlationId, ParkedOperationActionAck ack)
|
|
{
|
|
var outcome = ClassifyAck(ack);
|
|
return new DiscardSiteCallResponse(
|
|
correlationId,
|
|
outcome,
|
|
Success: outcome == SiteCallRelayOutcome.Applied,
|
|
SiteReachable: true,
|
|
ErrorMessage: AckErrorMessage(outcome, ack));
|
|
}
|
|
|
|
private RetrySiteCallResponse MapRetryFailure(string correlationId, string sourceSite, Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Retry relay to site {SourceSite} did not complete; reporting site unreachable", sourceSite);
|
|
return UnreachableRetry(correlationId);
|
|
}
|
|
|
|
private DiscardSiteCallResponse MapDiscardFailure(string correlationId, string sourceSite, Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Discard relay to site {SourceSite} did not complete; reporting site unreachable", sourceSite);
|
|
return UnreachableDiscard(correlationId);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Classifies a site ack: <c>Applied=true</c> → applied; <c>Applied=false</c>
|
|
/// with no error → the site definitively had nothing parked; <c>Applied=false</c>
|
|
/// with an error → the site could not apply the action.
|
|
/// </summary>
|
|
private static SiteCallRelayOutcome ClassifyAck(ParkedOperationActionAck ack)
|
|
{
|
|
if (ack.Applied)
|
|
{
|
|
return SiteCallRelayOutcome.Applied;
|
|
}
|
|
|
|
return ack.ErrorMessage is null
|
|
? SiteCallRelayOutcome.NotParked
|
|
: SiteCallRelayOutcome.OperationFailed;
|
|
}
|
|
|
|
private static string? AckErrorMessage(SiteCallRelayOutcome outcome, ParkedOperationActionAck ack)
|
|
{
|
|
return outcome switch
|
|
{
|
|
SiteCallRelayOutcome.Applied => null,
|
|
SiteCallRelayOutcome.NotParked =>
|
|
"The operation is no longer parked at the site (already delivered, discarded, or retrying).",
|
|
SiteCallRelayOutcome.OperationFailed => ack.ErrorMessage,
|
|
// SiteUnreachable is never produced from a ParkedOperationActionAck —
|
|
// unreachable responses are built by UnreachableRetry/UnreachableDiscard
|
|
// before any ack is classified, so this arm is unreachable by construction.
|
|
// We deliberately return ack.ErrorMessage (rather than throwing) to keep
|
|
// AckErrorMessage total and side-effect-free: site-unreachable is classified
|
|
// as transient by the upstream relay path (which has already constructed the
|
|
// SiteUnreachable response and detail text via SiteUnreachableMessage), so a
|
|
// defensive fall-through here just surfaces whatever error text the ack
|
|
// carries and lets the caller schedule a retry. Throwing would turn a benign
|
|
// refactor invariant violation into a relay-path crash.
|
|
SiteCallRelayOutcome.SiteUnreachable => ack.ErrorMessage,
|
|
_ => throw new ArgumentOutOfRangeException(
|
|
nameof(outcome), outcome, "unknown SiteCallRelayOutcome"),
|
|
};
|
|
}
|
|
|
|
/// <summary>Shared "site unreachable" detail text for both relay directions.</summary>
|
|
private const string SiteUnreachableMessage =
|
|
"The owning site is unreachable; the action was not applied. Retry when the site is back online.";
|
|
|
|
private static RetrySiteCallResponse UnreachableRetry(string correlationId)
|
|
{
|
|
return new RetrySiteCallResponse(
|
|
correlationId,
|
|
SiteCallRelayOutcome.SiteUnreachable,
|
|
Success: false,
|
|
SiteReachable: false,
|
|
ErrorMessage: SiteUnreachableMessage);
|
|
}
|
|
|
|
private static DiscardSiteCallResponse UnreachableDiscard(string correlationId)
|
|
{
|
|
return new DiscardSiteCallResponse(
|
|
correlationId,
|
|
SiteCallRelayOutcome.SiteUnreachable,
|
|
Success: false,
|
|
SiteReachable: false,
|
|
ErrorMessage: SiteUnreachableMessage);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Resolves an <see cref="ISiteCallAuditRepository"/> for one read message.
|
|
/// In test mode the injected instance is returned with a null scope; in
|
|
/// production a fresh DI scope is created and returned so the caller can
|
|
/// dispose it once the read completes — the same scope-per-message pattern
|
|
/// as <see cref="OnUpsertAsync"/>.
|
|
/// </summary>
|
|
private (IServiceScope? Scope, ISiteCallAuditRepository Repository) ResolveRepository()
|
|
{
|
|
if (_injectedRepository is not null)
|
|
{
|
|
return (null, _injectedRepository);
|
|
}
|
|
|
|
var scope = _serviceProvider!.CreateScope();
|
|
return (scope, scope.ServiceProvider.GetRequiredService<ISiteCallAuditRepository>());
|
|
}
|
|
|
|
/// <summary>
|
|
/// A cached call counts as stuck when it is still non-terminal and was
|
|
/// created before <paramref name="stuckCutoff"/>. Non-terminal is keyed off
|
|
/// <see cref="SiteCall.TerminalAtUtc"/> being <c>null</c> — the
|
|
/// <c>SiteCalls</c> operational mirror stores <c>AuditStatus</c>-derived
|
|
/// status strings (<c>Attempted</c>/<c>Delivered</c>/<c>Parked</c>/...), not
|
|
/// the tracking-lifecycle <c>Pending</c>/<c>Retrying</c> names the spec's
|
|
/// KPI section uses, so there is no status string that means "buffered".
|
|
/// <c>TerminalAtUtc</c> is the entity's own active/terminal discriminator
|
|
/// and is consistent with the repository KPI counts and
|
|
/// <c>PurgeTerminalAsync</c>.
|
|
/// </summary>
|
|
private static bool IsStuck(SiteCall row, DateTime stuckCutoff)
|
|
{
|
|
return row.TerminalAtUtc is null && row.CreatedAtUtc < stuckCutoff;
|
|
}
|
|
|
|
private static SiteCallSummary ToSummary(SiteCall row, DateTime stuckCutoff)
|
|
{
|
|
return new SiteCallSummary(
|
|
TrackedOperationId: row.TrackedOperationId.Value,
|
|
SourceSite: row.SourceSite,
|
|
Channel: row.Channel,
|
|
Target: row.Target,
|
|
Status: row.Status,
|
|
RetryCount: row.RetryCount,
|
|
LastError: row.LastError,
|
|
HttpStatus: row.HttpStatus,
|
|
CreatedAtUtc: row.CreatedAtUtc,
|
|
UpdatedAtUtc: row.UpdatedAtUtc,
|
|
TerminalAtUtc: row.TerminalAtUtc,
|
|
IsStuck: IsStuck(row, stuckCutoff),
|
|
SourceNode: row.SourceNode);
|
|
}
|
|
|
|
private static SiteCallDetail ToDetail(SiteCall row)
|
|
{
|
|
return new SiteCallDetail(
|
|
TrackedOperationId: row.TrackedOperationId.Value,
|
|
SourceSite: row.SourceSite,
|
|
Channel: row.Channel,
|
|
Target: row.Target,
|
|
Status: row.Status,
|
|
RetryCount: row.RetryCount,
|
|
LastError: row.LastError,
|
|
HttpStatus: row.HttpStatus,
|
|
CreatedAtUtc: row.CreatedAtUtc,
|
|
UpdatedAtUtc: row.UpdatedAtUtc,
|
|
TerminalAtUtc: row.TerminalAtUtc,
|
|
IngestedAtUtc: row.IngestedAtUtc,
|
|
SourceNode: row.SourceNode);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Treats an empty/whitespace filter string as "no constraint" — the
|
|
/// repository's <see cref="SiteCallQueryFilter"/> interprets <c>null</c> as
|
|
/// a no-op predicate, so a blank UI filter must collapse to <c>null</c>.
|
|
/// </summary>
|
|
private static string? NullIfBlank(string? value)
|
|
{
|
|
return string.IsNullOrWhiteSpace(value) ? null : value;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Registers the central→site command transport (the <c>CentralCommunicationActor</c>)
|
|
/// with the <see cref="SiteCallAuditActor"/> so it can relay Retry/Discard
|
|
/// actions on parked cached calls to their owning sites. Sent by the Host after
|
|
/// both actors exist. Lives here (not in Commons) because it carries an
|
|
/// <see cref="IActorRef"/> and <c>ZB.MOM.WW.ScadaBridge.Commons</c> has no Akka reference —
|
|
/// the same rationale as <c>RegisterAuditIngest</c>.
|
|
/// </summary>
|
|
public sealed record RegisterCentralCommunication(IActorRef CentralCommunication);
|