feat(sitecallaudit): central→site Retry/Discard relay for parked operations

This commit is contained in:
Joseph Doherty
2026-05-21 04:36:04 -04:00
parent ac1f73cf8a
commit 7816b840c1
13 changed files with 1025 additions and 1 deletions

View File

@@ -24,6 +24,11 @@
project reference is documented here so the actor's scope-per-message
GetRequiredService<ISiteCallAuditRepository>() compiles. -->
<ProjectReference Include="../ScadaLink.ConfigurationDatabase/ScadaLink.ConfigurationDatabase.csproj" />
<!-- Task 5 (#22): the central→site Retry/Discard relay routes RetryParkedOperation /
DiscardParkedOperation to the owning site via SiteEnvelope + CentralCommunicationActor,
the same transport every other central→site command uses. SiteEnvelope is defined
in ScadaLink.Communication (no cycle: Communication does not reference SiteCallAudit). -->
<ProjectReference Include="../ScadaLink.Communication/ScadaLink.Communication.csproj" />
</ItemGroup>
<ItemGroup>

View File

@@ -4,8 +4,10 @@ using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Entities.Audit;
using ScadaLink.Commons.Interfaces.Repositories;
using ScadaLink.Commons.Messages.Audit;
using ScadaLink.Commons.Messages.RemoteQuery;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Audit;
using ScadaLink.Communication;
namespace ScadaLink.SiteCallAudit;
@@ -52,6 +54,19 @@ public class SiteCallAuditActor : ReceiveActor
private readonly SiteCallAuditOptions _options;
private readonly ILogger<SiteCallAuditActor> _logger;
/// <summary>
/// Task 5 (#22): the central→site command transport — the
/// <c>CentralCommunicationActor</c>, which owns the per-site
/// <c>ClusterClient</c> map and routes a <see cref="SiteEnvelope"/> to the
/// owning site. Set via <see cref="RegisterCentralCommunication"/> by the
/// Host after both actors exist (this actor is a cluster singleton; the
/// transport actor is created separately). Null until registration
/// completes — a relay arriving before then is answered with a
/// <see cref="SiteCallRelayOutcome.SiteUnreachable"/> outcome, because there
/// is genuinely no route to any site yet.
/// </summary>
private IActorRef? _centralCommunication;
/// <summary>
/// Test-mode constructor — injects a concrete repository instance whose
/// lifetime exceeds the test, so the actor reuses the same instance
@@ -110,6 +125,15 @@ public class SiteCallAuditActor : ReceiveActor
Receive<SiteCallDetailRequest>(HandleDetail);
Receive<SiteCallKpiRequest>(HandleKpi);
Receive<PerSiteSiteCallKpiRequest>(HandlePerSiteKpi);
// Task 5 (#22): central→site Retry/Discard relay for parked cached calls.
Receive<RegisterCentralCommunication>(msg =>
{
_centralCommunication = msg.CentralCommunication;
_logger.LogInformation("SiteCallAudit registered central→site communication transport");
});
Receive<RetrySiteCallRequest>(HandleRetrySiteCall);
Receive<DiscardSiteCallRequest>(HandleDiscardSiteCall);
}
/// <summary>
@@ -385,6 +409,175 @@ public class SiteCallAuditActor : ReceiveActor
}
}
// ── Task 5: central→site Retry/Discard relay ──
/// <summary>
/// Relays an operator Retry of a parked cached call to its owning site. The
/// site is the source of truth — this handler NEVER writes the central
/// <c>SiteCalls</c> mirror row. It wraps a <see cref="RetryParkedOperation"/>
/// in a <see cref="SiteEnvelope"/> addressed to <c>SourceSite</c>, Asks the
/// <c>CentralCommunicationActor</c> (which routes it over the per-site
/// <c>ClusterClient</c>), and maps the site's
/// <see cref="ParkedOperationActionAck"/> — or an Ask timeout — onto a
/// <see cref="RetrySiteCallResponse"/>. A timeout / no-route is reported as
/// the distinct <see cref="SiteCallRelayOutcome.SiteUnreachable"/> outcome,
/// not a generic failure, so the Central UI can tell "site offline" from
/// "operation failed".
/// </summary>
private void HandleRetrySiteCall(RetrySiteCallRequest request)
{
var sender = Sender;
if (_centralCommunication is null)
{
// No transport registered yet — there is genuinely no route to any
// site, so the only honest answer is unreachable.
_logger.LogWarning(
"RetrySiteCall {TrackedOperationId} for site {SourceSite} arrived before the "
+ "central→site transport was registered; reporting site unreachable",
request.TrackedOperationId, request.SourceSite);
sender.Tell(UnreachableRetry(request.CorrelationId));
return;
}
var relay = new RetryParkedOperation(
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
var envelope = new SiteEnvelope(request.SourceSite, relay);
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
.PipeTo(
sender,
success: ack => MapRetryResponse(request.CorrelationId, ack),
failure: ex => MapRetryFailure(request.CorrelationId, request.SourceSite, ex));
}
/// <summary>
/// Relays an operator Discard of a parked cached call to its owning site.
/// Mirrors <see cref="HandleRetrySiteCall"/> — see that method for the
/// source-of-truth and site-unreachable rationale.
/// </summary>
private void HandleDiscardSiteCall(DiscardSiteCallRequest request)
{
var sender = Sender;
if (_centralCommunication is null)
{
_logger.LogWarning(
"DiscardSiteCall {TrackedOperationId} for site {SourceSite} arrived before the "
+ "central→site transport was registered; reporting site unreachable",
request.TrackedOperationId, request.SourceSite);
sender.Tell(UnreachableDiscard(request.CorrelationId));
return;
}
var relay = new DiscardParkedOperation(
request.CorrelationId, new TrackedOperationId(request.TrackedOperationId));
var envelope = new SiteEnvelope(request.SourceSite, relay);
_centralCommunication.Ask<ParkedOperationActionAck>(envelope, _options.RelayTimeout)
.PipeTo(
sender,
success: ack => MapDiscardResponse(request.CorrelationId, ack),
failure: ex => MapDiscardFailure(request.CorrelationId, request.SourceSite, ex));
}
/// <summary>
/// Maps the site's <see cref="ParkedOperationActionAck"/> for a Retry onto a
/// <see cref="RetrySiteCallResponse"/>: an applied action is
/// <see cref="SiteCallRelayOutcome.Applied"/>; a clean no-op
/// (<c>Applied=false</c>, no error) is <see cref="SiteCallRelayOutcome.NotParked"/>;
/// an ack carrying an error is <see cref="SiteCallRelayOutcome.OperationFailed"/>
/// — in every case the site WAS reached.
/// </summary>
private static RetrySiteCallResponse MapRetryResponse(string correlationId, ParkedOperationActionAck ack)
{
var outcome = ClassifyAck(ack);
return new RetrySiteCallResponse(
correlationId,
outcome,
Success: outcome == SiteCallRelayOutcome.Applied,
SiteReachable: true,
ErrorMessage: AckErrorMessage(outcome, ack));
}
private static DiscardSiteCallResponse MapDiscardResponse(string correlationId, ParkedOperationActionAck ack)
{
var outcome = ClassifyAck(ack);
return new DiscardSiteCallResponse(
correlationId,
outcome,
Success: outcome == SiteCallRelayOutcome.Applied,
SiteReachable: true,
ErrorMessage: AckErrorMessage(outcome, ack));
}
private RetrySiteCallResponse MapRetryFailure(string correlationId, string sourceSite, Exception ex)
{
_logger.LogWarning(ex,
"Retry relay to site {SourceSite} did not complete; reporting site unreachable", sourceSite);
return UnreachableRetry(correlationId);
}
private DiscardSiteCallResponse MapDiscardFailure(string correlationId, string sourceSite, Exception ex)
{
_logger.LogWarning(ex,
"Discard relay to site {SourceSite} did not complete; reporting site unreachable", sourceSite);
return UnreachableDiscard(correlationId);
}
/// <summary>
/// Classifies a site ack: <c>Applied=true</c> → applied; <c>Applied=false</c>
/// with no error → the site definitively had nothing parked; <c>Applied=false</c>
/// with an error → the site could not apply the action.
/// </summary>
private static SiteCallRelayOutcome ClassifyAck(ParkedOperationActionAck ack)
{
if (ack.Applied)
{
return SiteCallRelayOutcome.Applied;
}
return ack.ErrorMessage is null
? SiteCallRelayOutcome.NotParked
: SiteCallRelayOutcome.OperationFailed;
}
private static string? AckErrorMessage(SiteCallRelayOutcome outcome, ParkedOperationActionAck ack)
{
return outcome switch
{
SiteCallRelayOutcome.Applied => null,
SiteCallRelayOutcome.NotParked =>
"The operation is no longer parked at the site (already delivered, discarded, or retrying).",
SiteCallRelayOutcome.OperationFailed => ack.ErrorMessage,
_ => ack.ErrorMessage,
};
}
/// <summary>Shared "site unreachable" detail text for both relay directions.</summary>
private const string SiteUnreachableMessage =
"The owning site is unreachable; the action was not applied. Retry when the site is back online.";
private static RetrySiteCallResponse UnreachableRetry(string correlationId)
{
return new RetrySiteCallResponse(
correlationId,
SiteCallRelayOutcome.SiteUnreachable,
Success: false,
SiteReachable: false,
ErrorMessage: SiteUnreachableMessage);
}
private static DiscardSiteCallResponse UnreachableDiscard(string correlationId)
{
return new DiscardSiteCallResponse(
correlationId,
SiteCallRelayOutcome.SiteUnreachable,
Success: false,
SiteReachable: false,
ErrorMessage: SiteUnreachableMessage);
}
/// <summary>
/// Resolves an <see cref="ISiteCallAuditRepository"/> for one read message.
/// In test mode the injected instance is returned with a null scope; in
@@ -464,3 +657,13 @@ public class SiteCallAuditActor : ReceiveActor
return string.IsNullOrWhiteSpace(value) ? null : value;
}
}
/// <summary>
/// Registers the central→site command transport (the <c>CentralCommunicationActor</c>)
/// with the <see cref="SiteCallAuditActor"/> so it can relay Retry/Discard
/// actions on parked cached calls to their owning sites. Sent by the Host after
/// both actors exist. Lives here (not in Commons) because it carries an
/// <see cref="IActorRef"/> and <c>ScadaLink.Commons</c> has no Akka reference —
/// the same rationale as <c>RegisterAuditIngest</c>.
/// </summary>
public sealed record RegisterCentralCommunication(IActorRef CentralCommunication);

View File

@@ -23,4 +23,15 @@ public class SiteCallAuditOptions
/// <c>NotificationOutboxOptions.DeliveredKpiWindow</c>.
/// </summary>
public TimeSpan KpiInterval { get; set; } = TimeSpan.FromMinutes(1);
/// <summary>
/// Task 5 (#22): Ask timeout for the central→site Retry/Discard relay. When
/// the owning site does not ack a <c>RetryParkedOperation</c> /
/// <c>DiscardParkedOperation</c> within this window — site offline, no
/// ClusterClient route, or central buffering deliberately absent — the relay
/// reports a <c>SiteUnreachable</c> outcome. Default 10 seconds: long enough
/// to absorb a healthy cross-cluster round-trip, short enough that an
/// operator clicking Retry on an offline site gets a fast, honest answer.
/// </summary>
public TimeSpan RelayTimeout { get; set; } = TimeSpan.FromSeconds(10);
}