refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)
Solution + 23 src projects + 26 test projects renamed; folders, csproj, namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated. ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated. SQL roles/logins, LDAP domains, CLI command name, and CLI config dir (~/.scadalink → ~/.scadabridge) also renamed. Build green; 5 Host.Tests fail awaiting SQL login rename in next commit. Pre-existing StaleTagMonitor timing flakes unchanged. Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// Optional ambient site context the Store-and-Forward service consults at
|
||||
/// construction time. Carries the site identifier the S&F retry loop
|
||||
/// stamps onto cached-call audit telemetry (Audit Log #23 / M3 Bundle F).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// Defined here (not in <c>HealthMonitoring</c> alongside the existing
|
||||
/// <c>ISiteIdentityProvider</c>) so the dependency arrow does not flip:
|
||||
/// <c>HealthMonitoring</c> already references <c>StoreAndForward</c>, and
|
||||
/// having S&F take a dependency on <c>HealthMonitoring</c> would create a
|
||||
/// project-reference cycle.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// The Host registers a trivial adapter that forwards to the same
|
||||
/// <c>NodeOptions.SiteId</c> the existing <c>ISiteIdentityProvider</c> reads.
|
||||
/// Resolution is optional: when no binding is registered the S&F service
|
||||
/// stamps an empty site id, preserving the legacy pre-M3 behaviour exactly.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface IStoreAndForwardSiteContext
|
||||
{
|
||||
/// <summary>The site id stamped onto cached-call audit telemetry.</summary>
|
||||
string SiteId { get; }
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
using System.Text.Json;
|
||||
using Akka.Actor;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// Notification Outbox: the site Store-and-Forward delivery handler for the
|
||||
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.Notification"/>
|
||||
/// category.
|
||||
///
|
||||
/// In the outbox design the site no longer sends notification email itself.
|
||||
/// "Delivering" a buffered notification means forwarding it to the central cluster
|
||||
/// and treating central's <see cref="NotificationSubmitAck"/> as the outcome:
|
||||
/// <list type="bullet">
|
||||
/// <item><description>ack <c>Accepted</c> → <see cref="DeliverAsync"/> returns
|
||||
/// <c>true</c>; the S&F engine removes the message from the buffer.</description></item>
|
||||
/// <item><description>ack not <c>Accepted</c>, or the Ask times out / fails →
|
||||
/// <see cref="DeliverAsync"/> throws; the S&F engine treats any thrown
|
||||
/// exception as transient and retries the forward at the fixed interval.</description></item>
|
||||
/// </list>
|
||||
///
|
||||
/// The forward travels over the ClusterClient command/control transport: the handler
|
||||
/// <see cref="ActorRefImplicitSenderExtensions.Ask{T}(ICanTell, object, TimeSpan?)">Asks</see>
|
||||
/// the site communication actor, which wraps the message in a
|
||||
/// <c>ClusterClient.Send("/user/central-communication", …)</c> and routes central's
|
||||
/// reply straight back to this Ask.
|
||||
/// </summary>
|
||||
public sealed class NotificationForwarder
|
||||
{
|
||||
private readonly IActorRef _siteCommunicationActor;
|
||||
private readonly string _sourceSiteId;
|
||||
private readonly TimeSpan _forwardTimeout;
|
||||
private readonly ILogger<NotificationForwarder> _logger;
|
||||
|
||||
/// <param name="siteCommunicationActor">
|
||||
/// The site communication actor. It forwards a <see cref="NotificationSubmit"/> to
|
||||
/// central via the registered ClusterClient and replies with the
|
||||
/// <see cref="NotificationSubmitAck"/>.
|
||||
/// </param>
|
||||
/// <param name="sourceSiteId">This site's identifier, stamped on every submit.</param>
|
||||
/// <param name="forwardTimeout">
|
||||
/// How long to wait for central's ack before treating the forward as a transient
|
||||
/// failure. Sourced from host configuration.
|
||||
/// </param>
|
||||
/// <param name="logger">
|
||||
/// Optional logger. StoreAndForward-018: a corrupt buffered payload is logged at
|
||||
/// Warning before being discarded so an operator has a forensic trail of the row
|
||||
/// that vanished from the buffer.
|
||||
/// </param>
|
||||
public NotificationForwarder(
|
||||
IActorRef siteCommunicationActor,
|
||||
string sourceSiteId,
|
||||
TimeSpan forwardTimeout,
|
||||
ILogger<NotificationForwarder>? logger = null)
|
||||
{
|
||||
_siteCommunicationActor = siteCommunicationActor;
|
||||
_sourceSiteId = sourceSiteId;
|
||||
_forwardTimeout = forwardTimeout;
|
||||
_logger = logger ?? NullLogger<NotificationForwarder>.Instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Store-and-Forward delivery handler entry point — matches the
|
||||
/// <c>Func<StoreAndForwardMessage, Task<bool>></c> handler contract.
|
||||
/// Returns <c>true</c> when central accepts the notification; throws on a
|
||||
/// non-accepted ack or an Ask timeout/failure so the engine retries.
|
||||
/// </summary>
|
||||
/// <param name="message">The buffered store-and-forward message to deliver to central.</param>
|
||||
public async Task<bool> DeliverAsync(StoreAndForwardMessage message)
|
||||
{
|
||||
// StoreAndForward-018: an unreadable payload cannot be fixed by retrying.
|
||||
// The design doc explicitly forbids parking notifications ("notifications do
|
||||
// not park — they are retried at the fixed forward interval until central
|
||||
// acks"; Component-StoreAndForward.md). The earlier behaviour returned false
|
||||
// here, which the S&F engine interprets as a permanent failure and parks
|
||||
// the row — contradicting the invariant and surfacing the row in the
|
||||
// central UI's parked-message list. The correct outcome for a corrupt-payload
|
||||
// notification is to DISCARD: log a Warning with the buffered row id +
|
||||
// payload preview for forensics, then return true so the engine clears the
|
||||
// buffer via its standard success-path cleanup. The buffered row is
|
||||
// unrecoverable; retrying or parking would both make the queue worse, not
|
||||
// better.
|
||||
if (!TryBuildSubmit(message, out var submit))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Discarding corrupt buffered notification {NotificationId} (payload is not deserialisable as NotificationSubmit). " +
|
||||
"Payload preview: {PayloadPreview}",
|
||||
message.Id,
|
||||
PreviewPayload(message.PayloadJson));
|
||||
return true;
|
||||
}
|
||||
|
||||
// The reply may legitimately be a non-accepted ack, so it is not requested as
|
||||
// a status-failing Ask: ask for the bare NotificationSubmitAck and classify it
|
||||
// here. An Ask timeout surfaces as a TimeoutException, which — like any other
|
||||
// thrown exception — the S&F engine treats as transient.
|
||||
var ack = await _siteCommunicationActor
|
||||
.Ask<NotificationSubmitAck>(submit, _forwardTimeout)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (ack.Accepted)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// A non-accepted ack is a transient failure: central could not persist the
|
||||
// notification right now. Throw so the engine keeps buffering and retries.
|
||||
throw new NotificationForwardException(
|
||||
$"Central rejected notification {submit.NotificationId}: {ack.Error ?? "no detail"}");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maps a buffered S&F notification message onto the <see cref="NotificationSubmit"/>
|
||||
/// forwarded to central, returning <c>false</c> if the payload is unreadable.
|
||||
///
|
||||
/// The buffered payload IS a serialized <see cref="NotificationSubmit"/> written by
|
||||
/// the site <c>Notify.Send</c> enqueue path (Task 19). Its
|
||||
/// <see cref="NotificationSubmit.NotificationId"/> is the central idempotency key —
|
||||
/// it was generated by the script, equals the buffered row's
|
||||
/// <see cref="StoreAndForwardMessage.Id"/>, and is stable across every retry. The
|
||||
/// forwarder forwards the payload as-is except that it re-stamps the fields it
|
||||
/// authoritatively owns: <see cref="NotificationSubmit.SourceSiteId"/> (this site's
|
||||
/// id) and <see cref="NotificationSubmit.SourceInstanceId"/> (the buffered row's
|
||||
/// origin instance), and it falls the list name back to the S&F
|
||||
/// <see cref="StoreAndForwardMessage.Target"/> when the payload list name is blank.
|
||||
/// </summary>
|
||||
private bool TryBuildSubmit(StoreAndForwardMessage message, out NotificationSubmit submit)
|
||||
{
|
||||
submit = null!;
|
||||
|
||||
NotificationSubmit? payload;
|
||||
try
|
||||
{
|
||||
payload = JsonSerializer.Deserialize<NotificationSubmit>(message.PayloadJson);
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (payload == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
submit = payload with
|
||||
{
|
||||
// The NotificationId is the script-generated idempotency key carried in the
|
||||
// payload. Defend against a payload missing it by falling back to the
|
||||
// buffered row id, which the enqueue path pins to the same value.
|
||||
NotificationId = string.IsNullOrEmpty(payload.NotificationId)
|
||||
? message.Id
|
||||
: payload.NotificationId,
|
||||
// A null OR empty/blank ListName falls back to the S&F Target — so an empty
|
||||
// list name is never forwarded to central.
|
||||
ListName = string.IsNullOrEmpty(payload.ListName) ? message.Target : payload.ListName,
|
||||
// SourceSiteId/SourceInstanceId are authoritatively owned by the site: the
|
||||
// forwarder knows the real site id, and the buffered row records the origin
|
||||
// instance even after the instance is deleted.
|
||||
SourceSiteId = _sourceSiteId,
|
||||
SourceInstanceId = message.OriginInstanceName,
|
||||
};
|
||||
return true;
|
||||
}
|
||||
|
||||
private const int CorruptPayloadPreviewMaxLength = 200;
|
||||
|
||||
/// <summary>
|
||||
/// Returns a length-capped preview of a corrupt buffered payload for the Warning
|
||||
/// log line emitted on discard. The full payload may be megabytes and is not
|
||||
/// suitable for the structured log; the preview retains the leading characters,
|
||||
/// which is what an operator typically uses to identify the producing script.
|
||||
/// </summary>
|
||||
private static string PreviewPayload(string? payloadJson)
|
||||
{
|
||||
if (string.IsNullOrEmpty(payloadJson))
|
||||
{
|
||||
return "<empty>";
|
||||
}
|
||||
return payloadJson.Length <= CorruptPayloadPreviewMaxLength
|
||||
? payloadJson
|
||||
: payloadJson.Substring(0, CorruptPayloadPreviewMaxLength) + "…";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Raised by <see cref="NotificationForwarder"/> on a transient forward failure —
|
||||
/// a non-accepted central ack. The Store-and-Forward engine treats any thrown
|
||||
/// exception as transient and retries the forward at the fixed interval.
|
||||
/// </summary>
|
||||
public sealed class NotificationForwardException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes a new exception with the specified message.
|
||||
/// </summary>
|
||||
/// <param name="message">Message describing the forward failure.</param>
|
||||
public NotificationForwardException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
using System.Text.Json;
|
||||
using Akka.Actor;
|
||||
using Akka.Event;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// Akka actor bridge for <see cref="StoreAndForwardService"/> parked-message operations.
|
||||
/// Receives Query/Retry/Discard requests from the SiteCommunicationActor and replies
|
||||
/// with the matching response records.
|
||||
/// </summary>
|
||||
public class ParkedMessageHandlerActor : ReceiveActor
|
||||
{
|
||||
private readonly ILoggingAdapter _log = Context.GetLogger();
|
||||
private readonly StoreAndForwardService _service;
|
||||
private readonly string _siteId;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the actor and registers message handlers for query, retry, and discard operations.
|
||||
/// </summary>
|
||||
/// <param name="service">The store-and-forward service used to execute parked-message operations.</param>
|
||||
/// <param name="siteId">The site identifier this actor manages parked messages for.</param>
|
||||
public ParkedMessageHandlerActor(StoreAndForwardService service, string siteId)
|
||||
{
|
||||
_service = service;
|
||||
_siteId = siteId;
|
||||
|
||||
Receive<ParkedMessageQueryRequest>(HandleQuery);
|
||||
Receive<ParkedMessageRetryRequest>(HandleRetry);
|
||||
Receive<ParkedMessageDiscardRequest>(HandleDiscard);
|
||||
|
||||
// Task 5 (#22): central→site Retry/Discard relay for parked cached
|
||||
// operations. The cached call's S&F buffer message id is the
|
||||
// TrackedOperationId, so these reuse the same parked-message primitive
|
||||
// as HandleRetry/HandleDiscard, keyed off the tracked id.
|
||||
Receive<RetryParkedOperation>(HandleRetryParkedOperation);
|
||||
Receive<DiscardParkedOperation>(HandleDiscardParkedOperation);
|
||||
}
|
||||
|
||||
private void HandleQuery(ParkedMessageQueryRequest msg)
|
||||
{
|
||||
var sender = Sender;
|
||||
var siteId = _siteId;
|
||||
|
||||
// StoreAndForward-007: idiomatic PipeTo with explicit success/failure
|
||||
// projections instead of ContinueWith. Both projections touch only locals
|
||||
// (captured before the await), so they are safe to run off the actor thread.
|
||||
_service.GetParkedMessagesAsync(category: null, msg.PageNumber, msg.PageSize)
|
||||
.PipeTo(
|
||||
sender,
|
||||
success: result =>
|
||||
{
|
||||
var entries = result.Messages
|
||||
.Select(m => new ParkedMessageEntry(
|
||||
MessageId: m.Id,
|
||||
TargetSystem: m.Target,
|
||||
MethodName: ExtractMethodName(m.PayloadJson, m.Category),
|
||||
ErrorMessage: m.LastError ?? string.Empty,
|
||||
AttemptCount: m.RetryCount,
|
||||
OriginalTimestamp: m.CreatedAt,
|
||||
LastAttemptTimestamp: m.LastAttemptAt ?? m.CreatedAt,
|
||||
MaxAttempts: m.MaxRetries,
|
||||
Category: m.Category,
|
||||
OriginInstance: m.OriginInstanceName))
|
||||
.ToList();
|
||||
|
||||
return new ParkedMessageQueryResponse(
|
||||
msg.CorrelationId, siteId, entries, result.TotalCount,
|
||||
msg.PageNumber, msg.PageSize, true, null, DateTimeOffset.UtcNow);
|
||||
},
|
||||
failure: ex => new ParkedMessageQueryResponse(
|
||||
msg.CorrelationId, siteId, [], 0, msg.PageNumber, msg.PageSize,
|
||||
false, ex.GetBaseException().Message, DateTimeOffset.UtcNow));
|
||||
}
|
||||
|
||||
private void HandleRetry(ParkedMessageRetryRequest msg)
|
||||
{
|
||||
var sender = Sender;
|
||||
|
||||
_service.RetryParkedMessageAsync(msg.MessageId)
|
||||
.PipeTo(
|
||||
sender,
|
||||
success: retried => new ParkedMessageRetryResponse(
|
||||
msg.CorrelationId, retried,
|
||||
retried ? null : "Message not found or no longer parked."),
|
||||
failure: ex => new ParkedMessageRetryResponse(
|
||||
msg.CorrelationId, false, ex.GetBaseException().Message));
|
||||
}
|
||||
|
||||
private void HandleDiscard(ParkedMessageDiscardRequest msg)
|
||||
{
|
||||
var sender = Sender;
|
||||
|
||||
_service.DiscardParkedMessageAsync(msg.MessageId)
|
||||
.PipeTo(
|
||||
sender,
|
||||
success: discarded => new ParkedMessageDiscardResponse(
|
||||
msg.CorrelationId, discarded,
|
||||
discarded ? null : "Message not found or no longer parked."),
|
||||
failure: ex => new ParkedMessageDiscardResponse(
|
||||
msg.CorrelationId, false, ex.GetBaseException().Message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task 5 (#22): executes a central-relayed Retry of a parked cached call.
|
||||
/// The tracked id is the S&F buffer message id, so this reuses
|
||||
/// <see cref="StoreAndForwardService.RetryParkedMessageAsync"/> — which only
|
||||
/// touches rows that are actually <c>Parked</c> (a non-parked or unknown
|
||||
/// operation yields <c>false</c>, a safe no-op). Central never mutates the
|
||||
/// central <c>SiteCalls</c> mirror; the reset row's corrected state flows
|
||||
/// back via the normal cached-call telemetry path.
|
||||
/// </summary>
|
||||
private void HandleRetryParkedOperation(RetryParkedOperation msg)
|
||||
{
|
||||
var sender = Sender;
|
||||
|
||||
_service.RetryParkedMessageAsync(msg.TrackedOperationId.ToString())
|
||||
.PipeTo(
|
||||
sender,
|
||||
success: applied => new ParkedOperationActionAck(
|
||||
msg.CorrelationId, applied, ErrorMessage: null),
|
||||
failure: ex => new ParkedOperationActionAck(
|
||||
msg.CorrelationId, Applied: false, ex.GetBaseException().Message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task 5 (#22): executes a central-relayed Discard of a parked cached call.
|
||||
/// Mirrors <see cref="HandleRetryParkedOperation"/>; Discard removes the
|
||||
/// parked S&F buffer row (only when it is actually <c>Parked</c>).
|
||||
/// </summary>
|
||||
private void HandleDiscardParkedOperation(DiscardParkedOperation msg)
|
||||
{
|
||||
var sender = Sender;
|
||||
|
||||
_service.DiscardParkedMessageAsync(msg.TrackedOperationId.ToString())
|
||||
.PipeTo(
|
||||
sender,
|
||||
success: applied => new ParkedOperationActionAck(
|
||||
msg.CorrelationId, applied, ErrorMessage: null),
|
||||
failure: ex => new ParkedOperationActionAck(
|
||||
msg.CorrelationId, Applied: false, ex.GetBaseException().Message));
|
||||
}
|
||||
|
||||
private static string ExtractMethodName(string payloadJson, Commons.Types.Enums.StoreAndForwardCategory category)
|
||||
{
|
||||
if (string.IsNullOrEmpty(payloadJson))
|
||||
return category.ToString();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(payloadJson);
|
||||
var root = doc.RootElement;
|
||||
if (root.TryGetProperty("MethodName", out var method) && method.ValueKind == JsonValueKind.String)
|
||||
return method.GetString() ?? category.ToString();
|
||||
if (root.TryGetProperty("Subject", out var subject) && subject.ValueKind == JsonValueKind.String)
|
||||
return subject.GetString() ?? category.ToString();
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
}
|
||||
|
||||
return category.ToString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Async replication of buffer operations to standby node.
|
||||
///
|
||||
/// - Forwards add/remove/park operations to standby via a replication handler.
|
||||
/// - No ack wait (fire-and-forget per design).
|
||||
/// - Standby applies operations to its own SQLite.
|
||||
/// - On failover, standby resumes delivery from its replicated state.
|
||||
/// </summary>
|
||||
public class ReplicationService
|
||||
{
|
||||
private readonly StoreAndForwardOptions _options;
|
||||
private readonly ILogger<ReplicationService> _logger;
|
||||
private Func<ReplicationOperation, Task>? _replicationHandler;
|
||||
|
||||
/// <summary>Initializes a new instance of <see cref="ReplicationService"/>.</summary>
|
||||
/// <param name="options">Store-and-forward configuration options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public ReplicationService(
|
||||
StoreAndForwardOptions options,
|
||||
ILogger<ReplicationService> logger)
|
||||
{
|
||||
_options = options;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the handler for forwarding replication operations to the standby node.
|
||||
/// Typically wraps Akka Tell to the standby's replication actor.
|
||||
/// </summary>
|
||||
/// <param name="handler">The async delegate that forwards each replication operation to the standby.</param>
|
||||
public void SetReplicationHandler(Func<ReplicationOperation, Task> handler)
|
||||
{
|
||||
_replicationHandler = handler;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Replicates an enqueue operation to standby (fire-and-forget).
|
||||
/// </summary>
|
||||
/// <param name="message">The message that was enqueued on the active node.</param>
|
||||
public void ReplicateEnqueue(StoreAndForwardMessage message)
|
||||
{
|
||||
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
|
||||
|
||||
FireAndForget(new ReplicationOperation(
|
||||
ReplicationOperationType.Add,
|
||||
message.Id,
|
||||
message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Replicates a remove operation to standby (fire-and-forget).
|
||||
/// </summary>
|
||||
/// <param name="messageId">The identifier of the message to remove from the standby buffer.</param>
|
||||
public void ReplicateRemove(string messageId)
|
||||
{
|
||||
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
|
||||
|
||||
FireAndForget(new ReplicationOperation(
|
||||
ReplicationOperationType.Remove,
|
||||
messageId,
|
||||
null));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Replicates a park operation to standby (fire-and-forget).
|
||||
/// </summary>
|
||||
/// <param name="message">The message that was parked on the active node.</param>
|
||||
public void ReplicatePark(StoreAndForwardMessage message)
|
||||
{
|
||||
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
|
||||
|
||||
FireAndForget(new ReplicationOperation(
|
||||
ReplicationOperationType.Park,
|
||||
message.Id,
|
||||
message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11 / StoreAndForward-016: Replicates an operator-initiated requeue (a parked
|
||||
/// message moved back to the pending queue) to standby (fire-and-forget). The
|
||||
/// carried message reflects the active node's post-requeue state (Pending,
|
||||
/// retry_count = 0) so the standby's copy can be brought into sync.
|
||||
/// </summary>
|
||||
/// <param name="message">The message in its post-requeue (Pending, retry_count=0) state.</param>
|
||||
public void ReplicateRequeue(StoreAndForwardMessage message)
|
||||
{
|
||||
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
|
||||
|
||||
FireAndForget(new ReplicationOperation(
|
||||
ReplicationOperationType.Requeue,
|
||||
message.Id,
|
||||
message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Applies a replicated operation received from the active node.
|
||||
/// Used by the standby node to keep its SQLite in sync.
|
||||
/// </summary>
|
||||
/// <param name="operation">The replication operation to apply.</param>
|
||||
/// <param name="storage">The standby node's store-and-forward storage to update.</param>
|
||||
public async Task ApplyReplicatedOperationAsync(
|
||||
ReplicationOperation operation,
|
||||
StoreAndForwardStorage storage)
|
||||
{
|
||||
switch (operation.OperationType)
|
||||
{
|
||||
case ReplicationOperationType.Add when operation.Message != null:
|
||||
await storage.EnqueueAsync(operation.Message);
|
||||
break;
|
||||
|
||||
case ReplicationOperationType.Remove:
|
||||
await storage.RemoveMessageAsync(operation.MessageId);
|
||||
break;
|
||||
|
||||
case ReplicationOperationType.Park when operation.Message != null:
|
||||
operation.Message.Status = StoreAndForwardMessageStatus.Parked;
|
||||
await storage.UpdateMessageAsync(operation.Message);
|
||||
break;
|
||||
|
||||
case ReplicationOperationType.Requeue when operation.Message != null:
|
||||
// StoreAndForward-016: an operator retried a parked message on the
|
||||
// active node; mirror that on the standby by moving its row back to
|
||||
// Pending with retry_count = 0 so a failover preserves the retry.
|
||||
operation.Message.Status = StoreAndForwardMessageStatus.Pending;
|
||||
operation.Message.RetryCount = 0;
|
||||
await storage.UpdateMessageAsync(operation.Message);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private void FireAndForget(ReplicationOperation operation)
|
||||
{
|
||||
Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await _replicationHandler!.Invoke(operation);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// WP-11: No ack wait — log and move on
|
||||
_logger.LogDebug(ex,
|
||||
"Replication of {OpType} for message {MessageId} failed (best-effort)",
|
||||
operation.OperationType, operation.MessageId);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Represents a buffer operation to be replicated to standby.
|
||||
/// </summary>
|
||||
public record ReplicationOperation(
|
||||
ReplicationOperationType OperationType,
|
||||
string MessageId,
|
||||
StoreAndForwardMessage? Message);
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Types of buffer operations that are replicated.
|
||||
/// </summary>
|
||||
public enum ReplicationOperationType
|
||||
{
|
||||
Add,
|
||||
Remove,
|
||||
Park,
|
||||
/// <summary>
|
||||
/// StoreAndForward-016: an operator moved a parked message back to the pending
|
||||
/// queue. The standby resets its matching row to Pending with retry_count = 0.
|
||||
/// </summary>
|
||||
Requeue
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Registers Store-and-Forward services including storage, the delivery service, and the replication service.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
public static IServiceCollection AddStoreAndForward(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<StoreAndForwardStorage>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
|
||||
var logger = sp.GetRequiredService<ILogger<StoreAndForwardStorage>>();
|
||||
return new StoreAndForwardStorage(
|
||||
$"Data Source={options.SqliteDbPath}",
|
||||
logger);
|
||||
});
|
||||
|
||||
services.AddSingleton<StoreAndForwardService>(sp =>
|
||||
{
|
||||
var storage = sp.GetRequiredService<StoreAndForwardStorage>();
|
||||
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
|
||||
var logger = sp.GetRequiredService<ILogger<StoreAndForwardService>>();
|
||||
var replication = sp.GetRequiredService<ReplicationService>();
|
||||
// Audit Log #23 (M3 Bundle F): Wire the cached-call lifecycle
|
||||
// observer + site identity through DI so the S&F retry loop emits
|
||||
// per-attempt + terminal telemetry under the same TrackedOperationId
|
||||
// the script-thread CachedSubmit row used. Both bindings are
|
||||
// optional — when null the legacy pre-M3 retry behaviour is
|
||||
// preserved exactly (tests, central nodes without sites, hosts
|
||||
// that haven't called AddAuditLog).
|
||||
//
|
||||
// Site identity is resolved through the optional
|
||||
// IStoreAndForwardSiteContext binding (registered by the Host) to
|
||||
// avoid a project-reference cycle with HealthMonitoring's
|
||||
// ISiteIdentityProvider — HealthMonitoring already references S&F.
|
||||
var cachedCallObserver = sp.GetService<ICachedCallLifecycleObserver>();
|
||||
var siteContext = sp.GetService<IStoreAndForwardSiteContext>();
|
||||
// StoreAndForward-023: pass null/empty through unchanged — the
|
||||
// service constructor normalises it to UnknownSiteSentinel so a
|
||||
// host without an IStoreAndForwardSiteContext registration is
|
||||
// observable in the central audit log instead of producing a
|
||||
// silent empty-string SourceSite.
|
||||
var siteId = siteContext?.SiteId ?? string.Empty;
|
||||
return new StoreAndForwardService(
|
||||
storage,
|
||||
options,
|
||||
logger,
|
||||
replication,
|
||||
cachedCallObserver,
|
||||
siteId);
|
||||
});
|
||||
|
||||
services.AddSingleton<ReplicationService>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
|
||||
var logger = sp.GetRequiredService<ILogger<ReplicationService>>();
|
||||
return new ReplicationService(options, logger);
|
||||
});
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers Store-and-Forward Akka actor bindings. Actor creation is handled by the Host during actor system startup.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection to register into.</param>
|
||||
public static IServiceCollection AddStoreAndForwardActors(this IServiceCollection services)
|
||||
{
|
||||
// Akka actor registration handled by Host component during actor system startup
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// WP-9: Represents a single store-and-forward message as stored in SQLite.
|
||||
/// Maps to the sf_messages table.
|
||||
/// </summary>
|
||||
public class StoreAndForwardMessage
|
||||
{
|
||||
/// <summary>Unique message ID (GUID).</summary>
|
||||
public string Id { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>WP-9: Category: ExternalSystem, Notification, or CachedDbWrite.</summary>
|
||||
public StoreAndForwardCategory Category { get; set; }
|
||||
|
||||
/// <summary>Target system name (external system, notification list, or DB connection).</summary>
|
||||
public string Target { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>JSON-serialized payload containing the call details.</summary>
|
||||
public string PayloadJson { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Number of retry-sweep attempts performed so far. The initial (immediate or
|
||||
/// caller-made) delivery attempt is attempt 0 and is not counted here; this
|
||||
/// field counts only background retry attempts (StoreAndForward-003).
|
||||
/// </summary>
|
||||
public int RetryCount { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum retry-sweep attempts before the message is parked.
|
||||
/// <c>0</c> = no limit — the message is retried on every sweep until delivered
|
||||
/// and is never parked for exhausting retries. This is <b>not</b> a "never retry"
|
||||
/// value; a positive value is required to bound delivery attempts.
|
||||
/// </summary>
|
||||
public int MaxRetries { get; set; }
|
||||
|
||||
/// <summary>Retry interval in milliseconds.</summary>
|
||||
public long RetryIntervalMs { get; set; }
|
||||
|
||||
/// <summary>When this message was first enqueued.</summary>
|
||||
public DateTimeOffset CreatedAt { get; set; }
|
||||
|
||||
/// <summary>When delivery was last attempted (null if never attempted).</summary>
|
||||
public DateTimeOffset? LastAttemptAt { get; set; }
|
||||
|
||||
/// <summary>Current status of the message.</summary>
|
||||
public StoreAndForwardMessageStatus Status { get; set; }
|
||||
|
||||
/// <summary>Last error message from a failed delivery attempt.</summary>
|
||||
public string? LastError { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Instance that originated this message (for S&F-survives-delete behavior).
|
||||
/// WP-13: Messages are NOT cleared when instance is deleted.
|
||||
/// </summary>
|
||||
public string? OriginInstanceName { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (ExecutionId Task 4): the originating script execution's
|
||||
/// per-run correlation id, threaded from <c>ScriptRuntimeContext</c> through
|
||||
/// the cached-call enqueue path. Carried so the store-and-forward retry loop
|
||||
/// can stamp it onto the per-attempt / terminal cached-call audit rows
|
||||
/// (<c>ApiCallCached</c>/<c>DbWriteCached</c> Attempted, <c>CachedResolve</c>).
|
||||
/// <c>null</c> for non-cached-call categories (notifications) and for rows
|
||||
/// buffered before this field existed — back-compat with old persisted rows
|
||||
/// (the column is added by an additive migration and read as null when absent).
|
||||
/// </summary>
|
||||
public Guid? ExecutionId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (ExecutionId Task 4): the originating script identifier,
|
||||
/// threaded alongside <see cref="ExecutionId"/> from the cached-call enqueue
|
||||
/// path so the retry-loop audit rows carry the same <c>SourceScript</c>
|
||||
/// provenance the script-side cached rows already carry. <c>null</c> when not
|
||||
/// known (non-cached categories, pre-migration rows).
|
||||
/// </summary>
|
||||
public string? SourceScript { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (ParentExecutionId Task 6): the <c>ExecutionId</c> of the
|
||||
/// inbound-API request that spawned the originating script execution,
|
||||
/// threaded alongside <see cref="ExecutionId"/> from the cached-call enqueue
|
||||
/// path. Carried so the store-and-forward retry loop can stamp it onto the
|
||||
/// per-attempt / terminal cached-call audit rows
|
||||
/// (<c>ApiCallCached</c>/<c>DbWriteCached</c> Attempted, <c>CachedResolve</c>),
|
||||
/// keeping them correlated with the cross-execution chain. <c>null</c> for a
|
||||
/// non-routed run, for non-cached-call categories (notifications), and for
|
||||
/// rows buffered before this field existed — back-compat with old persisted
|
||||
/// rows (the column is added by an additive migration and read as null when
|
||||
/// absent).
|
||||
/// </summary>
|
||||
public Guid? ParentExecutionId { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// WP-9/10: Configuration options for the Store-and-Forward Engine.
|
||||
/// </summary>
|
||||
public class StoreAndForwardOptions
|
||||
{
|
||||
/// <summary>Path to the SQLite database for S&F message persistence.</summary>
|
||||
public string SqliteDbPath { get; set; } = "./data/store-and-forward.db";
|
||||
|
||||
/// <summary>WP-11: Whether to replicate buffer operations to standby node.</summary>
|
||||
public bool ReplicationEnabled { get; set; } = true;
|
||||
|
||||
/// <summary>WP-10: Default retry interval for messages without per-source settings.</summary>
|
||||
public TimeSpan DefaultRetryInterval { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Default maximum retry count before parking. Applied when an
|
||||
/// <c>EnqueueAsync</c> caller does not pass an explicit <c>maxRetries</c>.
|
||||
/// <para>
|
||||
/// <b>StoreAndForward-019:</b> this default is enforced uniformly across
|
||||
/// every category, including <see cref="Commons.Types.Enums.StoreAndForwardCategory.Notification"/>:
|
||||
/// once the buffered message's retry count reaches this cap the engine
|
||||
/// parks the row. The Component-StoreAndForward.md "notifications do not
|
||||
/// park" wording reflects the operational <i>intent</i> when central is
|
||||
/// reachable on the normal cadence; under a sustained central outage that
|
||||
/// exceeds <c>DefaultMaxRetries × forward-interval</c> a buffered
|
||||
/// notification <i>will</i> park and surface in the parked-message UI,
|
||||
/// matching the rest of the system's bounded-retry-then-park behaviour.
|
||||
/// Callers that genuinely require unbounded retry must pass
|
||||
/// <c>maxRetries: 0</c> on <c>EnqueueAsync</c> (the documented "no limit"
|
||||
/// escape hatch — see <c>StoreAndForwardService.EnqueueAsync</c>).
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public int DefaultMaxRetries { get; set; } = 50;
|
||||
|
||||
/// <summary>WP-10: Interval for the background retry timer sweep.</summary>
|
||||
public TimeSpan RetryTimerInterval { get; set; } = TimeSpan.FromSeconds(10);
|
||||
}
|
||||
@@ -0,0 +1,855 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// WP-9/10: Core store-and-forward service.
|
||||
///
|
||||
/// Lifecycle:
|
||||
/// 1. Caller attempts immediate delivery via IDeliveryHandler
|
||||
/// 2. On transient failure → buffer in SQLite → retry loop
|
||||
/// 3. On success → remove from buffer
|
||||
/// 4. On reaching MaxRetries → park (a MaxRetries of 0 means "no limit" — the
|
||||
/// message is retried until delivered and is never parked for retry exhaustion)
|
||||
/// 5. Permanent failures are returned to caller immediately (never buffered)
|
||||
///
|
||||
/// WP-10: Fixed retry interval (not exponential). Per-source-entity retry settings.
|
||||
/// Background timer-based retry sweep.
|
||||
///
|
||||
/// WP-12: Parked messages queryable, retryable, and discardable.
|
||||
///
|
||||
/// WP-14: Buffer depth reported as health metric. Activity logged to site event log.
|
||||
///
|
||||
/// WP-15: CachedCall idempotency is the caller's responsibility.
|
||||
/// This service does not deduplicate — if the same message is enqueued twice,
|
||||
/// it will be delivered twice. Callers using ExternalSystem.CachedCall() must
|
||||
/// design their payloads to be idempotent (e.g., include unique request IDs
|
||||
/// and handle duplicate detection on the remote end).
|
||||
/// </summary>
|
||||
public class StoreAndForwardService
|
||||
{
|
||||
private readonly StoreAndForwardStorage _storage;
|
||||
private readonly StoreAndForwardOptions _options;
|
||||
private readonly ReplicationService? _replication;
|
||||
private readonly ILogger<StoreAndForwardService> _logger;
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Task E4): site-side observer notified
|
||||
/// after every cached-call delivery attempt. Optional — when null no
|
||||
/// telemetry is emitted; the legacy pre-M3 retry loop behaviour is
|
||||
/// preserved exactly.
|
||||
/// </summary>
|
||||
private readonly ICachedCallLifecycleObserver? _cachedCallObserver;
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the
|
||||
/// cached-call attempt context so the audit bridge can build the
|
||||
/// <see cref="SiteCallOperational"/> half of the telemetry packet.
|
||||
/// <para>
|
||||
/// <b>StoreAndForward-023:</b> an empty-string site id must never reach
|
||||
/// downstream consumers — the central audit pipeline keys
|
||||
/// <c>(SourceSite, TrackedOperationId)</c> off this value, so an empty
|
||||
/// string degrades correlation to a per-id-only index and breaks the
|
||||
/// per-site routing of <c>RetryParkedOperation</c>/<c>DiscardParkedOperation</c>
|
||||
/// commands. The constructor normalises a null/empty/whitespace
|
||||
/// <paramref name="siteId"/> argument to <see cref="UnknownSiteSentinel"/>
|
||||
/// so a misconfigured host (no <c>IStoreAndForwardSiteContext</c>
|
||||
/// registered) produces a distinctive marker in the central audit log
|
||||
/// rather than silently merging multiple sites into the empty bucket.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
private readonly string _siteId;
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-023: distinctive marker stamped onto cached-call audit
|
||||
/// telemetry when the host has not registered an
|
||||
/// <see cref="IStoreAndForwardSiteContext"/>. Chosen with a leading <c>$</c>
|
||||
/// so it cannot collide with a real site id (which is a configuration
|
||||
/// identifier and never starts with <c>$</c>). Surfacing this in the
|
||||
/// central audit log makes a missing site-context binding immediately
|
||||
/// recognisable instead of an unattributable empty string.
|
||||
/// </summary>
|
||||
public const string UnknownSiteSentinel = "$unknown-site";
|
||||
private Timer? _retryTimer;
|
||||
private int _retryInProgress;
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-024: the in-flight retry sweep <see cref="Task"/>, or
|
||||
/// <c>null</c> when no sweep is currently running. Captured when the timer
|
||||
/// callback starts a sweep so <see cref="StopAsync"/> can wait for it to
|
||||
/// finish before the host disposes downstream dependencies
|
||||
/// (<see cref="_storage"/>, <see cref="_replication"/>) that the sweep is
|
||||
/// still touching. Written from the timer thread and from
|
||||
/// <see cref="StopAsync"/>, so reads are synchronised via the
|
||||
/// <see cref="Volatile"/> APIs.
|
||||
/// </summary>
|
||||
private Task? _sweepTask;
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-024: how long <see cref="StopAsync"/> waits for an
|
||||
/// in-flight retry sweep to finish before returning. The default — 10 s —
|
||||
/// is generous enough to let a typical sweep over the buffered queue drain,
|
||||
/// but bounded so a hung downstream call (a stuck SQLite write, a
|
||||
/// long-running delivery handler) cannot block host shutdown indefinitely.
|
||||
/// On timeout the wait is abandoned and the timer is still disposed; the
|
||||
/// sweep keeps running but will throw on the next call into a disposed
|
||||
/// dependency — preferred to blocking shutdown forever.
|
||||
/// </summary>
|
||||
private static readonly TimeSpan SweepShutdownWaitTimeout = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Delivery handler delegate. The return value / exception is interpreted
|
||||
/// the same way on both the immediate-delivery path (<see cref="EnqueueAsync"/>)
|
||||
/// and the background retry path (<c>RetryMessageAsync</c>):
|
||||
/// <list type="bullet">
|
||||
/// <item><description><c>true</c> — delivered successfully. The message is
|
||||
/// removed from the buffer (or, on the immediate path, never buffered).</description></item>
|
||||
/// <item><description><c>false</c> — permanent failure. On the immediate path
|
||||
/// the message is NOT buffered; on a retry the message is already buffered and
|
||||
/// is parked immediately (no further retries).</description></item>
|
||||
/// <item><description>throws — transient failure. On the immediate path the
|
||||
/// message is buffered for retry; on a retry the retry count is incremented and
|
||||
/// the message is parked once <see cref="StoreAndForwardMessage.MaxRetries"/> is
|
||||
/// reached.</description></item>
|
||||
/// </list>
|
||||
/// </summary>
|
||||
private readonly Dictionary<StoreAndForwardCategory, Func<StoreAndForwardMessage, Task<bool>>> _deliveryHandlers = new();
|
||||
|
||||
/// <summary>
|
||||
/// WP-14: Event callback for logging S&F activity to site event log.
|
||||
/// </summary>
|
||||
public event Action<string, StoreAndForwardCategory, string>? OnActivity;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the StoreAndForwardService.
|
||||
/// </summary>
|
||||
/// <param name="storage">The storage backend for buffered messages.</param>
|
||||
/// <param name="options">Configuration options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="replication">Optional replication service for standby synchronization.</param>
|
||||
/// <param name="cachedCallObserver">Optional observer for cached call lifecycle events.</param>
|
||||
/// <param name="siteId">The site identifier this service belongs to.</param>
|
||||
public StoreAndForwardService(
|
||||
StoreAndForwardStorage storage,
|
||||
StoreAndForwardOptions options,
|
||||
ILogger<StoreAndForwardService> logger,
|
||||
ReplicationService? replication = null,
|
||||
ICachedCallLifecycleObserver? cachedCallObserver = null,
|
||||
string siteId = "")
|
||||
{
|
||||
_storage = storage;
|
||||
_options = options;
|
||||
_logger = logger;
|
||||
_replication = replication;
|
||||
_cachedCallObserver = cachedCallObserver;
|
||||
// StoreAndForward-023: normalise an empty / whitespace site id to the
|
||||
// distinctive UnknownSiteSentinel so downstream consumers (the central
|
||||
// audit pipeline keying off SourceSite) never see an empty string and
|
||||
// a misconfigured host is recognisable in the central log.
|
||||
_siteId = string.IsNullOrWhiteSpace(siteId) ? UnknownSiteSentinel : siteId;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a delivery handler for a given message category. See the
|
||||
/// <c>_deliveryHandlers</c> field documentation for the true/false/throws contract,
|
||||
/// which applies identically on the immediate and retry paths.
|
||||
/// </summary>
|
||||
/// <param name="category">The message category to handle.</param>
|
||||
/// <param name="handler">The delivery handler function.</param>
|
||||
public void RegisterDeliveryHandler(
|
||||
StoreAndForwardCategory category,
|
||||
Func<StoreAndForwardMessage, Task<bool>> handler)
|
||||
{
|
||||
_deliveryHandlers[category] = handler;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes storage and starts the background retry timer.
|
||||
/// </summary>
|
||||
public async Task StartAsync()
|
||||
{
|
||||
await _storage.InitializeAsync();
|
||||
_retryTimer = new Timer(
|
||||
// StoreAndForward-024: capture the sweep Task on each tick so
|
||||
// StopAsync can await any in-flight invocation before the host
|
||||
// disposes _storage/_replication underneath it. The RetryPending
|
||||
// path is self-guarded against overlapping sweeps via the
|
||||
// _retryInProgress Interlocked flag, so unconditionally re-assigning
|
||||
// the field here cannot lose a still-running task (the new tick
|
||||
// will short-circuit if one is already running).
|
||||
_ => Volatile.Write(ref _sweepTask, RetryPendingMessagesAsync()),
|
||||
null,
|
||||
_options.RetryTimerInterval,
|
||||
_options.RetryTimerInterval);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Store-and-forward service started. Retry interval: {Interval}s",
|
||||
_options.DefaultRetryInterval.TotalSeconds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops the background retry timer and waits (bounded) for any in-flight
|
||||
/// retry sweep to finish before returning.
|
||||
///
|
||||
/// StoreAndForward-024: prior to this fix, <see cref="StopAsync"/> only
|
||||
/// disposed the timer — a sweep already inside
|
||||
/// <see cref="RetryPendingMessagesAsync"/> continued running against
|
||||
/// <see cref="_storage"/> and <see cref="_replication"/> after this method
|
||||
/// returned, and could then NRE / throw on a disposed dependency once the
|
||||
/// DI container ran its own shutdown. We now await the captured sweep task
|
||||
/// (with a bounded <see cref="SweepShutdownWaitTimeout"/> so a hung
|
||||
/// dependency cannot block host shutdown indefinitely) before returning.
|
||||
/// </summary>
|
||||
public async Task StopAsync()
|
||||
{
|
||||
if (_retryTimer != null)
|
||||
{
|
||||
// Stop the periodic callback first so no new sweep starts while we
|
||||
// are waiting for the in-flight one to drain.
|
||||
await _retryTimer.DisposeAsync();
|
||||
_retryTimer = null;
|
||||
}
|
||||
|
||||
var inflight = Volatile.Read(ref _sweepTask);
|
||||
if (inflight is null || inflight.IsCompleted)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// WaitAsync with a finite timeout: a hung delivery handler /
|
||||
// storage call cannot block host shutdown indefinitely. On timeout
|
||||
// the sweep keeps running but the host is free to proceed with
|
||||
// disposal — preferred to never returning.
|
||||
await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false);
|
||||
}
|
||||
catch (TimeoutException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Store-and-forward retry sweep did not finish within {Timeout}; " +
|
||||
"shutdown is proceeding while the sweep is still in-flight",
|
||||
SweepShutdownWaitTimeout);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// The sweep itself already logs at Error on failure (see
|
||||
// RetryPendingMessagesAsync's catch); we only log here so a
|
||||
// surprise fault during shutdown is still visible. Swallow so the
|
||||
// host's shutdown sequence can continue regardless.
|
||||
_logger.LogWarning(ex,
|
||||
"Store-and-forward retry sweep faulted during shutdown wait");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Enqueues a message for store-and-forward delivery.
|
||||
/// Attempts immediate delivery first. On transient failure, buffers for retry.
|
||||
/// On permanent failure (handler returns false), returns false immediately.
|
||||
///
|
||||
/// WP-10: Retry-count lifecycle — the immediate (or caller-made) delivery attempt
|
||||
/// is attempt 0 and is not counted; the background retry sweep increments
|
||||
/// <see cref="StoreAndForwardMessage.RetryCount"/> on each retry. A buffered
|
||||
/// message is parked once <c>RetryCount</c> reaches <paramref name="maxRetries"/>
|
||||
/// — <b>but only when <paramref name="maxRetries"/> is greater than 0</b>. A
|
||||
/// <paramref name="maxRetries"/> of <c>0</c> means <b>no limit</b>: the message is
|
||||
/// retried on every sweep until it is delivered and is <b>never parked</b> on a
|
||||
/// retry-count basis. It is therefore <i>not</i> a "do not retry" value — callers
|
||||
/// that want delivery abandoned after a bounded number of attempts must pass a
|
||||
/// positive <paramref name="maxRetries"/>.
|
||||
///
|
||||
/// WP-15: CachedCall idempotency note — this method does not deduplicate.
|
||||
/// The caller (e.g., ExternalSystem.CachedCall()) is responsible for ensuring
|
||||
/// that the remote system can handle duplicate deliveries safely.
|
||||
/// </summary>
|
||||
/// <param name="category">Message category (selects the delivery handler).</param>
|
||||
/// <param name="target">Target system name (external system / notification list / DB connection).</param>
|
||||
/// <param name="payloadJson">JSON-serialized call payload, treated opaquely.</param>
|
||||
/// <param name="originInstanceName">Instance that originated the message (WP-13: survives instance deletion).</param>
|
||||
/// <param name="maxRetries">
|
||||
/// Maximum background retry-sweep attempts before the message is parked.
|
||||
/// <b><c>0</c> = no limit</b> — the message is retried on every sweep until
|
||||
/// delivered and is never parked for exhausting retries; it is <b>not</b> a
|
||||
/// "never retry" value. <c>null</c> uses <see cref="StoreAndForwardOptions.DefaultMaxRetries"/>.
|
||||
/// Must be positive to bound delivery attempts. Mirrors the
|
||||
/// <see cref="StoreAndForwardMessage.MaxRetries"/> contract.
|
||||
/// </param>
|
||||
/// <param name="retryInterval">Fixed interval between retry sweeps for this message; <c>null</c> uses the configured default.</param>
|
||||
/// <param name="attemptImmediateDelivery">
|
||||
/// When <c>false</c>, the caller has already made its own delivery attempt and the
|
||||
/// message is buffered directly for the retry sweep (the handler is not invoked here).
|
||||
/// </param>
|
||||
/// <param name="messageId">
|
||||
/// An explicit, caller-supplied message id. <c>null</c> (the default) makes the
|
||||
/// service mint a fresh GUID. The Notification Outbox enqueue path supplies its own
|
||||
/// id so the script-generated <c>NotificationId</c> is the single idempotency key —
|
||||
/// it is the buffered row's <see cref="StoreAndForwardMessage.Id"/>, it is carried
|
||||
/// inside the payload, and it is the id the forwarder submits to central.
|
||||
/// </param>
|
||||
/// <param name="executionId">
|
||||
/// Audit Log #23 (ExecutionId Task 4): the originating script execution's
|
||||
/// per-run correlation id. Threaded onto the buffered row so the retry-loop
|
||||
/// cached-call audit rows carry it. <c>null</c> for callers (notifications,
|
||||
/// pre-Task-4 callers) that do not supply one.
|
||||
/// </param>
|
||||
/// <param name="sourceScript">
|
||||
/// Audit Log #23 (ExecutionId Task 4): the originating script identifier,
|
||||
/// threaded onto the buffered row alongside <paramref name="executionId"/>
|
||||
/// so the retry-loop audit rows carry the same provenance the script-side
|
||||
/// cached rows do. <c>null</c> when not known.
|
||||
/// </param>
|
||||
/// <param name="parentExecutionId">
|
||||
/// Audit Log #23 (ParentExecutionId Task 6): the <c>ExecutionId</c> of the
|
||||
/// inbound-API request that spawned the originating script execution.
|
||||
/// Threaded onto the buffered row alongside <paramref name="executionId"/>
|
||||
/// so the retry-loop cached-call audit rows carry it. <c>null</c> for a
|
||||
/// non-routed run and for callers (notifications, pre-Task-6 callers) that
|
||||
/// do not supply one.
|
||||
/// </param>
|
||||
public async Task<StoreAndForwardResult> EnqueueAsync(
|
||||
StoreAndForwardCategory category,
|
||||
string target,
|
||||
string payloadJson,
|
||||
string? originInstanceName = null,
|
||||
int? maxRetries = null,
|
||||
TimeSpan? retryInterval = null,
|
||||
bool attemptImmediateDelivery = true,
|
||||
string? messageId = null,
|
||||
Guid? executionId = null,
|
||||
string? sourceScript = null,
|
||||
Guid? parentExecutionId = null)
|
||||
{
|
||||
var message = new StoreAndForwardMessage
|
||||
{
|
||||
Id = messageId ?? Guid.NewGuid().ToString("N"),
|
||||
Category = category,
|
||||
Target = target,
|
||||
PayloadJson = payloadJson,
|
||||
RetryCount = 0,
|
||||
MaxRetries = maxRetries ?? _options.DefaultMaxRetries,
|
||||
RetryIntervalMs = (long)(retryInterval ?? _options.DefaultRetryInterval).TotalMilliseconds,
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
Status = StoreAndForwardMessageStatus.Pending,
|
||||
OriginInstanceName = originInstanceName,
|
||||
ExecutionId = executionId,
|
||||
SourceScript = sourceScript,
|
||||
ParentExecutionId = parentExecutionId
|
||||
};
|
||||
|
||||
// Attempt immediate delivery — unless the caller has already made a
|
||||
// delivery attempt of its own (attemptImmediateDelivery: false). In that
|
||||
// case re-invoking the handler here would dispatch the request twice.
|
||||
if (attemptImmediateDelivery && _deliveryHandlers.TryGetValue(category, out var handler))
|
||||
{
|
||||
try
|
||||
{
|
||||
var success = await handler(message);
|
||||
if (success)
|
||||
{
|
||||
RaiseActivity("Delivered", category, $"Immediate delivery to {target}");
|
||||
return new StoreAndForwardResult(true, message.Id, false);
|
||||
}
|
||||
|
||||
// Permanent failure — do not buffer
|
||||
return new StoreAndForwardResult(false, message.Id, false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Transient failure — buffer for retry. The immediate attempt is
|
||||
// attempt 0; RetryCount tracks only sweep retries, so it stays 0
|
||||
// here (StoreAndForward-003).
|
||||
_logger.LogWarning(ex,
|
||||
"Immediate delivery to {Target} failed (transient), buffering for retry",
|
||||
target);
|
||||
|
||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||
message.LastError = ex.Message;
|
||||
await BufferAsync(message);
|
||||
|
||||
RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})");
|
||||
return new StoreAndForwardResult(true, message.Id, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Either no handler is registered yet, or the caller already attempted
|
||||
// delivery itself — buffer for the background retry sweep to deliver.
|
||||
// The initial attempt (caller-made, or skipped because no handler is
|
||||
// registered) is attempt 0; RetryCount tracks only sweep retries and
|
||||
// therefore stays 0 here (StoreAndForward-003).
|
||||
if (!attemptImmediateDelivery)
|
||||
{
|
||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||
}
|
||||
await BufferAsync(message);
|
||||
RaiseActivity("Queued", category, attemptImmediateDelivery
|
||||
? $"No handler registered, buffered: {target}"
|
||||
: $"Buffered for retry: {target}");
|
||||
return new StoreAndForwardResult(true, message.Id, true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Persists a message to the local SQLite buffer and (WP-11) replicates the
|
||||
/// add to the standby node so a failover does not lose the buffered message.
|
||||
/// </summary>
|
||||
private async Task BufferAsync(StoreAndForwardMessage message)
|
||||
{
|
||||
await _storage.EnqueueAsync(message);
|
||||
_replication?.ReplicateEnqueue(message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Background retry sweep. Processes all pending messages that are due for retry.
|
||||
/// </summary>
|
||||
internal async Task RetryPendingMessagesAsync()
|
||||
{
|
||||
// Prevent overlapping retry sweeps
|
||||
if (Interlocked.CompareExchange(ref _retryInProgress, 1, 0) != 0)
|
||||
return;
|
||||
|
||||
try
|
||||
{
|
||||
var messages = await _storage.GetMessagesForRetryAsync();
|
||||
if (messages.Count == 0) return;
|
||||
|
||||
_logger.LogDebug("Retry sweep: {Count} messages due for retry", messages.Count);
|
||||
|
||||
foreach (var message in messages)
|
||||
{
|
||||
await RetryMessageAsync(message);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error during retry sweep");
|
||||
}
|
||||
finally
|
||||
{
|
||||
Interlocked.Exchange(ref _retryInProgress, 0);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RetryMessageAsync(StoreAndForwardMessage message)
|
||||
{
|
||||
if (!_deliveryHandlers.TryGetValue(message.Category, out var handler))
|
||||
{
|
||||
_logger.LogWarning("No delivery handler for category {Category}", message.Category);
|
||||
return;
|
||||
}
|
||||
|
||||
// Audit Log #23 (M3 Bundle E — Tasks E4/E5): measure per-attempt
|
||||
// duration so the audit row carries a meaningful DurationMs. Captured
|
||||
// around the handler invocation only — storage / replication overhead
|
||||
// is excluded.
|
||||
var attemptStartUtc = DateTime.UtcNow;
|
||||
var attemptStopwatch = System.Diagnostics.Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
var success = await handler(message);
|
||||
attemptStopwatch.Stop();
|
||||
if (success)
|
||||
{
|
||||
await _storage.RemoveMessageAsync(message.Id);
|
||||
_replication?.ReplicateRemove(message.Id);
|
||||
RaiseActivity("Delivered", message.Category,
|
||||
$"Delivered to {message.Target} after {message.RetryCount} retries");
|
||||
|
||||
// M3: terminal Delivered observer notification — the audit
|
||||
// bridge maps this to Attempted + CachedResolve(Delivered).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.Delivered,
|
||||
lastError: null,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
return;
|
||||
}
|
||||
|
||||
// Permanent failure on retry — park immediately.
|
||||
// StoreAndForward-005: the sweep observed this row as Pending; only commit
|
||||
// the park if it is still Pending so a concurrent operator action that
|
||||
// moved it (retry/discard) is not silently overwritten.
|
||||
message.Status = StoreAndForwardMessageStatus.Parked;
|
||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||
message.LastError = "Permanent failure (handler returned false)";
|
||||
var parked = await _storage.UpdateMessageIfStatusAsync(
|
||||
message, StoreAndForwardMessageStatus.Pending);
|
||||
if (!parked)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Message {MessageId} changed status during delivery; sweep park skipped",
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
_replication?.ReplicatePark(message);
|
||||
RaiseActivity("Parked", message.Category,
|
||||
$"Permanent failure for {message.Target}: handler returned false");
|
||||
|
||||
// M3: terminal PermanentFailure observer notification — the
|
||||
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.PermanentFailure,
|
||||
lastError: message.LastError,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
attemptStopwatch.Stop();
|
||||
// Transient failure — increment retry, check max
|
||||
message.RetryCount++;
|
||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||
message.LastError = ex.Message;
|
||||
|
||||
if (message.MaxRetries > 0 && message.RetryCount >= message.MaxRetries)
|
||||
{
|
||||
// StoreAndForward-005: conditional park — see the permanent-failure
|
||||
// branch above for rationale.
|
||||
message.Status = StoreAndForwardMessageStatus.Parked;
|
||||
var parked = await _storage.UpdateMessageIfStatusAsync(
|
||||
message, StoreAndForwardMessageStatus.Pending);
|
||||
if (!parked)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Message {MessageId} changed status during delivery; sweep park skipped",
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
_replication?.ReplicatePark(message);
|
||||
RaiseActivity("Parked", message.Category,
|
||||
$"Max retries ({message.MaxRetries}) reached for {message.Target}");
|
||||
_logger.LogWarning(
|
||||
"Message {MessageId} parked after {MaxRetries} retries to {Target}",
|
||||
message.Id, message.MaxRetries, message.Target);
|
||||
|
||||
// M3: terminal ParkedMaxRetries observer notification — the
|
||||
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries,
|
||||
lastError: ex.Message,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
}
|
||||
else
|
||||
{
|
||||
// StoreAndForward-005: the retry-count increment is also conditional
|
||||
// on the row still being Pending so it cannot clobber an operator
|
||||
// action that ran during the failed delivery.
|
||||
if (!await _storage.UpdateMessageIfStatusAsync(
|
||||
message, StoreAndForwardMessageStatus.Pending))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Message {MessageId} changed status during delivery; sweep retry-count update skipped",
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
RaiseActivity("Retried", message.Category,
|
||||
$"Retry {message.RetryCount}/{message.MaxRetries} for {message.Target}: {ex.Message}");
|
||||
|
||||
// M3: per-attempt TransientFailure observer notification —
|
||||
// the audit bridge maps this to Attempted(Failed).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.TransientFailure,
|
||||
lastError: ex.Message,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): notify the registered
|
||||
/// <see cref="ICachedCallLifecycleObserver"/> of the just-completed
|
||||
/// attempt. Only fires for cached-call categories
|
||||
/// (<see cref="StoreAndForwardCategory.ExternalSystem"/> and
|
||||
/// <see cref="StoreAndForwardCategory.CachedDbWrite"/>); the
|
||||
/// <see cref="StoreAndForwardCategory.Notification"/> category has its
|
||||
/// own central-side audit pipeline (Notification Outbox / #21) and must
|
||||
/// not surface on this hook.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Best-effort: an observer that throws is logged and swallowed so a
|
||||
/// failing audit pipeline cannot corrupt S&F retry bookkeeping
|
||||
/// (alog.md §7 contract). Messages whose ids are not valid GUIDs (pre-M3
|
||||
/// callers that didn't thread a TrackedOperationId in) are silently
|
||||
/// skipped — the observer requires a parseable id by contract.
|
||||
/// </remarks>
|
||||
private async Task NotifyCachedCallObserverAsync(
|
||||
StoreAndForwardMessage message,
|
||||
CachedCallAttemptOutcome outcome,
|
||||
string? lastError,
|
||||
int? httpStatus,
|
||||
DateTime occurredAtUtc,
|
||||
int? durationMs)
|
||||
{
|
||||
if (_cachedCallObserver == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Only cached-call categories generate audit telemetry on this hook —
|
||||
// notifications have their own outbox-side audit pipeline.
|
||||
var channel = message.Category switch
|
||||
{
|
||||
StoreAndForwardCategory.ExternalSystem => "ApiOutbound",
|
||||
StoreAndForwardCategory.CachedDbWrite => "DbOutbound",
|
||||
_ => null,
|
||||
};
|
||||
if (channel is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (!TrackedOperationId.TryParse(message.Id, out var trackedId))
|
||||
{
|
||||
// StoreAndForward-022: previously a silent skip — but a non-GUID
|
||||
// message id means a caller bypassed the audit hot path with zero
|
||||
// feedback. The drop is still best-effort (S&F retry bookkeeping
|
||||
// must never depend on the audit pipeline) but it is now observable
|
||||
// via a Warning so a misconfigured caller can be diagnosed.
|
||||
// Engine-minted ids (Guid.NewGuid().ToString("N")) and the current
|
||||
// caller set (NotificationOutbox enqueue with NotificationId,
|
||||
// cached-call enqueue with TrackedOperationId.ToString()) all
|
||||
// parse — this log line fires only when a future caller supplies a
|
||||
// non-GUID id, which is exactly when the silent-drop was hardest
|
||||
// to diagnose.
|
||||
_logger.LogWarning(
|
||||
"Cached-call audit observer skipped: message id {MessageId} is not a parseable TrackedOperationId (category {Category}, outcome {Outcome}). " +
|
||||
"Audit lifecycle for this operation will have no rows.",
|
||||
message.Id, message.Category, outcome);
|
||||
return;
|
||||
}
|
||||
|
||||
CachedCallAttemptContext context;
|
||||
try
|
||||
{
|
||||
context = new CachedCallAttemptContext(
|
||||
TrackedOperationId: trackedId,
|
||||
Channel: channel,
|
||||
Target: message.Target,
|
||||
SourceSite: _siteId,
|
||||
Outcome: outcome,
|
||||
RetryCount: message.RetryCount,
|
||||
LastError: lastError,
|
||||
HttpStatus: httpStatus,
|
||||
CreatedAtUtc: message.CreatedAt.UtcDateTime,
|
||||
OccurredAtUtc: DateTime.SpecifyKind(occurredAtUtc, DateTimeKind.Utc),
|
||||
DurationMs: durationMs,
|
||||
SourceInstanceId: message.OriginInstanceName,
|
||||
// Audit Log #23 (ExecutionId Task 4): the buffered message
|
||||
// carries the originating script execution's ExecutionId +
|
||||
// SourceScript; surface them on the context so the bridge can
|
||||
// stamp the retry-loop cached audit rows. Null on rows buffered
|
||||
// before Task 4 (back-compat).
|
||||
ExecutionId: message.ExecutionId,
|
||||
SourceScript: message.SourceScript,
|
||||
// Audit Log #23 (ParentExecutionId Task 6): the buffered
|
||||
// message also carries the spawning inbound-API request's
|
||||
// ExecutionId; surface it so the bridge stamps it onto the
|
||||
// retry-loop cached rows. Null for a non-routed run and on
|
||||
// rows buffered before Task 6 (back-compat).
|
||||
ParentExecutionId: message.ParentExecutionId);
|
||||
}
|
||||
catch (Exception buildEx)
|
||||
{
|
||||
// Defensive — record construction shouldn't throw, but the alog.md
|
||||
// §7 contract requires this path be exception-safe regardless.
|
||||
_logger.LogWarning(buildEx,
|
||||
"Failed to build cached-call attempt context for {MessageId}; observer skipped",
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await _cachedCallObserver.OnAttemptCompletedAsync(context, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// alog.md §7 best-effort: an audit observer outage must NEVER be
|
||||
// misclassified as a transient delivery failure or corrupt the
|
||||
// S&F retry bookkeeping.
|
||||
_logger.LogWarning(ex,
|
||||
"ICachedCallLifecycleObserver threw for {MessageId} (Outcome {Outcome}); ignored",
|
||||
message.Id, outcome);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Gets parked messages for central query (Pattern 8).
|
||||
/// </summary>
|
||||
/// <param name="category">Optional category filter, or null for all categories.</param>
|
||||
/// <param name="pageNumber">The page number (1-based).</param>
|
||||
/// <param name="pageSize">The page size.</param>
|
||||
/// <returns>A tuple of parked messages and the total count.</returns>
|
||||
public async Task<(List<StoreAndForwardMessage> Messages, int TotalCount)> GetParkedMessagesAsync(
|
||||
StoreAndForwardCategory? category = null,
|
||||
int pageNumber = 1,
|
||||
int pageSize = 50)
|
||||
{
|
||||
return await _storage.GetParkedMessagesAsync(category, pageNumber, pageSize);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Retries a parked message (moves back to pending queue).
|
||||
///
|
||||
/// StoreAndForward-016: an operator requeue is a buffer state change and is
|
||||
/// replicated to the standby (as a <see cref="ReplicationOperationType.Requeue"/>)
|
||||
/// so a failover preserves the operator's retry intent.
|
||||
/// StoreAndForward-017: the activity-log entry carries the message's true
|
||||
/// category rather than a hard-coded one.
|
||||
/// StoreAndForward-020: the parked row is captured <i>before</i> the local
|
||||
/// requeue write rather than re-read after it, so a concurrent
|
||||
/// <c>RemoveMessageAsync</c> or <c>DiscardParkedMessageAsync</c> running
|
||||
/// between the two storage calls cannot leave the standby in <c>Parked</c>
|
||||
/// while the active node has already requeued — we always have the row in
|
||||
/// hand for the <c>Requeue</c> replication.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The identifier of the message to retry.</param>
|
||||
/// <returns>True if successfully retried, false otherwise.</returns>
|
||||
public async Task<bool> RetryParkedMessageAsync(string messageId)
|
||||
{
|
||||
// StoreAndForward-020: capture the parked row up front so the standby
|
||||
// gets a Requeue even if a concurrent writer (a sweep delete after a
|
||||
// successful delivery, or an operator discard) removes the row between
|
||||
// the local update and the re-load. The storage call below is
|
||||
// conditional on status = Parked, so if the row has already moved we
|
||||
// return false here without replicating — the standby's matching row
|
||||
// will be reconciled by whichever other operator path won the race.
|
||||
var captured = await _storage.GetMessageByIdAsync(messageId);
|
||||
if (captured is null || captured.Status != StoreAndForwardMessageStatus.Parked)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var success = await _storage.RetryParkedMessageAsync(messageId);
|
||||
if (!success)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// The active node just rewrote this row to Pending with retry_count = 0
|
||||
// and cleared last_error / last_attempt_at (see
|
||||
// StoreAndForwardStorage.RetryParkedMessageAsync). Reconstruct the
|
||||
// post-requeue state on the captured POCO so the standby applies the
|
||||
// same mutations even if a concurrent writer has already deleted the
|
||||
// row underneath us.
|
||||
captured.Status = StoreAndForwardMessageStatus.Pending;
|
||||
captured.RetryCount = 0;
|
||||
captured.LastError = null;
|
||||
captured.LastAttemptAt = null;
|
||||
_replication?.ReplicateRequeue(captured);
|
||||
|
||||
RaiseActivity("Retry", captured.Category,
|
||||
$"Parked message {messageId} moved back to queue");
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Permanently discards a parked message.
|
||||
///
|
||||
/// StoreAndForward-016: an operator discard is a buffer removal and is replicated
|
||||
/// to the standby (as a <see cref="ReplicationOperationType.Remove"/>) so the
|
||||
/// discarded message does not reappear after a failover.
|
||||
/// StoreAndForward-017: the activity-log entry carries the message's true
|
||||
/// category rather than a hard-coded one.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The identifier of the message to discard.</param>
|
||||
/// <returns>True if successfully discarded, false otherwise.</returns>
|
||||
public async Task<bool> DiscardParkedMessageAsync(string messageId)
|
||||
{
|
||||
// Capture the category before the row is deleted so the activity log is
|
||||
// labelled correctly.
|
||||
var message = await _storage.GetMessageByIdAsync(messageId);
|
||||
var success = await _storage.DiscardParkedMessageAsync(messageId);
|
||||
if (success)
|
||||
{
|
||||
_replication?.ReplicateRemove(messageId);
|
||||
RaiseActivity("Discard", message?.Category ?? StoreAndForwardCategory.ExternalSystem,
|
||||
$"Parked message {messageId} discarded");
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-14: Gets buffer depth by category for health reporting.
|
||||
/// </summary>
|
||||
/// <returns>A dictionary of buffer depths by category.</returns>
|
||||
public async Task<Dictionary<StoreAndForwardCategory, int>> GetBufferDepthAsync()
|
||||
{
|
||||
return await _storage.GetBufferDepthByCategoryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-13: Gets count of S&F messages for a given instance (for verifying survival on deletion).
|
||||
/// </summary>
|
||||
/// <param name="instanceName">The instance name to query.</param>
|
||||
/// <returns>The number of messages originating from the instance.</returns>
|
||||
public async Task<int> GetMessageCountForInstanceAsync(string instanceName)
|
||||
{
|
||||
return await _storage.GetMessageCountByOriginInstanceAsync(instanceName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notification Outbox: looks up a buffered message by its id, or <c>null</c> if it
|
||||
/// is not (or no longer) in the buffer. <c>Notify.Status</c> uses this to detect a
|
||||
/// notification still in transit at the site — central reports it not-found while
|
||||
/// the S&F buffer still holds it, which is the site-local <c>Forwarding</c> state.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The message identifier.</param>
|
||||
/// <returns>The message, or null if not found.</returns>
|
||||
public async Task<StoreAndForwardMessage?> GetMessageByIdAsync(string messageId)
|
||||
{
|
||||
return await _storage.GetMessageByIdAsync(messageId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-14: Raises the S&F activity notification. StoreAndForward-009: the
|
||||
/// delegate is snapshotted (so a concurrent unsubscribe cannot NRE) and every
|
||||
/// subscriber invocation is wrapped so a slow/throwing subscriber (e.g. the site
|
||||
/// event log) cannot abort the caller. Crucially, a subscriber exception raised
|
||||
/// from <see cref="EnqueueAsync"/> or <c>RetryMessageAsync</c> must NOT be
|
||||
/// misclassified as a transient delivery failure — pre-fix it escaped into the
|
||||
/// delivery try/catch and caused a successfully delivered message to be buffered
|
||||
/// (or its retry count to be bumped). Activity logging is best-effort.
|
||||
/// </summary>
|
||||
private void RaiseActivity(string action, StoreAndForwardCategory category, string detail)
|
||||
{
|
||||
var handlers = OnActivity;
|
||||
if (handlers == null) return;
|
||||
|
||||
foreach (var handler in handlers.GetInvocationList().Cast<Action<string, StoreAndForwardCategory, string>>())
|
||||
{
|
||||
try
|
||||
{
|
||||
handler(action, category, detail);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Store-and-forward activity subscriber threw for action {Action}; ignored",
|
||||
action);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of an enqueue operation.
|
||||
/// </summary>
|
||||
public record StoreAndForwardResult(
|
||||
/// <summary>True if the message was accepted (either delivered immediately or buffered).</summary>
|
||||
bool Accepted,
|
||||
/// <summary>Unique message ID for tracking.</summary>
|
||||
string MessageId,
|
||||
/// <summary>True if the message was buffered (not delivered immediately).</summary>
|
||||
bool WasBuffered);
|
||||
@@ -0,0 +1,563 @@
|
||||
using Microsoft.Data.Sqlite;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
|
||||
|
||||
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
|
||||
|
||||
/// <summary>
|
||||
/// WP-9: SQLite persistence layer for store-and-forward messages.
|
||||
/// Uses direct Microsoft.Data.Sqlite (not EF Core) for lightweight site-side storage.
|
||||
/// No max buffer size per design decision.
|
||||
///
|
||||
/// StoreAndForward-008: every method opens a fresh <see cref="SqliteConnection"/> for
|
||||
/// the duration of the call rather than holding a long-lived connection. This is a
|
||||
/// deliberate trade-off, not an oversight: Microsoft.Data.Sqlite maintains an internal
|
||||
/// connection pool keyed on the connection string, so <c>OpenAsync</c> on a previously
|
||||
/// used connection string reuses a pooled handle instead of performing a real file
|
||||
/// open. The retry sweep therefore relies on that pool for acceptable performance —
|
||||
/// it calls <see cref="RemoveMessageAsync"/> / <see cref="UpdateMessageIfStatusAsync"/>
|
||||
/// once per due message, and with no max buffer size (by design) the buffer can grow
|
||||
/// large. The connection-per-call style keeps each method self-contained and
|
||||
/// transaction-scoped; if profiling ever shows the pooled open to be a bottleneck on
|
||||
/// the hot retry path, the remedy is a batched sweep API that opens one connection (and
|
||||
/// one transaction) per sweep.
|
||||
/// </summary>
|
||||
public class StoreAndForwardStorage
|
||||
{
|
||||
private readonly string _connectionString;
|
||||
private readonly ILogger<StoreAndForwardStorage> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="StoreAndForwardStorage"/> with the given SQLite connection string.
|
||||
/// </summary>
|
||||
/// <param name="connectionString">SQLite connection string for the store-and-forward database.</param>
|
||||
/// <param name="logger">Logger for diagnostics.</param>
|
||||
public StoreAndForwardStorage(string connectionString, ILogger<StoreAndForwardStorage> logger)
|
||||
{
|
||||
_connectionString = connectionString;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the sf_messages table if it does not exist.
|
||||
/// </summary>
|
||||
public async Task InitializeAsync()
|
||||
{
|
||||
EnsureDatabaseDirectoryExists();
|
||||
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = @"
|
||||
CREATE TABLE IF NOT EXISTS sf_messages (
|
||||
id TEXT PRIMARY KEY,
|
||||
category INTEGER NOT NULL,
|
||||
target TEXT NOT NULL,
|
||||
payload_json TEXT NOT NULL,
|
||||
retry_count INTEGER NOT NULL DEFAULT 0,
|
||||
max_retries INTEGER NOT NULL DEFAULT 50,
|
||||
retry_interval_ms INTEGER NOT NULL DEFAULT 30000,
|
||||
created_at TEXT NOT NULL,
|
||||
last_attempt_at TEXT,
|
||||
status INTEGER NOT NULL DEFAULT 0,
|
||||
last_error TEXT,
|
||||
origin_instance TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sf_messages_status ON sf_messages(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_sf_messages_category ON sf_messages(category);
|
||||
";
|
||||
await command.ExecuteNonQueryAsync();
|
||||
|
||||
// Audit Log #23 (ExecutionId Task 4): additively add the execution_id /
|
||||
// source_script columns. CREATE TABLE IF NOT EXISTS above does NOT add
|
||||
// columns to a table that already exists from before these fields, so a
|
||||
// databases created by an older build needs the columns ALTER-ed in.
|
||||
// SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
|
||||
// probed first and the ALTER skipped when already there. Both columns
|
||||
// are nullable with no default, so any row buffered before this
|
||||
// migration reads back ExecutionId/SourceScript = null (back-compat).
|
||||
await AddColumnIfMissingAsync(connection, "execution_id", "TEXT");
|
||||
await AddColumnIfMissingAsync(connection, "source_script", "TEXT");
|
||||
|
||||
// Audit Log #23 (ParentExecutionId Task 6): additively add the
|
||||
// parent_execution_id column the same way — a sibling to execution_id.
|
||||
// Nullable with no default, so any row buffered before this migration
|
||||
// reads back ParentExecutionId = null (back-compat).
|
||||
await AddColumnIfMissingAsync(connection, "parent_execution_id", "TEXT");
|
||||
|
||||
_logger.LogInformation("Store-and-forward SQLite storage initialized");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (ExecutionId Task 4): adds a column to <c>sf_messages</c>
|
||||
/// only when it is not already present. SQLite lacks <c>ADD COLUMN IF NOT
|
||||
/// EXISTS</c>, so the schema is probed via <c>PRAGMA table_info</c> first.
|
||||
/// Idempotent — safe to run on every <see cref="InitializeAsync"/>.
|
||||
/// </summary>
|
||||
private static async Task AddColumnIfMissingAsync(
|
||||
SqliteConnection connection, string columnName, string columnType)
|
||||
{
|
||||
await using var probe = connection.CreateCommand();
|
||||
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('sf_messages') WHERE name = @name";
|
||||
probe.Parameters.AddWithValue("@name", columnName);
|
||||
var exists = Convert.ToInt32(await probe.ExecuteScalarAsync()) > 0;
|
||||
if (exists)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
await using var alter = connection.CreateCommand();
|
||||
// Column name + type are caller-controlled constants, never user input —
|
||||
// safe to interpolate (parameters are not permitted in DDL).
|
||||
alter.CommandText = $"ALTER TABLE sf_messages ADD COLUMN {columnName} {columnType}";
|
||||
await alter.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ensures the directory for a file-backed SQLite database exists. SQLite creates
|
||||
/// the database file on demand but not its parent directory, so a configured path
|
||||
/// such as "./data/store-and-forward.db" fails to open ("unable to open database
|
||||
/// file") when the "data" directory does not yet exist. In-memory databases and
|
||||
/// bare filenames in the working directory have no directory to create and are
|
||||
/// skipped.
|
||||
/// </summary>
|
||||
private void EnsureDatabaseDirectoryExists()
|
||||
{
|
||||
var builder = new SqliteConnectionStringBuilder(_connectionString);
|
||||
if (builder.Mode == SqliteOpenMode.Memory)
|
||||
return;
|
||||
|
||||
var dataSource = builder.DataSource;
|
||||
if (string.IsNullOrEmpty(dataSource) || dataSource == ":memory:")
|
||||
return;
|
||||
|
||||
var directory = System.IO.Path.GetDirectoryName(System.IO.Path.GetFullPath(dataSource));
|
||||
if (!string.IsNullOrEmpty(directory) && !System.IO.Directory.Exists(directory))
|
||||
{
|
||||
System.IO.Directory.CreateDirectory(directory);
|
||||
_logger.LogInformation("Created store-and-forward database directory: {Directory}", directory);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-9: Enqueues a new message with Pending status.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to enqueue.</param>
|
||||
public async Task EnqueueAsync(StoreAndForwardMessage message)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
INSERT INTO sf_messages (id, category, target, payload_json, retry_count, max_retries,
|
||||
retry_interval_ms, created_at, last_attempt_at, status, last_error,
|
||||
origin_instance, execution_id, source_script, parent_execution_id)
|
||||
VALUES (@id, @category, @target, @payload, @retryCount, @maxRetries,
|
||||
@retryIntervalMs, @createdAt, @lastAttempt, @status, @lastError,
|
||||
@origin, @executionId, @sourceScript, @parentExecutionId)";
|
||||
|
||||
cmd.Parameters.AddWithValue("@id", message.Id);
|
||||
cmd.Parameters.AddWithValue("@category", (int)message.Category);
|
||||
cmd.Parameters.AddWithValue("@target", message.Target);
|
||||
cmd.Parameters.AddWithValue("@payload", message.PayloadJson);
|
||||
cmd.Parameters.AddWithValue("@retryCount", message.RetryCount);
|
||||
cmd.Parameters.AddWithValue("@maxRetries", message.MaxRetries);
|
||||
cmd.Parameters.AddWithValue("@retryIntervalMs", message.RetryIntervalMs);
|
||||
cmd.Parameters.AddWithValue("@createdAt", message.CreatedAt.ToString("O"));
|
||||
cmd.Parameters.AddWithValue("@lastAttempt", message.LastAttemptAt.HasValue
|
||||
? message.LastAttemptAt.Value.ToString("O") : DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("@status", (int)message.Status);
|
||||
cmd.Parameters.AddWithValue("@lastError", (object?)message.LastError ?? DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("@origin", (object?)message.OriginInstanceName ?? DBNull.Value);
|
||||
// Audit Log #23 (ExecutionId Task 4): the execution id is stored as its
|
||||
// canonical string form ("D") so it round-trips cleanly through the
|
||||
// TEXT column; null when not a cached call / not threaded.
|
||||
cmd.Parameters.AddWithValue("@executionId",
|
||||
message.ExecutionId.HasValue ? message.ExecutionId.Value.ToString("D") : DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("@sourceScript", (object?)message.SourceScript ?? DBNull.Value);
|
||||
// Audit Log #23 (ParentExecutionId Task 6): the parent execution id is
|
||||
// stored as its canonical string form ("D") so it round-trips cleanly
|
||||
// through the TEXT column; null when not a routed cached call.
|
||||
cmd.Parameters.AddWithValue("@parentExecutionId",
|
||||
message.ParentExecutionId.HasValue ? message.ParentExecutionId.Value.ToString("D") : DBNull.Value);
|
||||
|
||||
await cmd.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Gets all messages that are due for retry (Pending status, last attempt older than retry interval).
|
||||
/// </summary>
|
||||
public async Task<List<StoreAndForwardMessage>> GetMessagesForRetryAsync()
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
SELECT id, category, target, payload_json, retry_count, max_retries,
|
||||
retry_interval_ms, created_at, last_attempt_at, status, last_error, origin_instance,
|
||||
execution_id, source_script, parent_execution_id
|
||||
FROM sf_messages
|
||||
WHERE status = @pending
|
||||
AND (last_attempt_at IS NULL
|
||||
OR retry_interval_ms = 0
|
||||
OR (julianday('now') - julianday(last_attempt_at)) * 86400000 >= retry_interval_ms)
|
||||
ORDER BY created_at ASC";
|
||||
|
||||
cmd.Parameters.AddWithValue("@pending", (int)StoreAndForwardMessageStatus.Pending);
|
||||
|
||||
return await ReadMessagesAsync(cmd);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Updates a message after a delivery attempt.
|
||||
/// </summary>
|
||||
/// <param name="message">The message with updated retry count, status, and last error.</param>
|
||||
public async Task UpdateMessageAsync(StoreAndForwardMessage message)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
UPDATE sf_messages
|
||||
SET retry_count = @retryCount,
|
||||
last_attempt_at = @lastAttempt,
|
||||
status = @status,
|
||||
last_error = @lastError
|
||||
WHERE id = @id";
|
||||
|
||||
cmd.Parameters.AddWithValue("@id", message.Id);
|
||||
cmd.Parameters.AddWithValue("@retryCount", message.RetryCount);
|
||||
cmd.Parameters.AddWithValue("@lastAttempt", message.LastAttemptAt.HasValue
|
||||
? message.LastAttemptAt.Value.ToString("O") : DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("@status", (int)message.Status);
|
||||
cmd.Parameters.AddWithValue("@lastError", (object?)message.LastError ?? DBNull.Value);
|
||||
|
||||
await cmd.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Updates a message after a delivery attempt, but only if the row is still
|
||||
/// in the expected status. Returns true if the row was updated, false if it had
|
||||
/// already been changed (e.g. an operator retried or discarded the message) and so
|
||||
/// was skipped.
|
||||
///
|
||||
/// StoreAndForward-005: the retry sweep uses this for its state-changing writes so
|
||||
/// it cannot clobber a concurrent operator action (RetryParkedMessageAsync /
|
||||
/// DiscardParkedMessageAsync). Those operator operations are themselves SQL-
|
||||
/// conditional on <c>status = Parked</c>; making the sweep's writes conditional on
|
||||
/// the status the sweep observed closes the sweep-vs-management race rather than
|
||||
/// relying only on the in-process overlapping-sweep guard.
|
||||
/// </summary>
|
||||
/// <param name="message">The message with the updated values to persist.</param>
|
||||
/// <param name="expectedStatus">The status the row must currently have for the update to proceed.</param>
|
||||
public async Task<bool> UpdateMessageIfStatusAsync(
|
||||
StoreAndForwardMessage message,
|
||||
StoreAndForwardMessageStatus expectedStatus)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
UPDATE sf_messages
|
||||
SET retry_count = @retryCount,
|
||||
last_attempt_at = @lastAttempt,
|
||||
status = @status,
|
||||
last_error = @lastError
|
||||
WHERE id = @id AND status = @expectedStatus";
|
||||
|
||||
cmd.Parameters.AddWithValue("@id", message.Id);
|
||||
cmd.Parameters.AddWithValue("@retryCount", message.RetryCount);
|
||||
cmd.Parameters.AddWithValue("@lastAttempt", message.LastAttemptAt.HasValue
|
||||
? message.LastAttemptAt.Value.ToString("O") : DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("@status", (int)message.Status);
|
||||
cmd.Parameters.AddWithValue("@lastError", (object?)message.LastError ?? DBNull.Value);
|
||||
cmd.Parameters.AddWithValue("@expectedStatus", (int)expectedStatus);
|
||||
|
||||
var rows = await cmd.ExecuteNonQueryAsync();
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-10: Removes a successfully delivered message.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The id of the message to remove.</param>
|
||||
public async Task RemoveMessageAsync(string messageId)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = "DELETE FROM sf_messages WHERE id = @id";
|
||||
cmd.Parameters.AddWithValue("@id", messageId);
|
||||
|
||||
await cmd.ExecuteNonQueryAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Gets all parked messages, optionally filtered by category, with pagination.
|
||||
///
|
||||
/// StoreAndForward-006: the COUNT(*) and the paged SELECT run inside a single
|
||||
/// transaction so they observe one consistent snapshot. Without it, a concurrent
|
||||
/// enqueue/park/discard arriving between the two statements yields a TotalCount
|
||||
/// inconsistent with the returned page (flickering totals / off-by-one page math
|
||||
/// in the paginated UI).
|
||||
/// </summary>
|
||||
/// <param name="category">Optional category filter; null returns parked messages from all categories.</param>
|
||||
/// <param name="pageNumber">1-based page number.</param>
|
||||
/// <param name="pageSize">Maximum number of messages to return per page.</param>
|
||||
public async Task<(List<StoreAndForwardMessage> Messages, int TotalCount)> GetParkedMessagesAsync(
|
||||
StoreAndForwardCategory? category = null,
|
||||
int pageNumber = 1,
|
||||
int pageSize = 50)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var transaction = (SqliteTransaction)await connection.BeginTransactionAsync();
|
||||
|
||||
// Count
|
||||
await using var countCmd = connection.CreateCommand();
|
||||
countCmd.Transaction = transaction;
|
||||
countCmd.CommandText = category.HasValue
|
||||
? "SELECT COUNT(*) FROM sf_messages WHERE status = @parked AND category = @category"
|
||||
: "SELECT COUNT(*) FROM sf_messages WHERE status = @parked";
|
||||
countCmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
|
||||
if (category.HasValue) countCmd.Parameters.AddWithValue("@category", (int)category.Value);
|
||||
var totalCount = Convert.ToInt32(await countCmd.ExecuteScalarAsync());
|
||||
|
||||
// Page
|
||||
await using var pageCmd = connection.CreateCommand();
|
||||
pageCmd.Transaction = transaction;
|
||||
var categoryFilter = category.HasValue ? " AND category = @category" : "";
|
||||
pageCmd.CommandText = $@"
|
||||
SELECT id, category, target, payload_json, retry_count, max_retries,
|
||||
retry_interval_ms, created_at, last_attempt_at, status, last_error, origin_instance,
|
||||
execution_id, source_script, parent_execution_id
|
||||
FROM sf_messages
|
||||
WHERE status = @parked{categoryFilter}
|
||||
ORDER BY created_at ASC
|
||||
LIMIT @limit OFFSET @offset";
|
||||
|
||||
pageCmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
|
||||
if (category.HasValue) pageCmd.Parameters.AddWithValue("@category", (int)category.Value);
|
||||
pageCmd.Parameters.AddWithValue("@limit", pageSize);
|
||||
pageCmd.Parameters.AddWithValue("@offset", (pageNumber - 1) * pageSize);
|
||||
|
||||
var messages = await ReadMessagesAsync(pageCmd);
|
||||
|
||||
await transaction.CommitAsync();
|
||||
return (messages, totalCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Moves a parked message back to pending for retry.
|
||||
///
|
||||
/// StoreAndForward-010: <c>last_attempt_at</c> is reset to NULL so the re-queued
|
||||
/// message is unambiguously due on the next retry sweep. An operator-initiated
|
||||
/// retry means "attempt this again now"; leaving the stale parked timestamp in
|
||||
/// place would make the message's retry timing depend on the configured retry
|
||||
/// interval relative to the original (pre-park) attempt — "try immediately" only
|
||||
/// by accident, and a long interval would instead delay the operator's retry.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The id of the parked message to move back to Pending.</param>
|
||||
public async Task<bool> RetryParkedMessageAsync(string messageId)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
UPDATE sf_messages
|
||||
SET status = @pending, retry_count = 0, last_error = NULL, last_attempt_at = NULL
|
||||
WHERE id = @id AND status = @parked";
|
||||
|
||||
cmd.Parameters.AddWithValue("@id", messageId);
|
||||
cmd.Parameters.AddWithValue("@pending", (int)StoreAndForwardMessageStatus.Pending);
|
||||
cmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
|
||||
|
||||
var rows = await cmd.ExecuteNonQueryAsync();
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Permanently discards a parked message.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The id of the parked message to discard.</param>
|
||||
public async Task<bool> DiscardParkedMessageAsync(string messageId)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = "DELETE FROM sf_messages WHERE id = @id AND status = @parked";
|
||||
cmd.Parameters.AddWithValue("@id", messageId);
|
||||
cmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
|
||||
|
||||
var rows = await cmd.ExecuteNonQueryAsync();
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-14: Gets buffer depth by category (count of pending messages per category).
|
||||
/// </summary>
|
||||
public async Task<Dictionary<StoreAndForwardCategory, int>> GetBufferDepthByCategoryAsync()
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
SELECT category, COUNT(*) as cnt
|
||||
FROM sf_messages
|
||||
WHERE status = @pending
|
||||
GROUP BY category";
|
||||
cmd.Parameters.AddWithValue("@pending", (int)StoreAndForwardMessageStatus.Pending);
|
||||
|
||||
var result = new Dictionary<StoreAndForwardCategory, int>();
|
||||
await using var reader = await cmd.ExecuteReaderAsync();
|
||||
while (await reader.ReadAsync())
|
||||
{
|
||||
var category = (StoreAndForwardCategory)reader.GetInt32(0);
|
||||
var count = reader.GetInt32(1);
|
||||
result[category] = count;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-13: Verifies messages are NOT deleted when an instance is deleted.
|
||||
/// Returns the count of messages for a given origin instance.
|
||||
/// </summary>
|
||||
/// <param name="instanceName">The origin instance name to count messages for.</param>
|
||||
public async Task<int> GetMessageCountByOriginInstanceAsync(string instanceName)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
SELECT COUNT(*)
|
||||
FROM sf_messages
|
||||
WHERE origin_instance = @origin";
|
||||
cmd.Parameters.AddWithValue("@origin", instanceName);
|
||||
|
||||
return Convert.ToInt32(await cmd.ExecuteScalarAsync());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a message by ID.
|
||||
/// </summary>
|
||||
/// <param name="messageId">The id of the message to retrieve.</param>
|
||||
public async Task<StoreAndForwardMessage?> GetMessageByIdAsync(string messageId)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
SELECT id, category, target, payload_json, retry_count, max_retries,
|
||||
retry_interval_ms, created_at, last_attempt_at, status, last_error, origin_instance,
|
||||
execution_id, source_script, parent_execution_id
|
||||
FROM sf_messages
|
||||
WHERE id = @id";
|
||||
cmd.Parameters.AddWithValue("@id", messageId);
|
||||
|
||||
var messages = await ReadMessagesAsync(cmd);
|
||||
return messages.FirstOrDefault();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of parked messages (for health reporting).
|
||||
/// </summary>
|
||||
public async Task<int> GetParkedMessageCountAsync()
|
||||
{
|
||||
await using var conn = new SqliteConnection(_connectionString);
|
||||
await conn.OpenAsync();
|
||||
await using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM sf_messages WHERE status = @parked";
|
||||
cmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
|
||||
var result = await cmd.ExecuteScalarAsync();
|
||||
return Convert.ToInt32(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets total message count by status.
|
||||
/// </summary>
|
||||
/// <param name="status">The status to filter by.</param>
|
||||
public async Task<int> GetMessageCountByStatusAsync(StoreAndForwardMessageStatus status)
|
||||
{
|
||||
await using var connection = new SqliteConnection(_connectionString);
|
||||
await connection.OpenAsync();
|
||||
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = "SELECT COUNT(*) FROM sf_messages WHERE status = @status";
|
||||
cmd.Parameters.AddWithValue("@status", (int)status);
|
||||
|
||||
return Convert.ToInt32(await cmd.ExecuteScalarAsync());
|
||||
}
|
||||
|
||||
private static async Task<List<StoreAndForwardMessage>> ReadMessagesAsync(SqliteCommand cmd)
|
||||
{
|
||||
var results = new List<StoreAndForwardMessage>();
|
||||
await using var reader = await cmd.ExecuteReaderAsync();
|
||||
while (await reader.ReadAsync())
|
||||
{
|
||||
results.Add(new StoreAndForwardMessage
|
||||
{
|
||||
Id = reader.GetString(0),
|
||||
Category = (StoreAndForwardCategory)reader.GetInt32(1),
|
||||
Target = reader.GetString(2),
|
||||
PayloadJson = reader.GetString(3),
|
||||
RetryCount = reader.GetInt32(4),
|
||||
MaxRetries = reader.GetInt32(5),
|
||||
RetryIntervalMs = reader.GetInt64(6),
|
||||
CreatedAt = DateTimeOffset.Parse(reader.GetString(7)),
|
||||
LastAttemptAt = reader.IsDBNull(8) ? null : DateTimeOffset.Parse(reader.GetString(8)),
|
||||
Status = (StoreAndForwardMessageStatus)reader.GetInt32(9),
|
||||
LastError = reader.IsDBNull(10) ? null : reader.GetString(10),
|
||||
OriginInstanceName = reader.IsDBNull(11) ? null : reader.GetString(11),
|
||||
// Audit Log #23 (ExecutionId Task 4): rows persisted before the
|
||||
// additive migration have no execution_id / source_script value;
|
||||
// IsDBNull guards keep those reading back as null (back-compat).
|
||||
// Guid.TryParse (not Parse) guards the retry sweep: a corrupt
|
||||
// non-null execution_id is treated as "no execution id" rather
|
||||
// than throwing FormatException and aborting the whole sweep.
|
||||
ExecutionId = ParseGuidColumn(reader, 12),
|
||||
SourceScript = reader.IsDBNull(13) ? null : reader.GetString(13),
|
||||
// Audit Log #23 (ParentExecutionId Task 6): rows persisted
|
||||
// before the additive migration have no parent_execution_id
|
||||
// value; the IsDBNull guard inside ParseGuidColumn keeps those
|
||||
// reading back as null (back-compat). Guid.TryParse (not Parse)
|
||||
// guards the retry sweep against a corrupt non-null value.
|
||||
ParentExecutionId = ParseGuidColumn(reader, 14)
|
||||
});
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (ExecutionId Task 4 / ParentExecutionId Task 6):
|
||||
/// defensively reads a nullable GUID column (<c>execution_id</c> or
|
||||
/// <c>parent_execution_id</c>). A <c>null</c> value (legacy pre-migration
|
||||
/// rows) and a malformed non-null value both yield <c>null</c> — a corrupt
|
||||
/// id must not throw and abort the retry sweep, which reads many rows.
|
||||
/// </summary>
|
||||
private static Guid? ParseGuidColumn(System.Data.Common.DbDataReader reader, int ordinal)
|
||||
{
|
||||
if (reader.IsDBNull(ordinal))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return Guid.TryParse(reader.GetString(ordinal), out var value)
|
||||
? value
|
||||
: null;
|
||||
}
|
||||
}
|
||||
+26
@@ -0,0 +1,26 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Akka" />
|
||||
<PackageReference Include="Microsoft.Data.Sqlite" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
Reference in New Issue
Block a user