refactor: rename ScadaLink → ZB.MOM.WW.ScadaBridge (code + projects + namespaces)

Solution + 23 src projects + 26 test projects renamed; folders, csproj,
namespaces, and ScadaLinkDbContext/ScadaBridgeDbContext class updated.
ActorSystem "scadalink" → "scadabridge", Akka seed-node URLs migrated.
SQL roles/logins, LDAP domains, CLI command name, and CLI config dir
(~/.scadalink → ~/.scadabridge) also renamed.

Build green; 5 Host.Tests fail awaiting SQL login rename in next commit.
Pre-existing StaleTagMonitor timing flakes unchanged.

Rename script committed at tools/rename-to-scadabridge.sh.
This commit is contained in:
Joseph Doherty
2026-05-28 09:37:45 -04:00
parent 6d87ee3c3b
commit 7b0b9c7365
1531 changed files with 11180 additions and 11054 deletions
@@ -0,0 +1,27 @@
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// Optional ambient site context the Store-and-Forward service consults at
/// construction time. Carries the site identifier the S&amp;F retry loop
/// stamps onto cached-call audit telemetry (Audit Log #23 / M3 Bundle F).
/// </summary>
/// <remarks>
/// <para>
/// Defined here (not in <c>HealthMonitoring</c> alongside the existing
/// <c>ISiteIdentityProvider</c>) so the dependency arrow does not flip:
/// <c>HealthMonitoring</c> already references <c>StoreAndForward</c>, and
/// having S&amp;F take a dependency on <c>HealthMonitoring</c> would create a
/// project-reference cycle.
/// </para>
/// <para>
/// The Host registers a trivial adapter that forwards to the same
/// <c>NodeOptions.SiteId</c> the existing <c>ISiteIdentityProvider</c> reads.
/// Resolution is optional: when no binding is registered the S&amp;F service
/// stamps an empty site id, preserving the legacy pre-M3 behaviour exactly.
/// </para>
/// </remarks>
public interface IStoreAndForwardSiteContext
{
/// <summary>The site id stamped onto cached-call audit telemetry.</summary>
string SiteId { get; }
}
@@ -0,0 +1,203 @@
using System.Text.Json;
using Akka.Actor;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.Notification;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// Notification Outbox: the site Store-and-Forward delivery handler for the
/// <see cref="ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.StoreAndForwardCategory.Notification"/>
/// category.
///
/// In the outbox design the site no longer sends notification email itself.
/// "Delivering" a buffered notification means forwarding it to the central cluster
/// and treating central's <see cref="NotificationSubmitAck"/> as the outcome:
/// <list type="bullet">
/// <item><description>ack <c>Accepted</c> → <see cref="DeliverAsync"/> returns
/// <c>true</c>; the S&amp;F engine removes the message from the buffer.</description></item>
/// <item><description>ack not <c>Accepted</c>, or the Ask times out / fails →
/// <see cref="DeliverAsync"/> throws; the S&amp;F engine treats any thrown
/// exception as transient and retries the forward at the fixed interval.</description></item>
/// </list>
///
/// The forward travels over the ClusterClient command/control transport: the handler
/// <see cref="ActorRefImplicitSenderExtensions.Ask{T}(ICanTell, object, TimeSpan?)">Asks</see>
/// the site communication actor, which wraps the message in a
/// <c>ClusterClient.Send("/user/central-communication", …)</c> and routes central's
/// reply straight back to this Ask.
/// </summary>
public sealed class NotificationForwarder
{
private readonly IActorRef _siteCommunicationActor;
private readonly string _sourceSiteId;
private readonly TimeSpan _forwardTimeout;
private readonly ILogger<NotificationForwarder> _logger;
/// <param name="siteCommunicationActor">
/// The site communication actor. It forwards a <see cref="NotificationSubmit"/> to
/// central via the registered ClusterClient and replies with the
/// <see cref="NotificationSubmitAck"/>.
/// </param>
/// <param name="sourceSiteId">This site's identifier, stamped on every submit.</param>
/// <param name="forwardTimeout">
/// How long to wait for central's ack before treating the forward as a transient
/// failure. Sourced from host configuration.
/// </param>
/// <param name="logger">
/// Optional logger. StoreAndForward-018: a corrupt buffered payload is logged at
/// Warning before being discarded so an operator has a forensic trail of the row
/// that vanished from the buffer.
/// </param>
public NotificationForwarder(
IActorRef siteCommunicationActor,
string sourceSiteId,
TimeSpan forwardTimeout,
ILogger<NotificationForwarder>? logger = null)
{
_siteCommunicationActor = siteCommunicationActor;
_sourceSiteId = sourceSiteId;
_forwardTimeout = forwardTimeout;
_logger = logger ?? NullLogger<NotificationForwarder>.Instance;
}
/// <summary>
/// Store-and-Forward delivery handler entry point — matches the
/// <c>Func&lt;StoreAndForwardMessage, Task&lt;bool&gt;&gt;</c> handler contract.
/// Returns <c>true</c> when central accepts the notification; throws on a
/// non-accepted ack or an Ask timeout/failure so the engine retries.
/// </summary>
/// <param name="message">The buffered store-and-forward message to deliver to central.</param>
public async Task<bool> DeliverAsync(StoreAndForwardMessage message)
{
// StoreAndForward-018: an unreadable payload cannot be fixed by retrying.
// The design doc explicitly forbids parking notifications ("notifications do
// not park — they are retried at the fixed forward interval until central
// acks"; Component-StoreAndForward.md). The earlier behaviour returned false
// here, which the S&F engine interprets as a permanent failure and parks
// the row — contradicting the invariant and surfacing the row in the
// central UI's parked-message list. The correct outcome for a corrupt-payload
// notification is to DISCARD: log a Warning with the buffered row id +
// payload preview for forensics, then return true so the engine clears the
// buffer via its standard success-path cleanup. The buffered row is
// unrecoverable; retrying or parking would both make the queue worse, not
// better.
if (!TryBuildSubmit(message, out var submit))
{
_logger.LogWarning(
"Discarding corrupt buffered notification {NotificationId} (payload is not deserialisable as NotificationSubmit). " +
"Payload preview: {PayloadPreview}",
message.Id,
PreviewPayload(message.PayloadJson));
return true;
}
// The reply may legitimately be a non-accepted ack, so it is not requested as
// a status-failing Ask: ask for the bare NotificationSubmitAck and classify it
// here. An Ask timeout surfaces as a TimeoutException, which — like any other
// thrown exception — the S&F engine treats as transient.
var ack = await _siteCommunicationActor
.Ask<NotificationSubmitAck>(submit, _forwardTimeout)
.ConfigureAwait(false);
if (ack.Accepted)
{
return true;
}
// A non-accepted ack is a transient failure: central could not persist the
// notification right now. Throw so the engine keeps buffering and retries.
throw new NotificationForwardException(
$"Central rejected notification {submit.NotificationId}: {ack.Error ?? "no detail"}");
}
/// <summary>
/// Maps a buffered S&amp;F notification message onto the <see cref="NotificationSubmit"/>
/// forwarded to central, returning <c>false</c> if the payload is unreadable.
///
/// The buffered payload IS a serialized <see cref="NotificationSubmit"/> written by
/// the site <c>Notify.Send</c> enqueue path (Task 19). Its
/// <see cref="NotificationSubmit.NotificationId"/> is the central idempotency key —
/// it was generated by the script, equals the buffered row's
/// <see cref="StoreAndForwardMessage.Id"/>, and is stable across every retry. The
/// forwarder forwards the payload as-is except that it re-stamps the fields it
/// authoritatively owns: <see cref="NotificationSubmit.SourceSiteId"/> (this site's
/// id) and <see cref="NotificationSubmit.SourceInstanceId"/> (the buffered row's
/// origin instance), and it falls the list name back to the S&amp;F
/// <see cref="StoreAndForwardMessage.Target"/> when the payload list name is blank.
/// </summary>
private bool TryBuildSubmit(StoreAndForwardMessage message, out NotificationSubmit submit)
{
submit = null!;
NotificationSubmit? payload;
try
{
payload = JsonSerializer.Deserialize<NotificationSubmit>(message.PayloadJson);
}
catch (JsonException)
{
return false;
}
if (payload == null)
{
return false;
}
submit = payload with
{
// The NotificationId is the script-generated idempotency key carried in the
// payload. Defend against a payload missing it by falling back to the
// buffered row id, which the enqueue path pins to the same value.
NotificationId = string.IsNullOrEmpty(payload.NotificationId)
? message.Id
: payload.NotificationId,
// A null OR empty/blank ListName falls back to the S&F Target — so an empty
// list name is never forwarded to central.
ListName = string.IsNullOrEmpty(payload.ListName) ? message.Target : payload.ListName,
// SourceSiteId/SourceInstanceId are authoritatively owned by the site: the
// forwarder knows the real site id, and the buffered row records the origin
// instance even after the instance is deleted.
SourceSiteId = _sourceSiteId,
SourceInstanceId = message.OriginInstanceName,
};
return true;
}
private const int CorruptPayloadPreviewMaxLength = 200;
/// <summary>
/// Returns a length-capped preview of a corrupt buffered payload for the Warning
/// log line emitted on discard. The full payload may be megabytes and is not
/// suitable for the structured log; the preview retains the leading characters,
/// which is what an operator typically uses to identify the producing script.
/// </summary>
private static string PreviewPayload(string? payloadJson)
{
if (string.IsNullOrEmpty(payloadJson))
{
return "<empty>";
}
return payloadJson.Length <= CorruptPayloadPreviewMaxLength
? payloadJson
: payloadJson.Substring(0, CorruptPayloadPreviewMaxLength) + "…";
}
}
/// <summary>
/// Raised by <see cref="NotificationForwarder"/> on a transient forward failure —
/// a non-accepted central ack. The Store-and-Forward engine treats any thrown
/// exception as transient and retries the forward at the fixed interval.
/// </summary>
public sealed class NotificationForwardException : Exception
{
/// <summary>
/// Initializes a new exception with the specified message.
/// </summary>
/// <param name="message">Message describing the forward failure.</param>
public NotificationForwardException(string message) : base(message)
{
}
}
@@ -0,0 +1,165 @@
using System.Text.Json;
using Akka.Actor;
using Akka.Event;
using ZB.MOM.WW.ScadaBridge.Commons.Messages.RemoteQuery;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// Akka actor bridge for <see cref="StoreAndForwardService"/> parked-message operations.
/// Receives Query/Retry/Discard requests from the SiteCommunicationActor and replies
/// with the matching response records.
/// </summary>
public class ParkedMessageHandlerActor : ReceiveActor
{
private readonly ILoggingAdapter _log = Context.GetLogger();
private readonly StoreAndForwardService _service;
private readonly string _siteId;
/// <summary>
/// Initializes the actor and registers message handlers for query, retry, and discard operations.
/// </summary>
/// <param name="service">The store-and-forward service used to execute parked-message operations.</param>
/// <param name="siteId">The site identifier this actor manages parked messages for.</param>
public ParkedMessageHandlerActor(StoreAndForwardService service, string siteId)
{
_service = service;
_siteId = siteId;
Receive<ParkedMessageQueryRequest>(HandleQuery);
Receive<ParkedMessageRetryRequest>(HandleRetry);
Receive<ParkedMessageDiscardRequest>(HandleDiscard);
// Task 5 (#22): central→site Retry/Discard relay for parked cached
// operations. The cached call's S&F buffer message id is the
// TrackedOperationId, so these reuse the same parked-message primitive
// as HandleRetry/HandleDiscard, keyed off the tracked id.
Receive<RetryParkedOperation>(HandleRetryParkedOperation);
Receive<DiscardParkedOperation>(HandleDiscardParkedOperation);
}
private void HandleQuery(ParkedMessageQueryRequest msg)
{
var sender = Sender;
var siteId = _siteId;
// StoreAndForward-007: idiomatic PipeTo with explicit success/failure
// projections instead of ContinueWith. Both projections touch only locals
// (captured before the await), so they are safe to run off the actor thread.
_service.GetParkedMessagesAsync(category: null, msg.PageNumber, msg.PageSize)
.PipeTo(
sender,
success: result =>
{
var entries = result.Messages
.Select(m => new ParkedMessageEntry(
MessageId: m.Id,
TargetSystem: m.Target,
MethodName: ExtractMethodName(m.PayloadJson, m.Category),
ErrorMessage: m.LastError ?? string.Empty,
AttemptCount: m.RetryCount,
OriginalTimestamp: m.CreatedAt,
LastAttemptTimestamp: m.LastAttemptAt ?? m.CreatedAt,
MaxAttempts: m.MaxRetries,
Category: m.Category,
OriginInstance: m.OriginInstanceName))
.ToList();
return new ParkedMessageQueryResponse(
msg.CorrelationId, siteId, entries, result.TotalCount,
msg.PageNumber, msg.PageSize, true, null, DateTimeOffset.UtcNow);
},
failure: ex => new ParkedMessageQueryResponse(
msg.CorrelationId, siteId, [], 0, msg.PageNumber, msg.PageSize,
false, ex.GetBaseException().Message, DateTimeOffset.UtcNow));
}
private void HandleRetry(ParkedMessageRetryRequest msg)
{
var sender = Sender;
_service.RetryParkedMessageAsync(msg.MessageId)
.PipeTo(
sender,
success: retried => new ParkedMessageRetryResponse(
msg.CorrelationId, retried,
retried ? null : "Message not found or no longer parked."),
failure: ex => new ParkedMessageRetryResponse(
msg.CorrelationId, false, ex.GetBaseException().Message));
}
private void HandleDiscard(ParkedMessageDiscardRequest msg)
{
var sender = Sender;
_service.DiscardParkedMessageAsync(msg.MessageId)
.PipeTo(
sender,
success: discarded => new ParkedMessageDiscardResponse(
msg.CorrelationId, discarded,
discarded ? null : "Message not found or no longer parked."),
failure: ex => new ParkedMessageDiscardResponse(
msg.CorrelationId, false, ex.GetBaseException().Message));
}
/// <summary>
/// Task 5 (#22): executes a central-relayed Retry of a parked cached call.
/// The tracked id is the S&amp;F buffer message id, so this reuses
/// <see cref="StoreAndForwardService.RetryParkedMessageAsync"/> — which only
/// touches rows that are actually <c>Parked</c> (a non-parked or unknown
/// operation yields <c>false</c>, a safe no-op). Central never mutates the
/// central <c>SiteCalls</c> mirror; the reset row's corrected state flows
/// back via the normal cached-call telemetry path.
/// </summary>
private void HandleRetryParkedOperation(RetryParkedOperation msg)
{
var sender = Sender;
_service.RetryParkedMessageAsync(msg.TrackedOperationId.ToString())
.PipeTo(
sender,
success: applied => new ParkedOperationActionAck(
msg.CorrelationId, applied, ErrorMessage: null),
failure: ex => new ParkedOperationActionAck(
msg.CorrelationId, Applied: false, ex.GetBaseException().Message));
}
/// <summary>
/// Task 5 (#22): executes a central-relayed Discard of a parked cached call.
/// Mirrors <see cref="HandleRetryParkedOperation"/>; Discard removes the
/// parked S&amp;F buffer row (only when it is actually <c>Parked</c>).
/// </summary>
private void HandleDiscardParkedOperation(DiscardParkedOperation msg)
{
var sender = Sender;
_service.DiscardParkedMessageAsync(msg.TrackedOperationId.ToString())
.PipeTo(
sender,
success: applied => new ParkedOperationActionAck(
msg.CorrelationId, applied, ErrorMessage: null),
failure: ex => new ParkedOperationActionAck(
msg.CorrelationId, Applied: false, ex.GetBaseException().Message));
}
private static string ExtractMethodName(string payloadJson, Commons.Types.Enums.StoreAndForwardCategory category)
{
if (string.IsNullOrEmpty(payloadJson))
return category.ToString();
try
{
using var doc = JsonDocument.Parse(payloadJson);
var root = doc.RootElement;
if (root.TryGetProperty("MethodName", out var method) && method.ValueKind == JsonValueKind.String)
return method.GetString() ?? category.ToString();
if (root.TryGetProperty("Subject", out var subject) && subject.ValueKind == JsonValueKind.String)
return subject.GetString() ?? category.ToString();
}
catch (JsonException)
{
}
return category.ToString();
}
}
@@ -0,0 +1,176 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// WP-11: Async replication of buffer operations to standby node.
///
/// - Forwards add/remove/park operations to standby via a replication handler.
/// - No ack wait (fire-and-forget per design).
/// - Standby applies operations to its own SQLite.
/// - On failover, standby resumes delivery from its replicated state.
/// </summary>
public class ReplicationService
{
private readonly StoreAndForwardOptions _options;
private readonly ILogger<ReplicationService> _logger;
private Func<ReplicationOperation, Task>? _replicationHandler;
/// <summary>Initializes a new instance of <see cref="ReplicationService"/>.</summary>
/// <param name="options">Store-and-forward configuration options.</param>
/// <param name="logger">Logger instance.</param>
public ReplicationService(
StoreAndForwardOptions options,
ILogger<ReplicationService> logger)
{
_options = options;
_logger = logger;
}
/// <summary>
/// Sets the handler for forwarding replication operations to the standby node.
/// Typically wraps Akka Tell to the standby's replication actor.
/// </summary>
/// <param name="handler">The async delegate that forwards each replication operation to the standby.</param>
public void SetReplicationHandler(Func<ReplicationOperation, Task> handler)
{
_replicationHandler = handler;
}
/// <summary>
/// WP-11: Replicates an enqueue operation to standby (fire-and-forget).
/// </summary>
/// <param name="message">The message that was enqueued on the active node.</param>
public void ReplicateEnqueue(StoreAndForwardMessage message)
{
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
FireAndForget(new ReplicationOperation(
ReplicationOperationType.Add,
message.Id,
message));
}
/// <summary>
/// WP-11: Replicates a remove operation to standby (fire-and-forget).
/// </summary>
/// <param name="messageId">The identifier of the message to remove from the standby buffer.</param>
public void ReplicateRemove(string messageId)
{
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
FireAndForget(new ReplicationOperation(
ReplicationOperationType.Remove,
messageId,
null));
}
/// <summary>
/// WP-11: Replicates a park operation to standby (fire-and-forget).
/// </summary>
/// <param name="message">The message that was parked on the active node.</param>
public void ReplicatePark(StoreAndForwardMessage message)
{
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
FireAndForget(new ReplicationOperation(
ReplicationOperationType.Park,
message.Id,
message));
}
/// <summary>
/// WP-11 / StoreAndForward-016: Replicates an operator-initiated requeue (a parked
/// message moved back to the pending queue) to standby (fire-and-forget). The
/// carried message reflects the active node's post-requeue state (Pending,
/// retry_count = 0) so the standby's copy can be brought into sync.
/// </summary>
/// <param name="message">The message in its post-requeue (Pending, retry_count=0) state.</param>
public void ReplicateRequeue(StoreAndForwardMessage message)
{
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
FireAndForget(new ReplicationOperation(
ReplicationOperationType.Requeue,
message.Id,
message));
}
/// <summary>
/// WP-11: Applies a replicated operation received from the active node.
/// Used by the standby node to keep its SQLite in sync.
/// </summary>
/// <param name="operation">The replication operation to apply.</param>
/// <param name="storage">The standby node's store-and-forward storage to update.</param>
public async Task ApplyReplicatedOperationAsync(
ReplicationOperation operation,
StoreAndForwardStorage storage)
{
switch (operation.OperationType)
{
case ReplicationOperationType.Add when operation.Message != null:
await storage.EnqueueAsync(operation.Message);
break;
case ReplicationOperationType.Remove:
await storage.RemoveMessageAsync(operation.MessageId);
break;
case ReplicationOperationType.Park when operation.Message != null:
operation.Message.Status = StoreAndForwardMessageStatus.Parked;
await storage.UpdateMessageAsync(operation.Message);
break;
case ReplicationOperationType.Requeue when operation.Message != null:
// StoreAndForward-016: an operator retried a parked message on the
// active node; mirror that on the standby by moving its row back to
// Pending with retry_count = 0 so a failover preserves the retry.
operation.Message.Status = StoreAndForwardMessageStatus.Pending;
operation.Message.RetryCount = 0;
await storage.UpdateMessageAsync(operation.Message);
break;
}
}
private void FireAndForget(ReplicationOperation operation)
{
Task.Run(async () =>
{
try
{
await _replicationHandler!.Invoke(operation);
}
catch (Exception ex)
{
// WP-11: No ack wait — log and move on
_logger.LogDebug(ex,
"Replication of {OpType} for message {MessageId} failed (best-effort)",
operation.OperationType, operation.MessageId);
}
});
}
}
/// <summary>
/// WP-11: Represents a buffer operation to be replicated to standby.
/// </summary>
public record ReplicationOperation(
ReplicationOperationType OperationType,
string MessageId,
StoreAndForwardMessage? Message);
/// <summary>
/// WP-11: Types of buffer operations that are replicated.
/// </summary>
public enum ReplicationOperationType
{
Add,
Remove,
Park,
/// <summary>
/// StoreAndForward-016: an operator moved a parked message back to the pending
/// queue. The standby resets its matching row to Pending with retry_count = 0.
/// </summary>
Requeue
}
@@ -0,0 +1,79 @@
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
public static class ServiceCollectionExtensions
{
/// <summary>
/// Registers Store-and-Forward services including storage, the delivery service, and the replication service.
/// </summary>
/// <param name="services">The service collection to register into.</param>
public static IServiceCollection AddStoreAndForward(this IServiceCollection services)
{
services.AddSingleton<StoreAndForwardStorage>(sp =>
{
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
var logger = sp.GetRequiredService<ILogger<StoreAndForwardStorage>>();
return new StoreAndForwardStorage(
$"Data Source={options.SqliteDbPath}",
logger);
});
services.AddSingleton<StoreAndForwardService>(sp =>
{
var storage = sp.GetRequiredService<StoreAndForwardStorage>();
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
var logger = sp.GetRequiredService<ILogger<StoreAndForwardService>>();
var replication = sp.GetRequiredService<ReplicationService>();
// Audit Log #23 (M3 Bundle F): Wire the cached-call lifecycle
// observer + site identity through DI so the S&F retry loop emits
// per-attempt + terminal telemetry under the same TrackedOperationId
// the script-thread CachedSubmit row used. Both bindings are
// optional — when null the legacy pre-M3 retry behaviour is
// preserved exactly (tests, central nodes without sites, hosts
// that haven't called AddAuditLog).
//
// Site identity is resolved through the optional
// IStoreAndForwardSiteContext binding (registered by the Host) to
// avoid a project-reference cycle with HealthMonitoring's
// ISiteIdentityProvider — HealthMonitoring already references S&F.
var cachedCallObserver = sp.GetService<ICachedCallLifecycleObserver>();
var siteContext = sp.GetService<IStoreAndForwardSiteContext>();
// StoreAndForward-023: pass null/empty through unchanged — the
// service constructor normalises it to UnknownSiteSentinel so a
// host without an IStoreAndForwardSiteContext registration is
// observable in the central audit log instead of producing a
// silent empty-string SourceSite.
var siteId = siteContext?.SiteId ?? string.Empty;
return new StoreAndForwardService(
storage,
options,
logger,
replication,
cachedCallObserver,
siteId);
});
services.AddSingleton<ReplicationService>(sp =>
{
var options = sp.GetRequiredService<IOptions<StoreAndForwardOptions>>().Value;
var logger = sp.GetRequiredService<ILogger<ReplicationService>>();
return new ReplicationService(options, logger);
});
return services;
}
/// <summary>
/// Registers Store-and-Forward Akka actor bindings. Actor creation is handled by the Host during actor system startup.
/// </summary>
/// <param name="services">The service collection to register into.</param>
public static IServiceCollection AddStoreAndForwardActors(this IServiceCollection services)
{
// Akka actor registration handled by Host component during actor system startup
return services;
}
}
@@ -0,0 +1,94 @@
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// WP-9: Represents a single store-and-forward message as stored in SQLite.
/// Maps to the sf_messages table.
/// </summary>
public class StoreAndForwardMessage
{
/// <summary>Unique message ID (GUID).</summary>
public string Id { get; set; } = string.Empty;
/// <summary>WP-9: Category: ExternalSystem, Notification, or CachedDbWrite.</summary>
public StoreAndForwardCategory Category { get; set; }
/// <summary>Target system name (external system, notification list, or DB connection).</summary>
public string Target { get; set; } = string.Empty;
/// <summary>JSON-serialized payload containing the call details.</summary>
public string PayloadJson { get; set; } = string.Empty;
/// <summary>
/// Number of retry-sweep attempts performed so far. The initial (immediate or
/// caller-made) delivery attempt is attempt 0 and is not counted here; this
/// field counts only background retry attempts (StoreAndForward-003).
/// </summary>
public int RetryCount { get; set; }
/// <summary>
/// Maximum retry-sweep attempts before the message is parked.
/// <c>0</c> = no limit — the message is retried on every sweep until delivered
/// and is never parked for exhausting retries. This is <b>not</b> a "never retry"
/// value; a positive value is required to bound delivery attempts.
/// </summary>
public int MaxRetries { get; set; }
/// <summary>Retry interval in milliseconds.</summary>
public long RetryIntervalMs { get; set; }
/// <summary>When this message was first enqueued.</summary>
public DateTimeOffset CreatedAt { get; set; }
/// <summary>When delivery was last attempted (null if never attempted).</summary>
public DateTimeOffset? LastAttemptAt { get; set; }
/// <summary>Current status of the message.</summary>
public StoreAndForwardMessageStatus Status { get; set; }
/// <summary>Last error message from a failed delivery attempt.</summary>
public string? LastError { get; set; }
/// <summary>
/// Instance that originated this message (for S&amp;F-survives-delete behavior).
/// WP-13: Messages are NOT cleared when instance is deleted.
/// </summary>
public string? OriginInstanceName { get; set; }
/// <summary>
/// Audit Log #23 (ExecutionId Task 4): the originating script execution's
/// per-run correlation id, threaded from <c>ScriptRuntimeContext</c> through
/// the cached-call enqueue path. Carried so the store-and-forward retry loop
/// can stamp it onto the per-attempt / terminal cached-call audit rows
/// (<c>ApiCallCached</c>/<c>DbWriteCached</c> Attempted, <c>CachedResolve</c>).
/// <c>null</c> for non-cached-call categories (notifications) and for rows
/// buffered before this field existed — back-compat with old persisted rows
/// (the column is added by an additive migration and read as null when absent).
/// </summary>
public Guid? ExecutionId { get; set; }
/// <summary>
/// Audit Log #23 (ExecutionId Task 4): the originating script identifier,
/// threaded alongside <see cref="ExecutionId"/> from the cached-call enqueue
/// path so the retry-loop audit rows carry the same <c>SourceScript</c>
/// provenance the script-side cached rows already carry. <c>null</c> when not
/// known (non-cached categories, pre-migration rows).
/// </summary>
public string? SourceScript { get; set; }
/// <summary>
/// Audit Log #23 (ParentExecutionId Task 6): the <c>ExecutionId</c> of the
/// inbound-API request that spawned the originating script execution,
/// threaded alongside <see cref="ExecutionId"/> from the cached-call enqueue
/// path. Carried so the store-and-forward retry loop can stamp it onto the
/// per-attempt / terminal cached-call audit rows
/// (<c>ApiCallCached</c>/<c>DbWriteCached</c> Attempted, <c>CachedResolve</c>),
/// keeping them correlated with the cross-execution chain. <c>null</c> for a
/// non-routed run, for non-cached-call categories (notifications), and for
/// rows buffered before this field existed — back-compat with old persisted
/// rows (the column is added by an additive migration and read as null when
/// absent).
/// </summary>
public Guid? ParentExecutionId { get; set; }
}
@@ -0,0 +1,39 @@
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// WP-9/10: Configuration options for the Store-and-Forward Engine.
/// </summary>
public class StoreAndForwardOptions
{
/// <summary>Path to the SQLite database for S&amp;F message persistence.</summary>
public string SqliteDbPath { get; set; } = "./data/store-and-forward.db";
/// <summary>WP-11: Whether to replicate buffer operations to standby node.</summary>
public bool ReplicationEnabled { get; set; } = true;
/// <summary>WP-10: Default retry interval for messages without per-source settings.</summary>
public TimeSpan DefaultRetryInterval { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// WP-10: Default maximum retry count before parking. Applied when an
/// <c>EnqueueAsync</c> caller does not pass an explicit <c>maxRetries</c>.
/// <para>
/// <b>StoreAndForward-019:</b> this default is enforced uniformly across
/// every category, including <see cref="Commons.Types.Enums.StoreAndForwardCategory.Notification"/>:
/// once the buffered message's retry count reaches this cap the engine
/// parks the row. The Component-StoreAndForward.md "notifications do not
/// park" wording reflects the operational <i>intent</i> when central is
/// reachable on the normal cadence; under a sustained central outage that
/// exceeds <c>DefaultMaxRetries × forward-interval</c> a buffered
/// notification <i>will</i> park and surface in the parked-message UI,
/// matching the rest of the system's bounded-retry-then-park behaviour.
/// Callers that genuinely require unbounded retry must pass
/// <c>maxRetries: 0</c> on <c>EnqueueAsync</c> (the documented "no limit"
/// escape hatch — see <c>StoreAndForwardService.EnqueueAsync</c>).
/// </para>
/// </summary>
public int DefaultMaxRetries { get; set; } = 50;
/// <summary>WP-10: Interval for the background retry timer sweep.</summary>
public TimeSpan RetryTimerInterval { get; set; } = TimeSpan.FromSeconds(10);
}
@@ -0,0 +1,855 @@
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Interfaces.Services;
using ZB.MOM.WW.ScadaBridge.Commons.Types;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// WP-9/10: Core store-and-forward service.
///
/// Lifecycle:
/// 1. Caller attempts immediate delivery via IDeliveryHandler
/// 2. On transient failure → buffer in SQLite → retry loop
/// 3. On success → remove from buffer
/// 4. On reaching MaxRetries → park (a MaxRetries of 0 means "no limit" — the
/// message is retried until delivered and is never parked for retry exhaustion)
/// 5. Permanent failures are returned to caller immediately (never buffered)
///
/// WP-10: Fixed retry interval (not exponential). Per-source-entity retry settings.
/// Background timer-based retry sweep.
///
/// WP-12: Parked messages queryable, retryable, and discardable.
///
/// WP-14: Buffer depth reported as health metric. Activity logged to site event log.
///
/// WP-15: CachedCall idempotency is the caller's responsibility.
/// This service does not deduplicate — if the same message is enqueued twice,
/// it will be delivered twice. Callers using ExternalSystem.CachedCall() must
/// design their payloads to be idempotent (e.g., include unique request IDs
/// and handle duplicate detection on the remote end).
/// </summary>
public class StoreAndForwardService
{
private readonly StoreAndForwardStorage _storage;
private readonly StoreAndForwardOptions _options;
private readonly ReplicationService? _replication;
private readonly ILogger<StoreAndForwardService> _logger;
/// <summary>
/// Audit Log #23 (M3 Bundle E — Task E4): site-side observer notified
/// after every cached-call delivery attempt. Optional — when null no
/// telemetry is emitted; the legacy pre-M3 retry loop behaviour is
/// preserved exactly.
/// </summary>
private readonly ICachedCallLifecycleObserver? _cachedCallObserver;
/// <summary>
/// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the
/// cached-call attempt context so the audit bridge can build the
/// <see cref="SiteCallOperational"/> half of the telemetry packet.
/// <para>
/// <b>StoreAndForward-023:</b> an empty-string site id must never reach
/// downstream consumers — the central audit pipeline keys
/// <c>(SourceSite, TrackedOperationId)</c> off this value, so an empty
/// string degrades correlation to a per-id-only index and breaks the
/// per-site routing of <c>RetryParkedOperation</c>/<c>DiscardParkedOperation</c>
/// commands. The constructor normalises a null/empty/whitespace
/// <paramref name="siteId"/> argument to <see cref="UnknownSiteSentinel"/>
/// so a misconfigured host (no <c>IStoreAndForwardSiteContext</c>
/// registered) produces a distinctive marker in the central audit log
/// rather than silently merging multiple sites into the empty bucket.
/// </para>
/// </summary>
private readonly string _siteId;
/// <summary>
/// StoreAndForward-023: distinctive marker stamped onto cached-call audit
/// telemetry when the host has not registered an
/// <see cref="IStoreAndForwardSiteContext"/>. Chosen with a leading <c>$</c>
/// so it cannot collide with a real site id (which is a configuration
/// identifier and never starts with <c>$</c>). Surfacing this in the
/// central audit log makes a missing site-context binding immediately
/// recognisable instead of an unattributable empty string.
/// </summary>
public const string UnknownSiteSentinel = "$unknown-site";
private Timer? _retryTimer;
private int _retryInProgress;
/// <summary>
/// StoreAndForward-024: the in-flight retry sweep <see cref="Task"/>, or
/// <c>null</c> when no sweep is currently running. Captured when the timer
/// callback starts a sweep so <see cref="StopAsync"/> can wait for it to
/// finish before the host disposes downstream dependencies
/// (<see cref="_storage"/>, <see cref="_replication"/>) that the sweep is
/// still touching. Written from the timer thread and from
/// <see cref="StopAsync"/>, so reads are synchronised via the
/// <see cref="Volatile"/> APIs.
/// </summary>
private Task? _sweepTask;
/// <summary>
/// StoreAndForward-024: how long <see cref="StopAsync"/> waits for an
/// in-flight retry sweep to finish before returning. The default — 10 s —
/// is generous enough to let a typical sweep over the buffered queue drain,
/// but bounded so a hung downstream call (a stuck SQLite write, a
/// long-running delivery handler) cannot block host shutdown indefinitely.
/// On timeout the wait is abandoned and the timer is still disposed; the
/// sweep keeps running but will throw on the next call into a disposed
/// dependency — preferred to blocking shutdown forever.
/// </summary>
private static readonly TimeSpan SweepShutdownWaitTimeout = TimeSpan.FromSeconds(10);
/// <summary>
/// WP-10: Delivery handler delegate. The return value / exception is interpreted
/// the same way on both the immediate-delivery path (<see cref="EnqueueAsync"/>)
/// and the background retry path (<c>RetryMessageAsync</c>):
/// <list type="bullet">
/// <item><description><c>true</c> — delivered successfully. The message is
/// removed from the buffer (or, on the immediate path, never buffered).</description></item>
/// <item><description><c>false</c> — permanent failure. On the immediate path
/// the message is NOT buffered; on a retry the message is already buffered and
/// is parked immediately (no further retries).</description></item>
/// <item><description>throws — transient failure. On the immediate path the
/// message is buffered for retry; on a retry the retry count is incremented and
/// the message is parked once <see cref="StoreAndForwardMessage.MaxRetries"/> is
/// reached.</description></item>
/// </list>
/// </summary>
private readonly Dictionary<StoreAndForwardCategory, Func<StoreAndForwardMessage, Task<bool>>> _deliveryHandlers = new();
/// <summary>
/// WP-14: Event callback for logging S&amp;F activity to site event log.
/// </summary>
public event Action<string, StoreAndForwardCategory, string>? OnActivity;
/// <summary>
/// Initializes a new instance of the StoreAndForwardService.
/// </summary>
/// <param name="storage">The storage backend for buffered messages.</param>
/// <param name="options">Configuration options.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="replication">Optional replication service for standby synchronization.</param>
/// <param name="cachedCallObserver">Optional observer for cached call lifecycle events.</param>
/// <param name="siteId">The site identifier this service belongs to.</param>
public StoreAndForwardService(
StoreAndForwardStorage storage,
StoreAndForwardOptions options,
ILogger<StoreAndForwardService> logger,
ReplicationService? replication = null,
ICachedCallLifecycleObserver? cachedCallObserver = null,
string siteId = "")
{
_storage = storage;
_options = options;
_logger = logger;
_replication = replication;
_cachedCallObserver = cachedCallObserver;
// StoreAndForward-023: normalise an empty / whitespace site id to the
// distinctive UnknownSiteSentinel so downstream consumers (the central
// audit pipeline keying off SourceSite) never see an empty string and
// a misconfigured host is recognisable in the central log.
_siteId = string.IsNullOrWhiteSpace(siteId) ? UnknownSiteSentinel : siteId;
}
/// <summary>
/// Registers a delivery handler for a given message category. See the
/// <c>_deliveryHandlers</c> field documentation for the true/false/throws contract,
/// which applies identically on the immediate and retry paths.
/// </summary>
/// <param name="category">The message category to handle.</param>
/// <param name="handler">The delivery handler function.</param>
public void RegisterDeliveryHandler(
StoreAndForwardCategory category,
Func<StoreAndForwardMessage, Task<bool>> handler)
{
_deliveryHandlers[category] = handler;
}
/// <summary>
/// Initializes storage and starts the background retry timer.
/// </summary>
public async Task StartAsync()
{
await _storage.InitializeAsync();
_retryTimer = new Timer(
// StoreAndForward-024: capture the sweep Task on each tick so
// StopAsync can await any in-flight invocation before the host
// disposes _storage/_replication underneath it. The RetryPending
// path is self-guarded against overlapping sweeps via the
// _retryInProgress Interlocked flag, so unconditionally re-assigning
// the field here cannot lose a still-running task (the new tick
// will short-circuit if one is already running).
_ => Volatile.Write(ref _sweepTask, RetryPendingMessagesAsync()),
null,
_options.RetryTimerInterval,
_options.RetryTimerInterval);
_logger.LogInformation(
"Store-and-forward service started. Retry interval: {Interval}s",
_options.DefaultRetryInterval.TotalSeconds);
}
/// <summary>
/// Stops the background retry timer and waits (bounded) for any in-flight
/// retry sweep to finish before returning.
///
/// StoreAndForward-024: prior to this fix, <see cref="StopAsync"/> only
/// disposed the timer — a sweep already inside
/// <see cref="RetryPendingMessagesAsync"/> continued running against
/// <see cref="_storage"/> and <see cref="_replication"/> after this method
/// returned, and could then NRE / throw on a disposed dependency once the
/// DI container ran its own shutdown. We now await the captured sweep task
/// (with a bounded <see cref="SweepShutdownWaitTimeout"/> so a hung
/// dependency cannot block host shutdown indefinitely) before returning.
/// </summary>
public async Task StopAsync()
{
if (_retryTimer != null)
{
// Stop the periodic callback first so no new sweep starts while we
// are waiting for the in-flight one to drain.
await _retryTimer.DisposeAsync();
_retryTimer = null;
}
var inflight = Volatile.Read(ref _sweepTask);
if (inflight is null || inflight.IsCompleted)
{
return;
}
try
{
// WaitAsync with a finite timeout: a hung delivery handler /
// storage call cannot block host shutdown indefinitely. On timeout
// the sweep keeps running but the host is free to proceed with
// disposal — preferred to never returning.
await inflight.WaitAsync(SweepShutdownWaitTimeout).ConfigureAwait(false);
}
catch (TimeoutException)
{
_logger.LogWarning(
"Store-and-forward retry sweep did not finish within {Timeout}; " +
"shutdown is proceeding while the sweep is still in-flight",
SweepShutdownWaitTimeout);
}
catch (Exception ex)
{
// The sweep itself already logs at Error on failure (see
// RetryPendingMessagesAsync's catch); we only log here so a
// surprise fault during shutdown is still visible. Swallow so the
// host's shutdown sequence can continue regardless.
_logger.LogWarning(ex,
"Store-and-forward retry sweep faulted during shutdown wait");
}
}
/// <summary>
/// WP-10: Enqueues a message for store-and-forward delivery.
/// Attempts immediate delivery first. On transient failure, buffers for retry.
/// On permanent failure (handler returns false), returns false immediately.
///
/// WP-10: Retry-count lifecycle — the immediate (or caller-made) delivery attempt
/// is attempt 0 and is not counted; the background retry sweep increments
/// <see cref="StoreAndForwardMessage.RetryCount"/> on each retry. A buffered
/// message is parked once <c>RetryCount</c> reaches <paramref name="maxRetries"/>
/// — <b>but only when <paramref name="maxRetries"/> is greater than 0</b>. A
/// <paramref name="maxRetries"/> of <c>0</c> means <b>no limit</b>: the message is
/// retried on every sweep until it is delivered and is <b>never parked</b> on a
/// retry-count basis. It is therefore <i>not</i> a "do not retry" value — callers
/// that want delivery abandoned after a bounded number of attempts must pass a
/// positive <paramref name="maxRetries"/>.
///
/// WP-15: CachedCall idempotency note — this method does not deduplicate.
/// The caller (e.g., ExternalSystem.CachedCall()) is responsible for ensuring
/// that the remote system can handle duplicate deliveries safely.
/// </summary>
/// <param name="category">Message category (selects the delivery handler).</param>
/// <param name="target">Target system name (external system / notification list / DB connection).</param>
/// <param name="payloadJson">JSON-serialized call payload, treated opaquely.</param>
/// <param name="originInstanceName">Instance that originated the message (WP-13: survives instance deletion).</param>
/// <param name="maxRetries">
/// Maximum background retry-sweep attempts before the message is parked.
/// <b><c>0</c> = no limit</b> — the message is retried on every sweep until
/// delivered and is never parked for exhausting retries; it is <b>not</b> a
/// "never retry" value. <c>null</c> uses <see cref="StoreAndForwardOptions.DefaultMaxRetries"/>.
/// Must be positive to bound delivery attempts. Mirrors the
/// <see cref="StoreAndForwardMessage.MaxRetries"/> contract.
/// </param>
/// <param name="retryInterval">Fixed interval between retry sweeps for this message; <c>null</c> uses the configured default.</param>
/// <param name="attemptImmediateDelivery">
/// When <c>false</c>, the caller has already made its own delivery attempt and the
/// message is buffered directly for the retry sweep (the handler is not invoked here).
/// </param>
/// <param name="messageId">
/// An explicit, caller-supplied message id. <c>null</c> (the default) makes the
/// service mint a fresh GUID. The Notification Outbox enqueue path supplies its own
/// id so the script-generated <c>NotificationId</c> is the single idempotency key —
/// it is the buffered row's <see cref="StoreAndForwardMessage.Id"/>, it is carried
/// inside the payload, and it is the id the forwarder submits to central.
/// </param>
/// <param name="executionId">
/// Audit Log #23 (ExecutionId Task 4): the originating script execution's
/// per-run correlation id. Threaded onto the buffered row so the retry-loop
/// cached-call audit rows carry it. <c>null</c> for callers (notifications,
/// pre-Task-4 callers) that do not supply one.
/// </param>
/// <param name="sourceScript">
/// Audit Log #23 (ExecutionId Task 4): the originating script identifier,
/// threaded onto the buffered row alongside <paramref name="executionId"/>
/// so the retry-loop audit rows carry the same provenance the script-side
/// cached rows do. <c>null</c> when not known.
/// </param>
/// <param name="parentExecutionId">
/// Audit Log #23 (ParentExecutionId Task 6): the <c>ExecutionId</c> of the
/// inbound-API request that spawned the originating script execution.
/// Threaded onto the buffered row alongside <paramref name="executionId"/>
/// so the retry-loop cached-call audit rows carry it. <c>null</c> for a
/// non-routed run and for callers (notifications, pre-Task-6 callers) that
/// do not supply one.
/// </param>
public async Task<StoreAndForwardResult> EnqueueAsync(
StoreAndForwardCategory category,
string target,
string payloadJson,
string? originInstanceName = null,
int? maxRetries = null,
TimeSpan? retryInterval = null,
bool attemptImmediateDelivery = true,
string? messageId = null,
Guid? executionId = null,
string? sourceScript = null,
Guid? parentExecutionId = null)
{
var message = new StoreAndForwardMessage
{
Id = messageId ?? Guid.NewGuid().ToString("N"),
Category = category,
Target = target,
PayloadJson = payloadJson,
RetryCount = 0,
MaxRetries = maxRetries ?? _options.DefaultMaxRetries,
RetryIntervalMs = (long)(retryInterval ?? _options.DefaultRetryInterval).TotalMilliseconds,
CreatedAt = DateTimeOffset.UtcNow,
Status = StoreAndForwardMessageStatus.Pending,
OriginInstanceName = originInstanceName,
ExecutionId = executionId,
SourceScript = sourceScript,
ParentExecutionId = parentExecutionId
};
// Attempt immediate delivery — unless the caller has already made a
// delivery attempt of its own (attemptImmediateDelivery: false). In that
// case re-invoking the handler here would dispatch the request twice.
if (attemptImmediateDelivery && _deliveryHandlers.TryGetValue(category, out var handler))
{
try
{
var success = await handler(message);
if (success)
{
RaiseActivity("Delivered", category, $"Immediate delivery to {target}");
return new StoreAndForwardResult(true, message.Id, false);
}
// Permanent failure — do not buffer
return new StoreAndForwardResult(false, message.Id, false);
}
catch (Exception ex)
{
// Transient failure — buffer for retry. The immediate attempt is
// attempt 0; RetryCount tracks only sweep retries, so it stays 0
// here (StoreAndForward-003).
_logger.LogWarning(ex,
"Immediate delivery to {Target} failed (transient), buffering for retry",
target);
message.LastAttemptAt = DateTimeOffset.UtcNow;
message.LastError = ex.Message;
await BufferAsync(message);
RaiseActivity("Queued", category, $"Buffered for retry: {target} ({ex.Message})");
return new StoreAndForwardResult(true, message.Id, true);
}
}
// Either no handler is registered yet, or the caller already attempted
// delivery itself — buffer for the background retry sweep to deliver.
// The initial attempt (caller-made, or skipped because no handler is
// registered) is attempt 0; RetryCount tracks only sweep retries and
// therefore stays 0 here (StoreAndForward-003).
if (!attemptImmediateDelivery)
{
message.LastAttemptAt = DateTimeOffset.UtcNow;
}
await BufferAsync(message);
RaiseActivity("Queued", category, attemptImmediateDelivery
? $"No handler registered, buffered: {target}"
: $"Buffered for retry: {target}");
return new StoreAndForwardResult(true, message.Id, true);
}
/// <summary>
/// Persists a message to the local SQLite buffer and (WP-11) replicates the
/// add to the standby node so a failover does not lose the buffered message.
/// </summary>
private async Task BufferAsync(StoreAndForwardMessage message)
{
await _storage.EnqueueAsync(message);
_replication?.ReplicateEnqueue(message);
}
/// <summary>
/// WP-10: Background retry sweep. Processes all pending messages that are due for retry.
/// </summary>
internal async Task RetryPendingMessagesAsync()
{
// Prevent overlapping retry sweeps
if (Interlocked.CompareExchange(ref _retryInProgress, 1, 0) != 0)
return;
try
{
var messages = await _storage.GetMessagesForRetryAsync();
if (messages.Count == 0) return;
_logger.LogDebug("Retry sweep: {Count} messages due for retry", messages.Count);
foreach (var message in messages)
{
await RetryMessageAsync(message);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Error during retry sweep");
}
finally
{
Interlocked.Exchange(ref _retryInProgress, 0);
}
}
private async Task RetryMessageAsync(StoreAndForwardMessage message)
{
if (!_deliveryHandlers.TryGetValue(message.Category, out var handler))
{
_logger.LogWarning("No delivery handler for category {Category}", message.Category);
return;
}
// Audit Log #23 (M3 Bundle E — Tasks E4/E5): measure per-attempt
// duration so the audit row carries a meaningful DurationMs. Captured
// around the handler invocation only — storage / replication overhead
// is excluded.
var attemptStartUtc = DateTime.UtcNow;
var attemptStopwatch = System.Diagnostics.Stopwatch.StartNew();
try
{
var success = await handler(message);
attemptStopwatch.Stop();
if (success)
{
await _storage.RemoveMessageAsync(message.Id);
_replication?.ReplicateRemove(message.Id);
RaiseActivity("Delivered", message.Category,
$"Delivered to {message.Target} after {message.RetryCount} retries");
// M3: terminal Delivered observer notification — the audit
// bridge maps this to Attempted + CachedResolve(Delivered).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.Delivered,
lastError: null,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
return;
}
// Permanent failure on retry — park immediately.
// StoreAndForward-005: the sweep observed this row as Pending; only commit
// the park if it is still Pending so a concurrent operator action that
// moved it (retry/discard) is not silently overwritten.
message.Status = StoreAndForwardMessageStatus.Parked;
message.LastAttemptAt = DateTimeOffset.UtcNow;
message.LastError = "Permanent failure (handler returned false)";
var parked = await _storage.UpdateMessageIfStatusAsync(
message, StoreAndForwardMessageStatus.Pending);
if (!parked)
{
_logger.LogDebug(
"Message {MessageId} changed status during delivery; sweep park skipped",
message.Id);
return;
}
_replication?.ReplicatePark(message);
RaiseActivity("Parked", message.Category,
$"Permanent failure for {message.Target}: handler returned false");
// M3: terminal PermanentFailure observer notification — the
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.PermanentFailure,
lastError: message.LastError,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
}
catch (Exception ex)
{
attemptStopwatch.Stop();
// Transient failure — increment retry, check max
message.RetryCount++;
message.LastAttemptAt = DateTimeOffset.UtcNow;
message.LastError = ex.Message;
if (message.MaxRetries > 0 && message.RetryCount >= message.MaxRetries)
{
// StoreAndForward-005: conditional park — see the permanent-failure
// branch above for rationale.
message.Status = StoreAndForwardMessageStatus.Parked;
var parked = await _storage.UpdateMessageIfStatusAsync(
message, StoreAndForwardMessageStatus.Pending);
if (!parked)
{
_logger.LogDebug(
"Message {MessageId} changed status during delivery; sweep park skipped",
message.Id);
return;
}
_replication?.ReplicatePark(message);
RaiseActivity("Parked", message.Category,
$"Max retries ({message.MaxRetries}) reached for {message.Target}");
_logger.LogWarning(
"Message {MessageId} parked after {MaxRetries} retries to {Target}",
message.Id, message.MaxRetries, message.Target);
// M3: terminal ParkedMaxRetries observer notification — the
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.ParkedMaxRetries,
lastError: ex.Message,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
}
else
{
// StoreAndForward-005: the retry-count increment is also conditional
// on the row still being Pending so it cannot clobber an operator
// action that ran during the failed delivery.
if (!await _storage.UpdateMessageIfStatusAsync(
message, StoreAndForwardMessageStatus.Pending))
{
_logger.LogDebug(
"Message {MessageId} changed status during delivery; sweep retry-count update skipped",
message.Id);
return;
}
RaiseActivity("Retried", message.Category,
$"Retry {message.RetryCount}/{message.MaxRetries} for {message.Target}: {ex.Message}");
// M3: per-attempt TransientFailure observer notification —
// the audit bridge maps this to Attempted(Failed).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.TransientFailure,
lastError: ex.Message,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
}
}
}
/// <summary>
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): notify the registered
/// <see cref="ICachedCallLifecycleObserver"/> of the just-completed
/// attempt. Only fires for cached-call categories
/// (<see cref="StoreAndForwardCategory.ExternalSystem"/> and
/// <see cref="StoreAndForwardCategory.CachedDbWrite"/>); the
/// <see cref="StoreAndForwardCategory.Notification"/> category has its
/// own central-side audit pipeline (Notification Outbox / #21) and must
/// not surface on this hook.
/// </summary>
/// <remarks>
/// Best-effort: an observer that throws is logged and swallowed so a
/// failing audit pipeline cannot corrupt S&amp;F retry bookkeeping
/// (alog.md §7 contract). Messages whose ids are not valid GUIDs (pre-M3
/// callers that didn't thread a TrackedOperationId in) are silently
/// skipped — the observer requires a parseable id by contract.
/// </remarks>
private async Task NotifyCachedCallObserverAsync(
StoreAndForwardMessage message,
CachedCallAttemptOutcome outcome,
string? lastError,
int? httpStatus,
DateTime occurredAtUtc,
int? durationMs)
{
if (_cachedCallObserver == null)
{
return;
}
// Only cached-call categories generate audit telemetry on this hook —
// notifications have their own outbox-side audit pipeline.
var channel = message.Category switch
{
StoreAndForwardCategory.ExternalSystem => "ApiOutbound",
StoreAndForwardCategory.CachedDbWrite => "DbOutbound",
_ => null,
};
if (channel is null)
{
return;
}
if (!TrackedOperationId.TryParse(message.Id, out var trackedId))
{
// StoreAndForward-022: previously a silent skip — but a non-GUID
// message id means a caller bypassed the audit hot path with zero
// feedback. The drop is still best-effort (S&F retry bookkeeping
// must never depend on the audit pipeline) but it is now observable
// via a Warning so a misconfigured caller can be diagnosed.
// Engine-minted ids (Guid.NewGuid().ToString("N")) and the current
// caller set (NotificationOutbox enqueue with NotificationId,
// cached-call enqueue with TrackedOperationId.ToString()) all
// parse — this log line fires only when a future caller supplies a
// non-GUID id, which is exactly when the silent-drop was hardest
// to diagnose.
_logger.LogWarning(
"Cached-call audit observer skipped: message id {MessageId} is not a parseable TrackedOperationId (category {Category}, outcome {Outcome}). " +
"Audit lifecycle for this operation will have no rows.",
message.Id, message.Category, outcome);
return;
}
CachedCallAttemptContext context;
try
{
context = new CachedCallAttemptContext(
TrackedOperationId: trackedId,
Channel: channel,
Target: message.Target,
SourceSite: _siteId,
Outcome: outcome,
RetryCount: message.RetryCount,
LastError: lastError,
HttpStatus: httpStatus,
CreatedAtUtc: message.CreatedAt.UtcDateTime,
OccurredAtUtc: DateTime.SpecifyKind(occurredAtUtc, DateTimeKind.Utc),
DurationMs: durationMs,
SourceInstanceId: message.OriginInstanceName,
// Audit Log #23 (ExecutionId Task 4): the buffered message
// carries the originating script execution's ExecutionId +
// SourceScript; surface them on the context so the bridge can
// stamp the retry-loop cached audit rows. Null on rows buffered
// before Task 4 (back-compat).
ExecutionId: message.ExecutionId,
SourceScript: message.SourceScript,
// Audit Log #23 (ParentExecutionId Task 6): the buffered
// message also carries the spawning inbound-API request's
// ExecutionId; surface it so the bridge stamps it onto the
// retry-loop cached rows. Null for a non-routed run and on
// rows buffered before Task 6 (back-compat).
ParentExecutionId: message.ParentExecutionId);
}
catch (Exception buildEx)
{
// Defensive — record construction shouldn't throw, but the alog.md
// §7 contract requires this path be exception-safe regardless.
_logger.LogWarning(buildEx,
"Failed to build cached-call attempt context for {MessageId}; observer skipped",
message.Id);
return;
}
try
{
await _cachedCallObserver.OnAttemptCompletedAsync(context, CancellationToken.None)
.ConfigureAwait(false);
}
catch (Exception ex)
{
// alog.md §7 best-effort: an audit observer outage must NEVER be
// misclassified as a transient delivery failure or corrupt the
// S&F retry bookkeeping.
_logger.LogWarning(ex,
"ICachedCallLifecycleObserver threw for {MessageId} (Outcome {Outcome}); ignored",
message.Id, outcome);
}
}
/// <summary>
/// WP-12: Gets parked messages for central query (Pattern 8).
/// </summary>
/// <param name="category">Optional category filter, or null for all categories.</param>
/// <param name="pageNumber">The page number (1-based).</param>
/// <param name="pageSize">The page size.</param>
/// <returns>A tuple of parked messages and the total count.</returns>
public async Task<(List<StoreAndForwardMessage> Messages, int TotalCount)> GetParkedMessagesAsync(
StoreAndForwardCategory? category = null,
int pageNumber = 1,
int pageSize = 50)
{
return await _storage.GetParkedMessagesAsync(category, pageNumber, pageSize);
}
/// <summary>
/// WP-12: Retries a parked message (moves back to pending queue).
///
/// StoreAndForward-016: an operator requeue is a buffer state change and is
/// replicated to the standby (as a <see cref="ReplicationOperationType.Requeue"/>)
/// so a failover preserves the operator's retry intent.
/// StoreAndForward-017: the activity-log entry carries the message's true
/// category rather than a hard-coded one.
/// StoreAndForward-020: the parked row is captured <i>before</i> the local
/// requeue write rather than re-read after it, so a concurrent
/// <c>RemoveMessageAsync</c> or <c>DiscardParkedMessageAsync</c> running
/// between the two storage calls cannot leave the standby in <c>Parked</c>
/// while the active node has already requeued — we always have the row in
/// hand for the <c>Requeue</c> replication.
/// </summary>
/// <param name="messageId">The identifier of the message to retry.</param>
/// <returns>True if successfully retried, false otherwise.</returns>
public async Task<bool> RetryParkedMessageAsync(string messageId)
{
// StoreAndForward-020: capture the parked row up front so the standby
// gets a Requeue even if a concurrent writer (a sweep delete after a
// successful delivery, or an operator discard) removes the row between
// the local update and the re-load. The storage call below is
// conditional on status = Parked, so if the row has already moved we
// return false here without replicating — the standby's matching row
// will be reconciled by whichever other operator path won the race.
var captured = await _storage.GetMessageByIdAsync(messageId);
if (captured is null || captured.Status != StoreAndForwardMessageStatus.Parked)
{
return false;
}
var success = await _storage.RetryParkedMessageAsync(messageId);
if (!success)
{
return false;
}
// The active node just rewrote this row to Pending with retry_count = 0
// and cleared last_error / last_attempt_at (see
// StoreAndForwardStorage.RetryParkedMessageAsync). Reconstruct the
// post-requeue state on the captured POCO so the standby applies the
// same mutations even if a concurrent writer has already deleted the
// row underneath us.
captured.Status = StoreAndForwardMessageStatus.Pending;
captured.RetryCount = 0;
captured.LastError = null;
captured.LastAttemptAt = null;
_replication?.ReplicateRequeue(captured);
RaiseActivity("Retry", captured.Category,
$"Parked message {messageId} moved back to queue");
return true;
}
/// <summary>
/// WP-12: Permanently discards a parked message.
///
/// StoreAndForward-016: an operator discard is a buffer removal and is replicated
/// to the standby (as a <see cref="ReplicationOperationType.Remove"/>) so the
/// discarded message does not reappear after a failover.
/// StoreAndForward-017: the activity-log entry carries the message's true
/// category rather than a hard-coded one.
/// </summary>
/// <param name="messageId">The identifier of the message to discard.</param>
/// <returns>True if successfully discarded, false otherwise.</returns>
public async Task<bool> DiscardParkedMessageAsync(string messageId)
{
// Capture the category before the row is deleted so the activity log is
// labelled correctly.
var message = await _storage.GetMessageByIdAsync(messageId);
var success = await _storage.DiscardParkedMessageAsync(messageId);
if (success)
{
_replication?.ReplicateRemove(messageId);
RaiseActivity("Discard", message?.Category ?? StoreAndForwardCategory.ExternalSystem,
$"Parked message {messageId} discarded");
}
return success;
}
/// <summary>
/// WP-14: Gets buffer depth by category for health reporting.
/// </summary>
/// <returns>A dictionary of buffer depths by category.</returns>
public async Task<Dictionary<StoreAndForwardCategory, int>> GetBufferDepthAsync()
{
return await _storage.GetBufferDepthByCategoryAsync();
}
/// <summary>
/// WP-13: Gets count of S&amp;F messages for a given instance (for verifying survival on deletion).
/// </summary>
/// <param name="instanceName">The instance name to query.</param>
/// <returns>The number of messages originating from the instance.</returns>
public async Task<int> GetMessageCountForInstanceAsync(string instanceName)
{
return await _storage.GetMessageCountByOriginInstanceAsync(instanceName);
}
/// <summary>
/// Notification Outbox: looks up a buffered message by its id, or <c>null</c> if it
/// is not (or no longer) in the buffer. <c>Notify.Status</c> uses this to detect a
/// notification still in transit at the site — central reports it not-found while
/// the S&amp;F buffer still holds it, which is the site-local <c>Forwarding</c> state.
/// </summary>
/// <param name="messageId">The message identifier.</param>
/// <returns>The message, or null if not found.</returns>
public async Task<StoreAndForwardMessage?> GetMessageByIdAsync(string messageId)
{
return await _storage.GetMessageByIdAsync(messageId);
}
/// <summary>
/// WP-14: Raises the S&amp;F activity notification. StoreAndForward-009: the
/// delegate is snapshotted (so a concurrent unsubscribe cannot NRE) and every
/// subscriber invocation is wrapped so a slow/throwing subscriber (e.g. the site
/// event log) cannot abort the caller. Crucially, a subscriber exception raised
/// from <see cref="EnqueueAsync"/> or <c>RetryMessageAsync</c> must NOT be
/// misclassified as a transient delivery failure — pre-fix it escaped into the
/// delivery try/catch and caused a successfully delivered message to be buffered
/// (or its retry count to be bumped). Activity logging is best-effort.
/// </summary>
private void RaiseActivity(string action, StoreAndForwardCategory category, string detail)
{
var handlers = OnActivity;
if (handlers == null) return;
foreach (var handler in handlers.GetInvocationList().Cast<Action<string, StoreAndForwardCategory, string>>())
{
try
{
handler(action, category, detail);
}
catch (Exception ex)
{
_logger.LogWarning(ex,
"Store-and-forward activity subscriber threw for action {Action}; ignored",
action);
}
}
}
}
/// <summary>
/// Result of an enqueue operation.
/// </summary>
public record StoreAndForwardResult(
/// <summary>True if the message was accepted (either delivered immediately or buffered).</summary>
bool Accepted,
/// <summary>Unique message ID for tracking.</summary>
string MessageId,
/// <summary>True if the message was buffered (not delivered immediately).</summary>
bool WasBuffered);
@@ -0,0 +1,563 @@
using Microsoft.Data.Sqlite;
using Microsoft.Extensions.Logging;
using ZB.MOM.WW.ScadaBridge.Commons.Types.Enums;
namespace ZB.MOM.WW.ScadaBridge.StoreAndForward;
/// <summary>
/// WP-9: SQLite persistence layer for store-and-forward messages.
/// Uses direct Microsoft.Data.Sqlite (not EF Core) for lightweight site-side storage.
/// No max buffer size per design decision.
///
/// StoreAndForward-008: every method opens a fresh <see cref="SqliteConnection"/> for
/// the duration of the call rather than holding a long-lived connection. This is a
/// deliberate trade-off, not an oversight: Microsoft.Data.Sqlite maintains an internal
/// connection pool keyed on the connection string, so <c>OpenAsync</c> on a previously
/// used connection string reuses a pooled handle instead of performing a real file
/// open. The retry sweep therefore relies on that pool for acceptable performance —
/// it calls <see cref="RemoveMessageAsync"/> / <see cref="UpdateMessageIfStatusAsync"/>
/// once per due message, and with no max buffer size (by design) the buffer can grow
/// large. The connection-per-call style keeps each method self-contained and
/// transaction-scoped; if profiling ever shows the pooled open to be a bottleneck on
/// the hot retry path, the remedy is a batched sweep API that opens one connection (and
/// one transaction) per sweep.
/// </summary>
public class StoreAndForwardStorage
{
private readonly string _connectionString;
private readonly ILogger<StoreAndForwardStorage> _logger;
/// <summary>
/// Initializes a new instance of <see cref="StoreAndForwardStorage"/> with the given SQLite connection string.
/// </summary>
/// <param name="connectionString">SQLite connection string for the store-and-forward database.</param>
/// <param name="logger">Logger for diagnostics.</param>
public StoreAndForwardStorage(string connectionString, ILogger<StoreAndForwardStorage> logger)
{
_connectionString = connectionString;
_logger = logger;
}
/// <summary>
/// Creates the sf_messages table if it does not exist.
/// </summary>
public async Task InitializeAsync()
{
EnsureDatabaseDirectoryExists();
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var command = connection.CreateCommand();
command.CommandText = @"
CREATE TABLE IF NOT EXISTS sf_messages (
id TEXT PRIMARY KEY,
category INTEGER NOT NULL,
target TEXT NOT NULL,
payload_json TEXT NOT NULL,
retry_count INTEGER NOT NULL DEFAULT 0,
max_retries INTEGER NOT NULL DEFAULT 50,
retry_interval_ms INTEGER NOT NULL DEFAULT 30000,
created_at TEXT NOT NULL,
last_attempt_at TEXT,
status INTEGER NOT NULL DEFAULT 0,
last_error TEXT,
origin_instance TEXT
);
CREATE INDEX IF NOT EXISTS idx_sf_messages_status ON sf_messages(status);
CREATE INDEX IF NOT EXISTS idx_sf_messages_category ON sf_messages(category);
";
await command.ExecuteNonQueryAsync();
// Audit Log #23 (ExecutionId Task 4): additively add the execution_id /
// source_script columns. CREATE TABLE IF NOT EXISTS above does NOT add
// columns to a table that already exists from before these fields, so a
// databases created by an older build needs the columns ALTER-ed in.
// SQLite has no "ADD COLUMN IF NOT EXISTS"; the column presence is
// probed first and the ALTER skipped when already there. Both columns
// are nullable with no default, so any row buffered before this
// migration reads back ExecutionId/SourceScript = null (back-compat).
await AddColumnIfMissingAsync(connection, "execution_id", "TEXT");
await AddColumnIfMissingAsync(connection, "source_script", "TEXT");
// Audit Log #23 (ParentExecutionId Task 6): additively add the
// parent_execution_id column the same way — a sibling to execution_id.
// Nullable with no default, so any row buffered before this migration
// reads back ParentExecutionId = null (back-compat).
await AddColumnIfMissingAsync(connection, "parent_execution_id", "TEXT");
_logger.LogInformation("Store-and-forward SQLite storage initialized");
}
/// <summary>
/// Audit Log #23 (ExecutionId Task 4): adds a column to <c>sf_messages</c>
/// only when it is not already present. SQLite lacks <c>ADD COLUMN IF NOT
/// EXISTS</c>, so the schema is probed via <c>PRAGMA table_info</c> first.
/// Idempotent — safe to run on every <see cref="InitializeAsync"/>.
/// </summary>
private static async Task AddColumnIfMissingAsync(
SqliteConnection connection, string columnName, string columnType)
{
await using var probe = connection.CreateCommand();
probe.CommandText = "SELECT COUNT(*) FROM pragma_table_info('sf_messages') WHERE name = @name";
probe.Parameters.AddWithValue("@name", columnName);
var exists = Convert.ToInt32(await probe.ExecuteScalarAsync()) > 0;
if (exists)
{
return;
}
await using var alter = connection.CreateCommand();
// Column name + type are caller-controlled constants, never user input —
// safe to interpolate (parameters are not permitted in DDL).
alter.CommandText = $"ALTER TABLE sf_messages ADD COLUMN {columnName} {columnType}";
await alter.ExecuteNonQueryAsync();
}
/// <summary>
/// Ensures the directory for a file-backed SQLite database exists. SQLite creates
/// the database file on demand but not its parent directory, so a configured path
/// such as "./data/store-and-forward.db" fails to open ("unable to open database
/// file") when the "data" directory does not yet exist. In-memory databases and
/// bare filenames in the working directory have no directory to create and are
/// skipped.
/// </summary>
private void EnsureDatabaseDirectoryExists()
{
var builder = new SqliteConnectionStringBuilder(_connectionString);
if (builder.Mode == SqliteOpenMode.Memory)
return;
var dataSource = builder.DataSource;
if (string.IsNullOrEmpty(dataSource) || dataSource == ":memory:")
return;
var directory = System.IO.Path.GetDirectoryName(System.IO.Path.GetFullPath(dataSource));
if (!string.IsNullOrEmpty(directory) && !System.IO.Directory.Exists(directory))
{
System.IO.Directory.CreateDirectory(directory);
_logger.LogInformation("Created store-and-forward database directory: {Directory}", directory);
}
}
/// <summary>
/// WP-9: Enqueues a new message with Pending status.
/// </summary>
/// <param name="message">The message to enqueue.</param>
public async Task EnqueueAsync(StoreAndForwardMessage message)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
INSERT INTO sf_messages (id, category, target, payload_json, retry_count, max_retries,
retry_interval_ms, created_at, last_attempt_at, status, last_error,
origin_instance, execution_id, source_script, parent_execution_id)
VALUES (@id, @category, @target, @payload, @retryCount, @maxRetries,
@retryIntervalMs, @createdAt, @lastAttempt, @status, @lastError,
@origin, @executionId, @sourceScript, @parentExecutionId)";
cmd.Parameters.AddWithValue("@id", message.Id);
cmd.Parameters.AddWithValue("@category", (int)message.Category);
cmd.Parameters.AddWithValue("@target", message.Target);
cmd.Parameters.AddWithValue("@payload", message.PayloadJson);
cmd.Parameters.AddWithValue("@retryCount", message.RetryCount);
cmd.Parameters.AddWithValue("@maxRetries", message.MaxRetries);
cmd.Parameters.AddWithValue("@retryIntervalMs", message.RetryIntervalMs);
cmd.Parameters.AddWithValue("@createdAt", message.CreatedAt.ToString("O"));
cmd.Parameters.AddWithValue("@lastAttempt", message.LastAttemptAt.HasValue
? message.LastAttemptAt.Value.ToString("O") : DBNull.Value);
cmd.Parameters.AddWithValue("@status", (int)message.Status);
cmd.Parameters.AddWithValue("@lastError", (object?)message.LastError ?? DBNull.Value);
cmd.Parameters.AddWithValue("@origin", (object?)message.OriginInstanceName ?? DBNull.Value);
// Audit Log #23 (ExecutionId Task 4): the execution id is stored as its
// canonical string form ("D") so it round-trips cleanly through the
// TEXT column; null when not a cached call / not threaded.
cmd.Parameters.AddWithValue("@executionId",
message.ExecutionId.HasValue ? message.ExecutionId.Value.ToString("D") : DBNull.Value);
cmd.Parameters.AddWithValue("@sourceScript", (object?)message.SourceScript ?? DBNull.Value);
// Audit Log #23 (ParentExecutionId Task 6): the parent execution id is
// stored as its canonical string form ("D") so it round-trips cleanly
// through the TEXT column; null when not a routed cached call.
cmd.Parameters.AddWithValue("@parentExecutionId",
message.ParentExecutionId.HasValue ? message.ParentExecutionId.Value.ToString("D") : DBNull.Value);
await cmd.ExecuteNonQueryAsync();
}
/// <summary>
/// WP-10: Gets all messages that are due for retry (Pending status, last attempt older than retry interval).
/// </summary>
public async Task<List<StoreAndForwardMessage>> GetMessagesForRetryAsync()
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
SELECT id, category, target, payload_json, retry_count, max_retries,
retry_interval_ms, created_at, last_attempt_at, status, last_error, origin_instance,
execution_id, source_script, parent_execution_id
FROM sf_messages
WHERE status = @pending
AND (last_attempt_at IS NULL
OR retry_interval_ms = 0
OR (julianday('now') - julianday(last_attempt_at)) * 86400000 >= retry_interval_ms)
ORDER BY created_at ASC";
cmd.Parameters.AddWithValue("@pending", (int)StoreAndForwardMessageStatus.Pending);
return await ReadMessagesAsync(cmd);
}
/// <summary>
/// WP-10: Updates a message after a delivery attempt.
/// </summary>
/// <param name="message">The message with updated retry count, status, and last error.</param>
public async Task UpdateMessageAsync(StoreAndForwardMessage message)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
UPDATE sf_messages
SET retry_count = @retryCount,
last_attempt_at = @lastAttempt,
status = @status,
last_error = @lastError
WHERE id = @id";
cmd.Parameters.AddWithValue("@id", message.Id);
cmd.Parameters.AddWithValue("@retryCount", message.RetryCount);
cmd.Parameters.AddWithValue("@lastAttempt", message.LastAttemptAt.HasValue
? message.LastAttemptAt.Value.ToString("O") : DBNull.Value);
cmd.Parameters.AddWithValue("@status", (int)message.Status);
cmd.Parameters.AddWithValue("@lastError", (object?)message.LastError ?? DBNull.Value);
await cmd.ExecuteNonQueryAsync();
}
/// <summary>
/// WP-10: Updates a message after a delivery attempt, but only if the row is still
/// in the expected status. Returns true if the row was updated, false if it had
/// already been changed (e.g. an operator retried or discarded the message) and so
/// was skipped.
///
/// StoreAndForward-005: the retry sweep uses this for its state-changing writes so
/// it cannot clobber a concurrent operator action (RetryParkedMessageAsync /
/// DiscardParkedMessageAsync). Those operator operations are themselves SQL-
/// conditional on <c>status = Parked</c>; making the sweep's writes conditional on
/// the status the sweep observed closes the sweep-vs-management race rather than
/// relying only on the in-process overlapping-sweep guard.
/// </summary>
/// <param name="message">The message with the updated values to persist.</param>
/// <param name="expectedStatus">The status the row must currently have for the update to proceed.</param>
public async Task<bool> UpdateMessageIfStatusAsync(
StoreAndForwardMessage message,
StoreAndForwardMessageStatus expectedStatus)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
UPDATE sf_messages
SET retry_count = @retryCount,
last_attempt_at = @lastAttempt,
status = @status,
last_error = @lastError
WHERE id = @id AND status = @expectedStatus";
cmd.Parameters.AddWithValue("@id", message.Id);
cmd.Parameters.AddWithValue("@retryCount", message.RetryCount);
cmd.Parameters.AddWithValue("@lastAttempt", message.LastAttemptAt.HasValue
? message.LastAttemptAt.Value.ToString("O") : DBNull.Value);
cmd.Parameters.AddWithValue("@status", (int)message.Status);
cmd.Parameters.AddWithValue("@lastError", (object?)message.LastError ?? DBNull.Value);
cmd.Parameters.AddWithValue("@expectedStatus", (int)expectedStatus);
var rows = await cmd.ExecuteNonQueryAsync();
return rows > 0;
}
/// <summary>
/// WP-10: Removes a successfully delivered message.
/// </summary>
/// <param name="messageId">The id of the message to remove.</param>
public async Task RemoveMessageAsync(string messageId)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = "DELETE FROM sf_messages WHERE id = @id";
cmd.Parameters.AddWithValue("@id", messageId);
await cmd.ExecuteNonQueryAsync();
}
/// <summary>
/// WP-12: Gets all parked messages, optionally filtered by category, with pagination.
///
/// StoreAndForward-006: the COUNT(*) and the paged SELECT run inside a single
/// transaction so they observe one consistent snapshot. Without it, a concurrent
/// enqueue/park/discard arriving between the two statements yields a TotalCount
/// inconsistent with the returned page (flickering totals / off-by-one page math
/// in the paginated UI).
/// </summary>
/// <param name="category">Optional category filter; null returns parked messages from all categories.</param>
/// <param name="pageNumber">1-based page number.</param>
/// <param name="pageSize">Maximum number of messages to return per page.</param>
public async Task<(List<StoreAndForwardMessage> Messages, int TotalCount)> GetParkedMessagesAsync(
StoreAndForwardCategory? category = null,
int pageNumber = 1,
int pageSize = 50)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var transaction = (SqliteTransaction)await connection.BeginTransactionAsync();
// Count
await using var countCmd = connection.CreateCommand();
countCmd.Transaction = transaction;
countCmd.CommandText = category.HasValue
? "SELECT COUNT(*) FROM sf_messages WHERE status = @parked AND category = @category"
: "SELECT COUNT(*) FROM sf_messages WHERE status = @parked";
countCmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
if (category.HasValue) countCmd.Parameters.AddWithValue("@category", (int)category.Value);
var totalCount = Convert.ToInt32(await countCmd.ExecuteScalarAsync());
// Page
await using var pageCmd = connection.CreateCommand();
pageCmd.Transaction = transaction;
var categoryFilter = category.HasValue ? " AND category = @category" : "";
pageCmd.CommandText = $@"
SELECT id, category, target, payload_json, retry_count, max_retries,
retry_interval_ms, created_at, last_attempt_at, status, last_error, origin_instance,
execution_id, source_script, parent_execution_id
FROM sf_messages
WHERE status = @parked{categoryFilter}
ORDER BY created_at ASC
LIMIT @limit OFFSET @offset";
pageCmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
if (category.HasValue) pageCmd.Parameters.AddWithValue("@category", (int)category.Value);
pageCmd.Parameters.AddWithValue("@limit", pageSize);
pageCmd.Parameters.AddWithValue("@offset", (pageNumber - 1) * pageSize);
var messages = await ReadMessagesAsync(pageCmd);
await transaction.CommitAsync();
return (messages, totalCount);
}
/// <summary>
/// WP-12: Moves a parked message back to pending for retry.
///
/// StoreAndForward-010: <c>last_attempt_at</c> is reset to NULL so the re-queued
/// message is unambiguously due on the next retry sweep. An operator-initiated
/// retry means "attempt this again now"; leaving the stale parked timestamp in
/// place would make the message's retry timing depend on the configured retry
/// interval relative to the original (pre-park) attempt — "try immediately" only
/// by accident, and a long interval would instead delay the operator's retry.
/// </summary>
/// <param name="messageId">The id of the parked message to move back to Pending.</param>
public async Task<bool> RetryParkedMessageAsync(string messageId)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
UPDATE sf_messages
SET status = @pending, retry_count = 0, last_error = NULL, last_attempt_at = NULL
WHERE id = @id AND status = @parked";
cmd.Parameters.AddWithValue("@id", messageId);
cmd.Parameters.AddWithValue("@pending", (int)StoreAndForwardMessageStatus.Pending);
cmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
var rows = await cmd.ExecuteNonQueryAsync();
return rows > 0;
}
/// <summary>
/// WP-12: Permanently discards a parked message.
/// </summary>
/// <param name="messageId">The id of the parked message to discard.</param>
public async Task<bool> DiscardParkedMessageAsync(string messageId)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = "DELETE FROM sf_messages WHERE id = @id AND status = @parked";
cmd.Parameters.AddWithValue("@id", messageId);
cmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
var rows = await cmd.ExecuteNonQueryAsync();
return rows > 0;
}
/// <summary>
/// WP-14: Gets buffer depth by category (count of pending messages per category).
/// </summary>
public async Task<Dictionary<StoreAndForwardCategory, int>> GetBufferDepthByCategoryAsync()
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
SELECT category, COUNT(*) as cnt
FROM sf_messages
WHERE status = @pending
GROUP BY category";
cmd.Parameters.AddWithValue("@pending", (int)StoreAndForwardMessageStatus.Pending);
var result = new Dictionary<StoreAndForwardCategory, int>();
await using var reader = await cmd.ExecuteReaderAsync();
while (await reader.ReadAsync())
{
var category = (StoreAndForwardCategory)reader.GetInt32(0);
var count = reader.GetInt32(1);
result[category] = count;
}
return result;
}
/// <summary>
/// WP-13: Verifies messages are NOT deleted when an instance is deleted.
/// Returns the count of messages for a given origin instance.
/// </summary>
/// <param name="instanceName">The origin instance name to count messages for.</param>
public async Task<int> GetMessageCountByOriginInstanceAsync(string instanceName)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
SELECT COUNT(*)
FROM sf_messages
WHERE origin_instance = @origin";
cmd.Parameters.AddWithValue("@origin", instanceName);
return Convert.ToInt32(await cmd.ExecuteScalarAsync());
}
/// <summary>
/// Gets a message by ID.
/// </summary>
/// <param name="messageId">The id of the message to retrieve.</param>
public async Task<StoreAndForwardMessage?> GetMessageByIdAsync(string messageId)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = @"
SELECT id, category, target, payload_json, retry_count, max_retries,
retry_interval_ms, created_at, last_attempt_at, status, last_error, origin_instance,
execution_id, source_script, parent_execution_id
FROM sf_messages
WHERE id = @id";
cmd.Parameters.AddWithValue("@id", messageId);
var messages = await ReadMessagesAsync(cmd);
return messages.FirstOrDefault();
}
/// <summary>
/// Gets the count of parked messages (for health reporting).
/// </summary>
public async Task<int> GetParkedMessageCountAsync()
{
await using var conn = new SqliteConnection(_connectionString);
await conn.OpenAsync();
await using var cmd = conn.CreateCommand();
cmd.CommandText = "SELECT COUNT(*) FROM sf_messages WHERE status = @parked";
cmd.Parameters.AddWithValue("@parked", (int)StoreAndForwardMessageStatus.Parked);
var result = await cmd.ExecuteScalarAsync();
return Convert.ToInt32(result);
}
/// <summary>
/// Gets total message count by status.
/// </summary>
/// <param name="status">The status to filter by.</param>
public async Task<int> GetMessageCountByStatusAsync(StoreAndForwardMessageStatus status)
{
await using var connection = new SqliteConnection(_connectionString);
await connection.OpenAsync();
await using var cmd = connection.CreateCommand();
cmd.CommandText = "SELECT COUNT(*) FROM sf_messages WHERE status = @status";
cmd.Parameters.AddWithValue("@status", (int)status);
return Convert.ToInt32(await cmd.ExecuteScalarAsync());
}
private static async Task<List<StoreAndForwardMessage>> ReadMessagesAsync(SqliteCommand cmd)
{
var results = new List<StoreAndForwardMessage>();
await using var reader = await cmd.ExecuteReaderAsync();
while (await reader.ReadAsync())
{
results.Add(new StoreAndForwardMessage
{
Id = reader.GetString(0),
Category = (StoreAndForwardCategory)reader.GetInt32(1),
Target = reader.GetString(2),
PayloadJson = reader.GetString(3),
RetryCount = reader.GetInt32(4),
MaxRetries = reader.GetInt32(5),
RetryIntervalMs = reader.GetInt64(6),
CreatedAt = DateTimeOffset.Parse(reader.GetString(7)),
LastAttemptAt = reader.IsDBNull(8) ? null : DateTimeOffset.Parse(reader.GetString(8)),
Status = (StoreAndForwardMessageStatus)reader.GetInt32(9),
LastError = reader.IsDBNull(10) ? null : reader.GetString(10),
OriginInstanceName = reader.IsDBNull(11) ? null : reader.GetString(11),
// Audit Log #23 (ExecutionId Task 4): rows persisted before the
// additive migration have no execution_id / source_script value;
// IsDBNull guards keep those reading back as null (back-compat).
// Guid.TryParse (not Parse) guards the retry sweep: a corrupt
// non-null execution_id is treated as "no execution id" rather
// than throwing FormatException and aborting the whole sweep.
ExecutionId = ParseGuidColumn(reader, 12),
SourceScript = reader.IsDBNull(13) ? null : reader.GetString(13),
// Audit Log #23 (ParentExecutionId Task 6): rows persisted
// before the additive migration have no parent_execution_id
// value; the IsDBNull guard inside ParseGuidColumn keeps those
// reading back as null (back-compat). Guid.TryParse (not Parse)
// guards the retry sweep against a corrupt non-null value.
ParentExecutionId = ParseGuidColumn(reader, 14)
});
}
return results;
}
/// <summary>
/// Audit Log #23 (ExecutionId Task 4 / ParentExecutionId Task 6):
/// defensively reads a nullable GUID column (<c>execution_id</c> or
/// <c>parent_execution_id</c>). A <c>null</c> value (legacy pre-migration
/// rows) and a malformed non-null value both yield <c>null</c> — a corrupt
/// id must not throw and abort the retry sweep, which reads many rows.
/// </summary>
private static Guid? ParseGuidColumn(System.Data.Common.DbDataReader reader, int ordinal)
{
if (reader.IsDBNull(ordinal))
{
return null;
}
return Guid.TryParse(reader.GetString(ordinal), out var value)
? value
: null;
}
}
@@ -0,0 +1,26 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Akka" />
<PackageReference Include="Microsoft.Data.Sqlite" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../ZB.MOM.WW.ScadaBridge.Commons/ZB.MOM.WW.ScadaBridge.Commons.csproj" />
</ItemGroup>
<ItemGroup>
<InternalsVisibleTo Include="ZB.MOM.WW.ScadaBridge.StoreAndForward.Tests" />
</ItemGroup>
</Project>