feat(snf): per-attempt and terminal cached-call lifecycle observer (#23 M3)

Hook the store-and-forward retry loop so the audit pipeline can emit
per-attempt + terminal telemetry under the original TrackedOperationId
(Bundle E Tasks E4 + E5).

New seam:

* ICachedCallLifecycleObserver + CachedCallAttemptContext in
  Commons.Interfaces.Services. Outcome enum
  (Delivered / TransientFailure / PermanentFailure / ParkedMaxRetries)
  is S&F-vocabulary; the bridge living in ScadaLink.AuditLog (Bundle F)
  will map it to the AuditKind/AuditStatus pair when building the
  CachedCallTelemetry packet.

* StoreAndForwardService gains an optional cachedCallObserver
  constructor parameter + siteId. RetryMessageAsync fires the observer
  exactly once per attempt with the appropriate outcome:
    - handler returns true               -> Delivered
    - handler returns false              -> PermanentFailure (and parks)
    - handler throws + retries remaining -> TransientFailure
    - handler throws + max retries hit   -> ParkedMaxRetries (and parks)

Hook is best-effort: a thrown observer is logged + swallowed so a
failing audit pipeline can never be misclassified as a transient
delivery failure or corrupt the retry-count bookkeeping (alog.md §7).

Only cached-call categories (ExternalSystem, CachedDbWrite) generate
notifications — Notification category has its own central-side
audit pipeline (Notification Outbox / #21).

Pre-M3 callers that didn't thread a TrackedOperationId into the S&F
message id are silently skipped — the observer requires a parseable id
by contract. New S&F callers stamp the id as messageId (Bundle E3).

Bundle E tasks E4 + E5.
This commit is contained in:
Joseph Doherty
2026-05-20 14:52:34 -04:00
parent 42430dd10a
commit 63eb1f4225
3 changed files with 553 additions and 1 deletions

View File

@@ -1,4 +1,6 @@
using Microsoft.Extensions.Logging;
using ScadaLink.Commons.Interfaces.Services;
using ScadaLink.Commons.Types;
using ScadaLink.Commons.Types.Enums;
namespace ScadaLink.StoreAndForward;
@@ -33,6 +35,19 @@ public class StoreAndForwardService
private readonly StoreAndForwardOptions _options;
private readonly ReplicationService? _replication;
private readonly ILogger<StoreAndForwardService> _logger;
/// <summary>
/// Audit Log #23 (M3 Bundle E — Task E4): site-side observer notified
/// after every cached-call delivery attempt. Optional — when null no
/// telemetry is emitted; the legacy pre-M3 retry loop behaviour is
/// preserved exactly.
/// </summary>
private readonly ICachedCallLifecycleObserver? _cachedCallObserver;
/// <summary>
/// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the
/// cached-call attempt context so the audit bridge can build the
/// <see cref="SiteCallOperational"/> half of the telemetry packet.
/// </summary>
private readonly string _siteId;
private Timer? _retryTimer;
private int _retryInProgress;
@@ -63,12 +78,16 @@ public class StoreAndForwardService
StoreAndForwardStorage storage,
StoreAndForwardOptions options,
ILogger<StoreAndForwardService> logger,
ReplicationService? replication = null)
ReplicationService? replication = null,
ICachedCallLifecycleObserver? cachedCallObserver = null,
string siteId = "")
{
_storage = storage;
_options = options;
_logger = logger;
_replication = replication;
_cachedCallObserver = cachedCallObserver;
_siteId = siteId;
}
/// <summary>
@@ -280,15 +299,33 @@ public class StoreAndForwardService
return;
}
// Audit Log #23 (M3 Bundle E — Tasks E4/E5): measure per-attempt
// duration so the audit row carries a meaningful DurationMs. Captured
// around the handler invocation only — storage / replication overhead
// is excluded.
var attemptStartUtc = DateTime.UtcNow;
var attemptStopwatch = System.Diagnostics.Stopwatch.StartNew();
try
{
var success = await handler(message);
attemptStopwatch.Stop();
if (success)
{
await _storage.RemoveMessageAsync(message.Id);
_replication?.ReplicateRemove(message.Id);
RaiseActivity("Delivered", message.Category,
$"Delivered to {message.Target} after {message.RetryCount} retries");
// M3: terminal Delivered observer notification — the audit
// bridge maps this to Attempted + CachedResolve(Delivered).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.Delivered,
lastError: null,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
return;
}
@@ -311,9 +348,20 @@ public class StoreAndForwardService
_replication?.ReplicatePark(message);
RaiseActivity("Parked", message.Category,
$"Permanent failure for {message.Target}: handler returned false");
// M3: terminal PermanentFailure observer notification — the
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.PermanentFailure,
lastError: message.LastError,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
}
catch (Exception ex)
{
attemptStopwatch.Stop();
// Transient failure — increment retry, check max
message.RetryCount++;
message.LastAttemptAt = DateTimeOffset.UtcNow;
@@ -339,6 +387,16 @@ public class StoreAndForwardService
_logger.LogWarning(
"Message {MessageId} parked after {MaxRetries} retries to {Target}",
message.Id, message.MaxRetries, message.Target);
// M3: terminal ParkedMaxRetries observer notification — the
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.ParkedMaxRetries,
lastError: ex.Message,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
}
else
{
@@ -355,10 +413,113 @@ public class StoreAndForwardService
}
RaiseActivity("Retried", message.Category,
$"Retry {message.RetryCount}/{message.MaxRetries} for {message.Target}: {ex.Message}");
// M3: per-attempt TransientFailure observer notification —
// the audit bridge maps this to Attempted(Failed).
await NotifyCachedCallObserverAsync(
message,
CachedCallAttemptOutcome.TransientFailure,
lastError: ex.Message,
httpStatus: null,
occurredAtUtc: attemptStartUtc,
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
}
}
}
/// <summary>
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): notify the registered
/// <see cref="ICachedCallLifecycleObserver"/> of the just-completed
/// attempt. Only fires for cached-call categories
/// (<see cref="StoreAndForwardCategory.ExternalSystem"/> and
/// <see cref="StoreAndForwardCategory.CachedDbWrite"/>); the
/// <see cref="StoreAndForwardCategory.Notification"/> category has its
/// own central-side audit pipeline (Notification Outbox / #21) and must
/// not surface on this hook.
/// </summary>
/// <remarks>
/// Best-effort: an observer that throws is logged and swallowed so a
/// failing audit pipeline cannot corrupt S&amp;F retry bookkeeping
/// (alog.md §7 contract). Messages whose ids are not valid GUIDs (pre-M3
/// callers that didn't thread a TrackedOperationId in) are silently
/// skipped — the observer requires a parseable id by contract.
/// </remarks>
private async Task NotifyCachedCallObserverAsync(
StoreAndForwardMessage message,
CachedCallAttemptOutcome outcome,
string? lastError,
int? httpStatus,
DateTime occurredAtUtc,
int? durationMs)
{
if (_cachedCallObserver == null)
{
return;
}
// Only cached-call categories generate audit telemetry on this hook —
// notifications have their own outbox-side audit pipeline.
var channel = message.Category switch
{
StoreAndForwardCategory.ExternalSystem => "ApiOutbound",
StoreAndForwardCategory.CachedDbWrite => "DbOutbound",
_ => null,
};
if (channel is null)
{
return;
}
if (!TrackedOperationId.TryParse(message.Id, out var trackedId))
{
// Pre-M3 message (random GUID-N id from S&F itself, no
// TrackedOperationId threaded in). Skip — no audit row to bind to.
return;
}
CachedCallAttemptContext context;
try
{
context = new CachedCallAttemptContext(
TrackedOperationId: trackedId,
Channel: channel,
Target: message.Target,
SourceSite: _siteId,
Outcome: outcome,
RetryCount: message.RetryCount,
LastError: lastError,
HttpStatus: httpStatus,
CreatedAtUtc: message.CreatedAt.UtcDateTime,
OccurredAtUtc: DateTime.SpecifyKind(occurredAtUtc, DateTimeKind.Utc),
DurationMs: durationMs,
SourceInstanceId: message.OriginInstanceName);
}
catch (Exception buildEx)
{
// Defensive — record construction shouldn't throw, but the alog.md
// §7 contract requires this path be exception-safe regardless.
_logger.LogWarning(buildEx,
"Failed to build cached-call attempt context for {MessageId}; observer skipped",
message.Id);
return;
}
try
{
await _cachedCallObserver.OnAttemptCompletedAsync(context, CancellationToken.None)
.ConfigureAwait(false);
}
catch (Exception ex)
{
// alog.md §7 best-effort: an audit observer outage must NEVER be
// misclassified as a transient delivery failure or corrupt the
// S&F retry bookkeeping.
_logger.LogWarning(ex,
"ICachedCallLifecycleObserver threw for {MessageId} (Outcome {Outcome}); ignored",
message.Id, outcome);
}
}
/// <summary>
/// WP-12: Gets parked messages for central query (Pattern 8).
/// </summary>