feat(snf): per-attempt and terminal cached-call lifecycle observer (#23 M3)
Hook the store-and-forward retry loop so the audit pipeline can emit
per-attempt + terminal telemetry under the original TrackedOperationId
(Bundle E Tasks E4 + E5).
New seam:
* ICachedCallLifecycleObserver + CachedCallAttemptContext in
Commons.Interfaces.Services. Outcome enum
(Delivered / TransientFailure / PermanentFailure / ParkedMaxRetries)
is S&F-vocabulary; the bridge living in ScadaLink.AuditLog (Bundle F)
will map it to the AuditKind/AuditStatus pair when building the
CachedCallTelemetry packet.
* StoreAndForwardService gains an optional cachedCallObserver
constructor parameter + siteId. RetryMessageAsync fires the observer
exactly once per attempt with the appropriate outcome:
- handler returns true -> Delivered
- handler returns false -> PermanentFailure (and parks)
- handler throws + retries remaining -> TransientFailure
- handler throws + max retries hit -> ParkedMaxRetries (and parks)
Hook is best-effort: a thrown observer is logged + swallowed so a
failing audit pipeline can never be misclassified as a transient
delivery failure or corrupt the retry-count bookkeeping (alog.md §7).
Only cached-call categories (ExternalSystem, CachedDbWrite) generate
notifications — Notification category has its own central-side
audit pipeline (Notification Outbox / #21).
Pre-M3 callers that didn't thread a TrackedOperationId into the S&F
message id are silently skipped — the observer requires a parseable id
by contract. New S&F callers stamp the id as messageId (Bundle E3).
Bundle E tasks E4 + E5.
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
using ScadaLink.Commons.Types;
|
||||
|
||||
namespace ScadaLink.Commons.Interfaces.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): site-side hook the
|
||||
/// store-and-forward retry loop invokes after every cached-call attempt and
|
||||
/// at terminal-state transitions, so the audit pipeline can emit
|
||||
/// <c>ApiCallCached</c>/<c>DbWriteCached</c> per-attempt rows and the
|
||||
/// <c>CachedResolve</c> terminal row under the original
|
||||
/// <see cref="TrackedOperationId"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// The interface deliberately uses <see cref="CachedCallAttemptOutcome"/>
|
||||
/// rather than <see cref="ScadaLink.Commons.Types.Enums.AuditStatus"/> so the
|
||||
/// S&F project does not need to depend on the audit vocabulary — the
|
||||
/// bridge living in <c>ScadaLink.AuditLog</c> maps the outcome to the right
|
||||
/// audit kind + status when materialising the <c>CachedCallTelemetry</c>
|
||||
/// packet.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// <b>Best-effort contract (alog.md §7):</b> implementations MUST swallow
|
||||
/// internal failures rather than propagating to the S&F service — a
|
||||
/// thrown observer must not be misclassified as a transient delivery
|
||||
/// failure and must not corrupt the retry-count bookkeeping.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public interface ICachedCallLifecycleObserver
|
||||
{
|
||||
/// <summary>
|
||||
/// Called by the store-and-forward retry loop after every cached-call
|
||||
/// delivery attempt. Receives the message's TrackedOperationId-bearing id,
|
||||
/// the per-category channel discriminator, retry-count + last-error
|
||||
/// context, and whether the outcome reached a terminal state.
|
||||
/// </summary>
|
||||
Task OnAttemptCompletedAsync(CachedCallAttemptContext context, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Per-attempt context handed to <see cref="ICachedCallLifecycleObserver"/>.
|
||||
/// </summary>
|
||||
/// <param name="TrackedOperationId">
|
||||
/// Tracking id parsed from the underlying <c>StoreAndForwardMessage.Id</c>.
|
||||
/// </param>
|
||||
/// <param name="Channel">
|
||||
/// Trust-boundary channel string — <c>"ApiOutbound"</c> for ExternalSystem
|
||||
/// cached calls, <c>"DbOutbound"</c> for cached DB writes.
|
||||
/// </param>
|
||||
/// <param name="Target">Human-readable target (system name / DB connection).</param>
|
||||
/// <param name="SourceSite">Site id that submitted the cached call.</param>
|
||||
/// <param name="Outcome">Per-attempt outcome.</param>
|
||||
/// <param name="RetryCount">Number of retries performed so far (S&F bookkeeping).</param>
|
||||
/// <param name="LastError">Most recent error message (null on success).</param>
|
||||
/// <param name="HttpStatus">Most recent HTTP status (null when not applicable).</param>
|
||||
/// <param name="CreatedAtUtc">When the underlying S&F message was first enqueued.</param>
|
||||
/// <param name="OccurredAtUtc">When this attempt completed.</param>
|
||||
/// <param name="DurationMs">Duration of the attempt in milliseconds (null when not measured).</param>
|
||||
/// <param name="SourceInstanceId">Originating instance, when known.</param>
|
||||
public sealed record CachedCallAttemptContext(
|
||||
TrackedOperationId TrackedOperationId,
|
||||
string Channel,
|
||||
string Target,
|
||||
string SourceSite,
|
||||
CachedCallAttemptOutcome Outcome,
|
||||
int RetryCount,
|
||||
string? LastError,
|
||||
int? HttpStatus,
|
||||
DateTime CreatedAtUtc,
|
||||
DateTime OccurredAtUtc,
|
||||
int? DurationMs,
|
||||
string? SourceInstanceId);
|
||||
|
||||
/// <summary>
|
||||
/// Coarse outcome of one cached-call delivery attempt, observed from inside
|
||||
/// the store-and-forward retry loop. The audit bridge maps this to the
|
||||
/// <c>ApiCallCached</c>/<c>DbWriteCached</c> Attempted row and, when terminal,
|
||||
/// the corresponding <c>CachedResolve</c> row.
|
||||
/// </summary>
|
||||
public enum CachedCallAttemptOutcome
|
||||
{
|
||||
/// <summary>Attempt delivered successfully — terminal Delivered state.</summary>
|
||||
Delivered,
|
||||
|
||||
/// <summary>Attempt failed transiently; another retry will follow.</summary>
|
||||
TransientFailure,
|
||||
|
||||
/// <summary>Attempt returned permanent failure — terminal Parked state (S&F semantics).</summary>
|
||||
PermanentFailure,
|
||||
|
||||
/// <summary>Retry budget exhausted — terminal Parked state.</summary>
|
||||
ParkedMaxRetries,
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using ScadaLink.Commons.Interfaces.Services;
|
||||
using ScadaLink.Commons.Types;
|
||||
using ScadaLink.Commons.Types.Enums;
|
||||
|
||||
namespace ScadaLink.StoreAndForward;
|
||||
@@ -33,6 +35,19 @@ public class StoreAndForwardService
|
||||
private readonly StoreAndForwardOptions _options;
|
||||
private readonly ReplicationService? _replication;
|
||||
private readonly ILogger<StoreAndForwardService> _logger;
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Task E4): site-side observer notified
|
||||
/// after every cached-call delivery attempt. Optional — when null no
|
||||
/// telemetry is emitted; the legacy pre-M3 retry loop behaviour is
|
||||
/// preserved exactly.
|
||||
/// </summary>
|
||||
private readonly ICachedCallLifecycleObserver? _cachedCallObserver;
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Task E4): site id stamped onto the
|
||||
/// cached-call attempt context so the audit bridge can build the
|
||||
/// <see cref="SiteCallOperational"/> half of the telemetry packet.
|
||||
/// </summary>
|
||||
private readonly string _siteId;
|
||||
private Timer? _retryTimer;
|
||||
private int _retryInProgress;
|
||||
|
||||
@@ -63,12 +78,16 @@ public class StoreAndForwardService
|
||||
StoreAndForwardStorage storage,
|
||||
StoreAndForwardOptions options,
|
||||
ILogger<StoreAndForwardService> logger,
|
||||
ReplicationService? replication = null)
|
||||
ReplicationService? replication = null,
|
||||
ICachedCallLifecycleObserver? cachedCallObserver = null,
|
||||
string siteId = "")
|
||||
{
|
||||
_storage = storage;
|
||||
_options = options;
|
||||
_logger = logger;
|
||||
_replication = replication;
|
||||
_cachedCallObserver = cachedCallObserver;
|
||||
_siteId = siteId;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -280,15 +299,33 @@ public class StoreAndForwardService
|
||||
return;
|
||||
}
|
||||
|
||||
// Audit Log #23 (M3 Bundle E — Tasks E4/E5): measure per-attempt
|
||||
// duration so the audit row carries a meaningful DurationMs. Captured
|
||||
// around the handler invocation only — storage / replication overhead
|
||||
// is excluded.
|
||||
var attemptStartUtc = DateTime.UtcNow;
|
||||
var attemptStopwatch = System.Diagnostics.Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
var success = await handler(message);
|
||||
attemptStopwatch.Stop();
|
||||
if (success)
|
||||
{
|
||||
await _storage.RemoveMessageAsync(message.Id);
|
||||
_replication?.ReplicateRemove(message.Id);
|
||||
RaiseActivity("Delivered", message.Category,
|
||||
$"Delivered to {message.Target} after {message.RetryCount} retries");
|
||||
|
||||
// M3: terminal Delivered observer notification — the audit
|
||||
// bridge maps this to Attempted + CachedResolve(Delivered).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.Delivered,
|
||||
lastError: null,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -311,9 +348,20 @@ public class StoreAndForwardService
|
||||
_replication?.ReplicatePark(message);
|
||||
RaiseActivity("Parked", message.Category,
|
||||
$"Permanent failure for {message.Target}: handler returned false");
|
||||
|
||||
// M3: terminal PermanentFailure observer notification — the
|
||||
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.PermanentFailure,
|
||||
lastError: message.LastError,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
attemptStopwatch.Stop();
|
||||
// Transient failure — increment retry, check max
|
||||
message.RetryCount++;
|
||||
message.LastAttemptAt = DateTimeOffset.UtcNow;
|
||||
@@ -339,6 +387,16 @@ public class StoreAndForwardService
|
||||
_logger.LogWarning(
|
||||
"Message {MessageId} parked after {MaxRetries} retries to {Target}",
|
||||
message.Id, message.MaxRetries, message.Target);
|
||||
|
||||
// M3: terminal ParkedMaxRetries observer notification — the
|
||||
// audit bridge maps this to Attempted(Failed) + CachedResolve(Parked).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.ParkedMaxRetries,
|
||||
lastError: ex.Message,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -355,10 +413,113 @@ public class StoreAndForwardService
|
||||
}
|
||||
RaiseActivity("Retried", message.Category,
|
||||
$"Retry {message.RetryCount}/{message.MaxRetries} for {message.Target}: {ex.Message}");
|
||||
|
||||
// M3: per-attempt TransientFailure observer notification —
|
||||
// the audit bridge maps this to Attempted(Failed).
|
||||
await NotifyCachedCallObserverAsync(
|
||||
message,
|
||||
CachedCallAttemptOutcome.TransientFailure,
|
||||
lastError: ex.Message,
|
||||
httpStatus: null,
|
||||
occurredAtUtc: attemptStartUtc,
|
||||
durationMs: (int)attemptStopwatch.ElapsedMilliseconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit Log #23 (M3 Bundle E — Tasks E4/E5): notify the registered
|
||||
/// <see cref="ICachedCallLifecycleObserver"/> of the just-completed
|
||||
/// attempt. Only fires for cached-call categories
|
||||
/// (<see cref="StoreAndForwardCategory.ExternalSystem"/> and
|
||||
/// <see cref="StoreAndForwardCategory.CachedDbWrite"/>); the
|
||||
/// <see cref="StoreAndForwardCategory.Notification"/> category has its
|
||||
/// own central-side audit pipeline (Notification Outbox / #21) and must
|
||||
/// not surface on this hook.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Best-effort: an observer that throws is logged and swallowed so a
|
||||
/// failing audit pipeline cannot corrupt S&F retry bookkeeping
|
||||
/// (alog.md §7 contract). Messages whose ids are not valid GUIDs (pre-M3
|
||||
/// callers that didn't thread a TrackedOperationId in) are silently
|
||||
/// skipped — the observer requires a parseable id by contract.
|
||||
/// </remarks>
|
||||
private async Task NotifyCachedCallObserverAsync(
|
||||
StoreAndForwardMessage message,
|
||||
CachedCallAttemptOutcome outcome,
|
||||
string? lastError,
|
||||
int? httpStatus,
|
||||
DateTime occurredAtUtc,
|
||||
int? durationMs)
|
||||
{
|
||||
if (_cachedCallObserver == null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Only cached-call categories generate audit telemetry on this hook —
|
||||
// notifications have their own outbox-side audit pipeline.
|
||||
var channel = message.Category switch
|
||||
{
|
||||
StoreAndForwardCategory.ExternalSystem => "ApiOutbound",
|
||||
StoreAndForwardCategory.CachedDbWrite => "DbOutbound",
|
||||
_ => null,
|
||||
};
|
||||
if (channel is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (!TrackedOperationId.TryParse(message.Id, out var trackedId))
|
||||
{
|
||||
// Pre-M3 message (random GUID-N id from S&F itself, no
|
||||
// TrackedOperationId threaded in). Skip — no audit row to bind to.
|
||||
return;
|
||||
}
|
||||
|
||||
CachedCallAttemptContext context;
|
||||
try
|
||||
{
|
||||
context = new CachedCallAttemptContext(
|
||||
TrackedOperationId: trackedId,
|
||||
Channel: channel,
|
||||
Target: message.Target,
|
||||
SourceSite: _siteId,
|
||||
Outcome: outcome,
|
||||
RetryCount: message.RetryCount,
|
||||
LastError: lastError,
|
||||
HttpStatus: httpStatus,
|
||||
CreatedAtUtc: message.CreatedAt.UtcDateTime,
|
||||
OccurredAtUtc: DateTime.SpecifyKind(occurredAtUtc, DateTimeKind.Utc),
|
||||
DurationMs: durationMs,
|
||||
SourceInstanceId: message.OriginInstanceName);
|
||||
}
|
||||
catch (Exception buildEx)
|
||||
{
|
||||
// Defensive — record construction shouldn't throw, but the alog.md
|
||||
// §7 contract requires this path be exception-safe regardless.
|
||||
_logger.LogWarning(buildEx,
|
||||
"Failed to build cached-call attempt context for {MessageId}; observer skipped",
|
||||
message.Id);
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await _cachedCallObserver.OnAttemptCompletedAsync(context, CancellationToken.None)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// alog.md §7 best-effort: an audit observer outage must NEVER be
|
||||
// misclassified as a transient delivery failure or corrupt the
|
||||
// S&F retry bookkeeping.
|
||||
_logger.LogWarning(ex,
|
||||
"ICachedCallLifecycleObserver threw for {MessageId} (Outcome {Outcome}); ignored",
|
||||
message.Id, outcome);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Gets parked messages for central query (Pattern 8).
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user