fix(store-and-forward): resolve StoreAndForward-015..017 — document maxRetries=0 contract, replicate operator retry/discard, real category in activity log
This commit is contained in:
@@ -8,7 +8,7 @@
|
||||
| Last reviewed | 2026-05-17 |
|
||||
| Reviewer | claude-agent |
|
||||
| Commit reviewed | `39d737e` |
|
||||
| Open findings | 3 (3 Deferred: 002, 011, 012 — see notes) |
|
||||
| Open findings | 0 (3 Deferred: 002, 011, 012 — see notes) |
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -735,7 +735,7 @@ all six `SiteActorPathTests` now pass. Fixed by the commit whose message referen
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Documentation & comments |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:114`–`:130`, `:285` |
|
||||
|
||||
**Description**
|
||||
@@ -784,7 +784,17 @@ public `EnqueueAsync` contract must state the chosen meaning; today it states no
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-17. Documentation-only fix — retry semantics confirmed correct and
|
||||
left unchanged. Root cause verified against the source: `EnqueueAsync`'s `maxRetries`
|
||||
parameter had no `<param>` documentation and the method/class summaries described only
|
||||
the "park on max retries" path, never the `0 = no limit / retry forever` special case
|
||||
that `RetryMessageAsync`'s `MaxRetries > 0` guard actually enforces. Added an explicit
|
||||
`<param>` tag for every `EnqueueAsync` parameter — `maxRetries` now states in bold that
|
||||
`0` means "no limit, never parked for retry exhaustion" and is **not** a "never retry"
|
||||
value — extended the method summary with a retry-count lifecycle paragraph, updated the
|
||||
class-level lifecycle bullet, and tightened the `StoreAndForwardMessage.MaxRetries` field
|
||||
doc to the same wording. No behavioural code touched; an XML comment is not
|
||||
test-observable so no regression test was added.
|
||||
|
||||
### StoreAndForward-016 — Operator-initiated parked-message retry and discard are not replicated to the standby
|
||||
|
||||
@@ -792,7 +802,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Medium |
|
||||
| Category | Error handling & resilience |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:339`–`:362`; `src/ScadaLink.StoreAndForward/ReplicationService.cs:131`–`:136` |
|
||||
|
||||
**Description**
|
||||
@@ -842,7 +852,18 @@ paths).
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-17. Root cause confirmed: the two operator paths
|
||||
(`RetryParkedMessageAsync`, `DiscardParkedMessageAsync`) changed local SQLite state but
|
||||
never touched `_replication`, so a failover after an operator action diverged the
|
||||
standby buffer. `DiscardParkedMessageAsync` now calls `_replication?.ReplicateRemove`
|
||||
after a successful local delete (the existing `Remove` op deletes on the standby). A new
|
||||
`ReplicationOperationType.Requeue` was added; `RetryParkedMessageAsync` re-loads the
|
||||
requeued row and calls `_replication?.ReplicateRequeue`, and the standby's
|
||||
`ApplyReplicatedOperationAsync` `Requeue` case resets its matching row to `Pending` with
|
||||
`retry_count = 0`. Regression tests `DiscardingAParkedMessage_ReplicatesARemoveOperation`,
|
||||
`RetryingAParkedMessage_ReplicatesARequeueOperation` and
|
||||
`ApplyReplicatedOperation_Requeue_MovesStandbyRowBackToPending` (in
|
||||
`StoreAndForwardReplicationTests`) all pass; the first two fail against the pre-fix code.
|
||||
|
||||
### StoreAndForward-017 — Retry/Discard activity-log entries hard-code the `ExternalSystem` category
|
||||
|
||||
@@ -850,7 +871,7 @@ _Unresolved._
|
||||
|--|--|
|
||||
| Severity | Low |
|
||||
| Category | Correctness & logic bugs |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
| Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:344`, `:358` |
|
||||
|
||||
**Description**
|
||||
@@ -882,4 +903,14 @@ allow a nullable category for management actions rather than asserting a false o
|
||||
|
||||
**Resolution**
|
||||
|
||||
_Unresolved._
|
||||
Resolved 2026-05-17. Root cause confirmed: `RetryParkedMessageAsync` and
|
||||
`DiscardParkedMessageAsync` raised activity notifications with a hard-coded
|
||||
`StoreAndForwardCategory.ExternalSystem`, mislabelling parked `Notification` and
|
||||
`CachedDbWrite` messages in the site event log. Both methods now obtain the message's
|
||||
real category — `DiscardParkedMessageAsync` loads the row via `GetMessageByIdAsync`
|
||||
before the delete, `RetryParkedMessageAsync` re-loads the requeued row (also used for
|
||||
the StoreAndForward-016 replication) — and pass it to `RaiseActivity` (falling back to
|
||||
`ExternalSystem` only if the row is unexpectedly absent). Regression tests
|
||||
`RetryParkedMessageAsync_ActivityUsesMessageRealCategory` and
|
||||
`DiscardParkedMessageAsync_ActivityUsesMessageRealCategory` assert the activity carries
|
||||
`Notification` / `CachedDbWrite` respectively; both fail against the pre-fix code.
|
||||
|
||||
@@ -73,6 +73,22 @@ public class ReplicationService
|
||||
message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11 / StoreAndForward-016: Replicates an operator-initiated requeue (a parked
|
||||
/// message moved back to the pending queue) to standby (fire-and-forget). The
|
||||
/// carried message reflects the active node's post-requeue state (Pending,
|
||||
/// retry_count = 0) so the standby's copy can be brought into sync.
|
||||
/// </summary>
|
||||
public void ReplicateRequeue(StoreAndForwardMessage message)
|
||||
{
|
||||
if (!_options.ReplicationEnabled || _replicationHandler == null) return;
|
||||
|
||||
FireAndForget(new ReplicationOperation(
|
||||
ReplicationOperationType.Requeue,
|
||||
message.Id,
|
||||
message));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// WP-11: Applies a replicated operation received from the active node.
|
||||
/// Used by the standby node to keep its SQLite in sync.
|
||||
@@ -95,6 +111,15 @@ public class ReplicationService
|
||||
operation.Message.Status = StoreAndForwardMessageStatus.Parked;
|
||||
await storage.UpdateMessageAsync(operation.Message);
|
||||
break;
|
||||
|
||||
case ReplicationOperationType.Requeue when operation.Message != null:
|
||||
// StoreAndForward-016: an operator retried a parked message on the
|
||||
// active node; mirror that on the standby by moving its row back to
|
||||
// Pending with retry_count = 0 so a failover preserves the retry.
|
||||
operation.Message.Status = StoreAndForwardMessageStatus.Pending;
|
||||
operation.Message.RetryCount = 0;
|
||||
await storage.UpdateMessageAsync(operation.Message);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,5 +157,10 @@ public enum ReplicationOperationType
|
||||
{
|
||||
Add,
|
||||
Remove,
|
||||
Park
|
||||
Park,
|
||||
/// <summary>
|
||||
/// StoreAndForward-016: an operator moved a parked message back to the pending
|
||||
/// queue. The standby resets its matching row to Pending with retry_count = 0.
|
||||
/// </summary>
|
||||
Requeue
|
||||
}
|
||||
|
||||
@@ -27,7 +27,12 @@ public class StoreAndForwardMessage
|
||||
/// </summary>
|
||||
public int RetryCount { get; set; }
|
||||
|
||||
/// <summary>Maximum retry-sweep attempts before parking (0 = no limit).</summary>
|
||||
/// <summary>
|
||||
/// Maximum retry-sweep attempts before the message is parked.
|
||||
/// <c>0</c> = no limit — the message is retried on every sweep until delivered
|
||||
/// and is never parked for exhausting retries. This is <b>not</b> a "never retry"
|
||||
/// value; a positive value is required to bound delivery attempts.
|
||||
/// </summary>
|
||||
public int MaxRetries { get; set; }
|
||||
|
||||
/// <summary>Retry interval in milliseconds.</summary>
|
||||
|
||||
@@ -10,7 +10,8 @@ namespace ScadaLink.StoreAndForward;
|
||||
/// 1. Caller attempts immediate delivery via IDeliveryHandler
|
||||
/// 2. On transient failure → buffer in SQLite → retry loop
|
||||
/// 3. On success → remove from buffer
|
||||
/// 4. On max retries → park
|
||||
/// 4. On reaching MaxRetries → park (a MaxRetries of 0 means "no limit" — the
|
||||
/// message is retried until delivered and is never parked for retry exhaustion)
|
||||
/// 5. Permanent failures are returned to caller immediately (never buffered)
|
||||
///
|
||||
/// WP-10: Fixed retry interval (not exponential). Per-source-entity retry settings.
|
||||
@@ -116,10 +117,38 @@ public class StoreAndForwardService
|
||||
/// Attempts immediate delivery first. On transient failure, buffers for retry.
|
||||
/// On permanent failure (handler returns false), returns false immediately.
|
||||
///
|
||||
/// WP-10: Retry-count lifecycle — the immediate (or caller-made) delivery attempt
|
||||
/// is attempt 0 and is not counted; the background retry sweep increments
|
||||
/// <see cref="StoreAndForwardMessage.RetryCount"/> on each retry. A buffered
|
||||
/// message is parked once <c>RetryCount</c> reaches <paramref name="maxRetries"/>
|
||||
/// — <b>but only when <paramref name="maxRetries"/> is greater than 0</b>. A
|
||||
/// <paramref name="maxRetries"/> of <c>0</c> means <b>no limit</b>: the message is
|
||||
/// retried on every sweep until it is delivered and is <b>never parked</b> on a
|
||||
/// retry-count basis. It is therefore <i>not</i> a "do not retry" value — callers
|
||||
/// that want delivery abandoned after a bounded number of attempts must pass a
|
||||
/// positive <paramref name="maxRetries"/>.
|
||||
///
|
||||
/// WP-15: CachedCall idempotency note — this method does not deduplicate.
|
||||
/// The caller (e.g., ExternalSystem.CachedCall()) is responsible for ensuring
|
||||
/// that the remote system can handle duplicate deliveries safely.
|
||||
/// </summary>
|
||||
/// <param name="category">Message category (selects the delivery handler).</param>
|
||||
/// <param name="target">Target system name (external system / notification list / DB connection).</param>
|
||||
/// <param name="payloadJson">JSON-serialized call payload, treated opaquely.</param>
|
||||
/// <param name="originInstanceName">Instance that originated the message (WP-13: survives instance deletion).</param>
|
||||
/// <param name="maxRetries">
|
||||
/// Maximum background retry-sweep attempts before the message is parked.
|
||||
/// <b><c>0</c> = no limit</b> — the message is retried on every sweep until
|
||||
/// delivered and is never parked for exhausting retries; it is <b>not</b> a
|
||||
/// "never retry" value. <c>null</c> uses <see cref="StoreAndForwardOptions.DefaultMaxRetries"/>.
|
||||
/// Must be positive to bound delivery attempts. Mirrors the
|
||||
/// <see cref="StoreAndForwardMessage.MaxRetries"/> contract.
|
||||
/// </param>
|
||||
/// <param name="retryInterval">Fixed interval between retry sweeps for this message; <c>null</c> uses the configured default.</param>
|
||||
/// <param name="attemptImmediateDelivery">
|
||||
/// When <c>false</c>, the caller has already made its own delivery attempt and the
|
||||
/// message is buffered directly for the retry sweep (the handler is not invoked here).
|
||||
/// </param>
|
||||
public async Task<StoreAndForwardResult> EnqueueAsync(
|
||||
StoreAndForwardCategory category,
|
||||
string target,
|
||||
@@ -335,13 +364,27 @@ public class StoreAndForwardService
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Retries a parked message (moves back to pending queue).
|
||||
///
|
||||
/// StoreAndForward-016: an operator requeue is a buffer state change and is
|
||||
/// replicated to the standby (as a <see cref="ReplicationOperationType.Requeue"/>)
|
||||
/// so a failover preserves the operator's retry intent.
|
||||
/// StoreAndForward-017: the activity-log entry carries the message's true
|
||||
/// category rather than a hard-coded one.
|
||||
/// </summary>
|
||||
public async Task<bool> RetryParkedMessageAsync(string messageId)
|
||||
{
|
||||
var success = await _storage.RetryParkedMessageAsync(messageId);
|
||||
if (success)
|
||||
{
|
||||
RaiseActivity("Retry", StoreAndForwardCategory.ExternalSystem,
|
||||
// Re-load the requeued row so the activity log gets the real category
|
||||
// and the standby gets the post-requeue state (Pending, retry_count = 0).
|
||||
var message = await _storage.GetMessageByIdAsync(messageId);
|
||||
var category = message?.Category ?? StoreAndForwardCategory.ExternalSystem;
|
||||
if (message != null)
|
||||
{
|
||||
_replication?.ReplicateRequeue(message);
|
||||
}
|
||||
RaiseActivity("Retry", category,
|
||||
$"Parked message {messageId} moved back to queue");
|
||||
}
|
||||
return success;
|
||||
@@ -349,13 +392,23 @@ public class StoreAndForwardService
|
||||
|
||||
/// <summary>
|
||||
/// WP-12: Permanently discards a parked message.
|
||||
///
|
||||
/// StoreAndForward-016: an operator discard is a buffer removal and is replicated
|
||||
/// to the standby (as a <see cref="ReplicationOperationType.Remove"/>) so the
|
||||
/// discarded message does not reappear after a failover.
|
||||
/// StoreAndForward-017: the activity-log entry carries the message's true
|
||||
/// category rather than a hard-coded one.
|
||||
/// </summary>
|
||||
public async Task<bool> DiscardParkedMessageAsync(string messageId)
|
||||
{
|
||||
// Capture the category before the row is deleted so the activity log is
|
||||
// labelled correctly.
|
||||
var message = await _storage.GetMessageByIdAsync(messageId);
|
||||
var success = await _storage.DiscardParkedMessageAsync(messageId);
|
||||
if (success)
|
||||
{
|
||||
RaiseActivity("Discard", StoreAndForwardCategory.ExternalSystem,
|
||||
_replication?.ReplicateRemove(messageId);
|
||||
RaiseActivity("Discard", message?.Category ?? StoreAndForwardCategory.ExternalSystem,
|
||||
$"Parked message {messageId} discarded");
|
||||
}
|
||||
return success;
|
||||
|
||||
@@ -105,4 +105,102 @@ public class StoreAndForwardReplicationTests : IAsyncLifetime, IDisposable
|
||||
Assert.Contains(ops, o =>
|
||||
o.OperationType == ReplicationOperationType.Park && o.MessageId == result.MessageId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-016: an operator discarding a parked message must replicate
|
||||
/// a Remove so the standby's copy is also deleted (otherwise the discarded
|
||||
/// message reappears in the parked list after a failover).
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DiscardingAParkedMessage_ReplicatesARemoveOperation()
|
||||
{
|
||||
_service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem,
|
||||
_ => throw new HttpRequestException("always fails"));
|
||||
|
||||
var result = await _service.EnqueueAsync(
|
||||
StoreAndForwardCategory.ExternalSystem, "api", """{}""", maxRetries: 1);
|
||||
await _service.RetryPendingMessagesAsync(); // -> parked
|
||||
await WaitForReplicationAsync(2);
|
||||
|
||||
var discarded = await _service.DiscardParkedMessageAsync(result.MessageId);
|
||||
Assert.True(discarded);
|
||||
|
||||
var ops = await WaitForReplicationAsync(3);
|
||||
Assert.Contains(ops, o =>
|
||||
o.OperationType == ReplicationOperationType.Remove && o.MessageId == result.MessageId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-016: an operator retrying a parked message must replicate a
|
||||
/// Requeue so the standby's copy moves back to Pending (otherwise it stays
|
||||
/// Parked on the standby and the operator's retry is lost across a failover).
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RetryingAParkedMessage_ReplicatesARequeueOperation()
|
||||
{
|
||||
_service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem,
|
||||
_ => throw new HttpRequestException("always fails"));
|
||||
|
||||
var result = await _service.EnqueueAsync(
|
||||
StoreAndForwardCategory.ExternalSystem, "api", """{}""", maxRetries: 1);
|
||||
await _service.RetryPendingMessagesAsync(); // -> parked
|
||||
await WaitForReplicationAsync(2);
|
||||
|
||||
var retried = await _service.RetryParkedMessageAsync(result.MessageId);
|
||||
Assert.True(retried);
|
||||
|
||||
var ops = await WaitForReplicationAsync(3);
|
||||
var requeue = ops.SingleOrDefault(o =>
|
||||
o.OperationType == ReplicationOperationType.Requeue && o.MessageId == result.MessageId);
|
||||
Assert.NotNull(requeue);
|
||||
Assert.NotNull(requeue!.Message);
|
||||
Assert.Equal(StoreAndForwardMessageStatus.Pending, requeue.Message!.Status);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-016: the standby applies a Requeue by moving its row back to
|
||||
/// Pending with retry_count = 0, mirroring the active node's local state.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ApplyReplicatedOperation_Requeue_MovesStandbyRowBackToPending()
|
||||
{
|
||||
var replication = new ReplicationService(
|
||||
new StoreAndForwardOptions { ReplicationEnabled = true },
|
||||
NullLogger<ReplicationService>.Instance);
|
||||
|
||||
var parked = new StoreAndForwardMessage
|
||||
{
|
||||
Id = "requeue1",
|
||||
Category = StoreAndForwardCategory.ExternalSystem,
|
||||
Target = "api",
|
||||
PayloadJson = "{}",
|
||||
RetryCount = 5,
|
||||
MaxRetries = 1,
|
||||
RetryIntervalMs = 0,
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
Status = StoreAndForwardMessageStatus.Parked,
|
||||
};
|
||||
await _storage.EnqueueAsync(parked);
|
||||
|
||||
var requeued = new StoreAndForwardMessage
|
||||
{
|
||||
Id = parked.Id,
|
||||
Category = parked.Category,
|
||||
Target = parked.Target,
|
||||
PayloadJson = parked.PayloadJson,
|
||||
RetryCount = 0,
|
||||
MaxRetries = parked.MaxRetries,
|
||||
RetryIntervalMs = parked.RetryIntervalMs,
|
||||
CreatedAt = parked.CreatedAt,
|
||||
Status = StoreAndForwardMessageStatus.Pending,
|
||||
};
|
||||
await replication.ApplyReplicatedOperationAsync(
|
||||
new ReplicationOperation(ReplicationOperationType.Requeue, parked.Id, requeued),
|
||||
_storage);
|
||||
|
||||
var row = await _storage.GetMessageByIdAsync(parked.Id);
|
||||
Assert.NotNull(row);
|
||||
Assert.Equal(StoreAndForwardMessageStatus.Pending, row!.Status);
|
||||
Assert.Equal(0, row.RetryCount);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -267,6 +267,60 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable
|
||||
Assert.Equal(0, msg.RetryCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-017: the Retry activity-log entry must carry the parked
|
||||
/// message's true category, not a hard-coded ExternalSystem.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task RetryParkedMessageAsync_ActivityUsesMessageRealCategory()
|
||||
{
|
||||
_service.RegisterDeliveryHandler(StoreAndForwardCategory.Notification,
|
||||
_ => throw new HttpRequestException("fail"));
|
||||
|
||||
var result = await _service.EnqueueAsync(
|
||||
StoreAndForwardCategory.Notification, "ops-list", """{}""",
|
||||
maxRetries: 1);
|
||||
await _service.RetryPendingMessagesAsync(); // -> parked
|
||||
|
||||
var categories = new List<StoreAndForwardCategory>();
|
||||
_service.OnActivity += (action, category, _) =>
|
||||
{
|
||||
if (action == "Retry") categories.Add(category);
|
||||
};
|
||||
|
||||
var retried = await _service.RetryParkedMessageAsync(result.MessageId);
|
||||
Assert.True(retried);
|
||||
|
||||
Assert.Equal(new[] { StoreAndForwardCategory.Notification }, categories);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// StoreAndForward-017: the Discard activity-log entry must carry the parked
|
||||
/// message's true category, not a hard-coded ExternalSystem.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task DiscardParkedMessageAsync_ActivityUsesMessageRealCategory()
|
||||
{
|
||||
_service.RegisterDeliveryHandler(StoreAndForwardCategory.CachedDbWrite,
|
||||
_ => throw new HttpRequestException("fail"));
|
||||
|
||||
var result = await _service.EnqueueAsync(
|
||||
StoreAndForwardCategory.CachedDbWrite, "site-db", """{}""",
|
||||
maxRetries: 1);
|
||||
await _service.RetryPendingMessagesAsync(); // -> parked
|
||||
|
||||
var categories = new List<StoreAndForwardCategory>();
|
||||
_service.OnActivity += (action, category, _) =>
|
||||
{
|
||||
if (action == "Discard") categories.Add(category);
|
||||
};
|
||||
|
||||
var discarded = await _service.DiscardParkedMessageAsync(result.MessageId);
|
||||
Assert.True(discarded);
|
||||
|
||||
Assert.Equal(new[] { StoreAndForwardCategory.CachedDbWrite }, categories);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DiscardParkedMessageAsync_PermanentlyRemoves()
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user