From 71c0564ec05fb1c34e4795f274df696f90a67f10 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sat, 16 May 2026 19:57:28 -0400 Subject: [PATCH] =?UTF-8?q?fix(store-and-forward):=20resolve=20StoreAndFor?= =?UTF-8?q?ward-003,=20re-triage=20002=20=E2=80=94=20fix=20retry-count=20o?= =?UTF-8?q?ff-by-one?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code-reviews/StoreAndForward/findings.md | 54 +++++++++++++++++-- .../StoreAndForwardMessage.cs | 8 ++- .../StoreAndForwardService.cs | 11 ++-- .../StoreAndForwardServiceTests.cs | 42 ++++++++++++++- 4 files changed, 101 insertions(+), 14 deletions(-) diff --git a/code-reviews/StoreAndForward/findings.md b/code-reviews/StoreAndForward/findings.md index 2e9c996..2b8e27b 100644 --- a/code-reviews/StoreAndForward/findings.md +++ b/code-reviews/StoreAndForward/findings.md @@ -8,7 +8,7 @@ | Last reviewed | 2026-05-16 | | Reviewer | claude-agent | | Commit reviewed | `9c60592` | -| Open findings | 12 | +| Open findings | 11 | ## Summary @@ -94,7 +94,7 @@ commit whose message references `StoreAndForward-001`. | | | |--|--| -| Severity | High | +| Severity | ~~High~~ → Low (re-triaged) | | Category | Error handling & resilience | | Status | Open | | Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:162`, `:201` | @@ -121,9 +121,39 @@ handler exists rather than silently buffering an undeliverable message, and wire registration is intended, the retry sweep should treat a still-missing handler as a transient condition with bounded logging rather than a permanent no-op. +**Re-triage note (2026-05-16)** + +The finding's central factual claim — *"No caller in the codebase ever calls +`RegisterDeliveryHandler`"* and therefore *"every buffered message lands in this dead +state"* — is **no longer true at the reviewed code**. `ScadaLink.Host` +(`AkkaHostedService.RegisterSiteActors`, `AkkaHostedService.cs:353-379`) registers all +three delivery handlers (`ExternalSystem`, `CachedDbWrite`, `Notification`) at site +startup, immediately after `StoreAndForwardService.StartAsync()`. The finding was +written against commit `9c60592` before that wiring existed; the High-severity +"engine cannot deliver anything" outcome no longer occurs. + +The remaining residual risk is narrow: a message enqueued for a category that genuinely +has no handler (e.g. an enqueue racing ahead of `RegisterDeliveryHandler`, or a future +category added without a handler) is still buffered and then skipped by the sweep +forever. That is a real but minor robustness gap, hence the **downgrade to Low**. + +It is left **Open** rather than fixed in this pass because the finding's recommended +fix — making `EnqueueAsync` reject when no handler is registered — is a behavioural +contract change, not a localised bug fix: the "buffer with no handler yet" path is +exercised by `StoreAndForwardReplicationTests` and by three NotificationService and +ExternalSystemGateway tests (`Send_TransientError_WithStoreAndForward_BuffersMessage`, +`Send_Smtp4xxCommandException_ClassifiedTransientAndBuffered`, +`Send_SmtpProtocolException_ClassifiedTransient`) which construct a real +`StoreAndForwardService` without registering a handler and assert `WasBuffered`. +Changing the contract requires deciding whether late handler registration is supported +and updating tests in modules outside this review's edit scope — a design decision that +should be made deliberately rather than forced here. + **Resolution** -_Unresolved._ +_Open — re-triaged to Low. Premise (no handler registration anywhere) is stale: Host +now wires all three handlers. Residual gap is minor and the prescribed fix is a +cross-module contract change needing a design decision._ ### StoreAndForward-003 — Off-by-one in retry accounting: immediate failure pre-counts as retry 1 @@ -131,7 +161,7 @@ _Unresolved._ |--|--| | Severity | High | | Category | Correctness & logic bugs | -| Status | Open | +| Status | Resolved | | Location | `src/ScadaLink.StoreAndForward/StoreAndForwardService.cs:153`, `:229`, `:233` | **Description** @@ -159,7 +189,21 @@ the comparison. Update the affected test to match the chosen semantics. **Resolution** -_Unresolved._ +Resolved 2026-05-16 (commit ``). `RetryCount` now consistently means "number +of background retry-sweep attempts so far"; the initial immediate (or caller-made) +delivery attempt is attempt 0 and is not counted, and `MaxRetries` bounds retry-sweep +attempts after that initial attempt. `EnqueueAsync` no longer seeds `RetryCount = 1` on +either the transient-immediate-failure path or the `attemptImmediateDelivery: false` +path — a freshly buffered message has `RetryCount = 0`. `RetryMessageAsync` already +increments before the `>= MaxRetries` check, which is now correct, so a message with +`MaxRetries = 1` gets exactly one real retry before parking (previously zero). The +`StoreAndForwardMessage.RetryCount` XML doc was corrected to match. Regression test +`RetryPendingMessagesAsync_MaxRetriesOne_PerformsExactlyOneRetryBeforeParking` asserts +the immediate attempt plus exactly one retry occur before parking; the affected +existing tests (`EnqueueAsync_TransientFailure_BuffersForRetry`, +`EnqueueAsync_AttemptImmediateDeliveryFalse_BuffersWithoutInvokingHandler`, +`RetryPendingMessagesAsync_MaxRetriesReached_ParksMessage`) were updated to the +corrected semantics. ### StoreAndForward-004 — `RegisterDeliveryHandler` XML doc contradicts the implemented contract diff --git a/src/ScadaLink.StoreAndForward/StoreAndForwardMessage.cs b/src/ScadaLink.StoreAndForward/StoreAndForwardMessage.cs index ab0a5cf..5a62445 100644 --- a/src/ScadaLink.StoreAndForward/StoreAndForwardMessage.cs +++ b/src/ScadaLink.StoreAndForward/StoreAndForwardMessage.cs @@ -20,10 +20,14 @@ public class StoreAndForwardMessage /// JSON-serialized payload containing the call details. public string PayloadJson { get; set; } = string.Empty; - /// Number of delivery attempts so far. + /// + /// Number of retry-sweep attempts performed so far. The initial (immediate or + /// caller-made) delivery attempt is attempt 0 and is not counted here; this + /// field counts only background retry attempts (StoreAndForward-003). + /// public int RetryCount { get; set; } - /// Maximum retry attempts before parking (0 = no limit). + /// Maximum retry-sweep attempts before parking (0 = no limit). public int MaxRetries { get; set; } /// Retry interval in milliseconds. diff --git a/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs b/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs index c342d2b..4fdac2f 100644 --- a/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs +++ b/src/ScadaLink.StoreAndForward/StoreAndForwardService.cs @@ -148,13 +148,14 @@ public class StoreAndForwardService } catch (Exception ex) { - // Transient failure — buffer for retry + // Transient failure — buffer for retry. The immediate attempt is + // attempt 0; RetryCount tracks only sweep retries, so it stays 0 + // here (StoreAndForward-003). _logger.LogWarning(ex, "Immediate delivery to {Target} failed (transient), buffering for retry", target); message.LastAttemptAt = DateTimeOffset.UtcNow; - message.RetryCount = 1; message.LastError = ex.Message; await BufferAsync(message); @@ -165,11 +166,11 @@ public class StoreAndForwardService // Either no handler is registered yet, or the caller already attempted // delivery itself — buffer for the background retry sweep to deliver. + // The initial attempt (caller-made, or skipped because no handler is + // registered) is attempt 0; RetryCount tracks only sweep retries and + // therefore stays 0 here (StoreAndForward-003). if (!attemptImmediateDelivery) { - // The caller made (and failed) one attempt before handing the - // message over, so it counts as the first retry. - message.RetryCount = 1; message.LastAttemptAt = DateTimeOffset.UtcNow; } await BufferAsync(message); diff --git a/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs b/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs index 464354e..e7e4dda 100644 --- a/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs +++ b/tests/ScadaLink.StoreAndForward.Tests/StoreAndForwardServiceTests.cs @@ -86,7 +86,9 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable var msg = await _storage.GetMessageByIdAsync(result.MessageId); Assert.NotNull(msg); Assert.Equal(StoreAndForwardMessageStatus.Pending, msg!.Status); - Assert.Equal(1, msg.RetryCount); + // StoreAndForward-003: RetryCount counts sweep retries only; the immediate + // attempt is attempt 0, so a freshly buffered message has RetryCount 0. + Assert.Equal(0, msg.RetryCount); } [Fact] @@ -134,6 +136,12 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable StoreAndForwardCategory.ExternalSystem, "api", """{}""", maxRetries: 2); + // StoreAndForward-003: MaxRetries bounds sweep retries (not the immediate + // attempt), so a message with MaxRetries=2 needs two retry sweeps to park. + await _service.RetryPendingMessagesAsync(); + var afterFirst = await _storage.GetMessageByIdAsync(result.MessageId); + Assert.Equal(StoreAndForwardMessageStatus.Pending, afterFirst!.Status); + await _service.RetryPendingMessagesAsync(); var msg = await _storage.GetMessageByIdAsync(result.MessageId); @@ -141,6 +149,34 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable Assert.Equal(StoreAndForwardMessageStatus.Parked, msg!.Status); } + // ── StoreAndForward-003: retry-count accounting ── + + [Fact] + public async Task RetryPendingMessagesAsync_MaxRetriesOne_PerformsExactlyOneRetryBeforeParking() + { + // The immediate attempt is attempt 0; MaxRetries=1 must allow exactly one + // retry sweep before parking. The pre-fix off-by-one parked with zero retries. + var attempts = 0; + _service.RegisterDeliveryHandler(StoreAndForwardCategory.ExternalSystem, + _ => { Interlocked.Increment(ref attempts); throw new HttpRequestException("always fails"); }); + + var result = await _service.EnqueueAsync( + StoreAndForwardCategory.ExternalSystem, "api", """{}""", + maxRetries: 1); + + // After the immediate failed attempt the message is buffered, not parked. + var buffered = await _storage.GetMessageByIdAsync(result.MessageId); + Assert.Equal(StoreAndForwardMessageStatus.Pending, buffered!.Status); + Assert.Equal(1, attempts); // only the immediate attempt so far + + await _service.RetryPendingMessagesAsync(); + + var msg = await _storage.GetMessageByIdAsync(result.MessageId); + Assert.Equal(StoreAndForwardMessageStatus.Parked, msg!.Status); + Assert.Equal(2, attempts); // immediate attempt + exactly one retry + Assert.Equal(1, msg.RetryCount); // one sweep retry recorded + } + [Fact] public async Task RetryPendingMessagesAsync_PermanentFailureOnRetry_ParksMessage() { @@ -332,6 +368,8 @@ public class StoreAndForwardServiceTests : IAsyncLifetime, IDisposable var msg = await _storage.GetMessageByIdAsync(result.MessageId); Assert.NotNull(msg); Assert.Equal(StoreAndForwardMessageStatus.Pending, msg!.Status); - Assert.Equal(1, msg.RetryCount); // counts as the caller's first attempt + // StoreAndForward-003: the caller's own attempt is attempt 0; RetryCount + // counts only sweep retries, so a freshly buffered message has RetryCount 0. + Assert.Equal(0, msg.RetryCount); } }