diff --git a/code-reviews/Driver.Historian.Wonderware/findings.md b/code-reviews/Driver.Historian.Wonderware/findings.md index d154c7b..4052a95 100644 --- a/code-reviews/Driver.Historian.Wonderware/findings.md +++ b/code-reviews/Driver.Historian.Wonderware/findings.md @@ -161,7 +161,7 @@ lock), so the snapshot is internally consistent. | Severity | Medium | | Category | Error handling and resilience | | Location | `Ipc/PipeServer.cs:120-128` | -| Status | Open | +| Status | Resolved | **Description:** `RunAsync` re-accepts connections in a `while` loop. If `RunOneConnectionAsync` throws synchronously and immediately on every iteration @@ -175,7 +175,7 @@ seconds) before re-accepting after a caught exception, and consider a consecutive-failure threshold that escalates to a fatal exit so the supervisor can restart the sidecar cleanly. -**Resolution:** _(open)_ +**Resolution:** Resolved 2026-05-22 — added exponential backoff (250 ms → 8 s, six steps) after each connection-loop failure and a `MaxConsecutiveFailures=20` threshold that re-throws so the SCM/NSSM supervisor can restart the sidecar cleanly. ### Driver.Historian.Wonderware-007 diff --git a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs index 8737bc9..d20fe49 100644 --- a/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs +++ b/src/Drivers/ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware/Ipc/PipeServer.cs @@ -113,17 +113,62 @@ public sealed class PipeServer : IDisposable } } + // Backoff sequence for consecutive RunOneConnection failures: 250 ms → 500 ms → + // 1 000 ms → 2 000 ms → 4 000 ms → capped at 8 000 ms thereafter. + private static readonly TimeSpan[] BackoffSteps = + { + TimeSpan.FromMilliseconds(250), + TimeSpan.FromMilliseconds(500), + TimeSpan.FromSeconds(1), + TimeSpan.FromSeconds(2), + TimeSpan.FromSeconds(4), + TimeSpan.FromSeconds(8), + }; + + /// + /// Maximum consecutive failures before the server gives up and lets the process exit + /// so the supervisor (NSSM / SCM) can restart the sidecar cleanly. + /// + private const int MaxConsecutiveFailures = 20; + /// /// Runs the server continuously, handling one connection at a time. When a connection - /// ends (clean or error), accepts the next. + /// ends (clean or error), waits with exponential backoff before accepting the next. + /// If consecutive failures occur the method + /// throws so the supervisor can restart the sidecar. /// public async Task RunAsync(IFrameHandler handler, CancellationToken ct) { + var consecutiveFailures = 0; + while (!ct.IsCancellationRequested) { - try { await RunOneConnectionAsync(handler, ct).ConfigureAwait(false); } + try + { + await RunOneConnectionAsync(handler, ct).ConfigureAwait(false); + consecutiveFailures = 0; // a clean connection (even a short-lived one) resets the counter + } catch (OperationCanceledException) { break; } - catch (Exception ex) { _logger.Error(ex, "Sidecar IPC connection loop error — accepting next"); } + catch (Exception ex) + { + consecutiveFailures++; + + if (consecutiveFailures >= MaxConsecutiveFailures) + { + _logger.Fatal(ex, + "Sidecar IPC connection loop failed {Count} consecutive times — giving up so supervisor can restart", + consecutiveFailures); + throw; + } + + var delay = BackoffSteps[Math.Min(consecutiveFailures - 1, BackoffSteps.Length - 1)]; + _logger.Error(ex, + "Sidecar IPC connection loop error (consecutive failure {Count}/{Max}) — retrying in {Delay}", + consecutiveFailures, MaxConsecutiveFailures, delay); + + try { await Task.Delay(delay, ct).ConfigureAwait(false); } + catch (OperationCanceledException) { break; } + } } }