fix(driver-historian-wonderware): resolve Medium code-review finding (Driver.Historian.Wonderware-006)
Add exponential backoff (250 ms → 500 ms → 1 s → 2 s → 4 s → 8 s cap) to PipeServer.RunAsync after each connection-loop exception, replacing the spin loop that previously pegged a CPU core and flooded the log on persistent errors such as a duplicate pipe name or a failing PipeAcl.Create. After 20 consecutive failures the method re-throws so the SCM / NSSM supervisor can restart the sidecar cleanly. A clean connection (even a short-lived one) resets the counter. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -161,7 +161,7 @@ lock), so the snapshot is internally consistent.
|
||||
| Severity | Medium |
|
||||
| Category | Error handling and resilience |
|
||||
| Location | `Ipc/PipeServer.cs:120-128` |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `RunAsync` re-accepts connections in a `while` loop. If
|
||||
`RunOneConnectionAsync` throws synchronously and immediately on every iteration
|
||||
@@ -175,7 +175,7 @@ seconds) before re-accepting after a caught exception, and consider a
|
||||
consecutive-failure threshold that escalates to a fatal exit so the supervisor can
|
||||
restart the sidecar cleanly.
|
||||
|
||||
**Resolution:** _(open)_
|
||||
**Resolution:** Resolved 2026-05-22 — added exponential backoff (250 ms → 8 s, six steps) after each connection-loop failure and a `MaxConsecutiveFailures=20` threshold that re-throws so the SCM/NSSM supervisor can restart the sidecar cleanly.
|
||||
|
||||
### Driver.Historian.Wonderware-007
|
||||
|
||||
|
||||
@@ -113,17 +113,62 @@ public sealed class PipeServer : IDisposable
|
||||
}
|
||||
}
|
||||
|
||||
// Backoff sequence for consecutive RunOneConnection failures: 250 ms → 500 ms →
|
||||
// 1 000 ms → 2 000 ms → 4 000 ms → capped at 8 000 ms thereafter.
|
||||
private static readonly TimeSpan[] BackoffSteps =
|
||||
{
|
||||
TimeSpan.FromMilliseconds(250),
|
||||
TimeSpan.FromMilliseconds(500),
|
||||
TimeSpan.FromSeconds(1),
|
||||
TimeSpan.FromSeconds(2),
|
||||
TimeSpan.FromSeconds(4),
|
||||
TimeSpan.FromSeconds(8),
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Maximum consecutive failures before the server gives up and lets the process exit
|
||||
/// so the supervisor (NSSM / SCM) can restart the sidecar cleanly.
|
||||
/// </summary>
|
||||
private const int MaxConsecutiveFailures = 20;
|
||||
|
||||
/// <summary>
|
||||
/// Runs the server continuously, handling one connection at a time. When a connection
|
||||
/// ends (clean or error), accepts the next.
|
||||
/// ends (clean or error), waits with exponential backoff before accepting the next.
|
||||
/// If <see cref="MaxConsecutiveFailures"/> consecutive failures occur the method
|
||||
/// throws so the supervisor can restart the sidecar.
|
||||
/// </summary>
|
||||
public async Task RunAsync(IFrameHandler handler, CancellationToken ct)
|
||||
{
|
||||
var consecutiveFailures = 0;
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try { await RunOneConnectionAsync(handler, ct).ConfigureAwait(false); }
|
||||
try
|
||||
{
|
||||
await RunOneConnectionAsync(handler, ct).ConfigureAwait(false);
|
||||
consecutiveFailures = 0; // a clean connection (even a short-lived one) resets the counter
|
||||
}
|
||||
catch (OperationCanceledException) { break; }
|
||||
catch (Exception ex) { _logger.Error(ex, "Sidecar IPC connection loop error — accepting next"); }
|
||||
catch (Exception ex)
|
||||
{
|
||||
consecutiveFailures++;
|
||||
|
||||
if (consecutiveFailures >= MaxConsecutiveFailures)
|
||||
{
|
||||
_logger.Fatal(ex,
|
||||
"Sidecar IPC connection loop failed {Count} consecutive times — giving up so supervisor can restart",
|
||||
consecutiveFailures);
|
||||
throw;
|
||||
}
|
||||
|
||||
var delay = BackoffSteps[Math.Min(consecutiveFailures - 1, BackoffSteps.Length - 1)];
|
||||
_logger.Error(ex,
|
||||
"Sidecar IPC connection loop error (consecutive failure {Count}/{Max}) — retrying in {Delay}",
|
||||
consecutiveFailures, MaxConsecutiveFailures, delay);
|
||||
|
||||
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user