fix(driver-historian-wonderware): resolve Medium code-review finding (Driver.Historian.Wonderware-006)
Add exponential backoff (250 ms → 500 ms → 1 s → 2 s → 4 s → 8 s cap) to PipeServer.RunAsync after each connection-loop exception, replacing the spin loop that previously pegged a CPU core and flooded the log on persistent errors such as a duplicate pipe name or a failing PipeAcl.Create. After 20 consecutive failures the method re-throws so the SCM / NSSM supervisor can restart the sidecar cleanly. A clean connection (even a short-lived one) resets the counter. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -161,7 +161,7 @@ lock), so the snapshot is internally consistent.
|
|||||||
| Severity | Medium |
|
| Severity | Medium |
|
||||||
| Category | Error handling and resilience |
|
| Category | Error handling and resilience |
|
||||||
| Location | `Ipc/PipeServer.cs:120-128` |
|
| Location | `Ipc/PipeServer.cs:120-128` |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
|
|
||||||
**Description:** `RunAsync` re-accepts connections in a `while` loop. If
|
**Description:** `RunAsync` re-accepts connections in a `while` loop. If
|
||||||
`RunOneConnectionAsync` throws synchronously and immediately on every iteration
|
`RunOneConnectionAsync` throws synchronously and immediately on every iteration
|
||||||
@@ -175,7 +175,7 @@ seconds) before re-accepting after a caught exception, and consider a
|
|||||||
consecutive-failure threshold that escalates to a fatal exit so the supervisor can
|
consecutive-failure threshold that escalates to a fatal exit so the supervisor can
|
||||||
restart the sidecar cleanly.
|
restart the sidecar cleanly.
|
||||||
|
|
||||||
**Resolution:** _(open)_
|
**Resolution:** Resolved 2026-05-22 — added exponential backoff (250 ms → 8 s, six steps) after each connection-loop failure and a `MaxConsecutiveFailures=20` threshold that re-throws so the SCM/NSSM supervisor can restart the sidecar cleanly.
|
||||||
|
|
||||||
### Driver.Historian.Wonderware-007
|
### Driver.Historian.Wonderware-007
|
||||||
|
|
||||||
|
|||||||
@@ -113,17 +113,62 @@ public sealed class PipeServer : IDisposable
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Backoff sequence for consecutive RunOneConnection failures: 250 ms → 500 ms →
|
||||||
|
// 1 000 ms → 2 000 ms → 4 000 ms → capped at 8 000 ms thereafter.
|
||||||
|
private static readonly TimeSpan[] BackoffSteps =
|
||||||
|
{
|
||||||
|
TimeSpan.FromMilliseconds(250),
|
||||||
|
TimeSpan.FromMilliseconds(500),
|
||||||
|
TimeSpan.FromSeconds(1),
|
||||||
|
TimeSpan.FromSeconds(2),
|
||||||
|
TimeSpan.FromSeconds(4),
|
||||||
|
TimeSpan.FromSeconds(8),
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Maximum consecutive failures before the server gives up and lets the process exit
|
||||||
|
/// so the supervisor (NSSM / SCM) can restart the sidecar cleanly.
|
||||||
|
/// </summary>
|
||||||
|
private const int MaxConsecutiveFailures = 20;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Runs the server continuously, handling one connection at a time. When a connection
|
/// Runs the server continuously, handling one connection at a time. When a connection
|
||||||
/// ends (clean or error), accepts the next.
|
/// ends (clean or error), waits with exponential backoff before accepting the next.
|
||||||
|
/// If <see cref="MaxConsecutiveFailures"/> consecutive failures occur the method
|
||||||
|
/// throws so the supervisor can restart the sidecar.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public async Task RunAsync(IFrameHandler handler, CancellationToken ct)
|
public async Task RunAsync(IFrameHandler handler, CancellationToken ct)
|
||||||
{
|
{
|
||||||
|
var consecutiveFailures = 0;
|
||||||
|
|
||||||
while (!ct.IsCancellationRequested)
|
while (!ct.IsCancellationRequested)
|
||||||
{
|
{
|
||||||
try { await RunOneConnectionAsync(handler, ct).ConfigureAwait(false); }
|
try
|
||||||
|
{
|
||||||
|
await RunOneConnectionAsync(handler, ct).ConfigureAwait(false);
|
||||||
|
consecutiveFailures = 0; // a clean connection (even a short-lived one) resets the counter
|
||||||
|
}
|
||||||
catch (OperationCanceledException) { break; }
|
catch (OperationCanceledException) { break; }
|
||||||
catch (Exception ex) { _logger.Error(ex, "Sidecar IPC connection loop error — accepting next"); }
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
consecutiveFailures++;
|
||||||
|
|
||||||
|
if (consecutiveFailures >= MaxConsecutiveFailures)
|
||||||
|
{
|
||||||
|
_logger.Fatal(ex,
|
||||||
|
"Sidecar IPC connection loop failed {Count} consecutive times — giving up so supervisor can restart",
|
||||||
|
consecutiveFailures);
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
var delay = BackoffSteps[Math.Min(consecutiveFailures - 1, BackoffSteps.Length - 1)];
|
||||||
|
_logger.Error(ex,
|
||||||
|
"Sidecar IPC connection loop error (consecutive failure {Count}/{Max}) — retrying in {Delay}",
|
||||||
|
consecutiveFailures, MaxConsecutiveFailures, delay);
|
||||||
|
|
||||||
|
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
|
||||||
|
catch (OperationCanceledException) { break; }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user