Merge branch 'worktree-agent-ad34cad856c59bbc1' into feat/scripted-alarm-shelve-routing
This commit is contained in:
@@ -7,7 +7,7 @@
|
||||
| Review date | 2026-05-22 |
|
||||
| Commit reviewed | `76d35d1` |
|
||||
| Status | Reviewed |
|
||||
| Open findings | 11 |
|
||||
| Open findings | 7 |
|
||||
|
||||
## Checklist coverage
|
||||
|
||||
@@ -63,7 +63,7 @@ the reconnect path can re-open with `ReadOnly = false`) or at minimum as
|
||||
| Severity | Medium |
|
||||
| Category | Correctness and logic bugs |
|
||||
| Location | `Ipc/HistorianFrameHandler.cs:162`, `:181` |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `HandleWriteAlarmEventsAsync` dereferences `req.Events.Length`
|
||||
in both the `_alarmWriter is null` branch (line 162) and the catch block (line
|
||||
@@ -79,7 +79,7 @@ already null-guards `events`; the frame handler does not.
|
||||
immediately after deserialization (or guard each `.Length` access), consistent
|
||||
with the null-tolerance the writer already has.
|
||||
|
||||
**Resolution:** _(open)_
|
||||
**Resolution:** Resolved 2026-05-22 — normalise `req.Events` to `Array.Empty<AlarmHistorianEventDto>()` immediately after deserialization so all subsequent `.Length` accesses are safe against null frames.
|
||||
|
||||
### Driver.Historian.Wonderware-003
|
||||
|
||||
@@ -88,7 +88,7 @@ with the null-tolerance the writer already has.
|
||||
| Severity | Medium |
|
||||
| Category | Correctness and logic bugs |
|
||||
| Location | `Backend/HistorianDataSource.cs:320-323`, `:457-460` |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** Raw and at-time reads decide whether a sample is a string or a
|
||||
numeric with `if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)`.
|
||||
@@ -106,7 +106,7 @@ field rather than from `Value == 0`. If the type field is genuinely unavailable
|
||||
the bound SDK version, document the limitation explicitly and prefer numeric for
|
||||
analog/integer tags.
|
||||
|
||||
**Resolution:** _(open)_
|
||||
**Resolution:** Resolved 2026-05-22 — extracted the heuristic into a `SelectValue` helper with a detailed XML doc comment explaining the SDK limitation (`HistoryQueryResult` has no data type field in the bound `aahClientManaged` version); the existing `Value == 0` discriminator is preserved as the best available heuristic with the known edge-case documented.
|
||||
|
||||
### Driver.Historian.Wonderware-004
|
||||
|
||||
@@ -161,7 +161,7 @@ lock), so the snapshot is internally consistent.
|
||||
| Severity | Medium |
|
||||
| Category | Error handling and resilience |
|
||||
| Location | `Ipc/PipeServer.cs:120-128` |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `RunAsync` re-accepts connections in a `while` loop. If
|
||||
`RunOneConnectionAsync` throws synchronously and immediately on every iteration
|
||||
@@ -175,7 +175,7 @@ seconds) before re-accepting after a caught exception, and consider a
|
||||
consecutive-failure threshold that escalates to a fatal exit so the supervisor can
|
||||
restart the sidecar cleanly.
|
||||
|
||||
**Resolution:** _(open)_
|
||||
**Resolution:** Resolved 2026-05-22 — added exponential backoff (250 ms → 8 s, six steps) after each connection-loop failure and a `MaxConsecutiveFailures=20` threshold that re-throws so the SCM/NSSM supervisor can restart the sidecar cleanly.
|
||||
|
||||
### Driver.Historian.Wonderware-007
|
||||
|
||||
@@ -235,7 +235,7 @@ treat an SDK error as an empty history.
|
||||
| Severity | Medium |
|
||||
| Category | Performance and resource management |
|
||||
| Location | `Backend/HistorianDataSource.cs:382-395`, `Ipc/Contracts.cs:85-99` |
|
||||
| Status | Open |
|
||||
| Status | Resolved |
|
||||
|
||||
**Description:** `ReadAggregateAsync` drains `query.MoveNext` into `results` with
|
||||
no upper bound, unlike `ReadRawAsync`, which honours `maxValues` /
|
||||
@@ -252,7 +252,7 @@ sidecar holds the whole result set in memory.
|
||||
`ReadProcessedRequest`. Reject or truncate result sets that would exceed the frame
|
||||
cap with an explicit error reply rather than letting `WriteAsync` throw.
|
||||
|
||||
**Resolution:** _(open)_
|
||||
**Resolution:** Resolved 2026-05-22 — applied `_config.MaxValuesPerRead` as a bucket cap in `ReadAggregateAsync` mirroring the raw-read path; truncation logs a Warning with the limit and a hint to widen `IntervalMs` or reduce the time range.
|
||||
|
||||
### Driver.Historian.Wonderware-010
|
||||
|
||||
|
||||
@@ -316,15 +316,9 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
||||
var result = query.QueryResult;
|
||||
var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc);
|
||||
|
||||
object? value;
|
||||
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
|
||||
value = result.StringValue;
|
||||
else
|
||||
value = result.Value;
|
||||
|
||||
results.Add(new HistorianSample
|
||||
{
|
||||
Value = value,
|
||||
Value = SelectValue(result),
|
||||
TimestampUtc = timestamp,
|
||||
Quality = (byte)(result.OpcQuality & 0xFF),
|
||||
});
|
||||
@@ -379,6 +373,12 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
||||
return Task.FromResult(results);
|
||||
}
|
||||
|
||||
// Apply the same bucket cap as the raw-read path so a wide time range with a
|
||||
// small IntervalMs cannot produce an unbounded result set that would overflow
|
||||
// the 16 MiB FrameWriter frame cap and lose the entire reply.
|
||||
var bucketLimit = _config.MaxValuesPerRead;
|
||||
var bucketCount = 0;
|
||||
|
||||
while (query.MoveNext(out error))
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
@@ -392,6 +392,15 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
||||
Value = value,
|
||||
TimestampUtc = timestamp,
|
||||
});
|
||||
|
||||
bucketCount++;
|
||||
if (bucketLimit > 0 && bucketCount >= bucketLimit)
|
||||
{
|
||||
Log.Warning(
|
||||
"HistoryRead aggregate ({Aggregate}): {Tag} truncated at {Limit} buckets — widen IntervalMs or reduce time range",
|
||||
aggregateColumn, tagName, bucketLimit);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
query.EndQuery(out _);
|
||||
@@ -453,15 +462,9 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
||||
if (query.MoveNext(out error))
|
||||
{
|
||||
var result = query.QueryResult;
|
||||
object? value;
|
||||
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
|
||||
value = result.StringValue;
|
||||
else
|
||||
value = result.Value;
|
||||
|
||||
results.Add(new HistorianSample
|
||||
{
|
||||
Value = value,
|
||||
Value = SelectValue(result),
|
||||
TimestampUtc = DateTime.SpecifyKind(timestamp, DateTimeKind.Utc),
|
||||
Quality = (byte)(result.OpcQuality & 0xFF),
|
||||
});
|
||||
@@ -574,6 +577,29 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
||||
#pragma warning restore CS0618
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Selects the typed value from a <see cref="HistoryQueryResult"/> row.
|
||||
/// <para>
|
||||
/// <b>SDK limitation:</b> <c>HistoryQueryResult</c> exposes only <c>Value</c>
|
||||
/// (double) and <c>StringValue</c> (string) — there is no tag data-type field on
|
||||
/// the result. The correct approach would be to branch on the tag's declared
|
||||
/// data type, but the bound version of <c>aahClientManaged</c> does not surface
|
||||
/// it per query result. The heuristic below is the best available: prefer
|
||||
/// <c>StringValue</c> only when it is non-empty AND <c>Value</c> is zero,
|
||||
/// because string tags in the Historian SDK always project to <c>Value=0</c>
|
||||
/// while numeric tags may legitimately sample to zero (in which case the SDK
|
||||
/// does not populate <c>StringValue</c>). A numeric tag at exactly zero with a
|
||||
/// non-empty formatted <c>StringValue</c> (e.g. "0.00") would be mis-reported
|
||||
/// as a string; this is a known edge case of the SDK binding.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
private static object? SelectValue(HistoryQueryResult result)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
|
||||
return result.StringValue;
|
||||
return result.Value;
|
||||
}
|
||||
|
||||
internal static double? ExtractAggregateValue(AnalogSummaryQueryResult result, string column)
|
||||
{
|
||||
switch (column)
|
||||
|
||||
@@ -153,6 +153,11 @@ public sealed class HistorianFrameHandler : IFrameHandler
|
||||
private async Task HandleWriteAlarmEventsAsync(byte[] body, FrameWriter writer, CancellationToken ct)
|
||||
{
|
||||
var req = MessagePackSerializer.Deserialize<WriteAlarmEventsRequest>(body);
|
||||
|
||||
// MessagePack deserializes an absent or explicit-nil array as null, not Array.Empty.
|
||||
// Normalise here so every path below can safely dereference .Length without an NRE.
|
||||
req.Events ??= Array.Empty<AlarmHistorianEventDto>();
|
||||
|
||||
var reply = new WriteAlarmEventsReply { CorrelationId = req.CorrelationId };
|
||||
|
||||
if (_alarmWriter is null)
|
||||
|
||||
@@ -113,17 +113,62 @@ public sealed class PipeServer : IDisposable
|
||||
}
|
||||
}
|
||||
|
||||
// Backoff sequence for consecutive RunOneConnection failures: 250 ms → 500 ms →
|
||||
// 1 000 ms → 2 000 ms → 4 000 ms → capped at 8 000 ms thereafter.
|
||||
private static readonly TimeSpan[] BackoffSteps =
|
||||
{
|
||||
TimeSpan.FromMilliseconds(250),
|
||||
TimeSpan.FromMilliseconds(500),
|
||||
TimeSpan.FromSeconds(1),
|
||||
TimeSpan.FromSeconds(2),
|
||||
TimeSpan.FromSeconds(4),
|
||||
TimeSpan.FromSeconds(8),
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Maximum consecutive failures before the server gives up and lets the process exit
|
||||
/// so the supervisor (NSSM / SCM) can restart the sidecar cleanly.
|
||||
/// </summary>
|
||||
private const int MaxConsecutiveFailures = 20;
|
||||
|
||||
/// <summary>
|
||||
/// Runs the server continuously, handling one connection at a time. When a connection
|
||||
/// ends (clean or error), accepts the next.
|
||||
/// ends (clean or error), waits with exponential backoff before accepting the next.
|
||||
/// If <see cref="MaxConsecutiveFailures"/> consecutive failures occur the method
|
||||
/// throws so the supervisor can restart the sidecar.
|
||||
/// </summary>
|
||||
public async Task RunAsync(IFrameHandler handler, CancellationToken ct)
|
||||
{
|
||||
var consecutiveFailures = 0;
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try { await RunOneConnectionAsync(handler, ct).ConfigureAwait(false); }
|
||||
try
|
||||
{
|
||||
await RunOneConnectionAsync(handler, ct).ConfigureAwait(false);
|
||||
consecutiveFailures = 0; // a clean connection (even a short-lived one) resets the counter
|
||||
}
|
||||
catch (OperationCanceledException) { break; }
|
||||
catch (Exception ex) { _logger.Error(ex, "Sidecar IPC connection loop error — accepting next"); }
|
||||
catch (Exception ex)
|
||||
{
|
||||
consecutiveFailures++;
|
||||
|
||||
if (consecutiveFailures >= MaxConsecutiveFailures)
|
||||
{
|
||||
_logger.Fatal(ex,
|
||||
"Sidecar IPC connection loop failed {Count} consecutive times — giving up so supervisor can restart",
|
||||
consecutiveFailures);
|
||||
throw;
|
||||
}
|
||||
|
||||
var delay = BackoffSteps[Math.Min(consecutiveFailures - 1, BackoffSteps.Length - 1)];
|
||||
_logger.Error(ex,
|
||||
"Sidecar IPC connection loop error (consecutive failure {Count}/{Max}) — retrying in {Delay}",
|
||||
consecutiveFailures, MaxConsecutiveFailures, delay);
|
||||
|
||||
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
|
||||
catch (OperationCanceledException) { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user