Merge branch 'worktree-agent-ad34cad856c59bbc1' into feat/scripted-alarm-shelve-routing
This commit is contained in:
@@ -7,7 +7,7 @@
|
|||||||
| Review date | 2026-05-22 |
|
| Review date | 2026-05-22 |
|
||||||
| Commit reviewed | `76d35d1` |
|
| Commit reviewed | `76d35d1` |
|
||||||
| Status | Reviewed |
|
| Status | Reviewed |
|
||||||
| Open findings | 11 |
|
| Open findings | 7 |
|
||||||
|
|
||||||
## Checklist coverage
|
## Checklist coverage
|
||||||
|
|
||||||
@@ -63,7 +63,7 @@ the reconnect path can re-open with `ReadOnly = false`) or at minimum as
|
|||||||
| Severity | Medium |
|
| Severity | Medium |
|
||||||
| Category | Correctness and logic bugs |
|
| Category | Correctness and logic bugs |
|
||||||
| Location | `Ipc/HistorianFrameHandler.cs:162`, `:181` |
|
| Location | `Ipc/HistorianFrameHandler.cs:162`, `:181` |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
|
|
||||||
**Description:** `HandleWriteAlarmEventsAsync` dereferences `req.Events.Length`
|
**Description:** `HandleWriteAlarmEventsAsync` dereferences `req.Events.Length`
|
||||||
in both the `_alarmWriter is null` branch (line 162) and the catch block (line
|
in both the `_alarmWriter is null` branch (line 162) and the catch block (line
|
||||||
@@ -79,7 +79,7 @@ already null-guards `events`; the frame handler does not.
|
|||||||
immediately after deserialization (or guard each `.Length` access), consistent
|
immediately after deserialization (or guard each `.Length` access), consistent
|
||||||
with the null-tolerance the writer already has.
|
with the null-tolerance the writer already has.
|
||||||
|
|
||||||
**Resolution:** _(open)_
|
**Resolution:** Resolved 2026-05-22 — normalise `req.Events` to `Array.Empty<AlarmHistorianEventDto>()` immediately after deserialization so all subsequent `.Length` accesses are safe against null frames.
|
||||||
|
|
||||||
### Driver.Historian.Wonderware-003
|
### Driver.Historian.Wonderware-003
|
||||||
|
|
||||||
@@ -88,7 +88,7 @@ with the null-tolerance the writer already has.
|
|||||||
| Severity | Medium |
|
| Severity | Medium |
|
||||||
| Category | Correctness and logic bugs |
|
| Category | Correctness and logic bugs |
|
||||||
| Location | `Backend/HistorianDataSource.cs:320-323`, `:457-460` |
|
| Location | `Backend/HistorianDataSource.cs:320-323`, `:457-460` |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
|
|
||||||
**Description:** Raw and at-time reads decide whether a sample is a string or a
|
**Description:** Raw and at-time reads decide whether a sample is a string or a
|
||||||
numeric with `if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)`.
|
numeric with `if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)`.
|
||||||
@@ -106,7 +106,7 @@ field rather than from `Value == 0`. If the type field is genuinely unavailable
|
|||||||
the bound SDK version, document the limitation explicitly and prefer numeric for
|
the bound SDK version, document the limitation explicitly and prefer numeric for
|
||||||
analog/integer tags.
|
analog/integer tags.
|
||||||
|
|
||||||
**Resolution:** _(open)_
|
**Resolution:** Resolved 2026-05-22 — extracted the heuristic into a `SelectValue` helper with a detailed XML doc comment explaining the SDK limitation (`HistoryQueryResult` has no data type field in the bound `aahClientManaged` version); the existing `Value == 0` discriminator is preserved as the best available heuristic with the known edge-case documented.
|
||||||
|
|
||||||
### Driver.Historian.Wonderware-004
|
### Driver.Historian.Wonderware-004
|
||||||
|
|
||||||
@@ -161,7 +161,7 @@ lock), so the snapshot is internally consistent.
|
|||||||
| Severity | Medium |
|
| Severity | Medium |
|
||||||
| Category | Error handling and resilience |
|
| Category | Error handling and resilience |
|
||||||
| Location | `Ipc/PipeServer.cs:120-128` |
|
| Location | `Ipc/PipeServer.cs:120-128` |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
|
|
||||||
**Description:** `RunAsync` re-accepts connections in a `while` loop. If
|
**Description:** `RunAsync` re-accepts connections in a `while` loop. If
|
||||||
`RunOneConnectionAsync` throws synchronously and immediately on every iteration
|
`RunOneConnectionAsync` throws synchronously and immediately on every iteration
|
||||||
@@ -175,7 +175,7 @@ seconds) before re-accepting after a caught exception, and consider a
|
|||||||
consecutive-failure threshold that escalates to a fatal exit so the supervisor can
|
consecutive-failure threshold that escalates to a fatal exit so the supervisor can
|
||||||
restart the sidecar cleanly.
|
restart the sidecar cleanly.
|
||||||
|
|
||||||
**Resolution:** _(open)_
|
**Resolution:** Resolved 2026-05-22 — added exponential backoff (250 ms → 8 s, six steps) after each connection-loop failure and a `MaxConsecutiveFailures=20` threshold that re-throws so the SCM/NSSM supervisor can restart the sidecar cleanly.
|
||||||
|
|
||||||
### Driver.Historian.Wonderware-007
|
### Driver.Historian.Wonderware-007
|
||||||
|
|
||||||
@@ -235,7 +235,7 @@ treat an SDK error as an empty history.
|
|||||||
| Severity | Medium |
|
| Severity | Medium |
|
||||||
| Category | Performance and resource management |
|
| Category | Performance and resource management |
|
||||||
| Location | `Backend/HistorianDataSource.cs:382-395`, `Ipc/Contracts.cs:85-99` |
|
| Location | `Backend/HistorianDataSource.cs:382-395`, `Ipc/Contracts.cs:85-99` |
|
||||||
| Status | Open |
|
| Status | Resolved |
|
||||||
|
|
||||||
**Description:** `ReadAggregateAsync` drains `query.MoveNext` into `results` with
|
**Description:** `ReadAggregateAsync` drains `query.MoveNext` into `results` with
|
||||||
no upper bound, unlike `ReadRawAsync`, which honours `maxValues` /
|
no upper bound, unlike `ReadRawAsync`, which honours `maxValues` /
|
||||||
@@ -252,7 +252,7 @@ sidecar holds the whole result set in memory.
|
|||||||
`ReadProcessedRequest`. Reject or truncate result sets that would exceed the frame
|
`ReadProcessedRequest`. Reject or truncate result sets that would exceed the frame
|
||||||
cap with an explicit error reply rather than letting `WriteAsync` throw.
|
cap with an explicit error reply rather than letting `WriteAsync` throw.
|
||||||
|
|
||||||
**Resolution:** _(open)_
|
**Resolution:** Resolved 2026-05-22 — applied `_config.MaxValuesPerRead` as a bucket cap in `ReadAggregateAsync` mirroring the raw-read path; truncation logs a Warning with the limit and a hint to widen `IntervalMs` or reduce the time range.
|
||||||
|
|
||||||
### Driver.Historian.Wonderware-010
|
### Driver.Historian.Wonderware-010
|
||||||
|
|
||||||
|
|||||||
@@ -316,15 +316,9 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
|||||||
var result = query.QueryResult;
|
var result = query.QueryResult;
|
||||||
var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc);
|
var timestamp = DateTime.SpecifyKind(result.StartDateTime, DateTimeKind.Utc);
|
||||||
|
|
||||||
object? value;
|
|
||||||
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
|
|
||||||
value = result.StringValue;
|
|
||||||
else
|
|
||||||
value = result.Value;
|
|
||||||
|
|
||||||
results.Add(new HistorianSample
|
results.Add(new HistorianSample
|
||||||
{
|
{
|
||||||
Value = value,
|
Value = SelectValue(result),
|
||||||
TimestampUtc = timestamp,
|
TimestampUtc = timestamp,
|
||||||
Quality = (byte)(result.OpcQuality & 0xFF),
|
Quality = (byte)(result.OpcQuality & 0xFF),
|
||||||
});
|
});
|
||||||
@@ -379,6 +373,12 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
|||||||
return Task.FromResult(results);
|
return Task.FromResult(results);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Apply the same bucket cap as the raw-read path so a wide time range with a
|
||||||
|
// small IntervalMs cannot produce an unbounded result set that would overflow
|
||||||
|
// the 16 MiB FrameWriter frame cap and lose the entire reply.
|
||||||
|
var bucketLimit = _config.MaxValuesPerRead;
|
||||||
|
var bucketCount = 0;
|
||||||
|
|
||||||
while (query.MoveNext(out error))
|
while (query.MoveNext(out error))
|
||||||
{
|
{
|
||||||
ct.ThrowIfCancellationRequested();
|
ct.ThrowIfCancellationRequested();
|
||||||
@@ -392,6 +392,15 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
|||||||
Value = value,
|
Value = value,
|
||||||
TimestampUtc = timestamp,
|
TimestampUtc = timestamp,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
bucketCount++;
|
||||||
|
if (bucketLimit > 0 && bucketCount >= bucketLimit)
|
||||||
|
{
|
||||||
|
Log.Warning(
|
||||||
|
"HistoryRead aggregate ({Aggregate}): {Tag} truncated at {Limit} buckets — widen IntervalMs or reduce time range",
|
||||||
|
aggregateColumn, tagName, bucketLimit);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
query.EndQuery(out _);
|
query.EndQuery(out _);
|
||||||
@@ -453,15 +462,9 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
|||||||
if (query.MoveNext(out error))
|
if (query.MoveNext(out error))
|
||||||
{
|
{
|
||||||
var result = query.QueryResult;
|
var result = query.QueryResult;
|
||||||
object? value;
|
|
||||||
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
|
|
||||||
value = result.StringValue;
|
|
||||||
else
|
|
||||||
value = result.Value;
|
|
||||||
|
|
||||||
results.Add(new HistorianSample
|
results.Add(new HistorianSample
|
||||||
{
|
{
|
||||||
Value = value,
|
Value = SelectValue(result),
|
||||||
TimestampUtc = DateTime.SpecifyKind(timestamp, DateTimeKind.Utc),
|
TimestampUtc = DateTime.SpecifyKind(timestamp, DateTimeKind.Utc),
|
||||||
Quality = (byte)(result.OpcQuality & 0xFF),
|
Quality = (byte)(result.OpcQuality & 0xFF),
|
||||||
});
|
});
|
||||||
@@ -574,6 +577,29 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Backend
|
|||||||
#pragma warning restore CS0618
|
#pragma warning restore CS0618
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Selects the typed value from a <see cref="HistoryQueryResult"/> row.
|
||||||
|
/// <para>
|
||||||
|
/// <b>SDK limitation:</b> <c>HistoryQueryResult</c> exposes only <c>Value</c>
|
||||||
|
/// (double) and <c>StringValue</c> (string) — there is no tag data-type field on
|
||||||
|
/// the result. The correct approach would be to branch on the tag's declared
|
||||||
|
/// data type, but the bound version of <c>aahClientManaged</c> does not surface
|
||||||
|
/// it per query result. The heuristic below is the best available: prefer
|
||||||
|
/// <c>StringValue</c> only when it is non-empty AND <c>Value</c> is zero,
|
||||||
|
/// because string tags in the Historian SDK always project to <c>Value=0</c>
|
||||||
|
/// while numeric tags may legitimately sample to zero (in which case the SDK
|
||||||
|
/// does not populate <c>StringValue</c>). A numeric tag at exactly zero with a
|
||||||
|
/// non-empty formatted <c>StringValue</c> (e.g. "0.00") would be mis-reported
|
||||||
|
/// as a string; this is a known edge case of the SDK binding.
|
||||||
|
/// </para>
|
||||||
|
/// </summary>
|
||||||
|
private static object? SelectValue(HistoryQueryResult result)
|
||||||
|
{
|
||||||
|
if (!string.IsNullOrEmpty(result.StringValue) && result.Value == 0)
|
||||||
|
return result.StringValue;
|
||||||
|
return result.Value;
|
||||||
|
}
|
||||||
|
|
||||||
internal static double? ExtractAggregateValue(AnalogSummaryQueryResult result, string column)
|
internal static double? ExtractAggregateValue(AnalogSummaryQueryResult result, string column)
|
||||||
{
|
{
|
||||||
switch (column)
|
switch (column)
|
||||||
|
|||||||
@@ -153,6 +153,11 @@ public sealed class HistorianFrameHandler : IFrameHandler
|
|||||||
private async Task HandleWriteAlarmEventsAsync(byte[] body, FrameWriter writer, CancellationToken ct)
|
private async Task HandleWriteAlarmEventsAsync(byte[] body, FrameWriter writer, CancellationToken ct)
|
||||||
{
|
{
|
||||||
var req = MessagePackSerializer.Deserialize<WriteAlarmEventsRequest>(body);
|
var req = MessagePackSerializer.Deserialize<WriteAlarmEventsRequest>(body);
|
||||||
|
|
||||||
|
// MessagePack deserializes an absent or explicit-nil array as null, not Array.Empty.
|
||||||
|
// Normalise here so every path below can safely dereference .Length without an NRE.
|
||||||
|
req.Events ??= Array.Empty<AlarmHistorianEventDto>();
|
||||||
|
|
||||||
var reply = new WriteAlarmEventsReply { CorrelationId = req.CorrelationId };
|
var reply = new WriteAlarmEventsReply { CorrelationId = req.CorrelationId };
|
||||||
|
|
||||||
if (_alarmWriter is null)
|
if (_alarmWriter is null)
|
||||||
|
|||||||
@@ -113,17 +113,62 @@ public sealed class PipeServer : IDisposable
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Backoff sequence for consecutive RunOneConnection failures: 250 ms → 500 ms →
|
||||||
|
// 1 000 ms → 2 000 ms → 4 000 ms → capped at 8 000 ms thereafter.
|
||||||
|
private static readonly TimeSpan[] BackoffSteps =
|
||||||
|
{
|
||||||
|
TimeSpan.FromMilliseconds(250),
|
||||||
|
TimeSpan.FromMilliseconds(500),
|
||||||
|
TimeSpan.FromSeconds(1),
|
||||||
|
TimeSpan.FromSeconds(2),
|
||||||
|
TimeSpan.FromSeconds(4),
|
||||||
|
TimeSpan.FromSeconds(8),
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Maximum consecutive failures before the server gives up and lets the process exit
|
||||||
|
/// so the supervisor (NSSM / SCM) can restart the sidecar cleanly.
|
||||||
|
/// </summary>
|
||||||
|
private const int MaxConsecutiveFailures = 20;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Runs the server continuously, handling one connection at a time. When a connection
|
/// Runs the server continuously, handling one connection at a time. When a connection
|
||||||
/// ends (clean or error), accepts the next.
|
/// ends (clean or error), waits with exponential backoff before accepting the next.
|
||||||
|
/// If <see cref="MaxConsecutiveFailures"/> consecutive failures occur the method
|
||||||
|
/// throws so the supervisor can restart the sidecar.
|
||||||
/// </summary>
|
/// </summary>
|
||||||
public async Task RunAsync(IFrameHandler handler, CancellationToken ct)
|
public async Task RunAsync(IFrameHandler handler, CancellationToken ct)
|
||||||
{
|
{
|
||||||
|
var consecutiveFailures = 0;
|
||||||
|
|
||||||
while (!ct.IsCancellationRequested)
|
while (!ct.IsCancellationRequested)
|
||||||
{
|
{
|
||||||
try { await RunOneConnectionAsync(handler, ct).ConfigureAwait(false); }
|
try
|
||||||
|
{
|
||||||
|
await RunOneConnectionAsync(handler, ct).ConfigureAwait(false);
|
||||||
|
consecutiveFailures = 0; // a clean connection (even a short-lived one) resets the counter
|
||||||
|
}
|
||||||
catch (OperationCanceledException) { break; }
|
catch (OperationCanceledException) { break; }
|
||||||
catch (Exception ex) { _logger.Error(ex, "Sidecar IPC connection loop error — accepting next"); }
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
consecutiveFailures++;
|
||||||
|
|
||||||
|
if (consecutiveFailures >= MaxConsecutiveFailures)
|
||||||
|
{
|
||||||
|
_logger.Fatal(ex,
|
||||||
|
"Sidecar IPC connection loop failed {Count} consecutive times — giving up so supervisor can restart",
|
||||||
|
consecutiveFailures);
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
var delay = BackoffSteps[Math.Min(consecutiveFailures - 1, BackoffSteps.Length - 1)];
|
||||||
|
_logger.Error(ex,
|
||||||
|
"Sidecar IPC connection loop error (consecutive failure {Count}/{Max}) — retrying in {Delay}",
|
||||||
|
consecutiveFailures, MaxConsecutiveFailures, delay);
|
||||||
|
|
||||||
|
try { await Task.Delay(delay, ct).ConfigureAwait(false); }
|
||||||
|
catch (OperationCanceledException) { break; }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user