fix(historian): address code review on Raw HistoryRead paging

C1 (critical): a boundary tie cluster larger than NumValuesPerNode could
silently truncate a resumed read to GoodNoData, permanently dropping the
un-emitted ties — the (timestamp, skip) cursor cannot advance past a single
timestamp the fixed-(start,end,cap) backend keeps re-returning. Now detected
and failed LOUDLY per node with BadHistoryOperationUnsupported + a log naming
the tag/timestamp/cap; documented in Historian.md with the larger-cap remedy.
Regression test Raw_tie_cluster_larger_than_page_fails_loudly_not_silently.

I3: build HistoryData before Save() so a projection failure can never orphan a
stored continuation cursor.

N1 (YAGNI): drop the never-produced HistoryReadKind enum + Processed-only
Aggregate/IntervalTicks fields from HistoryContinuationState — only Raw pages.

N3: ComputeResumeCursor guards its documented non-empty precondition.

I1: document InMemoryHistoryContinuationStore's eventual-consistency (test double).

Build clean, 182/182 OpcUaServer tests pass.
This commit is contained in:
Joseph Doherty
2026-06-15 05:15:07 -04:00
parent 94c3ca60fc
commit bea0b482d4
6 changed files with 101 additions and 36 deletions
@@ -105,7 +105,11 @@ internal sealed class InMemoryHistoryContinuationStore(int capacity = 100) : IHi
{
private readonly object _gate = new();
private readonly Dictionary<Guid, HistoryContinuationState> _states = new();
// Insertion order, so we can evict the OLDEST when over capacity (matches the SDK store).
// Insertion order, so we can evict the OLDEST when over capacity (matches the SDK store). Eventually
// consistent: a Guid taken/released stays in _order until the eviction loop reaches it and finds it
// already gone from _states (a harmless no-op dequeue). _states is the source of truth for liveness;
// _order only ever over-approximates it, so eviction never drops a LIVE entry early. Fine for a
// bounded test double — production uses the SDK's own per-session store, not this class.
private readonly Queue<Guid> _order = new();
private readonly int _capacity = capacity < 1 ? 1 : capacity;
@@ -3,25 +3,12 @@ using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
namespace ZB.MOM.WW.OtOpcUa.OpcUaServer;
/// <summary>
/// The kind of variable-history read a continuation point resumes. Only the two count-capped,
/// time-range arms page server-side (see <see cref="HistoryPaging"/>); AtTime is single-shot
/// (no client count cap, so there is never a "full page" signal to page on) and never produces a
/// continuation point, so it has no entry here.
/// </summary>
internal enum HistoryReadKind
{
/// <summary>HistoryRead-Raw — resumes via <see cref="IHistorianDataSource.ReadRawAsync"/>.</summary>
Raw,
/// <summary>HistoryRead-Processed — resumes via <see cref="IHistorianDataSource.ReadProcessedAsync"/>.</summary>
Processed,
}
/// <summary>
/// The server-side resume state stored behind an opaque continuation point for a single
/// paged variable-history read. Captures exactly enough to continue the SAME logical read from
/// where the previous page stopped: the read kind + tagname, the original (inclusive) end of the
/// window, the next start of the window, and — for Processed — the aggregate + interval.
/// The server-side resume state stored behind an opaque continuation point for a single paged
/// HistoryRead-Raw read. Captures exactly enough to continue the SAME logical read from where the
/// previous page stopped: the tagname, the original (inclusive) end of the window, the next start of
/// the window, and the tie-safe boundary skip. (Only Raw pages server-side — Processed and AtTime
/// carry no client count cap, so they are single-shot and never produce a continuation point; see
/// <see cref="HistoryPaging"/>.)
/// <para>
/// The boundary fields (<see cref="NextStartUtc"/> + <see cref="BoundarySkipCount"/>) encode a
/// tie-safe resume cursor: the next page reads from <see cref="NextStartUtc"/> INCLUSIVE and
@@ -34,7 +21,6 @@ internal enum HistoryReadKind
/// cheap value that unit tests can drive directly.
/// </para>
/// </summary>
/// <param name="Kind">Which variable-history arm this state resumes.</param>
/// <param name="Tagname">The resolved historian tagname (NOT the NodeId) to read from.</param>
/// <param name="NextStartUtc">
/// Inclusive lower bound for the next page — the boundary timestamp the previous page stopped on.
@@ -45,17 +31,12 @@ internal enum HistoryReadKind
/// prior pages and must be dropped from the head of the next page (tie de-dup).
/// </param>
/// <param name="NumValuesPerNode">The client's per-page cap; re-applied to every resumed page.</param>
/// <param name="Aggregate">The aggregate for a Processed read; ignored for Raw.</param>
/// <param name="IntervalTicks">The Processed bucketing interval in ticks; ignored for Raw.</param>
internal sealed record HistoryContinuationState(
HistoryReadKind Kind,
string Tagname,
DateTime NextStartUtc,
DateTime EndUtc,
int BoundarySkipCount,
uint NumValuesPerNode,
HistoryAggregateType Aggregate,
long IntervalTicks);
uint NumValuesPerNode);
/// <summary>
/// Pure server-side continuation-point paging decisions for the count-capped variable-history arms
@@ -111,6 +92,11 @@ internal static class HistoryPaging
out DateTime nextStartUtc,
out int boundarySkipCount)
{
// Enforce the documented precondition at the API boundary rather than relying on the caller's guard
// (the only caller only pages a full, non-empty page, but this is a public static helper).
if (page.Count == 0)
throw new ArgumentOutOfRangeException(nameof(page), "ComputeResumeCursor requires a non-empty page.");
// The boundary is the last returned sample's SourceTimestamp. A sample whose SourceTimestamp is
// null (Bad/unset) cannot anchor a time cursor; fall back to MinValue so the next read covers the
// whole remaining window rather than silently dropping data — duplicates are then de-duped by the
@@ -1751,28 +1751,53 @@ public sealed class OtOpcUaNodeManager : CustomNodeManager2
.ReadRawAsync(tagname, startUtc, endUtc, numValuesPerNode, CancellationToken.None)
.GetAwaiter().GetResult();
var backendFull = HistoryPaging.IsFullPage(sourceResult.Samples.Count, numValuesPerNode);
// On a resume read, drop the boundary ties already returned on the prior page.
var samples = inboundCp is { Length: > 0 }
? HistoryPaging.TrimBoundaryDuplicates(sourceResult.Samples, startUtc, boundarySkip)
: sourceResult.Samples;
// Degenerate tie cluster: a resume read returned a FULL backend page that the boundary-tie trim
// emptied entirely. That can only happen when more than NumValuesPerNode samples share the resume
// boundary timestamp — a tie cluster larger than the page cap. The fixed-(start,end,cap) backend
// can only ever return the first `cap` of those ties, so a (timestamp, skip) cursor can never
// advance past the cluster. Fail LOUDLY for this node rather than silently truncate to GoodNoData
// (which would permanently drop the un-emitted ties). The operator's remedy is a larger
// NumValuesPerNode; see docs/Historian.md "Paging limitation".
if (inboundCp is { Length: > 0 } && backendFull && samples.Count == 0)
{
#pragma warning disable CS0618 // Type or member is obsolete
Utils.LogError(
"OtOpcUaNodeManager: HistoryReadRaw paging stalled — tie cluster at {0:O} for tag '{1}' " +
"exceeds NumValuesPerNode={2}; cannot page past it. Increase NumValuesPerNode.",
startUtc, tagname, numValuesPerNode);
#pragma warning restore CS0618
errors[handle.Index] = StatusCodes.BadHistoryOperationUnsupported;
results[handle.Index] = new SdkHistoryReadResult { StatusCode = StatusCodes.BadHistoryOperationUnsupported };
return;
}
// The "full page" test is against the RAW backend count (before trimming): the backend honoured
// the cap, so a full backend page ⇒ there may be more even if we trimmed some boundary ties.
var historyData = ToHistoryDataFromSamples(samples);
byte[]? outboundCp = null;
if (HistoryPaging.IsFullPage(sourceResult.Samples.Count, numValuesPerNode) && samples.Count > 0)
if (backendFull && samples.Count > 0)
{
HistoryPaging.ComputeResumeCursor(samples, out var nextStart, out var skip);
var nextState = new HistoryContinuationState(
HistoryReadKind.Raw, tagname, nextStart, endUtc, skip, numValuesPerNode,
Aggregate: default, IntervalTicks: 0);
tagname, nextStart, endUtc, skip, numValuesPerNode);
// Save may return null (no session on this request) ⇒ degrade to single-shot for this node.
// Built AFTER historyData so a failure projecting samples can never orphan a stored cursor.
outboundCp = _historyContinuationStore.Save(session, nextState);
}
var historyData = ToHistoryDataFromSamples(samples);
results[handle.Index] = new SdkHistoryReadResult
{
// No samples ⇒ GoodNoData (the node is historized, the window just held no data).
// No samples ⇒ GoodNoData (the node is historized, the window just held no data). With the
// degenerate-cluster guard above, a resumed empty page now only means the window/cluster is
// genuinely drained — never silent data loss.
StatusCode = samples.Count == 0 ? StatusCodes.GoodNoData : StatusCodes.Good,
HistoryData = new ExtensionObject(historyData),
ContinuationPoint = outboundCp,