feat(historian): emit PermanentFail for poison alarm events via additive PerEventStatus sidecar IPC field

This commit is contained in:
Joseph Doherty
2026-06-18 12:30:14 -04:00
parent f320f323ae
commit feddc2b80e
6 changed files with 438 additions and 21 deletions
@@ -225,4 +225,8 @@ public sealed class WriteAlarmEventsReply
/// <summary>Per-event success flag, parallel to <see cref="WriteAlarmEventsRequest.Events"/>.</summary>
[Key(3)] public bool[] PerEventOk { get; set; } = Array.Empty<bool>();
/// <summary>Per-event status parallel to the request's Events: 0=Ack, 1=Retry, 2=Permanent.
/// Empty ⇒ an older sidecar that only sent <see cref="PerEventOk"/>; the client falls back to it.</summary>
[Key(4)] public byte[] PerEventStatus { get; set; } = Array.Empty<byte>();
}
@@ -298,21 +298,29 @@ public sealed class WonderwareHistorianClient : IHistorianDataSource, IAlarmHist
/// </summary>
/// <remarks>
/// <para>
/// <b>PermanentFail limitation (finding 002):</b> this writer never returns
/// <see cref="HistorianWriteOutcome.PermanentFail"/>. The sidecar wire contract
/// (<see cref="WriteAlarmEventsReply.PerEventOk"/>) carries only a per-event
/// boolean (succeeded / did-not-succeed) and provides no unrecoverable vs.
/// transient distinction. A poison event that the historian SDK can never persist
/// (e.g. a permanently malformed row) will therefore retry indefinitely inside the
/// store-and-forward drain worker rather than being moved to the dead-letter table.
/// Extending the protocol to add a per-event status enum (Ack / Retry / Permanent)
/// requires a coordinated additive change to the .NET 4.8 sidecar and is tracked as
/// a follow-up. Until then, the drain worker's own retry-count limit is the
/// backstop against an infinite loop.
/// <b>Per-event status:</b> when the sidecar populates the additive
/// <see cref="WriteAlarmEventsReply.PerEventStatus"/> wire field (0=Ack, 1=Retry,
/// 2=Permanent), each slot maps directly to <see cref="HistorianWriteOutcome.Ack"/> /
/// <see cref="HistorianWriteOutcome.RetryPlease"/> / <see cref="HistorianWriteOutcome.PermanentFail"/>.
/// The sidecar emits <c>Permanent</c> for structurally-malformed (poison) events,
/// so the store-and-forward drain worker dead-letters them immediately instead of
/// looping to the retry cap. An older sidecar that sends only the legacy
/// <see cref="WriteAlarmEventsReply.PerEventOk"/> boolean is handled by the
/// fallback path below (true→Ack, false→RetryPlease) for rolling-deploy back-compat.
/// </para>
/// <para>
/// Transport or deserialization failures return <see cref="HistorianWriteOutcome.RetryPlease"/>
/// for every event in the batch; the drain worker's backoff controls recovery.
/// <b>Documented boundary:</b> only <i>structurally</i>-malformed events surface as
/// <see cref="HistorianWriteOutcome.PermanentFail"/>. A structurally-valid event that
/// the AAH historian SDK rejects for a deeper, semantic reason still maps to
/// <see cref="HistorianWriteOutcome.RetryPlease"/> (→ retry cap), because the sidecar's
/// writer returns only a transient/persisted boolean for events it actually attempts.
/// Surfacing richer SDK-semantic permanent rejections requires the infra-gated
/// <c>AahClientManagedAlarmEventWriter</c> to report a status code rather than a bool.
/// </para>
/// <para>
/// Transport or deserialization failures, and any whole-call failure
/// (<c>Success=false</c>), return <see cref="HistorianWriteOutcome.RetryPlease"/> for
/// every event in the batch; the drain worker's backoff controls recovery.
/// </para>
/// </remarks>
/// <param name="batch">The batch of alarm historian events to write.</param>
@@ -347,9 +355,26 @@ public sealed class WonderwareHistorianClient : IHistorianDataSource, IAlarmHist
return fail;
}
// Per-event status: PerEventOk[i] = true → Ack; false → RetryPlease.
// NOTE: PermanentFail is never emitted — see <remarks> for the wire-contract
// limitation and why poison events currently retry rather than dead-letter.
// Prefer the granular per-event status when the sidecar provides it (new wire
// field); fall back to the legacy PerEventOk bool for older sidecars. The sidecar
// emits status 2 (Permanent) for structurally-malformed poison events so they
// dead-letter immediately rather than retrying to the cap.
if (reply.PerEventStatus is { Length: > 0 } status && status.Length == batch.Count)
{
var statusOutcomes = new HistorianWriteOutcome[batch.Count];
for (var i = 0; i < batch.Count; i++)
statusOutcomes[i] = status[i] switch
{
0 => HistorianWriteOutcome.Ack,
2 => HistorianWriteOutcome.PermanentFail,
_ => HistorianWriteOutcome.RetryPlease, // 1 or unknown
};
return statusOutcomes;
}
// Legacy fallback: PerEventOk[i] = true → Ack; false → RetryPlease. An older
// sidecar without PerEventStatus can never signal PermanentFail through this
// path, so a poison event retries to the drain worker's cap.
var outcomes = new HistorianWriteOutcome[batch.Count];
for (var i = 0; i < batch.Count; i++)
{
@@ -361,7 +386,8 @@ public sealed class WonderwareHistorianClient : IHistorianDataSource, IAlarmHist
catch
{
// Transport / deserialization failure — every event is retry-please. The drain
// worker's backoff handles recovery. PermanentFail is never emitted (see <remarks>).
// worker's backoff handles recovery. PermanentFail is only emitted from the
// success path's PerEventStatus mapping, never from a transport failure.
var fail = new HistorianWriteOutcome[batch.Count];
Array.Fill(fail, HistorianWriteOutcome.RetryPlease);
return fail;