feat(historian): emit PermanentFail for poison alarm events via additive PerEventStatus sidecar IPC field
This commit is contained in:
@@ -225,4 +225,8 @@ public sealed class WriteAlarmEventsReply
|
||||
|
||||
/// <summary>Per-event success flag, parallel to <see cref="WriteAlarmEventsRequest.Events"/>.</summary>
|
||||
[Key(3)] public bool[] PerEventOk { get; set; } = Array.Empty<bool>();
|
||||
|
||||
/// <summary>Per-event status parallel to the request's Events: 0=Ack, 1=Retry, 2=Permanent.
|
||||
/// Empty ⇒ an older sidecar that only sent <see cref="PerEventOk"/>; the client falls back to it.</summary>
|
||||
[Key(4)] public byte[] PerEventStatus { get; set; } = Array.Empty<byte>();
|
||||
}
|
||||
|
||||
+43
-17
@@ -298,21 +298,29 @@ public sealed class WonderwareHistorianClient : IHistorianDataSource, IAlarmHist
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// <b>PermanentFail limitation (finding 002):</b> this writer never returns
|
||||
/// <see cref="HistorianWriteOutcome.PermanentFail"/>. The sidecar wire contract
|
||||
/// (<see cref="WriteAlarmEventsReply.PerEventOk"/>) carries only a per-event
|
||||
/// boolean (succeeded / did-not-succeed) and provides no unrecoverable vs.
|
||||
/// transient distinction. A poison event that the historian SDK can never persist
|
||||
/// (e.g. a permanently malformed row) will therefore retry indefinitely inside the
|
||||
/// store-and-forward drain worker rather than being moved to the dead-letter table.
|
||||
/// Extending the protocol to add a per-event status enum (Ack / Retry / Permanent)
|
||||
/// requires a coordinated additive change to the .NET 4.8 sidecar and is tracked as
|
||||
/// a follow-up. Until then, the drain worker's own retry-count limit is the
|
||||
/// backstop against an infinite loop.
|
||||
/// <b>Per-event status:</b> when the sidecar populates the additive
|
||||
/// <see cref="WriteAlarmEventsReply.PerEventStatus"/> wire field (0=Ack, 1=Retry,
|
||||
/// 2=Permanent), each slot maps directly to <see cref="HistorianWriteOutcome.Ack"/> /
|
||||
/// <see cref="HistorianWriteOutcome.RetryPlease"/> / <see cref="HistorianWriteOutcome.PermanentFail"/>.
|
||||
/// The sidecar emits <c>Permanent</c> for structurally-malformed (poison) events,
|
||||
/// so the store-and-forward drain worker dead-letters them immediately instead of
|
||||
/// looping to the retry cap. An older sidecar that sends only the legacy
|
||||
/// <see cref="WriteAlarmEventsReply.PerEventOk"/> boolean is handled by the
|
||||
/// fallback path below (true→Ack, false→RetryPlease) for rolling-deploy back-compat.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Transport or deserialization failures return <see cref="HistorianWriteOutcome.RetryPlease"/>
|
||||
/// for every event in the batch; the drain worker's backoff controls recovery.
|
||||
/// <b>Documented boundary:</b> only <i>structurally</i>-malformed events surface as
|
||||
/// <see cref="HistorianWriteOutcome.PermanentFail"/>. A structurally-valid event that
|
||||
/// the AAH historian SDK rejects for a deeper, semantic reason still maps to
|
||||
/// <see cref="HistorianWriteOutcome.RetryPlease"/> (→ retry cap), because the sidecar's
|
||||
/// writer returns only a transient/persisted boolean for events it actually attempts.
|
||||
/// Surfacing richer SDK-semantic permanent rejections requires the infra-gated
|
||||
/// <c>AahClientManagedAlarmEventWriter</c> to report a status code rather than a bool.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Transport or deserialization failures, and any whole-call failure
|
||||
/// (<c>Success=false</c>), return <see cref="HistorianWriteOutcome.RetryPlease"/> for
|
||||
/// every event in the batch; the drain worker's backoff controls recovery.
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
/// <param name="batch">The batch of alarm historian events to write.</param>
|
||||
@@ -347,9 +355,26 @@ public sealed class WonderwareHistorianClient : IHistorianDataSource, IAlarmHist
|
||||
return fail;
|
||||
}
|
||||
|
||||
// Per-event status: PerEventOk[i] = true → Ack; false → RetryPlease.
|
||||
// NOTE: PermanentFail is never emitted — see <remarks> for the wire-contract
|
||||
// limitation and why poison events currently retry rather than dead-letter.
|
||||
// Prefer the granular per-event status when the sidecar provides it (new wire
|
||||
// field); fall back to the legacy PerEventOk bool for older sidecars. The sidecar
|
||||
// emits status 2 (Permanent) for structurally-malformed poison events so they
|
||||
// dead-letter immediately rather than retrying to the cap.
|
||||
if (reply.PerEventStatus is { Length: > 0 } status && status.Length == batch.Count)
|
||||
{
|
||||
var statusOutcomes = new HistorianWriteOutcome[batch.Count];
|
||||
for (var i = 0; i < batch.Count; i++)
|
||||
statusOutcomes[i] = status[i] switch
|
||||
{
|
||||
0 => HistorianWriteOutcome.Ack,
|
||||
2 => HistorianWriteOutcome.PermanentFail,
|
||||
_ => HistorianWriteOutcome.RetryPlease, // 1 or unknown
|
||||
};
|
||||
return statusOutcomes;
|
||||
}
|
||||
|
||||
// Legacy fallback: PerEventOk[i] = true → Ack; false → RetryPlease. An older
|
||||
// sidecar without PerEventStatus can never signal PermanentFail through this
|
||||
// path, so a poison event retries to the drain worker's cap.
|
||||
var outcomes = new HistorianWriteOutcome[batch.Count];
|
||||
for (var i = 0; i < batch.Count; i++)
|
||||
{
|
||||
@@ -361,7 +386,8 @@ public sealed class WonderwareHistorianClient : IHistorianDataSource, IAlarmHist
|
||||
catch
|
||||
{
|
||||
// Transport / deserialization failure — every event is retry-please. The drain
|
||||
// worker's backoff handles recovery. PermanentFail is never emitted (see <remarks>).
|
||||
// worker's backoff handles recovery. PermanentFail is only emitted from the
|
||||
// success path's PerEventStatus mapping, never from a transport failure.
|
||||
var fail = new HistorianWriteOutcome[batch.Count];
|
||||
Array.Fill(fail, HistorianWriteOutcome.RetryPlease);
|
||||
return fail;
|
||||
|
||||
@@ -263,4 +263,8 @@ public sealed class WriteAlarmEventsReply
|
||||
|
||||
/// <summary>Per-event success flag, parallel to <see cref="WriteAlarmEventsRequest.Events"/>.</summary>
|
||||
[Key(3)] public bool[] PerEventOk { get; set; } = Array.Empty<bool>();
|
||||
|
||||
/// <summary>Per-event status parallel to the request's Events: 0=Ack, 1=Retry, 2=Permanent.
|
||||
/// Empty ⇒ an older sidecar that only sent <see cref="PerEventOk"/>; the client falls back to it.</summary>
|
||||
[Key(4)] public byte[] PerEventStatus { get; set; } = Array.Empty<byte>();
|
||||
}
|
||||
|
||||
+72
-4
@@ -16,6 +16,11 @@ namespace ZB.MOM.WW.OtOpcUa.Driver.Historian.Wonderware.Ipc;
|
||||
/// </summary>
|
||||
public sealed class HistorianFrameHandler : IFrameHandler
|
||||
{
|
||||
// WriteAlarmEventsReply.PerEventStatus byte semantics: 0=Ack, 1=Retry, 2=Permanent.
|
||||
private const byte StatusAck = 0;
|
||||
private const byte StatusRetry = 1;
|
||||
private const byte StatusPermanent = 2;
|
||||
|
||||
private readonly IHistorianDataSource _historian;
|
||||
private readonly IAlarmEventWriter? _alarmWriter;
|
||||
private readonly ILogger _logger;
|
||||
@@ -174,18 +179,51 @@ public sealed class HistorianFrameHandler : IFrameHandler
|
||||
reply.Success = false;
|
||||
reply.Error = "Sidecar not configured with an alarm-event writer.";
|
||||
reply.PerEventOk = new bool[req.Events.Length];
|
||||
reply.PerEventStatus = AllStatus(req.Events.Length, StatusRetry);
|
||||
await writer.WriteAsync(MessageKind.WriteAlarmEventsReply, reply, ct).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var perEvent = await _alarmWriter.WriteAsync(req.Events, ct).ConfigureAwait(false);
|
||||
reply.PerEventOk = perEvent;
|
||||
// Classify each event before touching the writer: structurally-malformed
|
||||
// (poison) events can never be persisted, so mark them Permanent and exclude
|
||||
// them from the writer batch. Only the well-formed remainder is handed to the
|
||||
// writer, whose bool[] result is mapped back onto the original indices.
|
||||
var status = new byte[req.Events.Length];
|
||||
var writable = new List<AlarmHistorianEventDto>(req.Events.Length);
|
||||
var originalIndex = new List<int>(req.Events.Length);
|
||||
for (var i = 0; i < req.Events.Length; i++)
|
||||
{
|
||||
if (IsStructurallyMalformed(req.Events[i]))
|
||||
{
|
||||
status[i] = StatusPermanent;
|
||||
}
|
||||
else
|
||||
{
|
||||
originalIndex.Add(i);
|
||||
writable.Add(req.Events[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Aligned 1:1 to `writable`; empty when every event was poison (writer skipped).
|
||||
var perEvent = writable.Count == 0
|
||||
? Array.Empty<bool>()
|
||||
: await _alarmWriter.WriteAsync(writable.ToArray(), ct).ConfigureAwait(false);
|
||||
|
||||
for (var i = 0; i < originalIndex.Count; i++)
|
||||
{
|
||||
var ok = i < perEvent.Length && perEvent[i];
|
||||
status[originalIndex[i]] = ok ? StatusAck : StatusRetry;
|
||||
}
|
||||
|
||||
reply.PerEventStatus = status;
|
||||
reply.PerEventOk = StatusToOk(status);
|
||||
reply.Success = true;
|
||||
// Whole-batch Success stays true even when some events failed — per-event
|
||||
// PerEventOk slots carry the granular result; the SQLite drain worker treats
|
||||
// false slots as retry-please candidates.
|
||||
// PerEventStatus slots carry the granular result (Ack / Retry / Permanent);
|
||||
// the SQLite drain worker acks 0, retries 1, and dead-letters 2. PerEventOk
|
||||
// is kept populated for rolling-deploy back-compat with an older client.
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -193,11 +231,41 @@ public sealed class HistorianFrameHandler : IFrameHandler
|
||||
reply.Success = false;
|
||||
reply.Error = ex.Message;
|
||||
reply.PerEventOk = new bool[req.Events.Length];
|
||||
reply.PerEventStatus = AllStatus(req.Events.Length, StatusRetry);
|
||||
}
|
||||
|
||||
await writer.WriteAsync(MessageKind.WriteAlarmEventsReply, reply, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classifies an alarm event as structurally malformed (poison): an event the historian
|
||||
/// event store can never persist regardless of retries. Such events are marked Permanent
|
||||
/// so the store-and-forward sink dead-letters them immediately instead of looping to the
|
||||
/// retry cap. A blank source name or alarm type, or a non-positive event timestamp, are
|
||||
/// the structural invariants the historian write requires.
|
||||
/// </summary>
|
||||
/// <param name="e">The candidate alarm event.</param>
|
||||
/// <returns><c>true</c> when the event is structurally malformed; otherwise <c>false</c>.</returns>
|
||||
internal static bool IsStructurallyMalformed(AlarmHistorianEventDto e) =>
|
||||
e is null
|
||||
|| string.IsNullOrWhiteSpace(e.SourceName)
|
||||
|| string.IsNullOrWhiteSpace(e.AlarmType)
|
||||
|| e.EventTimeUtcTicks <= 0;
|
||||
|
||||
private static byte[] AllStatus(int length, byte value)
|
||||
{
|
||||
var status = new byte[length];
|
||||
for (var i = 0; i < length; i++) status[i] = value;
|
||||
return status;
|
||||
}
|
||||
|
||||
private static bool[] StatusToOk(byte[] status)
|
||||
{
|
||||
var ok = new bool[status.Length];
|
||||
for (var i = 0; i < status.Length; i++) ok[i] = status[i] == StatusAck;
|
||||
return ok;
|
||||
}
|
||||
|
||||
private static HistorianSampleDto[] ToWire(List<HistorianSample> samples)
|
||||
{
|
||||
var dtos = new HistorianSampleDto[samples.Count];
|
||||
|
||||
Reference in New Issue
Block a user