fix(debug-stream): stream-first lifecycle with replay/dedup (#26, M2.18)

Re-architect DebugStreamBridgeActor from snapshot-first to stream-first so no
attribute/alarm event occurring during the snapshot-build + network-transit
window is lost (#26).

Lifecycle change:
- PreStart now opens the gRPC subscription FIRST (alongside sending the
  SubscribeDebugViewRequest), so live events start flowing immediately.
- Phase model via a single _snapshotDelivered flag (mutated only on the actor
  thread). While buffering (snapshot not yet delivered), AttributeValueChanged/
  AlarmStateChanged are appended to an ordered _preSnapshotBuffer instead of
  being delivered. After snapshot+flush, the same handlers pass through directly.
- On DebugViewSnapshot: deliver snapshot, then flush the buffer in arrival order
  with per-entity dedup, then set _snapshotDelivered=true (pass-through).

Dedup rule (exactly-once):
- Identity: attributes by (InstanceUniqueName, AttributePath, AttributeName);
  alarms by (InstanceUniqueName, AlarmName, SourceReference) so native
  per-condition alarms are not conflated. Keys joined with a NUL delimiter
  (declared as an escaped char constant; no raw NUL in source) so distinct
  identities never collide on a space within a name.
- Boundary: a buffered event whose timestamp is <= the snapshot's timestamp for
  the same entity is already reflected -> DROP; strictly-newer (>) -> DELIVER;
  entity absent from the snapshot -> DELIVER (genuine gap-window event).

Preserved paths:
- M2.11 InstanceNotFound: with stream-first the gRPC stream is already open, so
  the not-found path now tears it down (CleanupGrpc) + clears the buffer, does
  NOT enter pass-through, delivers the not-found snapshot, and stops cleanly.
- Reconnect (ReconnectGrpcStream -> OpenGrpcStream) does not touch the phase
  flag: a mid-session reconnect resumes pass-through; a reconnect during the
  buffering phase stays buffering until the snapshot arrives.
- Communication-008 retry/stability/stop/terminate + ReceiveTimeout orphan net
  unchanged. Duplicate/late snapshot after delivery is ignored defensively.

Tests: 10 new M2.18 tests (stream-first ordering, gap-window buffering, dedup
drop/deliver for attrs + alarms, ordering, pass-through, InstanceNotFound
teardown, reconnect-during-buffering, reconnect-after-snapshot) + revised the
M2.11 not-found test to assert stream teardown. Full DebugStreamBridgeActor
class green: 23/23.
This commit is contained in:
Joseph Doherty
2026-06-16 07:33:26 -04:00
parent c1043569f6
commit d8519cb464
3 changed files with 595 additions and 24 deletions
@@ -61,16 +61,22 @@ public class DebugStreamBridgeActorTests : TestKit
}
[Fact]
public void On_InstanceNotFound_Snapshot_Forwards_To_OnEvent_Does_Not_Open_Stream_And_Terminates()
public void On_InstanceNotFound_Snapshot_Forwards_To_OnEvent_Tears_Down_Stream_And_Terminates()
{
// M2.11: when the site reports InstanceNotFound=true the bridge actor must
// M2.11 (revised for M2.18 stream-first): the gRPC subscription is now opened
// up-front in PreStart, so when the site reports InstanceNotFound=true the
// bridge actor must
// (a) forward the not-found snapshot to _onEvent so DebugStreamService's TCS
// resolves and the caller can inspect the flag,
// (b) NOT open a gRPC stream (SubscribeCalls must remain empty), and
// (b) tear DOWN the already-opened gRPC stream (Unsubscribe the just-opened
// correlation) rather than enter pass-through, and
// (c) stop itself cleanly.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>(); // initial subscribe envelope
// Stream-first: the gRPC subscription is opened before the snapshot arrives.
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var notFoundSnapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
@@ -90,12 +96,12 @@ public class DebugStreamBridgeActorTests : TestKit
Assert.True(received.InstanceNotFound);
}
// (b) no gRPC stream opened
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(3));
Assert.Empty(ctx.MockGrpcClient.SubscribeCalls);
// (b) the just-opened gRPC stream is torn down (not left running / no pass-through)
AwaitCondition(() => ctx.MockGrpcClient.UnsubscribedCorrelationIds.Contains("corr-1"),
TimeSpan.FromSeconds(3));
// (c) actor terminates cleanly
// ExpectTerminated above already verified termination
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(3));
}
[Fact]
@@ -386,6 +392,369 @@ public class DebugStreamBridgeActorTests : TestKit
Assert.Equal("corr-1", factory.ClientFor(GrpcNodeB).SubscribeCalls[0].CorrelationId);
}
// ---------------------------------------------------------------------
// M2.18 (#26) — stream-first + replay/dedup
// ---------------------------------------------------------------------
[Fact]
public void PreStart_Opens_GrpcStream_Before_Snapshot_Arrives()
{
// M2.18: the gRPC subscription must be opened in PreStart (stream-first),
// BEFORE the snapshot is delivered, so live events start flowing during the
// snapshot-build + network-transit window. The old lifecycle opened the
// stream only after the snapshot arrived, losing gap-window events.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>(); // initial subscribe envelope
// No snapshot sent yet — the stream must already be open.
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
Assert.Equal("corr-1", ctx.MockGrpcClient.SubscribeCalls[0].CorrelationId);
Assert.Equal(InstanceName, ctx.MockGrpcClient.SubscribeCalls[0].InstanceUniqueName);
// _onEvent must NOT have fired — buffering, not delivering.
lock (ctx.ReceivedEvents) { Assert.Empty(ctx.ReceivedEvents); }
}
[Fact]
public void GapWindow_Event_Buffered_Before_Snapshot_Is_Delivered_Exactly_Once_After_Snapshot()
{
// M2.18: an event arriving DURING the snapshot window (before the snapshot
// is delivered) is buffered, then flushed exactly once AFTER the snapshot.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
// Live event arrives BEFORE the snapshot — its entity is NOT in the snapshot,
// so it is a genuine gap-window event that must survive.
var gapEvent = new AttributeValueChanged(InstanceName, "IO", "Pressure", 99.9, "Good",
DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(gapEvent);
// While buffering, _onEvent has not fired.
lock (ctx.ReceivedEvents) { Assert.Empty(ctx.ReceivedEvents); }
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
ctx.BridgeActor.Tell(snapshot);
// snapshot then the buffered gap-window event, exactly once, in that order.
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]);
var flushed = Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[1]);
Assert.Equal("Pressure", flushed.AttributeName);
}
}
[Fact]
public void Buffered_Event_Already_Reflected_In_Snapshot_Is_Dropped()
{
// M2.18 dedup: a buffered event whose entity is in the snapshot with an equal
// or newer snapshot timestamp (buffered.Timestamp <= snapshot.Timestamp) is
// already reflected and must be DROPPED.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var t0 = DateTimeOffset.UtcNow;
// Buffered event for "Temp" at t0.
var buffered = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", t0);
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(buffered);
// Snapshot already contains "Temp" at the SAME timestamp t0 → buffered is a dup.
var snapAttr = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", t0);
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged> { snapAttr },
new List<AlarmStateChanged>(),
t0);
ctx.BridgeActor.Tell(snapshot);
// Only the snapshot is delivered; the buffered duplicate is dropped.
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 1; } },
TimeSpan.FromSeconds(3));
// Give a beat to ensure no extra (dropped) event sneaks through.
Thread.Sleep(200);
lock (ctx.ReceivedEvents)
{
Assert.Single(ctx.ReceivedEvents);
Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]);
}
}
[Fact]
public void Buffered_Event_Strictly_Newer_Than_Snapshot_Entity_Is_Delivered()
{
// M2.18 dedup: a buffered event strictly newer than the snapshot's entry for
// the same entity (buffered.Timestamp > snapshot.Timestamp) is NOT a dup and
// must be DELIVERED after the snapshot.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var snapTime = DateTimeOffset.UtcNow;
var newerTime = snapTime.AddMilliseconds(1);
// Buffered event for "Temp" strictly NEWER than the snapshot's "Temp".
var buffered = new AttributeValueChanged(InstanceName, "IO", "Temp", 50.0, "Good", newerTime);
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(buffered);
var snapAttr = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good", snapTime);
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged> { snapAttr },
new List<AlarmStateChanged>(),
snapTime);
ctx.BridgeActor.Tell(snapshot);
// snapshot then the strictly-newer buffered event.
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]);
var flushed = Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[1]);
Assert.Equal(50.0, flushed.Value);
Assert.Equal(newerTime, flushed.Timestamp);
}
}
[Fact]
public void Buffered_Alarm_Dedup_Uses_AlarmIdentity_And_Timestamp()
{
// M2.18 dedup for alarms: identity = (instance, alarm name, source reference).
// A buffered alarm older-or-equal to the snapshot's same-identity alarm is
// dropped; a strictly-newer one is delivered.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var t0 = DateTimeOffset.UtcNow;
// Buffered: "PumpFault" at t0 (dup) and "Overheat" at t0+1ms (newer, deliver).
var dupAlarm = new AlarmStateChanged(InstanceName, "PumpFault",
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState.Active, 500, t0);
var newerAlarm = new AlarmStateChanged(InstanceName, "Overheat",
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState.Active, 700, t0.AddMilliseconds(1));
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(dupAlarm);
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(newerAlarm);
// Snapshot contains BOTH "PumpFault" and "Overheat" at t0.
var snapPumpFault = new AlarmStateChanged(InstanceName, "PumpFault",
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState.Active, 500, t0);
var snapOverheat = new AlarmStateChanged(InstanceName, "Overheat",
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState.Normal, 0, t0);
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged> { snapPumpFault, snapOverheat },
t0);
ctx.BridgeActor.Tell(snapshot);
// snapshot + only the strictly-newer "Overheat" alarm (PumpFault dropped).
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
Thread.Sleep(200);
lock (ctx.ReceivedEvents)
{
Assert.Equal(2, ctx.ReceivedEvents.Count);
Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]);
var flushed = Assert.IsType<AlarmStateChanged>(ctx.ReceivedEvents[1]);
Assert.Equal("Overheat", flushed.AlarmName);
Assert.Equal(700, flushed.Priority);
}
}
[Fact]
public void Buffered_Events_Flushed_In_Arrival_Order()
{
// M2.18: ordering preserved across multiple buffered events (none are dups —
// their entities are absent from the snapshot).
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var baseTime = DateTimeOffset.UtcNow;
var sub = ctx.MockGrpcClient.SubscribeCalls[0];
sub.OnEvent(new AttributeValueChanged(InstanceName, "IO", "A", 1, "Good", baseTime));
sub.OnEvent(new AlarmStateChanged(InstanceName, "AlarmX",
ZB.MOM.WW.ScadaBridge.Commons.Types.Enums.AlarmState.Active, 100, baseTime));
sub.OnEvent(new AttributeValueChanged(InstanceName, "IO", "B", 2, "Good", baseTime));
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
baseTime);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 4; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]);
Assert.Equal("A", Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[1]).AttributeName);
Assert.Equal("AlarmX", Assert.IsType<AlarmStateChanged>(ctx.ReceivedEvents[2]).AlarmName);
Assert.Equal("B", Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[3]).AttributeName);
}
}
[Fact]
public void PassThrough_After_Flush_Delivers_Subsequent_Events_Immediately()
{
// M2.18: after the snapshot+flush the actor switches to pass-through — later
// events go straight to _onEvent (no buffering, no dup).
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 1; } },
TimeSpan.FromSeconds(3));
// Post-snapshot event — must be delivered immediately, exactly once.
var postEvent = new AttributeValueChanged(InstanceName, "IO", "Temp", 42.5, "Good",
DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(postEvent);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[1]);
}
}
[Fact]
public void InstanceNotFound_After_StreamFirst_Tears_Down_Stream_And_Does_Not_PassThrough()
{
// M2.18 + M2.11: stream-first means the gRPC subscription is already open
// when an InstanceNotFound snapshot arrives. The bridge must tear that stream
// down (Unsubscribe the just-opened correlation), deliver the not-found
// snapshot, NOT enter pass-through, and stop cleanly.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
// Stream opened up-front (stream-first).
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var notFoundSnapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow,
InstanceNotFound: true);
Watch(ctx.BridgeActor);
ctx.BridgeActor.Tell(notFoundSnapshot);
// Not-found snapshot delivered.
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 1; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.True(Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]).InstanceNotFound);
}
// The just-opened stream must be torn down.
AwaitCondition(() => ctx.MockGrpcClient.UnsubscribedCorrelationIds.Contains("corr-1"),
TimeSpan.FromSeconds(3));
// Stops cleanly.
ExpectTerminated(ctx.BridgeActor, TimeSpan.FromSeconds(3));
// No pass-through: an event arriving after the stop is not delivered.
var late = new AttributeValueChanged(InstanceName, "IO", "Temp", 1, "Good", DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[0].OnEvent(late);
Thread.Sleep(200);
lock (ctx.ReceivedEvents) { Assert.Single(ctx.ReceivedEvents); }
}
[Fact]
public void Reconnect_During_Buffering_Phase_Keeps_Buffering_Until_Snapshot()
{
// M2.18: a gRPC error/reconnect BEFORE the snapshot arrives must remain in the
// buffering phase — events on the new stream are still buffered, then flushed
// when the snapshot finally arrives.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
// Error before snapshot → reconnect (still buffering).
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("pre-snapshot blip"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
// Event on the reconnected stream — still buffered (snapshot not yet delivered).
var gapEvent = new AttributeValueChanged(InstanceName, "IO", "Late", 7, "Good",
DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[1].OnEvent(gapEvent);
lock (ctx.ReceivedEvents) { Assert.Empty(ctx.ReceivedEvents); }
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
ctx.BridgeActor.Tell(snapshot);
// snapshot + the event buffered across the reconnect.
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.IsType<DebugViewSnapshot>(ctx.ReceivedEvents[0]);
Assert.Equal("Late", Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[1]).AttributeName);
}
}
[Fact]
public void Reconnect_After_Snapshot_Resumes_PassThrough_Not_Buffering()
{
// M2.18: a mid-session reconnect (after the snapshot was already delivered)
// must resume pass-through — the snapshot is a one-time thing and events on
// the reconnected stream are delivered immediately, not re-buffered.
var ctx = CreateBridgeActor();
ctx.CommProbe.ExpectMsg<SiteEnvelope>();
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 1, TimeSpan.FromSeconds(3));
var snapshot = new DebugViewSnapshot(
InstanceName,
new List<AttributeValueChanged>(),
new List<AlarmStateChanged>(),
DateTimeOffset.UtcNow);
ctx.BridgeActor.Tell(snapshot);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 1; } },
TimeSpan.FromSeconds(3));
// Mid-session reconnect.
ctx.MockGrpcClient.SubscribeCalls[0].OnError(new Exception("mid-session blip"));
AwaitCondition(() => ctx.MockGrpcClient.SubscribeCalls.Count == 2, TimeSpan.FromSeconds(5));
// Event on the reconnected stream — delivered immediately (pass-through).
var postEvent = new AttributeValueChanged(InstanceName, "IO", "Temp", 9, "Good",
DateTimeOffset.UtcNow);
ctx.MockGrpcClient.SubscribeCalls[1].OnEvent(postEvent);
AwaitCondition(() => { lock (ctx.ReceivedEvents) { return ctx.ReceivedEvents.Count == 2; } },
TimeSpan.FromSeconds(3));
lock (ctx.ReceivedEvents)
{
Assert.Equal("Temp", Assert.IsType<AttributeValueChanged>(ctx.ReceivedEvents[1]).AttributeName);
}
}
[Fact]
public void RetryCount_RecoveredOnlyAfterStreamStaysStableForStabilityWindow()
{