fix(data-connection-layer): resolve DataConnectionLayer-002/003/004/005 — Resume supervision, concurrent dicts, subscribe-failure classification, write timeout

This commit is contained in:
Joseph Doherty
2026-05-16 19:40:40 -04:00
parent d7630d80fe
commit fccd3274d3
7 changed files with 350 additions and 25 deletions
@@ -125,8 +125,20 @@ public class DataConnectionManagerActor : ReceiveActor
}
/// <summary>
/// OneForOneStrategy with Restart for connection actors — a failed connection
/// should restart and attempt reconnection.
/// OneForOneStrategy with Resume for connection actors.
///
/// DataConnectionLayer-002: a DataConnectionActor is a long-lived, stateful
/// coordinator — its in-memory subscription registry (_subscriptionsByInstance,
/// _subscriptionIds, _subscribers) is the only record of which Instance Actors
/// subscribed to which tags, and there is no durable store to rebuild it from.
/// Restart would create a fresh instance and silently discard that registry,
/// breaking the design doc's "transparent re-subscribe" guarantee (WP-10):
/// subscribers would never be re-subscribed and would sit at stale quality with
/// no error. Resume keeps the actor instance and its state intact, so a transient
/// exception in a message handler does not lose subscription state. The actor's
/// own Become/Stash reconnect state machine already recovers connection-level
/// faults, so it does not need a restart to re-establish the connection.
/// This matches the ScadaLink convention of Resume for coordinator actors.
/// </summary>
protected override SupervisorStrategy SupervisorStrategy()
{
@@ -135,8 +147,8 @@ public class DataConnectionManagerActor : ReceiveActor
withinTimeRange: TimeSpan.FromMinutes(1),
decider: Decider.From(ex =>
{
_log.Warning(ex, "DataConnectionActor threw exception, restarting");
return Directive.Restart;
_log.Warning(ex, "DataConnectionActor threw exception, resuming (subscription state preserved)");
return Directive.Resume;
}));
}
}