feat(otopcua): driver-instance post-connect bounded re-discovery

This commit is contained in:
Joseph Doherty
2026-06-26 07:40:24 -04:00
parent bb21db0a8e
commit 51634cca38
2 changed files with 198 additions and 3 deletions
@@ -98,6 +98,13 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
/// subscription that un-gates an <see cref="IAlarmSource"/> driver's feed. Handled async so the
/// <see cref="IAlarmSource.SubscribeAlarmsAsync"/> call is bounded + off the synchronous handlers.</summary>
private sealed record SubscribeAlarms;
/// <summary>Published to the parent (DriverHostActor) after each post-connect discovery pass so it can
/// graft the driver's discovered FixedTree nodes under the equipment. Empty/duplicate sets are fine —
/// the parent dedups and injection is idempotent.</summary>
public sealed record DiscoveredNodesReady(string DriverInstanceId, IReadOnlyList<DiscoveredNode> Nodes);
/// <summary>Internal self-tick driving bounded post-connect re-discovery (FixedTree populates ~02s after connect).</summary>
private sealed record RediscoverTick(int Generation, int Attempt, int PreviousCount);
public sealed class RetryConnect
{
public static readonly RetryConnect Instance = new();
@@ -112,6 +119,14 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
private readonly string _clusterId;
private readonly IDriverHealthPublisher _healthPublisher;
private readonly TimeSpan _reconnectInterval;
/// <summary>Interval between bounded post-connect re-discovery passes. Production default 2s; tests
/// inject a tiny value so the loop runs without real-time waits.</summary>
private readonly TimeSpan _rediscoverInterval;
/// <summary>Cap on the number of post-connect re-discovery passes — a backstop so a never-stabilising
/// (or perpetually-empty) discovered set cannot spin the loop forever. Production default 15.</summary>
private readonly int _rediscoverMaxAttempts;
private readonly ILoggingAdapter _log = Context.GetLogger();
private string? _currentConfigJson;
@@ -167,18 +182,24 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
/// stub paths don't need to provide one.</param>
/// <param name="clusterId">Optional cluster identifier forwarded in <see cref="DriverHealthChanged"/> messages;
/// defaults to an empty string when not provided (e.g. in unit tests).</param>
/// <param name="rediscoverInterval">Optional interval between post-connect re-discovery passes; defaults to 2 seconds.</param>
/// <param name="rediscoverMaxAttempts">Optional cap on re-discovery passes; defaults to 15.</param>
public static Props Props(
IDriver driver,
TimeSpan? reconnectInterval = null,
bool startStubbed = false,
IDriverHealthPublisher? healthPublisher = null,
string? clusterId = null) =>
string? clusterId = null,
TimeSpan? rediscoverInterval = null,
int rediscoverMaxAttempts = 15) =>
Akka.Actor.Props.Create(() => new DriverInstanceActor(
driver,
reconnectInterval ?? DefaultReconnectInterval,
startStubbed,
healthPublisher ?? NullDriverHealthPublisher.Instance,
clusterId ?? string.Empty));
clusterId ?? string.Empty,
rediscoverInterval,
rediscoverMaxAttempts));
/// <summary>
/// Returns true when the driver should boot in DEV-STUB mode based on host platform and
@@ -210,18 +231,24 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
/// <param name="startStubbed">If true, start in stub mode for testing or unavailable platforms.</param>
/// <param name="healthPublisher">Sink for health-change notifications; must not be null.</param>
/// <param name="clusterId">Cluster identifier forwarded in health snapshots.</param>
/// <param name="rediscoverInterval">Interval between post-connect re-discovery passes; defaults to 2 seconds.</param>
/// <param name="rediscoverMaxAttempts">Cap on the number of re-discovery passes; defaults to 15.</param>
public DriverInstanceActor(
IDriver driver,
TimeSpan reconnectInterval,
bool startStubbed = false,
IDriverHealthPublisher? healthPublisher = null,
string? clusterId = null)
string? clusterId = null,
TimeSpan? rediscoverInterval = null,
int rediscoverMaxAttempts = 15)
{
_driver = driver;
_driverInstanceId = driver.DriverInstanceId;
_clusterId = clusterId ?? string.Empty;
_healthPublisher = healthPublisher ?? NullDriverHealthPublisher.Instance;
_reconnectInterval = reconnectInterval;
_rediscoverInterval = rediscoverInterval ?? TimeSpan.FromSeconds(2);
_rediscoverMaxAttempts = rediscoverMaxAttempts;
OtOpcUaTelemetry.DriverInstanceLifecycle.Add(1,
new KeyValuePair<string, object?>("event", startStubbed ? "spawn_stub" : "spawn"),
new KeyValuePair<string, object?>("driver_type", driver.DriverType));
@@ -284,6 +311,7 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
ResubscribeDesired();
AttachAlarmSource();
SubscribeDesiredAlarms();
StartDiscovery();
});
Receive<InitializeFailed>(msg =>
{
@@ -321,6 +349,7 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
{
_log.Warning("DriverInstance {Id}: disconnect observed ({Reason}); reconnecting",
_driverInstanceId, msg.Reason);
Timers.Cancel("rediscover");
DetachSubscription();
RecordFault();
Become(Reconnecting);
@@ -329,10 +358,12 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
Receive<ForceReconnect>(_ =>
{
_log.Info("DriverInstance {Id}: ForceReconnect requested by admin; re-entering Reconnecting", _driverInstanceId);
Timers.Cancel("rediscover");
DetachSubscription();
Become(Reconnecting);
PublishHealthSnapshot();
});
ReceiveAsync<RediscoverTick>(HandleRediscoverAsync);
ReceiveAsync<WriteAttribute>(HandleWriteAsync);
ReceiveAsync<RouteAlarmAck>(HandleAcknowledgeAsync);
ReceiveAsync<Subscribe>(HandleSubscribeAsync);
@@ -677,6 +708,59 @@ public sealed class DriverInstanceActor : ReceiveActor, IWithTimers
}
}
/// <summary>Kick the bounded post-connect re-discovery loop on a <c>Connected</c> entry. A no-op unless the
/// driver exposes <see cref="ITagDiscovery"/> (nothing to inject otherwise). Self-sends the first
/// <see cref="RediscoverTick"/> tagged with the current init generation so a tick that outlives a reconnect
/// is rejected by the generation guard in <see cref="HandleRediscoverAsync"/>.</summary>
private void StartDiscovery()
{
if (_driver is not ITagDiscovery) return; // driver doesn't expose discovery — nothing to inject
Self.Tell(new RediscoverTick(_initGeneration, Attempt: 0, PreviousCount: -1));
}
/// <summary>Runs one post-connect discovery pass: captures the driver's streamed FixedTree via a
/// <see cref="CapturingAddressSpaceBuilder"/> and ships the result to the parent as
/// <see cref="DiscoveredNodesReady"/> (empty/duplicate sets are fine — the parent dedups and injection
/// is idempotent). Retries on the <see cref="_rediscoverInterval"/> until the non-empty discovered set
/// has STABILISED (same count two passes running) or the <see cref="_rediscoverMaxAttempts"/> cap is hit,
/// whichever comes first; keeps retrying while empty because a FOCAS-style FixedTree cache may still be
/// populating. The generation guard (checked before and again after the await) drops a tick from a
/// superseded (re)connect so a stale loop cannot resurrect or double-ship.
/// <para>Limitation: this assumes a driver's discovered set only GROWS toward a stable size (true for
/// FOCAS — its FixedTree appears once, and on the wonder deploy the driver-config <c>_options.Tags</c> is
/// empty so the set is 0 until the cache populates). A driver that emits an initial non-empty set and
/// later grows could stop early on a transient repeat; acceptable for current scope.</para></summary>
private async Task HandleRediscoverAsync(RediscoverTick tick)
{
if (tick.Generation != _initGeneration) return; // stale (a reconnect happened)
if (_driver is not ITagDiscovery discovery) return;
IReadOnlyList<DiscoveredNode> nodes;
try
{
var builder = new CapturingAddressSpaceBuilder();
await discovery.DiscoverAsync(builder, CancellationToken.None).ConfigureAwait(false);
nodes = builder.Nodes;
}
catch (Exception ex)
{
_log.Debug(ex, "DriverInstance {Id}: discovery pass {Attempt} failed; will retry", _driverInstanceId, tick.Attempt);
nodes = Array.Empty<DiscoveredNode>();
}
if (tick.Generation != _initGeneration) return; // re-check after the await (state may have changed)
Context.Parent.Tell(new DiscoveredNodesReady(_driverInstanceId, nodes));
// Stop when the non-empty discovered set has stabilised, or the attempt cap is hit. Keep retrying
// while empty (FixedTree cache may still be populating). PreviousCount=-1 on the first pass.
var stableNonEmpty = nodes.Count > 0 && nodes.Count == tick.PreviousCount;
if (tick.Attempt + 1 < _rediscoverMaxAttempts && !stableNonEmpty)
Timers.StartSingleTimer("rediscover", new RediscoverTick(tick.Generation, tick.Attempt + 1, nodes.Count), _rediscoverInterval);
else
_log.Debug("DriverInstance {Id}: discovery settled after {Attempt} pass(es), {Count} node(s)", _driverInstanceId, tick.Attempt + 1, nodes.Count);
}
/// <summary>Records the host's desired subscription set without touching the live subscription.
/// The set is (re)applied by <see cref="ResubscribeDesired"/> on the next <c>Connected</c> entry.</summary>
private void StoreDesiredSubscriptions(SetDesiredSubscriptions msg)