fix(driver-galaxy): resolve High code-review findings (Driver.Galaxy-002, Driver.Galaxy-008)
Driver.Galaxy-002 — DataTypeMap.Map had no Int64 arm though MxValueDecoder/ MxValueEncoder both fully support Int64. Galaxy attributes with the Int64 mx_data_type code fell through to the String default, creating a String address-space node while runtime reads decoded a boxed long. Added `6 => DriverDataType.Int64`, extending the contiguous 0..5 scheme so the type map agrees with the decoder/encoder on all seven Galaxy data types. Driver.Galaxy-008 — after a stream fault the EventPump's StreamEvents consumer loop exited and its channel completed; EventPump.Start() is a no-op on a completed-but-non-null loop, so a replayed subscription had no consumer and ReplayAsync never re-registered the post-reconnect item handles. ReplayAsync now recreates the EventPump (RestartEventPumpForReplay) and rebinds the SubscriptionRegistry per subscription with the fresh item handles returned by the post-reconnect SubscribeBulkAsync, via new SubscriptionRegistry.SnapshotEntries and Rebind APIs. Regression tests: DataTypeMapTests (every code incl. Int64), SubscriptionRegistry Tests (Rebind/SnapshotEntries), EventPumpStreamFaultTests (faulted pump dead, fresh pump resumes dispatch). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -257,22 +257,106 @@ public sealed class GalaxyDriver
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Replay callback. Walks every active subscription's bindings and re-issues
|
||||
/// SubscribeBulk for the tag list. PR 6.x can swap this for the gw's batched
|
||||
/// <c>ReplaySubscriptionsCommand</c> once it ships.
|
||||
/// Replay callback. Walks every active subscription, re-issues SubscribeBulk for
|
||||
/// its tag list, and <see cref="SubscriptionRegistry.Rebind">rebinds</see> the
|
||||
/// registry with the fresh item handles the gateway returned — the pre-reconnect
|
||||
/// handles are dead once the session reopened. The faulted <see cref="EventPump"/>
|
||||
/// is recreated first so the replayed subscriptions have a live StreamEvents
|
||||
/// consumer; without that restart the replayed tags are subscribed on the gw but
|
||||
/// never reach <c>OnDataChange</c> (Driver.Galaxy-008). PR 6.x can swap this for
|
||||
/// the gw's batched <c>ReplaySubscriptionsCommand</c> once it ships.
|
||||
/// </summary>
|
||||
private async Task ReplayAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
if (_subscriber is null) return;
|
||||
var bindings = _subscriptions.SnapshotAllBindings();
|
||||
if (bindings.Count == 0) return;
|
||||
var entries = _subscriptions.SnapshotEntries();
|
||||
if (entries.Count == 0) return;
|
||||
|
||||
// The stream-fault that triggered this recovery left the old pump's RunAsync loop
|
||||
// exited and its channel completed; EventPump.Start() is a no-op on a non-null but
|
||||
// completed loop. Recreate the pump so the replayed subscriptions have a consumer.
|
||||
RestartEventPumpForReplay();
|
||||
|
||||
var tagCount = 0;
|
||||
foreach (var (subscriptionId, oldBindings) in entries)
|
||||
{
|
||||
var refs = oldBindings
|
||||
.Select(b => b.FullReference)
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.ToArray();
|
||||
if (refs.Length == 0) continue;
|
||||
|
||||
var results = await _subscriber
|
||||
.SubscribeBulkAsync(refs, _options.MxAccess.PublishingIntervalMs, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var byAddress = BuildResultIndex(results);
|
||||
var newBindings = new List<TagBinding>(refs.Length);
|
||||
foreach (var fullRef in refs)
|
||||
{
|
||||
var itemHandle = byAddress.TryGetValue(fullRef, out var match) && match.WasSuccessful
|
||||
? match.ItemHandle
|
||||
: 0;
|
||||
newBindings.Add(new TagBinding(fullRef, itemHandle));
|
||||
}
|
||||
|
||||
// Rebind so the EventPump fan-out reverse map points at the post-reconnect
|
||||
// handles; otherwise events on the new handles miss every subscription.
|
||||
_subscriptions.Rebind(subscriptionId, newBindings);
|
||||
tagCount += refs.Length;
|
||||
}
|
||||
|
||||
var refs = bindings.Select(b => b.FullReference).Distinct(StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
await _subscriber.SubscribeBulkAsync(
|
||||
refs, _options.MxAccess.PublishingIntervalMs, cancellationToken).ConfigureAwait(false);
|
||||
_logger.LogInformation(
|
||||
"GalaxyDriver {InstanceId} replay completed — {Count} tags re-subscribed",
|
||||
_driverInstanceId, refs.Length);
|
||||
"GalaxyDriver {InstanceId} replay completed — {SubCount} subscriptions, {TagCount} tags re-subscribed",
|
||||
_driverInstanceId, entries.Count, tagCount);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Index a SubscribeBulk result list by tag address (OrdinalIgnoreCase) so the
|
||||
/// subscribe / replay correlation loops are O(1) per reference rather than a
|
||||
/// linear scan. Last-write-wins on a duplicate address (the gw shouldn't emit one).
|
||||
/// </summary>
|
||||
private static Dictionary<string, SubscribeResult> BuildResultIndex(IReadOnlyList<SubscribeResult> results)
|
||||
{
|
||||
var index = new Dictionary<string, SubscribeResult>(results.Count, StringComparer.OrdinalIgnoreCase);
|
||||
foreach (var result in results)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(result.TagAddress)) index[result.TagAddress] = result;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dispose the faulted <see cref="EventPump"/> and create a fresh one bound to the
|
||||
/// same subscriber / registry, started immediately. Invoked from the reconnect
|
||||
/// replay path. No-op when no pump was ever started (no active subscriptions).
|
||||
/// </summary>
|
||||
private void RestartEventPumpForReplay()
|
||||
{
|
||||
EventPump? old;
|
||||
lock (_pumpLock)
|
||||
{
|
||||
old = _eventPump;
|
||||
if (old is null) return; // pump never started — nothing to restart
|
||||
_eventPump = null;
|
||||
}
|
||||
|
||||
// Detach + dispose the faulted pump outside the lock so a slow shutdown doesn't
|
||||
// block a concurrent EnsureEventPumpStarted; the old loop already exited on fault.
|
||||
old.OnDataChange -= OnPumpDataChange;
|
||||
try { old.DisposeAsync().AsTask().GetAwaiter().GetResult(); }
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"GalaxyDriver {InstanceId} faulted EventPump dispose failed during replay — continuing.",
|
||||
_driverInstanceId);
|
||||
}
|
||||
|
||||
// EnsureEventPumpStarted creates + starts a fresh pump under the lock. Skip the
|
||||
// recreate if the driver is being disposed — Dispose already tore the pump down
|
||||
// and a fresh one here would leak past the driver's lifetime.
|
||||
if (_disposed) return;
|
||||
EnsureEventPumpStarted();
|
||||
}
|
||||
|
||||
private void OnSupervisorStateChanged(object? sender, StateTransition transition)
|
||||
|
||||
Reference in New Issue
Block a user