The sibling mxaccessgw repo (clients/dotnet/) restored a proper client library + contracts under the new ZB.MOM.WW.MxGateway namespace, so the binary-vendoring stopgap from PR Driver.Galaxy-016 can unwind via plan #1 of libs/README.md. - csproj: replace <Reference HintPath="libs\MxGateway.*.dll"> with a ProjectReference into ..\..\..\..\mxaccessgw\clients\dotnet ZB.MOM.WW.MxGateway.Client\. The five backfill PackageReference shims (Google.Protobuf, Grpc.Core.Api, Grpc.Net.Client, Polly.Core, Microsoft.Extensions.Logging.Abstractions) are now transitive again. - Source: 'using MxGateway.X' -> 'using ZB.MOM.WW.MxGateway.X' across 19 driver files + 14 test files. No fully-qualified MxGateway.* usages in code, so no behavioural changes — purely a using-prefix flip. - libs/: deleted MxGateway.Client.dll, MxGateway.Contracts.dll, README.md (orphan after the unwind). Verified: dotnet build clean (Release), all 245 Driver.Galaxy unit tests pass, OtOpcUa service running with the new client DLL loaded (opc.tcp://localhost:4840/OtOpcUa, no FileNotFound/TypeLoad/ MissingMethod in startup logs). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1277 lines
61 KiB
C#
1277 lines
61 KiB
C#
using Microsoft.Extensions.Logging;
|
||
using Microsoft.Extensions.Logging.Abstractions;
|
||
using ZB.MOM.WW.MxGateway.Client;
|
||
using ZB.MOM.WW.MxGateway.Contracts.Proto;
|
||
using ZB.MOM.WW.OtOpcUa.Core.Abstractions;
|
||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Browse;
|
||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Config;
|
||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Health;
|
||
using ZB.MOM.WW.OtOpcUa.Driver.Galaxy.Runtime;
|
||
|
||
namespace ZB.MOM.WW.OtOpcUa.Driver.Galaxy;
|
||
|
||
/// <summary>
|
||
/// In-process .NET 10 Galaxy driver — the only Galaxy backend since PR 7.2 retired
|
||
/// the legacy <c>Galaxy.Host</c> / <c>Galaxy.Proxy</c> / <c>Galaxy.Shared</c>
|
||
/// projects and the <c>OtOpcUaGalaxyHost</c> Windows service. Implements the full
|
||
/// capability surface: <see cref="ITagDiscovery"/>, <see cref="IReadable"/>,
|
||
/// <see cref="IWritable"/>, <see cref="ISubscribable"/>, <see cref="IRediscoverable"/>,
|
||
/// <see cref="IHostConnectivityProbe"/>, and <see cref="IAlarmSource"/>. Galaxy
|
||
/// access flows through the in-process driver over gRPC to the separately
|
||
/// installed <c>mxaccessgw</c> gateway (sibling repo), which owns the MXAccess
|
||
/// COM apartment server-side.
|
||
/// </summary>
|
||
/// <remarks>
|
||
/// <para>
|
||
/// Registered as a Tier A in-process driver alongside Modbus / S7 / etc. via
|
||
/// <see cref="GalaxyDriverFactoryExtensions"/> under driver-type name
|
||
/// "GalaxyMxGateway".
|
||
/// </para>
|
||
/// <para>
|
||
/// Tests inject capability seams (<see cref="IGalaxyHierarchySource"/>,
|
||
/// <see cref="IGalaxyDataReader"/>, <see cref="IGalaxyDataWriter"/>,
|
||
/// <see cref="IGalaxySubscriber"/>, <see cref="IGalaxyAlarmAcknowledger"/>,
|
||
/// <see cref="IGalaxyAlarmFeed"/>) via the internal ctor so capability flow
|
||
/// can be exercised without a real gw round-trip.
|
||
/// </para>
|
||
/// </remarks>
|
||
public sealed class GalaxyDriver
|
||
: IDriver, ITagDiscovery, IReadable, IWritable, ISubscribable, IRediscoverable, IHostConnectivityProbe, IAlarmSource, IDisposable, IAsyncDisposable
|
||
{
|
||
private readonly string _driverInstanceId;
|
||
private readonly GalaxyDriverOptions _options;
|
||
private readonly ILogger<GalaxyDriver> _logger;
|
||
|
||
// PR 4.1 — IGalaxyHierarchySource is the test seam for browse. When null, the driver
|
||
// lazily builds a GatewayGalaxyHierarchySource around a GalaxyRepositoryClient on
|
||
// first DiscoverAsync. Tests inject a fake source via the internal ctor to exercise
|
||
// GalaxyDiscoverer's translation logic without a real gRPC channel.
|
||
private IGalaxyHierarchySource? _hierarchySource;
|
||
private GalaxyRepositoryClient? _ownedRepositoryClient;
|
||
|
||
// PR 4.2 — IGalaxyDataReader is the test seam for IReadable. PR 4.4 supplies the
|
||
// production implementation that wraps GalaxyMxSession's SubscribeBulk + StreamEvents
|
||
// pump; until then ReadAsync throws NotSupportedException when the reader is null
|
||
// (legacy-host backend handles reads in production via DriverNodeManager's
|
||
// capability-routing).
|
||
private IGalaxyDataReader? _dataReader;
|
||
|
||
// PR 4.3 — IGalaxyDataWriter is the test seam for IWritable. Production wraps
|
||
// GalaxyMxSession via GatewayGalaxyDataWriter (Write / WriteSecured routing). The
|
||
// per-tag SecurityClassification map is populated during ITagDiscovery and consumed
|
||
// here at write time.
|
||
private IGalaxyDataWriter? _dataWriter;
|
||
private readonly System.Collections.Concurrent.ConcurrentDictionary<string, SecurityClassification>
|
||
_securityByFullRef = new(StringComparer.OrdinalIgnoreCase);
|
||
|
||
// PR 4.4 — subscription lifecycle. The pump consumes the gw event stream and fans
|
||
// out OnDataChange events to every registered driver subscription via the registry's
|
||
// reverse map. The subscriber is the test seam — production uses
|
||
// GatewayGalaxySubscriber over a connected GalaxyMxSession.
|
||
private IGalaxySubscriber? _subscriber;
|
||
private readonly SubscriptionRegistry _subscriptions = new();
|
||
private EventPump? _eventPump;
|
||
private readonly Lock _pumpLock = new();
|
||
|
||
// IAlarmSource implementation. Production-side acks route through
|
||
// GatewayGalaxyAlarmAcknowledger which calls the session-less
|
||
// MxGatewayClient.AcknowledgeAlarmAsync RPC; alarm transitions arrive on the
|
||
// gateway's session-less StreamAlarms feed via GatewayGalaxyAlarmFeed. Tests inject
|
||
// IGalaxyAlarmAcknowledger + IGalaxyAlarmFeed via the internal ctor to exercise the
|
||
// wiring without a running gateway. This driver bridges the feed's OnAlarmTransition
|
||
// onto IAlarmSource.OnAlarmEvent.
|
||
private IGalaxyAlarmAcknowledger? _alarmAcknowledger;
|
||
private IGalaxyAlarmFeed? _alarmFeed;
|
||
private readonly Lock _alarmHandlersLock = new();
|
||
private readonly Lock _alarmFeedLock = new();
|
||
private bool _alarmFeedWired;
|
||
// List preserves insertion order so OnAlarmFeedTransition always picks the
|
||
// earliest-registered handle — a deterministic choice that doesn't vary as
|
||
// handles are added/removed (Driver.Galaxy-006 fix: HashSet.First() is unstable).
|
||
private readonly List<GalaxyAlarmSubscriptionHandle> _alarmSubscriptions = new();
|
||
|
||
// PR 4.W — production runtime owned by InitializeAsync. The driver builds these
|
||
// when it opens a real gw session; tests bypass them by injecting seams via the
|
||
// internal ctor.
|
||
private GalaxyMxSession? _ownedMxSession;
|
||
private MxGatewayClient? _ownedMxClient;
|
||
|
||
// PR 4.5 — reconnect supervisor. Reflects in DriverState.Degraded while not Healthy.
|
||
private ReconnectSupervisor? _supervisor;
|
||
|
||
// PR 4.6 — IRediscoverable plumbing.
|
||
private DeployWatcher? _deployWatcher;
|
||
|
||
// PR 4.7 — IHostConnectivityProbe plumbing. The aggregator owns the merged
|
||
// transport+per-platform view; the forwarder is fed from the supervisor on
|
||
// transport state transitions; the probe watcher subscribes ScanState attributes
|
||
// for every discovered platform and pushes value changes to the aggregator.
|
||
private readonly HostStatusAggregator _hostStatuses = new();
|
||
private HostConnectivityForwarder? _transportForwarder;
|
||
private PerPlatformProbeWatcher? _probeWatcher;
|
||
|
||
private DriverHealth _health = new(DriverState.Unknown, null, null);
|
||
private bool _disposed;
|
||
|
||
/// <summary>
|
||
/// Server-pushed data-change notification. Fires from the
|
||
/// <see cref="EventPump"/>'s background loop; handlers should be cheap (or queue
|
||
/// onto another thread) to avoid blocking the gw event stream.
|
||
/// </summary>
|
||
public event EventHandler<DataChangeEventArgs>? OnDataChange;
|
||
|
||
/// <summary>Fires when the gateway signals a deploy-time change (PR 4.6 DeployWatcher).</summary>
|
||
public event EventHandler<RediscoveryEventArgs>? OnRediscoveryNeeded;
|
||
|
||
/// <summary>Fires when a host transitions Running ↔ Stopped (PR 4.7 HostStatusAggregator).</summary>
|
||
public event EventHandler<HostStatusChangedEventArgs>? OnHostStatusChanged;
|
||
|
||
/// <inheritdoc />
|
||
public event EventHandler<AlarmEventArgs>? OnAlarmEvent;
|
||
|
||
public GalaxyDriver(
|
||
string driverInstanceId,
|
||
GalaxyDriverOptions options,
|
||
ILogger<GalaxyDriver>? logger = null)
|
||
: this(driverInstanceId, options,
|
||
hierarchySource: null, dataReader: null, dataWriter: null, subscriber: null,
|
||
alarmAcknowledger: null, alarmFeed: null, logger)
|
||
{
|
||
}
|
||
|
||
/// <summary>
|
||
/// Test-visible ctor — inject custom seams so <see cref="DiscoverAsync"/>,
|
||
/// <see cref="ReadAsync"/>, <see cref="WriteAsync"/>, and
|
||
/// <see cref="SubscribeAsync"/> can be exercised against canned data without
|
||
/// building real gRPC channels.
|
||
/// </summary>
|
||
internal GalaxyDriver(
|
||
string driverInstanceId,
|
||
GalaxyDriverOptions options,
|
||
IGalaxyHierarchySource? hierarchySource,
|
||
IGalaxyDataReader? dataReader = null,
|
||
IGalaxyDataWriter? dataWriter = null,
|
||
IGalaxySubscriber? subscriber = null,
|
||
IGalaxyAlarmAcknowledger? alarmAcknowledger = null,
|
||
IGalaxyAlarmFeed? alarmFeed = null,
|
||
ILogger<GalaxyDriver>? logger = null)
|
||
{
|
||
_driverInstanceId = !string.IsNullOrWhiteSpace(driverInstanceId)
|
||
? driverInstanceId
|
||
: throw new ArgumentException("Driver instance id required.", nameof(driverInstanceId));
|
||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||
_logger = logger ?? NullLogger<GalaxyDriver>.Instance;
|
||
_hierarchySource = hierarchySource;
|
||
_dataReader = dataReader;
|
||
_dataWriter = dataWriter;
|
||
_subscriber = subscriber;
|
||
_alarmAcknowledger = alarmAcknowledger;
|
||
_alarmFeed = alarmFeed;
|
||
|
||
// Forward the aggregator's transitions through IHostConnectivityProbe.
|
||
_hostStatuses.OnHostStatusChanged += (_, args) => OnHostStatusChanged?.Invoke(this, args);
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public string DriverInstanceId => _driverInstanceId;
|
||
|
||
/// <inheritdoc />
|
||
public string DriverType => GalaxyDriverFactoryExtensions.DriverTypeName;
|
||
|
||
/// <summary>Test-visible options snapshot.</summary>
|
||
internal GalaxyDriverOptions Options => _options;
|
||
|
||
/// <summary>
|
||
/// Test-visible entry into <see cref="ReplayAsync"/>. The supervisor's
|
||
/// <see cref="ReconnectSupervisor.ReportTransportFailure"/> drives this on a
|
||
/// background task in production; tests prefer to invoke it directly so the
|
||
/// <see cref="GalaxyReconnectOptions.ReplayOnSessionLost"/> branch can be
|
||
/// asserted deterministically (Driver.Galaxy-013).
|
||
/// </summary>
|
||
internal Task InvokeReplayForTestAsync(CancellationToken cancellationToken) =>
|
||
ReplayAsync(cancellationToken);
|
||
|
||
/// <inheritdoc />
|
||
public async Task InitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
|
||
// Tests inject seams via the internal ctor; production InitializeAsync builds
|
||
// the gateway client + session + per-capability runtime components from
|
||
// GalaxyDriverOptions. When seams are pre-injected we leave them alone (the
|
||
// test exercises the wired surface without a real gw round-trip).
|
||
if (_subscriber is null && _dataWriter is null && _hierarchySource is null)
|
||
{
|
||
await BuildProductionRuntimeAsync(cancellationToken).ConfigureAwait(false);
|
||
}
|
||
else
|
||
{
|
||
_logger.LogDebug(
|
||
"GalaxyDriver {InstanceId} initializing with pre-injected seams — production runtime build skipped",
|
||
_driverInstanceId);
|
||
}
|
||
|
||
StartDeployWatcher();
|
||
_logger.LogInformation(
|
||
"GalaxyDriver {InstanceId} initialized — endpoint={Endpoint} clientName={ClientName}",
|
||
_driverInstanceId, _options.Gateway.Endpoint, _options.MxAccess.ClientName);
|
||
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Build the production gw client + session + per-capability runtime components
|
||
/// from <c>_options</c>. Sets up the reconnect supervisor's reopen / replay
|
||
/// callbacks so a transport drop replays every active subscription on the
|
||
/// restored session.
|
||
/// </summary>
|
||
private async Task BuildProductionRuntimeAsync(CancellationToken cancellationToken)
|
||
{
|
||
var clientOptions = BuildClientOptions(_options.Gateway);
|
||
_ownedMxClient = MxGatewayClient.Create(clientOptions);
|
||
_ownedMxSession = new GalaxyMxSession(_options.MxAccess, _logger);
|
||
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||
|
||
// PR 6.1 — wrap the gw-facing seams in tracing decorators so every Subscribe /
|
||
// Unsubscribe / Write / StreamEvents call emits a span on the
|
||
// "ZB.MOM.WW.OtOpcUa.Driver.Galaxy" ActivitySource. The host process's tracing
|
||
// listener (OTLP exporter, dotnet-trace, etc.) consumes these without the driver
|
||
// taking a dependency on the OpenTelemetry packages.
|
||
_subscriber = new TracedGalaxySubscriber(
|
||
new GatewayGalaxySubscriber(_ownedMxSession), _options.MxAccess.ClientName);
|
||
_dataWriter = new TracedGalaxyDataWriter(
|
||
new GatewayGalaxyDataWriter(_ownedMxSession, _options.MxAccess.WriteUserId, _logger),
|
||
_options.MxAccess.ClientName);
|
||
|
||
_supervisor = new ReconnectSupervisor(
|
||
reopen: ReopenAsync,
|
||
replay: ReplayAsync,
|
||
options: new ReconnectOptions(
|
||
InitialBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.InitialBackoffMs),
|
||
MaxBackoffOverride: TimeSpan.FromMilliseconds(_options.Reconnect.MaxBackoffMs)),
|
||
logger: _logger);
|
||
|
||
_transportForwarder = new HostConnectivityForwarder(_options.MxAccess.ClientName, _hostStatuses, _logger);
|
||
_transportForwarder.SetTransport(HostState.Running); // initial state — we just connected
|
||
|
||
_supervisor.StateChanged += OnSupervisorStateChanged;
|
||
|
||
_probeWatcher = new PerPlatformProbeWatcher(
|
||
_subscriber, _hostStatuses, _logger,
|
||
bufferedUpdateIntervalMs: _options.MxAccess.PublishingIntervalMs);
|
||
|
||
// Wire the alarm acknowledger + feed to the live gateway client. Both are
|
||
// session-less — the gateway serves alarms from an always-on central monitor —
|
||
// so they hang off the owned MxGatewayClient, not the worker session.
|
||
_alarmAcknowledger ??= new GatewayGalaxyAlarmAcknowledger(_ownedMxClient, _logger);
|
||
_alarmFeed ??= new GatewayGalaxyAlarmFeed(
|
||
_ownedMxClient.StreamAlarmsAsync, _logger, _options.MxAccess.ClientName);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Reopen callback for <see cref="ReconnectSupervisor"/>: re-Register the gw session.
|
||
/// If the session never connected, this is a fresh ConnectAsync; otherwise it's a
|
||
/// reconnect against the existing client.
|
||
/// </summary>
|
||
private async Task ReopenAsync(CancellationToken cancellationToken)
|
||
{
|
||
if (_ownedMxSession is null) return;
|
||
var clientOptions = BuildClientOptions(_options.Gateway);
|
||
await _ownedMxSession.ConnectAsync(clientOptions, cancellationToken).ConfigureAwait(false);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Replay callback. Walks every active subscription, re-issues SubscribeBulk for
|
||
/// its tag list, and <see cref="SubscriptionRegistry.Rebind">rebinds</see> the
|
||
/// registry with the fresh item handles the gateway returned — the pre-reconnect
|
||
/// handles are dead once the session reopened. The faulted <see cref="EventPump"/>
|
||
/// is recreated first so the replayed subscriptions have a live StreamEvents
|
||
/// consumer; without that restart the replayed tags are subscribed on the gw but
|
||
/// never reach <c>OnDataChange</c> (Driver.Galaxy-008). PR 6.x can swap this for
|
||
/// the gw's batched <c>ReplaySubscriptionsCommand</c> once it ships.
|
||
/// </summary>
|
||
private async Task ReplayAsync(CancellationToken cancellationToken)
|
||
{
|
||
if (_subscriber is null) return;
|
||
var entries = _subscriptions.SnapshotEntries();
|
||
if (entries.Count == 0) return;
|
||
|
||
// Driver.Galaxy-013: honor ReplayOnSessionLost. When operators opt out (false)
|
||
// we skip the per-tag SubscribeBulk fan-out — they're delegating to the
|
||
// gateway's session-level ReplaySubscriptions or accept post-reconnect tag
|
||
// loss. We still restart the EventPump so a future Subscribe call lands on
|
||
// a live consumer.
|
||
if (!_options.Reconnect.ReplayOnSessionLost)
|
||
{
|
||
RestartEventPumpForReplay();
|
||
_logger.LogInformation(
|
||
"GalaxyDriver {InstanceId} reconnect replay skipped — ReplayOnSessionLost=false ({SubCount} subscriptions tracked)",
|
||
_driverInstanceId, entries.Count);
|
||
return;
|
||
}
|
||
|
||
// The stream-fault that triggered this recovery left the old pump's RunAsync loop
|
||
// exited and its channel completed; EventPump.Start() is a no-op on a non-null but
|
||
// completed loop. Recreate the pump so the replayed subscriptions have a consumer.
|
||
RestartEventPumpForReplay();
|
||
|
||
var tagCount = 0;
|
||
foreach (var (subscriptionId, oldBindings) in entries)
|
||
{
|
||
var refs = oldBindings
|
||
.Select(b => b.FullReference)
|
||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||
.ToArray();
|
||
if (refs.Length == 0) continue;
|
||
|
||
var results = await _subscriber
|
||
.SubscribeBulkAsync(refs, _options.MxAccess.PublishingIntervalMs, cancellationToken)
|
||
.ConfigureAwait(false);
|
||
|
||
var byAddress = BuildResultIndex(results);
|
||
var newBindings = new List<TagBinding>(refs.Length);
|
||
foreach (var fullRef in refs)
|
||
{
|
||
var itemHandle = byAddress.TryGetValue(fullRef, out var match) && match.WasSuccessful
|
||
? match.ItemHandle
|
||
: 0;
|
||
newBindings.Add(new TagBinding(fullRef, itemHandle));
|
||
}
|
||
|
||
// Rebind so the EventPump fan-out reverse map points at the post-reconnect
|
||
// handles; otherwise events on the new handles miss every subscription.
|
||
_subscriptions.Rebind(subscriptionId, newBindings);
|
||
tagCount += refs.Length;
|
||
}
|
||
|
||
_logger.LogInformation(
|
||
"GalaxyDriver {InstanceId} replay completed — {SubCount} subscriptions, {TagCount} tags re-subscribed",
|
||
_driverInstanceId, entries.Count, tagCount);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Index a SubscribeBulk result list by tag address (OrdinalIgnoreCase) so the
|
||
/// subscribe / replay correlation loops are O(1) per reference rather than a
|
||
/// linear scan. Last-write-wins on a duplicate address (the gw shouldn't emit one).
|
||
/// </summary>
|
||
private static Dictionary<string, SubscribeResult> BuildResultIndex(IReadOnlyList<SubscribeResult> results)
|
||
{
|
||
var index = new Dictionary<string, SubscribeResult>(results.Count, StringComparer.OrdinalIgnoreCase);
|
||
foreach (var result in results)
|
||
{
|
||
if (!string.IsNullOrEmpty(result.TagAddress)) index[result.TagAddress] = result;
|
||
}
|
||
return index;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Dispose the faulted <see cref="EventPump"/> and create a fresh one bound to the
|
||
/// same subscriber / registry, started immediately. Invoked from the reconnect
|
||
/// replay path. No-op when no pump was ever started (no active subscriptions).
|
||
/// </summary>
|
||
private void RestartEventPumpForReplay()
|
||
{
|
||
EventPump? old;
|
||
lock (_pumpLock)
|
||
{
|
||
old = _eventPump;
|
||
if (old is null) return; // pump never started — nothing to restart
|
||
_eventPump = null;
|
||
}
|
||
|
||
// Detach + dispose the faulted pump outside the lock so a slow shutdown doesn't
|
||
// block a concurrent EnsureEventPumpStarted; the old loop already exited on fault.
|
||
old.OnDataChange -= OnPumpDataChange;
|
||
try { old.DisposeAsync().AsTask().GetAwaiter().GetResult(); }
|
||
catch (Exception ex)
|
||
{
|
||
_logger.LogWarning(ex,
|
||
"GalaxyDriver {InstanceId} faulted EventPump dispose failed during replay — continuing.",
|
||
_driverInstanceId);
|
||
}
|
||
|
||
// EnsureEventPumpStarted creates + starts a fresh pump under the lock. Skip the
|
||
// recreate if the driver is being disposed — Dispose already tore the pump down
|
||
// and a fresh one here would leak past the driver's lifetime.
|
||
if (_disposed) return;
|
||
EnsureEventPumpStarted();
|
||
}
|
||
|
||
private void OnSupervisorStateChanged(object? sender, StateTransition transition)
|
||
{
|
||
// Reflect supervisor state in DriverHealth + transport forwarder.
|
||
_health = transition.Next switch
|
||
{
|
||
ReconnectSupervisor.State.Healthy => new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null),
|
||
_ => new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, transition.Cause),
|
||
};
|
||
if (_transportForwarder is not null)
|
||
{
|
||
var hostState = transition.Next == ReconnectSupervisor.State.Healthy
|
||
? HostState.Running
|
||
: HostState.Stopped;
|
||
_transportForwarder.SetTransport(hostState);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Resolves <c>Gateway.ApiKeySecretRef</c> to the actual API-key bytes. Four
|
||
/// forms supported, evaluated in order:
|
||
/// <list type="number">
|
||
/// <item><c>env:NAME</c> — reads <c>Environment.GetEnvironmentVariable(NAME)</c>.
|
||
/// Throws when the variable is unset, so a misconfigured deployment fails
|
||
/// fast at InitializeAsync rather than silently sending an empty key.</item>
|
||
/// <item><c>file:PATH</c> — reads UTF-8 text from <c>PATH</c>, trimming
|
||
/// whitespace. Lets operators stash the key in an ACL'd file outside the
|
||
/// repo (the same pattern as the legacy <c>.local/galaxy-host-secret.txt</c>).</item>
|
||
/// <item><c>dev:KEY</c> — explicit cleartext literal. The <c>dev:</c> prefix
|
||
/// is a deliberate opt-in signal (dev box, parity rig) so the resolver
|
||
/// doesn't emit a warning; production should never use this arm.</item>
|
||
/// <item>Anything else — used as the literal API key for back-compat with
|
||
/// configs that pre-date this resolver. When a logger is supplied the
|
||
/// resolver emits a startup warning so an operator who accidentally
|
||
/// committed a cleartext key sees it (Driver.Galaxy-010).</item>
|
||
/// </list>
|
||
/// A future PR can swap any of these arms for a DPAPI-backed lookup without
|
||
/// changing the call site.
|
||
/// </summary>
|
||
internal static string ResolveApiKey(string secretRef) => ResolveApiKey(secretRef, logger: null);
|
||
|
||
/// <summary>
|
||
/// Logger-aware overload. Emits a <see cref="LogLevel.Warning"/> if the secret
|
||
/// ref falls through to the back-compat literal arm (an unprefixed cleartext
|
||
/// API key in <c>DriverConfig</c> JSON). The <c>dev:</c> prefix is the explicit
|
||
/// opt-in path that doesn't warn.
|
||
/// </summary>
|
||
internal static string ResolveApiKey(string secretRef, ILogger? logger)
|
||
{
|
||
ArgumentException.ThrowIfNullOrEmpty(secretRef);
|
||
|
||
if (secretRef.StartsWith("env:", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
var name = secretRef[4..];
|
||
var value = Environment.GetEnvironmentVariable(name);
|
||
return !string.IsNullOrEmpty(value)
|
||
? value
|
||
: throw new InvalidOperationException(
|
||
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' resolves to env var '{name}', but it is unset.");
|
||
}
|
||
|
||
if (secretRef.StartsWith("file:", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
var path = secretRef[5..];
|
||
if (!File.Exists(path))
|
||
{
|
||
throw new InvalidOperationException(
|
||
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' points at '{path}', which doesn't exist.");
|
||
}
|
||
var contents = File.ReadAllText(path).Trim();
|
||
return !string.IsNullOrEmpty(contents)
|
||
? contents
|
||
: throw new InvalidOperationException(
|
||
$"Galaxy.Gateway.ApiKeySecretRef='{secretRef}' file '{path}' is empty.");
|
||
}
|
||
|
||
if (secretRef.StartsWith("dev:", StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
// Explicit dev opt-in — no warning, the operator deliberately chose a
|
||
// cleartext literal (dev box, parity rig).
|
||
return secretRef[4..];
|
||
}
|
||
|
||
// Back-compat literal arm. An unprefixed string is treated as the literal
|
||
// API key — but emit a warning so an operator who accidentally committed a
|
||
// cleartext key into DriverConfig sees it at startup. Use the dev: prefix
|
||
// to suppress this warning when the literal is intentional.
|
||
logger?.LogWarning(
|
||
"Galaxy.Gateway.ApiKeySecretRef is being treated as a literal cleartext API key. " +
|
||
"Prefer env:NAME, file:PATH, or the explicit dev:KEY prefix for dev rigs — " +
|
||
"a literal key in DriverConfig JSON is stored in cleartext in the central config DB.");
|
||
return secretRef;
|
||
}
|
||
|
||
private MxGatewayClientOptions BuildClientOptions(GalaxyGatewayOptions gw) => new()
|
||
{
|
||
Endpoint = new Uri(gw.Endpoint, UriKind.Absolute),
|
||
// Driver.Galaxy-010: pass the logger so the literal-arm cleartext fallback
|
||
// surfaces a startup warning rather than silently shipping the key.
|
||
ApiKey = ResolveApiKey(gw.ApiKeySecretRef, _logger),
|
||
UseTls = gw.UseTls,
|
||
CaCertificatePath = gw.CaCertificatePath,
|
||
ConnectTimeout = TimeSpan.FromSeconds(gw.ConnectTimeoutSeconds),
|
||
DefaultCallTimeout = TimeSpan.FromSeconds(gw.DefaultCallTimeoutSeconds),
|
||
StreamTimeout = gw.StreamTimeoutSeconds > 0 ? TimeSpan.FromSeconds(gw.StreamTimeoutSeconds) : null,
|
||
};
|
||
|
||
private void StartDeployWatcher()
|
||
{
|
||
if (!_options.Repository.WatchDeployEvents) return;
|
||
if (_ownedRepositoryClient is null && _hierarchySource is null) return;
|
||
|
||
// Reuse the lazily-built repository client (DiscoverAsync constructs it on demand).
|
||
// If discovery hasn't run yet, build the client here so the watcher has a target.
|
||
// Driver.Galaxy-009 fix: guard with ??= so if BuildDefaultHierarchySource later runs
|
||
// it reuses this client rather than overwriting the field and leaking the first instance.
|
||
_ownedRepositoryClient ??= MxGateway.Client.GalaxyRepositoryClient.Create(
|
||
BuildClientOptions(_options.Gateway));
|
||
|
||
var source = new GatewayGalaxyDeployWatchSource(_ownedRepositoryClient);
|
||
_deployWatcher = new DeployWatcher(source, _logger);
|
||
_deployWatcher.OnRediscoveryNeeded += (_, args) => OnRediscoveryNeeded?.Invoke(this, args);
|
||
|
||
// StartAsync schedules the background loop and returns Task.CompletedTask immediately.
|
||
// It throws InvalidOperationException synchronously if called twice (programming error).
|
||
// Driver.Galaxy-009 fix: don't discard the return value — observe any synchronous throw.
|
||
var startTask = _deployWatcher.StartAsync(CancellationToken.None);
|
||
// The task is already completed (StartAsync is synchronous); surface any synchronous fault.
|
||
if (startTask.IsFaulted) startTask.GetAwaiter().GetResult();
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
/// <remarks>
|
||
/// <para>
|
||
/// In-place config reapply. The driver does not currently support
|
||
/// hot-swapping <see cref="GalaxyDriverOptions"/> at runtime — changing the
|
||
/// gateway endpoint, MxAccess client name, or reconnect policy requires
|
||
/// tearing down the gw session, supervisor, event pump, and address space.
|
||
/// The host stack handles that via DriverInstance restart, so this method
|
||
/// only accepts an equivalent config (no meaningful change) and refreshes
|
||
/// health; a non-equivalent reapply throws <see cref="NotSupportedException"/>
|
||
/// so the caller knows the change wasn't applied (Driver.Galaxy-013:
|
||
/// previously the method silently ignored <c>driverConfigJson</c>).
|
||
/// </para>
|
||
/// </remarks>
|
||
public Task ReinitializeAsync(string driverConfigJson, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
if (!string.IsNullOrWhiteSpace(driverConfigJson))
|
||
{
|
||
// Materialise the incoming config and compare against the live options. We
|
||
// refuse any change that would require a session teardown rather than
|
||
// pretending to apply it.
|
||
GalaxyDriverOptions incoming;
|
||
try
|
||
{
|
||
// Reuse the factory's parse pipeline so any missing-required-field
|
||
// error surfaces with the same diagnostic text as InitializeAsync.
|
||
var transient = GalaxyDriverFactoryExtensions.CreateInstance(_driverInstanceId, driverConfigJson);
|
||
incoming = transient.Options;
|
||
// The transient instance never started a runtime — disposing is cheap.
|
||
transient.Dispose();
|
||
}
|
||
catch (Exception ex) when (ex is not NotSupportedException and not ObjectDisposedException)
|
||
{
|
||
throw new NotSupportedException(
|
||
$"GalaxyDriver.ReinitializeAsync could not parse the incoming DriverConfig JSON for '{_driverInstanceId}': {ex.Message}",
|
||
ex);
|
||
}
|
||
|
||
if (!OptionsAreEquivalent(_options, incoming))
|
||
{
|
||
throw new NotSupportedException(
|
||
"GalaxyDriver.ReinitializeAsync does not support hot-swapping driver options at runtime " +
|
||
"(gateway endpoint, MxAccess client name, reconnect policy, etc.). Restart the DriverInstance " +
|
||
"through the host stack to apply a config change.");
|
||
}
|
||
}
|
||
_health = new DriverHealth(DriverState.Healthy, DateTime.UtcNow, null);
|
||
return Task.CompletedTask;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Compare two <see cref="GalaxyDriverOptions"/> for runtime equivalence — every
|
||
/// field that drives gw session shape, address space, or reconnect behaviour
|
||
/// must match. Records get value-equality from the language, so a direct
|
||
/// equality check is enough.
|
||
/// </summary>
|
||
private static bool OptionsAreEquivalent(GalaxyDriverOptions a, GalaxyDriverOptions b) => a == b;
|
||
|
||
/// <inheritdoc />
|
||
public Task ShutdownAsync(CancellationToken cancellationToken)
|
||
{
|
||
if (_disposed) return Task.CompletedTask;
|
||
_logger.LogInformation("GalaxyDriver {InstanceId} shutting down", _driverInstanceId);
|
||
_health = new DriverHealth(DriverState.Unknown, _health.LastSuccessfulRead, null);
|
||
return Task.CompletedTask;
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public DriverHealth GetHealth()
|
||
{
|
||
// Reconnect supervisor wins when degraded — the cached _health reflects the last
|
||
// successful operation, but ongoing recovery should surface as Degraded.
|
||
if (_supervisor?.IsDegraded == true)
|
||
{
|
||
return new DriverHealth(DriverState.Degraded, _health.LastSuccessfulRead, _supervisor.LastError);
|
||
}
|
||
return _health;
|
||
}
|
||
|
||
// ===== IHostConnectivityProbe (PR 4.7 wire-up) =====
|
||
|
||
/// <inheritdoc />
|
||
public IReadOnlyList<HostConnectivityStatus> GetHostStatuses() => _hostStatuses.Snapshot();
|
||
|
||
/// <inheritdoc />
|
||
/// <remarks>
|
||
/// Estimated footprint: 64 bytes × tracked item handles (one gw subscription entry
|
||
/// per bound tag) + 256 bytes × tracked driver subscriptions (registry overhead per
|
||
/// OPC UA monitored item). Returns 0 when no subscriptions are active. These
|
||
/// constants are conservative — a 50k-tag set occupies ~3 MB and registers clearly
|
||
/// with the server's cache-flush heuristic. Driver.Galaxy-011: the stale
|
||
/// "PR 4.4 sets this" comment is removed; PR 4.4 shipped the SubscriptionRegistry
|
||
/// but never wired it here.
|
||
/// </remarks>
|
||
public long GetMemoryFootprint()
|
||
{
|
||
const long BytesPerItemHandle = 64L; // TagBinding + reverse-map entry
|
||
const long BytesPerSubscription = 256L; // SubscriptionEntry overhead
|
||
return (_subscriptions.TrackedItemHandleCount * BytesPerItemHandle)
|
||
+ (_subscriptions.TrackedSubscriptionCount * BytesPerSubscription);
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public Task FlushOptionalCachesAsync(CancellationToken cancellationToken) => Task.CompletedTask;
|
||
|
||
// ===== ITagDiscovery (PR 4.1) =====
|
||
|
||
/// <inheritdoc />
|
||
public async Task DiscoverAsync(IAddressSpaceBuilder builder, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(builder);
|
||
|
||
// PR 4.3 — capture SecurityClassification per attribute. PR 4.W — also refresh
|
||
// the per-platform probe watcher's membership after discovery so newly-added
|
||
// $WinPlatform / $AppEngine objects start advising their ScanState attribute.
|
||
var capturingBuilder = new SecurityCapturingBuilder(builder, _securityByFullRef);
|
||
var source = _hierarchySource ??= BuildDefaultHierarchySource();
|
||
var discoverer = new GalaxyDiscoverer(source);
|
||
await discoverer.DiscoverAsync(capturingBuilder, cancellationToken).ConfigureAwait(false);
|
||
|
||
if (_probeWatcher is not null)
|
||
{
|
||
var hierarchy = await source.GetHierarchyAsync(cancellationToken).ConfigureAwait(false);
|
||
var platforms = hierarchy
|
||
.Where(o => o.TemplateChain.Any(t =>
|
||
string.Equals(t, "$WinPlatform", StringComparison.OrdinalIgnoreCase)
|
||
|| string.Equals(t, "$AppEngine", StringComparison.OrdinalIgnoreCase)))
|
||
.Select(o => o.TagName)
|
||
.Where(name => !string.IsNullOrEmpty(name));
|
||
await _probeWatcher.SyncPlatformsAsync(platforms, cancellationToken).ConfigureAwait(false);
|
||
}
|
||
}
|
||
|
||
private SecurityClassification ResolveSecurity(string fullReference) =>
|
||
_securityByFullRef.TryGetValue(fullReference, out var sec) ? sec : SecurityClassification.FreeAccess;
|
||
|
||
// ===== IReadable =====
|
||
|
||
/// <inheritdoc />
|
||
public Task<IReadOnlyList<DataValueSnapshot>> ReadAsync(
|
||
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(fullReferences);
|
||
if (fullReferences.Count == 0) return Task.FromResult<IReadOnlyList<DataValueSnapshot>>([]);
|
||
|
||
if (_dataReader is not null)
|
||
{
|
||
// Test-only path — tests inject a canned reader via the internal ctor.
|
||
return _dataReader.ReadAsync(fullReferences, cancellationToken);
|
||
}
|
||
|
||
if (_subscriber is null)
|
||
{
|
||
throw new NotSupportedException(
|
||
"GalaxyDriver.ReadAsync requires a connected GalaxyMxSession (production runtime not built). " +
|
||
"Either inject a test seam via the internal ctor or call InitializeAsync against a real gateway.");
|
||
}
|
||
|
||
return ReadViaSubscribeOnceAsync(fullReferences, cancellationToken);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Production read path. MxAccess has no one-shot Read RPC — every value comes
|
||
/// through the event stream. We synthesise a Read by:
|
||
/// <list type="number">
|
||
/// <item>Subscribing the requested tags through the existing
|
||
/// <see cref="SubscriptionRegistry"/> + <see cref="EventPump"/>.</item>
|
||
/// <item>Waiting for the first <c>OnDataChange</c> per item handle (the gateway
|
||
/// pushes the current value as the initial event after a SubscribeBulk).</item>
|
||
/// <item>Unsubscribing.</item>
|
||
/// </list>
|
||
/// Tags the gw rejects at SubscribeBulk time, or that never publish before the
|
||
/// caller's cancellation token fires, return a Bad-status snapshot in input order
|
||
/// so the caller still sees one snapshot per requested reference.
|
||
/// </summary>
|
||
private async Task<IReadOnlyList<DataValueSnapshot>> ReadViaSubscribeOnceAsync(
|
||
IReadOnlyList<string> fullReferences, CancellationToken cancellationToken)
|
||
{
|
||
var pump = EnsureEventPumpStarted();
|
||
var subscriptionId = _subscriptions.NextSubscriptionId();
|
||
|
||
// Pre-allocate one TaskCompletionSource per full-reference so the OnDataChange
|
||
// handler can complete them out-of-order as events arrive. Wired BEFORE the
|
||
// SubscribeBulk call so we don't race with the first event the gw pushes.
|
||
var pendingByRef = new Dictionary<string, TaskCompletionSource<DataValueSnapshot>>(
|
||
StringComparer.OrdinalIgnoreCase);
|
||
foreach (var fullRef in fullReferences.Distinct(StringComparer.OrdinalIgnoreCase))
|
||
{
|
||
pendingByRef[fullRef] = new TaskCompletionSource<DataValueSnapshot>(
|
||
TaskCreationOptions.RunContinuationsAsynchronously);
|
||
}
|
||
|
||
EventHandler<DataChangeEventArgs> handler = (_, args) =>
|
||
{
|
||
// Filter to OUR subscription — the pump's OnDataChange fans out across all
|
||
// subscriptions on the driver, and we don't want a parallel ISubscribable
|
||
// caller's events to leak into our read.
|
||
if (args.SubscriptionHandle is GalaxySubscriptionHandle gsh
|
||
&& gsh.SubscriptionId == subscriptionId
|
||
&& pendingByRef.TryGetValue(args.FullReference, out var tcs))
|
||
{
|
||
tcs.TrySetResult(args.Snapshot);
|
||
}
|
||
};
|
||
pump.OnDataChange += handler;
|
||
|
||
var bufferedIntervalMs = _options.MxAccess.PublishingIntervalMs;
|
||
IReadOnlyList<SubscribeResult> results;
|
||
try
|
||
{
|
||
results = await _subscriber!
|
||
.SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken)
|
||
.ConfigureAwait(false);
|
||
}
|
||
catch
|
||
{
|
||
pump.OnDataChange -= handler;
|
||
throw;
|
||
}
|
||
|
||
// Register bindings so the pump knows to dispatch events for these handles.
|
||
// Driver.Galaxy-012: index the SubscribeBulk results once and correlate to
|
||
// references in O(1) instead of FirstOrDefault per element (O(n²) over the
|
||
// batch). On the 50k-tag soak path this turns a 2.5G-comparison loop into
|
||
// a single Dictionary build + linear scan.
|
||
var resultIndex = BuildResultIndex(results);
|
||
var bindings = new List<TagBinding>(fullReferences.Count);
|
||
for (var i = 0; i < fullReferences.Count; i++)
|
||
{
|
||
var fullRef = fullReferences[i];
|
||
var itemHandle = resultIndex.TryGetValue(fullRef, out var match) && match is { WasSuccessful: true }
|
||
? match.ItemHandle
|
||
: 0;
|
||
bindings.Add(new TagBinding(fullRef, itemHandle));
|
||
|
||
// Tags the gw rejected up front — complete with Bad status now so the
|
||
// wait below doesn't time out on them.
|
||
if (itemHandle <= 0
|
||
&& pendingByRef.TryGetValue(fullRef, out var rejectedTcs))
|
||
{
|
||
rejectedTcs.TrySetResult(new DataValueSnapshot(
|
||
Value: null,
|
||
StatusCode: 0x80000000u, // Bad
|
||
SourceTimestampUtc: null,
|
||
ServerTimestampUtc: DateTime.UtcNow));
|
||
}
|
||
}
|
||
_subscriptions.Register(subscriptionId, bindings);
|
||
|
||
try
|
||
{
|
||
// Wait for every pending TCS to complete or the caller's CT to fire. When the
|
||
// CT fires before all values arrive, fill the still-pending entries with a
|
||
// Bad-status snapshot rather than throwing — Read semantics let callers see
|
||
// partial results.
|
||
using var registration = cancellationToken.Register(() =>
|
||
{
|
||
foreach (var tcs in pendingByRef.Values)
|
||
{
|
||
tcs.TrySetResult(new DataValueSnapshot(
|
||
Value: null,
|
||
StatusCode: 0x800B0000u, // BadTimeout
|
||
SourceTimestampUtc: null,
|
||
ServerTimestampUtc: DateTime.UtcNow));
|
||
}
|
||
});
|
||
|
||
var snapshots = new DataValueSnapshot[fullReferences.Count];
|
||
for (var i = 0; i < fullReferences.Count; i++)
|
||
{
|
||
snapshots[i] = await pendingByRef[fullReferences[i]].Task.ConfigureAwait(false);
|
||
}
|
||
return snapshots;
|
||
}
|
||
finally
|
||
{
|
||
pump.OnDataChange -= handler;
|
||
// Drop the bindings + unsubscribe the live handles. UnsubscribeBulkAsync's
|
||
// failure isn't fatal — the registry is already cleared, so any straggling
|
||
// event from the gw would be a no-op fan-out.
|
||
_subscriptions.Remove(subscriptionId);
|
||
var liveHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray();
|
||
if (liveHandles.Length > 0)
|
||
{
|
||
try
|
||
{
|
||
await _subscriber!.UnsubscribeBulkAsync(liveHandles, CancellationToken.None)
|
||
.ConfigureAwait(false);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
_logger.LogWarning(ex,
|
||
"GalaxyDriver.ReadViaSubscribeOnceAsync UnsubscribeBulk failed for {Count} handle(s) — registry already cleared.",
|
||
liveHandles.Length);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// ===== IWritable (PR 4.3) =====
|
||
|
||
/// <inheritdoc />
|
||
public Task<IReadOnlyList<WriteResult>> WriteAsync(
|
||
IReadOnlyList<WriteRequest> writes, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(writes);
|
||
if (writes.Count == 0) return Task.FromResult<IReadOnlyList<WriteResult>>([]);
|
||
|
||
if (_dataWriter is null)
|
||
{
|
||
// Mirror the IReadable fallback: production write path runs on top of
|
||
// GalaxyMxSession (PR 4.2 skeleton; PR 4.4 wires the live session). Until
|
||
// that lands, deployments selecting Galaxy:Backend=mxgateway can't write.
|
||
throw new NotSupportedException(
|
||
"GalaxyDriver.WriteAsync requires GatewayGalaxyDataWriter wired against a connected " +
|
||
"GalaxyMxSession (PR 4.4). Until that lands, route writes through the legacy-host " +
|
||
"backend (Galaxy:Backend=legacy-host).");
|
||
}
|
||
|
||
return _dataWriter.WriteAsync(writes, ResolveSecurity, cancellationToken);
|
||
}
|
||
|
||
// ===== ISubscribable (PR 4.4) =====
|
||
|
||
/// <inheritdoc />
|
||
public async Task<ISubscriptionHandle> SubscribeAsync(
|
||
IReadOnlyList<string> fullReferences, TimeSpan publishingInterval, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(fullReferences);
|
||
|
||
if (_subscriber is null)
|
||
{
|
||
throw new NotSupportedException(
|
||
"GalaxyDriver.SubscribeAsync requires a connected GalaxyMxSession + GatewayGalaxySubscriber. " +
|
||
"PR 4.W wires the production session; until then route subscriptions through the legacy-host backend.");
|
||
}
|
||
|
||
var pump = EnsureEventPumpStarted();
|
||
var subscriptionId = _subscriptions.NextSubscriptionId();
|
||
|
||
if (fullReferences.Count == 0)
|
||
{
|
||
// Empty subscriptions register but never bind anything — keeps Unsubscribe
|
||
// symmetric for callers that conditionally add tags later.
|
||
_subscriptions.Register(subscriptionId, []);
|
||
return new GalaxySubscriptionHandle(subscriptionId);
|
||
}
|
||
|
||
// PR 6.3 — when the caller doesn't set a publishing interval (TimeSpan.Zero or
|
||
// negative), fall back to the configured MxAccess.PublishingIntervalMs. The
|
||
// server's UA subscription publishingInterval drives this in production; tests
|
||
// and infrastructure callers (probe watcher, deploy watcher) hit the fallback.
|
||
var requested = (int)Math.Max(0, publishingInterval.TotalMilliseconds);
|
||
var bufferedIntervalMs = requested > 0 ? requested : _options.MxAccess.PublishingIntervalMs;
|
||
var results = await _subscriber
|
||
.SubscribeBulkAsync(fullReferences, bufferedIntervalMs, cancellationToken)
|
||
.ConfigureAwait(false);
|
||
|
||
// Build the binding list in input order. Failed entries (gw rejected the tag) are
|
||
// recorded with a non-positive ItemHandle so the caller can detect partial failure
|
||
// by inspecting the returned handle's diagnostic context — full per-tag error
|
||
// surface lands in PR 5.3's parity tests.
|
||
// Driver.Galaxy-012: index results once, correlate in O(1) per reference rather
|
||
// than FirstOrDefault inside the loop (O(n²) on the 50k-tag path).
|
||
var resultIndex = BuildResultIndex(results);
|
||
var bindings = new List<TagBinding>(fullReferences.Count);
|
||
for (var i = 0; i < fullReferences.Count; i++)
|
||
{
|
||
var fullRef = fullReferences[i];
|
||
var hasMatch = resultIndex.TryGetValue(fullRef, out var match);
|
||
var itemHandle = hasMatch && match is { WasSuccessful: true } ? match.ItemHandle : 0;
|
||
bindings.Add(new TagBinding(fullRef, itemHandle));
|
||
if (match is null || !match.WasSuccessful)
|
||
{
|
||
_logger.LogWarning(
|
||
"Galaxy subscribe for {FullRef} failed: {Error}",
|
||
fullRef, match?.ErrorMessage ?? "<no result returned>");
|
||
}
|
||
}
|
||
|
||
_subscriptions.Register(subscriptionId, bindings);
|
||
_ = pump; // keep the pump alive for the subscription's lifetime
|
||
return new GalaxySubscriptionHandle(subscriptionId);
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public async Task UnsubscribeAsync(ISubscriptionHandle handle, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(handle);
|
||
if (handle is not GalaxySubscriptionHandle gsh)
|
||
{
|
||
throw new ArgumentException(
|
||
$"Subscription handle was not issued by this driver (expected GalaxySubscriptionHandle, got {handle.GetType().Name}).",
|
||
nameof(handle));
|
||
}
|
||
|
||
var bindings = _subscriptions.Remove(gsh.SubscriptionId);
|
||
if (bindings is null) return; // already removed or never registered
|
||
|
||
var liveItemHandles = bindings.Where(b => b.ItemHandle > 0).Select(b => b.ItemHandle).ToArray();
|
||
if (liveItemHandles.Length == 0 || _subscriber is null) return;
|
||
|
||
try
|
||
{
|
||
await _subscriber.UnsubscribeBulkAsync(liveItemHandles, cancellationToken).ConfigureAwait(false);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
_logger.LogWarning(ex,
|
||
"Galaxy UnsubscribeBulk failed for subscription {SubscriptionId} — registry already cleared on driver side.",
|
||
gsh.SubscriptionId);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Lazily start the <see cref="EventPump"/> on the first subscribe. The pump is
|
||
/// shared across every subscription on this driver — fan-out happens through the
|
||
/// <see cref="SubscriptionRegistry"/> reverse map, not by spinning a pump per
|
||
/// subscription.
|
||
/// </summary>
|
||
private EventPump EnsureEventPumpStarted()
|
||
{
|
||
lock (_pumpLock)
|
||
{
|
||
if (_eventPump is not null) return _eventPump;
|
||
_eventPump = new EventPump(
|
||
_subscriber!, _subscriptions, _logger,
|
||
channelCapacity: _options.MxAccess.EventPumpChannelCapacity,
|
||
clientName: _options.MxAccess.ClientName,
|
||
onStreamFault: OnEventPumpStreamFault);
|
||
_eventPump.OnDataChange += OnPumpDataChange;
|
||
_eventPump.Start();
|
||
return _eventPump;
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Stream-fault callback for the <see cref="EventPump"/>. The gw StreamEvents
|
||
/// stream faulted (transient gateway drop, network blip, gw restart). Forward
|
||
/// the cause to the <see cref="ReconnectSupervisor"/> so it drives reopen →
|
||
/// replay; without this hand-off a transient transport drop permanently kills
|
||
/// the event stream and <c>GetHealth()</c> keeps reporting Healthy.
|
||
/// </summary>
|
||
private void OnEventPumpStreamFault(Exception cause)
|
||
{
|
||
var supervisor = _supervisor;
|
||
if (supervisor is null)
|
||
{
|
||
// No production runtime (skeleton / injected-seam path) — nothing to drive.
|
||
_logger.LogWarning(cause,
|
||
"GalaxyDriver {InstanceId} event stream faulted but no reconnect supervisor is wired.",
|
||
_driverInstanceId);
|
||
return;
|
||
}
|
||
|
||
try
|
||
{
|
||
supervisor.ReportTransportFailure(cause);
|
||
}
|
||
catch (ObjectDisposedException)
|
||
{
|
||
// Driver is being disposed — the stream fault is just shutdown noise.
|
||
}
|
||
}
|
||
|
||
// ===== IAlarmSource =====
|
||
|
||
/// <summary>
|
||
/// Start the gateway alarm feed (idempotent) and wire its transitions onto this
|
||
/// driver's <see cref="OnAlarmEvent"/> bridge. The feed is session-less — it does
|
||
/// not depend on a data subscription or the <see cref="EventPump"/>.
|
||
/// </summary>
|
||
private void EnsureAlarmFeedStarted()
|
||
{
|
||
lock (_alarmFeedLock)
|
||
{
|
||
if (_alarmFeed is null)
|
||
{
|
||
throw new InvalidOperationException(
|
||
"GalaxyDriver alarm feed is not wired. InitializeAsync must run (or a feed " +
|
||
"seam must be injected via the internal ctor) before subscribing to alarms.");
|
||
}
|
||
if (_alarmFeedWired) return;
|
||
_alarmFeed.OnAlarmTransition += OnAlarmFeedTransition;
|
||
_alarmFeed.Start();
|
||
_alarmFeedWired = true;
|
||
}
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public Task<IAlarmSubscriptionHandle> SubscribeAlarmsAsync(
|
||
IReadOnlyList<string> sourceNodeIds, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(sourceNodeIds);
|
||
|
||
// The driver doesn't multiplex alarm subscriptions per source-node-id today —
|
||
// every active handle receives every transition off the gateway's session-less
|
||
// StreamAlarms feed, and the server filters by source node before raising Part 9
|
||
// conditions. The subscription handle is a sentinel the server uses for
|
||
// symmetric Unsubscribe. Same shape AbCip uses.
|
||
EnsureAlarmFeedStarted();
|
||
var handle = new GalaxyAlarmSubscriptionHandle(Guid.NewGuid().ToString("N"));
|
||
lock (_alarmHandlersLock)
|
||
{
|
||
_alarmSubscriptions.Add(handle);
|
||
}
|
||
return Task.FromResult<IAlarmSubscriptionHandle>(handle);
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public Task UnsubscribeAlarmsAsync(IAlarmSubscriptionHandle handle, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(handle);
|
||
if (handle is not GalaxyAlarmSubscriptionHandle gash)
|
||
{
|
||
throw new ArgumentException(
|
||
$"Subscription handle was not issued by this driver (expected GalaxyAlarmSubscriptionHandle, got {handle.GetType().Name}).",
|
||
nameof(handle));
|
||
}
|
||
lock (_alarmHandlersLock)
|
||
{
|
||
_alarmSubscriptions.Remove(gash);
|
||
}
|
||
return Task.CompletedTask;
|
||
}
|
||
|
||
/// <inheritdoc />
|
||
public async Task AcknowledgeAsync(
|
||
IReadOnlyList<AlarmAcknowledgeRequest> acknowledgements, CancellationToken cancellationToken)
|
||
{
|
||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||
ArgumentNullException.ThrowIfNull(acknowledgements);
|
||
if (acknowledgements.Count == 0) return;
|
||
|
||
if (_alarmAcknowledger is null)
|
||
{
|
||
throw new NotSupportedException(
|
||
"GalaxyDriver.AcknowledgeAsync requires GatewayGalaxyAlarmAcknowledger wired against a connected " +
|
||
"GalaxyMxSession (PR B.2). InitializeAsync must run before alarm acknowledgements can flow.");
|
||
}
|
||
|
||
// Acks are issued one-by-one — the gateway RPC accepts a single alarm
|
||
// reference per call. AlarmConditionState's per-condition Acknowledge in the
|
||
// server-side ACL layer is the natural rate-limit, so issuing in series here
|
||
// keeps the operator-comment ordering deterministic without bursting the
|
||
// worker's STA queue.
|
||
foreach (var ack in acknowledgements)
|
||
{
|
||
// ConditionId carries the alarm full reference for the Galaxy driver —
|
||
// SourceNodeId is the OPC UA browse path, which the gateway can't address.
|
||
// The server-side condition state pairs them through AlarmConditionService.
|
||
var alarmFullReference = !string.IsNullOrEmpty(ack.ConditionId)
|
||
? ack.ConditionId
|
||
: ack.SourceNodeId;
|
||
await _alarmAcknowledger.AcknowledgeAsync(
|
||
alarmFullReference,
|
||
ack.Comment ?? string.Empty,
|
||
operatorUser: string.Empty, // server-side ACL fills this from the OPC UA session
|
||
cancellationToken).ConfigureAwait(false);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Receives <see cref="GalaxyAlarmTransition"/> events from the gateway alarm
|
||
/// feed and reshapes them into <see cref="AlarmEventArgs"/> for OPC UA-side
|
||
/// consumers. Fires <see cref="OnAlarmEvent"/> only when at least one alarm
|
||
/// subscription is active so a server that hasn't called
|
||
/// <see cref="SubscribeAlarmsAsync"/> yet doesn't surface untracked transitions.
|
||
/// </summary>
|
||
private void OnAlarmFeedTransition(object? sender, GalaxyAlarmTransition transition)
|
||
{
|
||
GalaxyAlarmSubscriptionHandle? handle;
|
||
lock (_alarmHandlersLock)
|
||
{
|
||
// Pick the earliest-registered handle as the event owner. The server routes
|
||
// by SourceNodeId (not by handle), so every active subscriber sees the same
|
||
// transition regardless of which handle is attached here. Using the first
|
||
// insertion-order entry is deterministic and stable as long as at least one
|
||
// subscription remains — HashSet.First() was unstable across mutations
|
||
// (Driver.Galaxy-006 fix). _alarmSubscriptions is a List, so [0] is always
|
||
// the earliest-registered handle.
|
||
handle = _alarmSubscriptions.Count > 0
|
||
? _alarmSubscriptions[0]
|
||
: null;
|
||
}
|
||
if (handle is null) return;
|
||
|
||
var args = new AlarmEventArgs(
|
||
SubscriptionHandle: handle,
|
||
SourceNodeId: transition.SourceObjectReference,
|
||
ConditionId: transition.AlarmFullReference,
|
||
AlarmType: transition.AlarmTypeName,
|
||
Message: transition.Description,
|
||
Severity: transition.SeverityBucket,
|
||
SourceTimestampUtc: transition.TransitionTimestampUtc,
|
||
OperatorComment: string.IsNullOrEmpty(transition.OperatorComment) ? null : transition.OperatorComment,
|
||
OriginalRaiseTimestampUtc: transition.OriginalRaiseTimestampUtc,
|
||
AlarmCategory: string.IsNullOrEmpty(transition.Category) ? null : transition.Category);
|
||
try
|
||
{
|
||
OnAlarmEvent?.Invoke(this, args);
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
_logger.LogWarning(ex,
|
||
"GalaxyDriver OnAlarmEvent handler threw for {AlarmRef} — continuing.",
|
||
transition.AlarmFullReference);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Forwards every fan-out event to the public <see cref="OnDataChange"/> for
|
||
/// ISubscribable consumers, AND routes ScanState changes to the per-platform
|
||
/// probe watcher (PR 4.7) so platform health entries update without the watcher
|
||
/// consuming the event stream itself.
|
||
/// </summary>
|
||
private void OnPumpDataChange(object? sender, DataChangeEventArgs args)
|
||
{
|
||
OnDataChange?.Invoke(this, args);
|
||
|
||
if (_probeWatcher is not null
|
||
&& args.FullReference.EndsWith(PerPlatformProbeWatcher.ProbeSuffix, StringComparison.OrdinalIgnoreCase))
|
||
{
|
||
// The probe decoder takes a raw quality byte. Recover it via the canonical
|
||
// StatusCodeMap.ToQualityCategoryByte helper so the mapping lives in one
|
||
// place next to its inverse (FromQualityByte) and cannot desync silently.
|
||
var qualityByte = StatusCodeMap.ToQualityCategoryByte(args.Snapshot.StatusCode);
|
||
_probeWatcher.OnProbeValueChanged(args.FullReference, args.Snapshot.Value, qualityByte);
|
||
}
|
||
}
|
||
|
||
/// <summary>
|
||
/// Lazily builds the default <see cref="IGalaxyHierarchySource"/> from
|
||
/// <c>_options.Gateway</c>. Owned <see cref="GalaxyRepositoryClient"/> is disposed in
|
||
/// <see cref="Dispose"/>. Tests bypass this by injecting their own source via the
|
||
/// internal ctor.
|
||
/// </summary>
|
||
private IGalaxyHierarchySource BuildDefaultHierarchySource()
|
||
{
|
||
// Driver.Galaxy-009 fix: reuse a client that StartDeployWatcher may have already
|
||
// created (??=) rather than always overwriting the field and leaking the first
|
||
// instance. Both paths produce equivalent clients from the same options.
|
||
_ownedRepositoryClient ??= GalaxyRepositoryClient.Create(BuildClientOptions(_options.Gateway));
|
||
return new TracedGalaxyHierarchySource(
|
||
new GatewayGalaxyHierarchySource(_ownedRepositoryClient), _options.MxAccess.ClientName);
|
||
}
|
||
|
||
/// <summary>
|
||
/// Asynchronous disposal. Prefer <c>await using</c> over <c>using</c> — the
|
||
/// async path does not block the caller while awaiting EventPump / session /
|
||
/// client shutdown (Driver.Galaxy-007: the sync path blocked on
|
||
/// <c>GetAwaiter().GetResult()</c> for every async sub-component, risking a
|
||
/// deadlock under thread-pool starvation).
|
||
/// </summary>
|
||
public async ValueTask DisposeAsync()
|
||
{
|
||
if (_disposed) return;
|
||
_disposed = true;
|
||
|
||
// Synchronous sub-components first — none of these block.
|
||
try { _deployWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "DeployWatcher dispose failed"); }
|
||
try { _supervisor?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ReconnectSupervisor dispose failed"); }
|
||
try { _probeWatcher?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "ProbeWatcher dispose failed"); }
|
||
try { _transportForwarder?.Dispose(); } catch (Exception ex) { _logger.LogWarning(ex, "Transport forwarder dispose failed"); }
|
||
|
||
// Async sub-components: await each so we don't block a thread-pool thread
|
||
// on a slow shutdown (e.g. EventPump draining its channel, gRPC stream closing).
|
||
EventPump? pump;
|
||
lock (_pumpLock) { pump = _eventPump; _eventPump = null; }
|
||
if (pump is not null)
|
||
{
|
||
try { await pump.DisposeAsync().ConfigureAwait(false); }
|
||
catch (Exception ex) { _logger.LogWarning(ex, "EventPump dispose failed"); }
|
||
}
|
||
|
||
IGalaxyAlarmFeed? alarmFeed;
|
||
lock (_alarmFeedLock) { alarmFeed = _alarmFeed; _alarmFeed = null; }
|
||
if (alarmFeed is not null)
|
||
{
|
||
try { await alarmFeed.DisposeAsync().ConfigureAwait(false); }
|
||
catch (Exception ex) { _logger.LogWarning(ex, "Alarm feed dispose failed"); }
|
||
}
|
||
|
||
if (_ownedMxSession is not null)
|
||
{
|
||
try { await _ownedMxSession.DisposeAsync().ConfigureAwait(false); }
|
||
catch (Exception ex) { _logger.LogWarning(ex, "MxSession dispose failed"); }
|
||
_ownedMxSession = null;
|
||
}
|
||
|
||
if (_ownedMxClient is not null)
|
||
{
|
||
try { await _ownedMxClient.DisposeAsync().ConfigureAwait(false); }
|
||
catch (Exception ex) { _logger.LogWarning(ex, "MxClient dispose failed"); }
|
||
_ownedMxClient = null;
|
||
}
|
||
|
||
if (_ownedRepositoryClient is not null)
|
||
{
|
||
try { await _ownedRepositoryClient.DisposeAsync().ConfigureAwait(false); }
|
||
catch (Exception ex) { _logger.LogWarning(ex, "RepositoryClient dispose failed"); }
|
||
_ownedRepositoryClient = null;
|
||
}
|
||
|
||
_hierarchySource = null;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Synchronous disposal. Prefer <see cref="DisposeAsync"/> in async contexts —
|
||
/// this path must block on every async sub-component shutdown. Provided for
|
||
/// compatibility with <c>using</c> statements that cannot <c>await</c>.
|
||
/// </summary>
|
||
public void Dispose() => DisposeAsync().AsTask().GetAwaiter().GetResult();
|
||
|
||
/// <summary>
|
||
/// Address-space builder wrapper that records each variable's
|
||
/// <see cref="DriverAttributeInfo.SecurityClass"/> into the supplied dictionary
|
||
/// before delegating to the inner builder. Used by <see cref="DiscoverAsync"/>
|
||
/// to capture per-tag classifications for the IWritable routing decision —
|
||
/// PR 4.3 needs the data, but the discoverer itself doesn't (and shouldn't)
|
||
/// know about the driver's internal state.
|
||
/// </summary>
|
||
private sealed class SecurityCapturingBuilder(
|
||
IAddressSpaceBuilder inner,
|
||
System.Collections.Concurrent.ConcurrentDictionary<string, SecurityClassification> map)
|
||
: IAddressSpaceBuilder
|
||
{
|
||
public IAddressSpaceBuilder Folder(string browseName, string displayName)
|
||
=> new SecurityCapturingBuilder(inner.Folder(browseName, displayName), map);
|
||
|
||
public IVariableHandle Variable(string browseName, string displayName, DriverAttributeInfo attributeInfo)
|
||
{
|
||
map[attributeInfo.FullName] = attributeInfo.SecurityClass;
|
||
return inner.Variable(browseName, displayName, attributeInfo);
|
||
}
|
||
|
||
public void AddProperty(string browseName, DriverDataType dataType, object? value)
|
||
=> inner.AddProperty(browseName, dataType, value);
|
||
}
|
||
}
|